├── CG ├── img │ ├── .gitkeep │ ├── Strong Scaling │ │ ├── cg_runtime_8A100.pdf │ │ └── cg_runtime_8A100.png │ ├── Constant Number of GPUs │ │ ├── matrix_speedup_table_1 GPU.pdf │ │ ├── matrix_speedup_table_2 GPUs.pdf │ │ ├── matrix_speedup_table_3 GPUs.pdf │ │ ├── matrix_speedup_table_4 GPUs.pdf │ │ ├── matrix_speedup_table_5 GPUs.pdf │ │ ├── matrix_speedup_table_6 GPUs.pdf │ │ ├── matrix_speedup_table_7 GPUs.pdf │ │ └── matrix_speedup_table_8 GPUs.pdf │ └── Operation Breakdown │ │ └── Discrete Pipelined Operation Breakdown.png ├── results │ ├── .gitkeep │ ├── cg_runtime_single_gpu-A100.csv │ ├── standard_speedup │ │ ├── pipelined_cg_speedup_1 GPU.txt │ │ ├── pipelined_cg_speedup_2 GPUs.txt │ │ ├── pipelined_cg_speedup_3 GPUs.txt │ │ ├── pipelined_cg_speedup_6 GPUs.txt │ │ ├── pipelined_cg_speedup_8 GPUs.txt │ │ ├── pipelined_cg_speedup_4 GPUs.txt │ │ ├── pipelined_cg_speedup_5 GPUs.txt │ │ └── pipelined_cg_speedup_7 GPUs.txt │ ├── pipelined_speedup │ │ ├── pipelined_cg_speedup_1 GPU.txt │ │ ├── pipelined_cg_speedup_2 GPUs.txt │ │ ├── pipelined_cg_speedup_3 GPUs.txt │ │ ├── pipelined_cg_speedup_4 GPUs.txt │ │ ├── pipelined_cg_speedup_5 GPUs.txt │ │ ├── pipelined_cg_speedup_6 GPUs.txt │ │ ├── pipelined_cg_speedup_7 GPUs.txt │ │ └── pipelined_cg_speedup_8 GPUs.txt │ ├── cg_operation_breakdown_8A100_discrete_pipelined.csv │ ├── cg_operation_breakdown_8A100_discrete_standard.csv │ ├── cg_operation_breakdown_4A100.txt │ └── cg_operation_breakdown_8A100.txt ├── nsys_reports │ └── .gitkeep ├── scripts │ ├── .gitignore │ ├── requirements.txt │ ├── calculate_nnz_num_rows_ratio.py │ ├── plots │ │ ├── plot_operation_breakdown.py │ │ └── common.py │ ├── download_matrices.py │ └── calculate_speedup.py ├── batch │ ├── Karolina │ │ ├── _load_karolina_modules.sh │ │ ├── _measure_single_gpu_runtime.sh │ │ └── _measure_total_runtime.sh │ ├── Simula │ │ ├── _load_simula_modules.sh │ │ ├── _measure_operation_breakdown.sh │ │ └── _measure_total_runtime.sh │ ├── A100-machine │ │ └── _load_A100-machine_modules.sh │ └── Truba │ │ ├── _load_truba_modules.sh │ │ ├── _measure_operation_breakdown.sh │ │ └── _measure_total_runtime.sh ├── include │ ├── single-stream │ │ ├── standard.cuh │ │ ├── pipelined.cuh │ │ ├── pipelined-gather.cuh │ │ ├── standard-saxpy-overlap.cuh │ │ └── pipelined-multi-overlap.cuh │ ├── baseline │ │ ├── discrete-standard.cuh │ │ └── discrete-pipelined.cuh │ └── profiling │ │ ├── discrete-standard.cuh │ │ └── discrete-pipelined.cuh ├── CMakeLists.txt ├── Makefile2 └── src │ └── single-gpu │ └── discrete-standard.cu ├── Plots ├── Images │ ├── .gitkeep │ ├── 8 GPUs (2048x4096).png │ ├── 2D_Weak_Scaling_256x256.png │ ├── 3D_Weak_Scaling_256x256x256.png │ ├── matrix_speedup_table_8 GPUs.png │ └── 2D_Weak_Scaling_No_Compute_16384x16384__.png ├── data │ ├── .gitkeep │ ├── comp.csv │ ├── no-comp.csv │ ├── fig5 │ │ ├── 2D_Weak_Scaling_8192x4096.csv │ │ ├── 2D_Weak_Scaling_2048x1024.csv │ │ └── 2D_Weak_Scaling_256x256.csv │ ├── fig6 │ │ ├── 3D_Weak_Scaling_256x256x256.csv │ │ ├── 3D_Strong_Scaling_No_Compute_512x512x512.csv │ │ └── 3D_Strong_Scaling_256x256x256.csv │ ├── 2d-weak-scaling-small.csv │ ├── 2d-weak-scaling-medium.csv │ ├── 2d-weak-scaling-large.csv │ └── 2d-comp.csv ├── .gitignore ├── requirements.txt ├── README.md ├── common.py ├── scaling-bar.py ├── comp-vs-comm.py ├── weak-scaling.py └── weak-scaling-2.py ├── Stencil ├── jacobi2D │ ├── scripts │ │ ├── .gitignore │ │ ├── requirements.txt │ │ ├── bench.sh │ │ ├── constant_num_gpus_bench.sh │ │ ├── plot.py │ │ ├── strong_scale_bench_truba.sh │ │ ├── strong_scale_bench.sh │ │ ├── weak_scale_comm_bench_truba.sh │ │ ├── weak_scale_comp_bench_truba.sh │ │ └── weak_scale_bench.sh │ ├── PERKS │ │ ├── genconfig.cuh │ │ ├── common │ │ │ ├── jacobi_reference.hpp │ │ │ ├── jacobi_cuda.cuh │ │ │ └── types.hpp │ │ ├── config.cuh │ │ └── jacobi-general-wrapper.cu │ ├── include │ │ ├── PERKS │ │ │ └── multi-stream-perks.cuh │ │ ├── multi-stream │ │ │ └── multi-gpu-peer-tiling.cuh │ │ ├── baseline │ │ │ ├── multi-threaded-p2p.cuh │ │ │ ├── multi-threaded-copy.cuh │ │ │ ├── single-threaded-copy.cuh │ │ │ └── multi-threaded-copy-overlap.cuh │ │ ├── single-stream │ │ │ ├── multi-threaded-one-block-comm.cuh │ │ │ ├── multi-threaded-two-block-comm.cuh │ │ │ └── multi-threaded-one-block-comm-layer.cuh │ │ └── no-compute │ │ │ ├── multi-gpu-peer-tiling-no-compute.cuh │ │ │ ├── multi-threaded-p2p-no-compute.cuh │ │ │ ├── multi-threaded-copy-no-compute.cuh │ │ │ ├── multi-threaded-two-block-comm-no-compute.cuh │ │ │ ├── multi-threaded-one-block-comm-no-compute.cuh │ │ │ ├── multi-threaded-copy-overlap-no-compute.cuh │ │ │ └── multi-threaded-one-block-comm-layer-no-compute.cuh │ ├── include_nvshmem │ │ ├── PERKS │ │ │ └── multi-stream-perks.cuh │ │ ├── multi-stream │ │ │ ├── multi-gpu-peer-tiling.cuh │ │ │ └── multi-gpu-multi-block-tiling.cuh │ │ ├── baseline │ │ │ ├── multi-threaded-nvshmem.cuh │ │ │ └── multi-threaded-nvshmem-opt.cuh │ │ ├── no-compute │ │ │ ├── design-1-multi-block-no-compute.cuh │ │ │ ├── multi-gpu-peer-tiling-no-compute.cuh │ │ │ ├── multi-threaded-nvshmem-no-compute.cuh │ │ │ ├── multi-threaded-nvshmem-opt-no-compute.cuh │ │ │ ├── multi-threaded-one-block-comm-no-compute.cuh │ │ │ └── multi-threaded-two-block-comm-no-compute.cuh │ │ └── single-stream │ │ │ ├── multi-threaded-two-block-comm.cuh │ │ │ ├── multi-threaded-one-block-comm.cuh │ │ │ └── multi-threaded-multi-block-comm.cuh │ ├── CMakeLists.txt │ ├── src │ │ └── main.cu │ └── src_nvshmem │ │ └── main.cu ├── jacobi3D │ ├── scripts │ │ ├── .gitignore │ │ ├── requirements.txt │ │ ├── run-bench.sh │ │ ├── bench.sh │ │ ├── constant_num_gpus_bench.sh │ │ ├── plot.py │ │ ├── multi-node.sh │ │ └── strong_scale_bench.sh │ ├── src │ │ ├── PERKS │ │ │ ├── genconfig.cuh │ │ │ ├── common │ │ │ │ ├── jacobi_cuda.cuh │ │ │ │ ├── jacobi_reference.hpp │ │ │ │ └── types.hpp │ │ │ └── config.cuh │ │ └── main.cu │ ├── src_nvshmem │ │ ├── PERKS-nvshmem │ │ │ ├── genconfig.cuh │ │ │ ├── common │ │ │ │ ├── jacobi_cuda.cuh │ │ │ │ ├── jacobi_reference.hpp │ │ │ │ └── types.hpp │ │ │ └── config.cuh │ │ └── main.cu │ ├── include │ │ ├── PERKS │ │ │ └── multi-stream-perks.cuh │ │ ├── PERKS-nvshmem │ │ │ └── multi-stream-perks-nvshmem.h │ │ ├── multi-stream │ │ │ └── multi-gpu-peer-tiling.cuh │ │ ├── baseline │ │ │ ├── multi-threaded-p2p.cuh │ │ │ ├── multi-threaded-copy.cuh │ │ │ ├── single-threaded-copy.cuh │ │ │ └── multi-threaded-copy-overlap.cuh │ │ ├── single-stream │ │ │ ├── multi-threaded-one-block-comm.cuh │ │ │ ├── multi-threaded-two-block-comm.cuh │ │ │ └── multi-threaded-multi-block-comm.cuh │ │ └── no-compute │ │ │ ├── multi-gpu-peer-tiling-no-compute.cuh │ │ │ ├── multi-threaded-p2p-no-compute.cuh │ │ │ ├── multi-threaded-copy-no-compute.cuh │ │ │ ├── multi-threaded-two-block-comm-no-compute.cuh │ │ │ ├── multi-threaded-one-block-comm-no-compute.cuh │ │ │ ├── multi-threaded-multi-block-comm-no-compute.cuh │ │ │ └── multi-threaded-copy-overlap-no-compute.cuh │ ├── include_nvshmem │ │ ├── PERKS-nvshmem │ │ │ ├── multi-stream-perks-nvshmem.h │ │ │ └── multi-stream-perks-nvshmem-block.h │ │ ├── multi-stream │ │ │ ├── multi-gpu-peer-tiling.cuh │ │ │ └── multi-gpu-multi-block-tiling.cuh │ │ ├── baseline │ │ │ ├── multi-threaded-nvshmem.cuh │ │ │ └── multi-threaded-nvshmem-opt.cuh │ │ ├── single-stream │ │ │ ├── multi-threaded-one-block-comm.cuh │ │ │ ├── multi-threaded-two-block-comm.cuh │ │ │ └── multi-threaded-multi-block-comm.cuh │ │ └── no-compute │ │ │ ├── multi-gpu-peer-tiling-no-compute.cuh │ │ │ ├── multi-threaded-nvshmem-no-compute.cuh │ │ │ ├── multi-threaded-nvshmem-opt-no-compute.cuh │ │ │ ├── multi-threaded-one-block-comm-no-compute.cuh │ │ │ ├── multi-threaded-two-block-comm-no-compute.cuh │ │ │ └── multi-threaded-multi-block-comm-no-compute.cuh │ └── CMakeLists.txt ├── CMakeLists.txt └── Makefile2 ├── .gitignore ├── Makefile2 ├── CMakeLists.txt ├── LICENSE ├── common.mk ├── Scripts └── full_bench.py └── .clang-format /CG/img/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /CG/results/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Plots/Images/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Plots/data/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /CG/nsys_reports/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Plots/.gitignore: -------------------------------------------------------------------------------- 1 | Images/ 2 | data/ 3 | -------------------------------------------------------------------------------- /CG/scripts/.gitignore: -------------------------------------------------------------------------------- 1 | venv 2 | __pycache__ -------------------------------------------------------------------------------- /Stencil/jacobi2D/scripts/.gitignore: -------------------------------------------------------------------------------- 1 | venv -------------------------------------------------------------------------------- /Stencil/jacobi3D/scripts/.gitignore: -------------------------------------------------------------------------------- 1 | venv -------------------------------------------------------------------------------- /Plots/requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | pandas 3 | matplotlib 4 | -------------------------------------------------------------------------------- /CG/scripts/requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | pandas 3 | matplotlib 4 | -------------------------------------------------------------------------------- /Stencil/jacobi2D/PERKS/genconfig.cuh: -------------------------------------------------------------------------------- 1 | 2 | 3 | // #define REG_FOLDER_Y (0) 4 | #define REG_FOLDER_Y (0) -------------------------------------------------------------------------------- /Plots/Images/8 GPUs (2048x4096).png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ParCoreLab/CPU-Free-model/HEAD/Plots/Images/8 GPUs (2048x4096).png -------------------------------------------------------------------------------- /Stencil/jacobi3D/src/PERKS/genconfig.cuh: -------------------------------------------------------------------------------- 1 | // #define REG_FOLDER_Z (8) 2 | // #define REG_FOLDER_Z (12) 3 | #define REG_FOLDER_Z (0) -------------------------------------------------------------------------------- /Plots/Images/2D_Weak_Scaling_256x256.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ParCoreLab/CPU-Free-model/HEAD/Plots/Images/2D_Weak_Scaling_256x256.png -------------------------------------------------------------------------------- /CG/img/Strong Scaling/cg_runtime_8A100.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ParCoreLab/CPU-Free-model/HEAD/CG/img/Strong Scaling/cg_runtime_8A100.pdf -------------------------------------------------------------------------------- /CG/img/Strong Scaling/cg_runtime_8A100.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ParCoreLab/CPU-Free-model/HEAD/CG/img/Strong Scaling/cg_runtime_8A100.png -------------------------------------------------------------------------------- /Stencil/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(jacobi2D) 2 | add_subdirectory(jacobi3D) 3 | 4 | add_custom_target(jacobi jacobi2D jacobi2D_nvshmem) 5 | -------------------------------------------------------------------------------- /Stencil/jacobi3D/src_nvshmem/PERKS-nvshmem/genconfig.cuh: -------------------------------------------------------------------------------- 1 | // #define REG_FOLDER_Z (8) 2 | // #define REG_FOLDER_Z (12) 3 | #define REG_FOLDER_Z (0) -------------------------------------------------------------------------------- /Plots/Images/3D_Weak_Scaling_256x256x256.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ParCoreLab/CPU-Free-model/HEAD/Plots/Images/3D_Weak_Scaling_256x256x256.png -------------------------------------------------------------------------------- /Plots/Images/matrix_speedup_table_8 GPUs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ParCoreLab/CPU-Free-model/HEAD/Plots/Images/matrix_speedup_table_8 GPUs.png -------------------------------------------------------------------------------- /CG/batch/Karolina/_load_karolina_modules.sh: -------------------------------------------------------------------------------- 1 | ml NVSHMEM/2.9.0-gompi-2022a-CUDA-11.7.0 2 | ml Python/3.10.4-GCCcore-11.3.0 3 | export NVSHMEM_IB_ENABLE_IBGDA=true -------------------------------------------------------------------------------- /Plots/README.md: -------------------------------------------------------------------------------- 1 | ## Figure 5: 2 | ```bash 3 | ./weak-scaling.py fig5/*.csv 4 | ``` 5 | 6 | ## Figure 6: 7 | ```bash 8 | ./weak-scaling-2.py fig6/*.csv 9 | ``` 10 | -------------------------------------------------------------------------------- /Plots/Images/2D_Weak_Scaling_No_Compute_16384x16384__.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ParCoreLab/CPU-Free-model/HEAD/Plots/Images/2D_Weak_Scaling_No_Compute_16384x16384__.png -------------------------------------------------------------------------------- /CG/img/Constant Number of GPUs/matrix_speedup_table_1 GPU.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ParCoreLab/CPU-Free-model/HEAD/CG/img/Constant Number of GPUs/matrix_speedup_table_1 GPU.pdf -------------------------------------------------------------------------------- /CG/img/Constant Number of GPUs/matrix_speedup_table_2 GPUs.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ParCoreLab/CPU-Free-model/HEAD/CG/img/Constant Number of GPUs/matrix_speedup_table_2 GPUs.pdf -------------------------------------------------------------------------------- /CG/img/Constant Number of GPUs/matrix_speedup_table_3 GPUs.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ParCoreLab/CPU-Free-model/HEAD/CG/img/Constant Number of GPUs/matrix_speedup_table_3 GPUs.pdf -------------------------------------------------------------------------------- /CG/img/Constant Number of GPUs/matrix_speedup_table_4 GPUs.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ParCoreLab/CPU-Free-model/HEAD/CG/img/Constant Number of GPUs/matrix_speedup_table_4 GPUs.pdf -------------------------------------------------------------------------------- /CG/img/Constant Number of GPUs/matrix_speedup_table_5 GPUs.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ParCoreLab/CPU-Free-model/HEAD/CG/img/Constant Number of GPUs/matrix_speedup_table_5 GPUs.pdf -------------------------------------------------------------------------------- /CG/img/Constant Number of GPUs/matrix_speedup_table_6 GPUs.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ParCoreLab/CPU-Free-model/HEAD/CG/img/Constant Number of GPUs/matrix_speedup_table_6 GPUs.pdf -------------------------------------------------------------------------------- /CG/img/Constant Number of GPUs/matrix_speedup_table_7 GPUs.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ParCoreLab/CPU-Free-model/HEAD/CG/img/Constant Number of GPUs/matrix_speedup_table_7 GPUs.pdf -------------------------------------------------------------------------------- /CG/img/Constant Number of GPUs/matrix_speedup_table_8 GPUs.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ParCoreLab/CPU-Free-model/HEAD/CG/img/Constant Number of GPUs/matrix_speedup_table_8 GPUs.pdf -------------------------------------------------------------------------------- /CG/img/Operation Breakdown/Discrete Pipelined Operation Breakdown.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ParCoreLab/CPU-Free-model/HEAD/CG/img/Operation Breakdown/Discrete Pipelined Operation Breakdown.png -------------------------------------------------------------------------------- /Plots/data/comp.csv: -------------------------------------------------------------------------------- 1 | Version,1 GPU (256x256x256),2 GPUs (256x256x512),4 GPUs (256x256x1024),8 GPUs (256x256x2048) 2 | Baseline Overlap,12.4439,12.1929,12.2302,12.3219 3 | Baseline P2P,11.9452,13.3371,13.5145,13.6295 4 | Ours,15.1575,15.2379,15.2893,15.2893 -------------------------------------------------------------------------------- /Stencil/jacobi2D/scripts/requirements.txt: -------------------------------------------------------------------------------- 1 | cycler==0.11.0 2 | fonttools==4.29.1 3 | kiwisolver==1.3.2 4 | numpy==1.22.2 5 | packaging==21.3 6 | pandas==1.4.1 7 | Pillow==9.0.1 8 | pyparsing==3.0.7 9 | python-dateutil==2.8.2 10 | pytz==2021.3 11 | six==1.16.0 12 | -------------------------------------------------------------------------------- /Stencil/jacobi3D/scripts/requirements.txt: -------------------------------------------------------------------------------- 1 | cycler==0.11.0 2 | fonttools==4.29.1 3 | kiwisolver==1.3.2 4 | numpy==1.22.2 5 | packaging==21.3 6 | pandas==1.4.1 7 | Pillow==9.0.1 8 | pyparsing==3.0.7 9 | python-dateutil==2.8.2 10 | pytz==2021.3 11 | six==1.16.0 12 | -------------------------------------------------------------------------------- /Plots/data/no-comp.csv: -------------------------------------------------------------------------------- 1 | Version,1 GPU (256x256x256),2 GPUs (256x256x512),4 GPUs (256x256x1024),8 GPUs (256x256x2048) 2 | Baseline Copy Overlap (No Compute),2.5735,6.952,7.8268,10.0821 3 | Baseline P2P (No Compute),3.9442,5.209,5.3338,5.9174 4 | NVSHMEM Double Stream (No Compute),1.7782,1.9406,1.9862,2.0168 5 | -------------------------------------------------------------------------------- /Stencil/jacobi2D/include/PERKS/multi-stream-perks.cuh: -------------------------------------------------------------------------------- 1 | #ifndef INC_2D_STENCIL_MULTI_STREAM_PERKS_CUH 2 | #define INC_2D_STENCIL_MULTI_STREAM_PERKS_CUH 3 | 4 | #include "../common.h" 5 | 6 | namespace MultiStreamPERKS { 7 | int init(int argc, char** argv); 8 | } 9 | 10 | #endif // INC_2D_STENCIL_MULTI_STREAM_PERKS_CUH 11 | -------------------------------------------------------------------------------- /Stencil/jacobi3D/include/PERKS/multi-stream-perks.cuh: -------------------------------------------------------------------------------- 1 | #ifndef INC_3D_STENCIL_MULTI_THREADED_PERKS_H 2 | #define INC_3D_STENCIL_MULTI_THREADED_PERKS_H 3 | 4 | #include "../common.h" 5 | 6 | namespace MultiStreamPERKS { 7 | int init(int argc, char** argv); 8 | } 9 | 10 | #endif // INC_3D_STENCIL_MULTI_THREADED_PERKS_H 11 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode/ 2 | .idea/ 3 | venv/ 4 | __pycache__/ 5 | 6 | CMakeFiles/ 7 | CMakeCache.txt 8 | Makefile 9 | 10 | *.cmake 11 | *.o 12 | *.out 13 | 14 | bin/ 15 | obj/ 16 | build/ 17 | 18 | cg 19 | jacobi 20 | jacobi_nvshmem 21 | */obj/* 22 | !obj/.gitkeep 23 | */obj_nvshmem/ 24 | 25 | # Benchmarking artifacts 26 | *.log 27 | -------------------------------------------------------------------------------- /Makefile2: -------------------------------------------------------------------------------- 1 | include common.mk 2 | 3 | include Stencil/Makefile 4 | include CG/Makefile 5 | 6 | all: stencil cg 7 | 8 | SOURCES := $(shell find . -type f -name '*.cu' -or -name '*.c' -or -name '*.cuh' -or -name '*.h' -or -name '*.cpp') 9 | 10 | .PHONY format: 11 | format: $(SOURCES) 12 | clang-format --style=file:.clang-format -i $^ 13 | -------------------------------------------------------------------------------- /Stencil/jacobi2D/include/multi-stream/multi-gpu-peer-tiling.cuh: -------------------------------------------------------------------------------- 1 | #ifndef INC_2D_STENCIL_MULTI_GPU_PEER_TILING_CUH 2 | #define INC_2D_STENCIL_MULTI_GPU_PEER_TILING_CUH 3 | 4 | #include "../common.h" 5 | 6 | namespace MultiGPUPeerTiling { 7 | int init(int argc, char** argv); 8 | } 9 | 10 | #endif // INC_2D_STENCIL_MULTI_GPU_PEER_TILING_CUH 11 | -------------------------------------------------------------------------------- /Stencil/jacobi3D/include/PERKS-nvshmem/multi-stream-perks-nvshmem.h: -------------------------------------------------------------------------------- 1 | #ifndef JACOBI3D_MULTI_STREAM_PERKS_NVSHMEM_H 2 | #define JACOBI3D_MULTI_STREAM_PERKS_NVSHMEM_H 3 | 4 | #include "../common.h" 5 | 6 | namespace MultiStreamPERKSNvshmem { 7 | int init(int argc, char **argv); 8 | } 9 | 10 | #endif // JACOBI3D_MULTI_STREAM_PERKS_NVSHMEM_H 11 | -------------------------------------------------------------------------------- /Stencil/jacobi3D/include/multi-stream/multi-gpu-peer-tiling.cuh: -------------------------------------------------------------------------------- 1 | #ifndef INC_3D_STENCIL_MULTI_GPU_PEER_TILING_CUH 2 | #define INC_3D_STENCIL_MULTI_GPU_PEER_TILING_CUH 3 | 4 | #include "../common.h" 5 | 6 | namespace MultiGPUPeerTiling { 7 | int init(int argc, char** argv); 8 | } 9 | 10 | #endif // INC_3D_STENCIL_MULTI_GPU_PEER_TILING_CUH 11 | -------------------------------------------------------------------------------- /Stencil/jacobi3D/include_nvshmem/PERKS-nvshmem/multi-stream-perks-nvshmem.h: -------------------------------------------------------------------------------- 1 | #ifndef JACOBI3D_MULTI_STREAM_PERKS_NVSHMEM_H 2 | #define JACOBI3D_MULTI_STREAM_PERKS_NVSHMEM_H 3 | 4 | #include "../common.h" 5 | 6 | namespace MultiStreamPERKSNvshmem { 7 | int init(int argc, char **argv); 8 | } 9 | 10 | #endif // JACOBI3D_MULTI_STREAM_PERKS_NVSHMEM_H 11 | -------------------------------------------------------------------------------- /Stencil/jacobi2D/include/baseline/multi-threaded-p2p.cuh: -------------------------------------------------------------------------------- 1 | #ifndef INC_2D_STENCIL_BASELINE_MULTI_THREADED_P2P_CUH 2 | #define INC_2D_STENCIL_BASELINE_MULTI_THREADED_P2P_CUH 3 | 4 | #include "../common.h" 5 | 6 | namespace BaselineMultiThreadedP2P { 7 | int init(int argc, char **argv); 8 | } 9 | 10 | #endif // INC_2D_STENCIL_BASELINE_MULTI_THREADED_P2P_CUH 11 | -------------------------------------------------------------------------------- /Stencil/jacobi3D/include/baseline/multi-threaded-p2p.cuh: -------------------------------------------------------------------------------- 1 | #ifndef INC_3D_STENCIL_BASELINE_MULTI_THREADED_P2P_CUH 2 | #define INC_3D_STENCIL_BASELINE_MULTI_THREADED_P2P_CUH 3 | 4 | #include "../common.h" 5 | 6 | namespace BaselineMultiThreadedP2P { 7 | int init(int argc, char **argv); 8 | } 9 | 10 | #endif // INC_3D_STENCIL_BASELINE_MULTI_THREADED_P2P_CUH 11 | -------------------------------------------------------------------------------- /Stencil/jacobi2D/include/baseline/multi-threaded-copy.cuh: -------------------------------------------------------------------------------- 1 | #ifndef INC_2D_STENCIL_BASELINE_MULTI_THREADED_COPY_CUH 2 | #define INC_2D_STENCIL_BASELINE_MULTI_THREADED_COPY_CUH 3 | 4 | #include "../common.h" 5 | 6 | namespace BaselineMultiThreadedCopy { 7 | int init(int argc, char **argv); 8 | } 9 | 10 | #endif // INC_2D_STENCIL_BASELINE_MULTI_THREADED_COPY_CUH 11 | -------------------------------------------------------------------------------- /Stencil/jacobi2D/include_nvshmem/PERKS/multi-stream-perks.cuh: -------------------------------------------------------------------------------- 1 | #ifndef INC_2D_STENCIL_MULTI_STREAM_PERKS_NVSHMEM_CUH 2 | #define INC_2D_STENCIL_MULTI_STREAM_PERKS_NVSHMEM_CUH 3 | 4 | #include "../common.h" 5 | 6 | namespace MultiStreamPERKSNVSHMEM { 7 | int init(int argc, char** argv); 8 | } 9 | 10 | #endif // INC_2D_STENCIL_MULTI_STREAM_PERKS_NVSHMEM_CUH 11 | -------------------------------------------------------------------------------- /Stencil/jacobi3D/include/baseline/multi-threaded-copy.cuh: -------------------------------------------------------------------------------- 1 | #ifndef INC_3D_STENCIL_BASELINE_MULTI_THREADED_COPY_CUH 2 | #define INC_3D_STENCIL_BASELINE_MULTI_THREADED_COPY_CUH 3 | 4 | #include "../common.h" 5 | 6 | namespace BaselineMultiThreadedCopy { 7 | int init(int argc, char **argv); 8 | } 9 | 10 | #endif // INC_2D_STENCIL_BASELINE_MULTI_THREADED_COPY_CUH 11 | -------------------------------------------------------------------------------- /Stencil/jacobi2D/include/baseline/single-threaded-copy.cuh: -------------------------------------------------------------------------------- 1 | #ifndef INC_2D_STENCIL_BASELINE_SINGLE_THREADED_COPY_CUH 2 | #define INC_2D_STENCIL_BASELINE_SINGLE_THREADED_COPY_CUH 3 | 4 | #include "../common.h" 5 | 6 | namespace BaselineSingleThreadedCopy { 7 | int init(int argc, char **argv); 8 | } 9 | 10 | #endif // INC_2D_STENCIL_BASELINE_SINGLE_THREADED_COPY_CUH 11 | -------------------------------------------------------------------------------- /Stencil/jacobi3D/include/baseline/single-threaded-copy.cuh: -------------------------------------------------------------------------------- 1 | #ifndef INC_3D_STENCIL_BASELINE_SINGLE_THREADED_COPY_CUH 2 | #define INC_3D_STENCIL_BASELINE_SINGLE_THREADED_COPY_CUH 3 | 4 | #include "../common.h" 5 | 6 | namespace BaselineSingleThreadedCopy { 7 | int init(int argc, char **argv); 8 | } 9 | 10 | #endif // INC_3D_STENCIL_BASELINE_SINGLE_THREADED_COPY_CUH 11 | -------------------------------------------------------------------------------- /Stencil/jacobi3D/include_nvshmem/PERKS-nvshmem/multi-stream-perks-nvshmem-block.h: -------------------------------------------------------------------------------- 1 | #ifndef JACOBI3D_MULTI_STREAM_PERKS_NVSHMEM_BLOCK_H 2 | #define JACOBI3D_MULTI_STREAM_PERKS_NVSHMEM_BLOCK_H 3 | 4 | #include "../common.h" 5 | 6 | namespace MultiStreamPERKSNvshmemBlock { 7 | int init(int argc, char **argv); 8 | } 9 | 10 | #endif // JACOBI3D_MULTI_STREAM_PERKS_NVSHMEM_BLOCK_H 11 | -------------------------------------------------------------------------------- /Stencil/jacobi2D/include/single-stream/multi-threaded-one-block-comm.cuh: -------------------------------------------------------------------------------- 1 | #ifndef INC_2D_STENCIL_MULTI_THREADED_ONE_BLOCK_COMM_CUH 2 | #define INC_2D_STENCIL_MULTI_THREADED_ONE_BLOCK_COMM_CUH 3 | 4 | #include "../common.h" 5 | 6 | namespace SSMultiThreadedOneBlockComm { 7 | int init(int argc, char** argv); 8 | } 9 | 10 | #endif // INC_2D_STENCIL_MULTI_THREADED_ONE_BLOCK_COMM_CUH 11 | -------------------------------------------------------------------------------- /Stencil/jacobi2D/include/single-stream/multi-threaded-two-block-comm.cuh: -------------------------------------------------------------------------------- 1 | #ifndef INC_2D_STENCIL_MULTI_THREADED_TWO_BLOCK_COMM_CUH 2 | #define INC_2D_STENCIL_MULTI_THREADED_TWO_BLOCK_COMM_CUH 3 | 4 | #include "../common.h" 5 | 6 | namespace SSMultiThreadedTwoBlockComm { 7 | int init(int argc, char** argv); 8 | } 9 | 10 | #endif // INC_2D_STENCIL_MULTI_THREADED_TWO_BLOCK_COMM_CUH 11 | -------------------------------------------------------------------------------- /Stencil/jacobi2D/include_nvshmem/multi-stream/multi-gpu-peer-tiling.cuh: -------------------------------------------------------------------------------- 1 | #ifndef INC_2D_STENCIL_MULTI_GPU_PEER_TILING_NVSHMEM_CUH 2 | #define INC_2D_STENCIL_MULTI_GPU_PEER_TILING_NVSHMEM_CUH 3 | 4 | #include "../common.h" 5 | 6 | namespace MultiGPUPeerTilingNvshmem { 7 | int init(int argc, char** argv); 8 | } 9 | 10 | #endif // INC_2D_STENCIL_MULTI_GPU_PEER_TILING_NVSHMEM_CUH 11 | -------------------------------------------------------------------------------- /Stencil/jacobi3D/include/single-stream/multi-threaded-one-block-comm.cuh: -------------------------------------------------------------------------------- 1 | #ifndef INC_3D_STENCIL_MULTI_THREADED_ONE_BLOCK_COMM_CUH 2 | #define INC_3D_STENCIL_MULTI_THREADED_ONE_BLOCK_COMM_CUH 3 | 4 | #include "../common.h" 5 | 6 | namespace SSMultiThreadedOneBlockComm { 7 | int init(int argc, char** argv); 8 | } 9 | 10 | #endif // INC_3D_STENCIL_MULTI_THREADED_ONE_BLOCK_COMM_CUH 11 | -------------------------------------------------------------------------------- /Stencil/jacobi3D/include/single-stream/multi-threaded-two-block-comm.cuh: -------------------------------------------------------------------------------- 1 | #ifndef INC_2D_STENCIL_MULTI_THREADED_TWO_BLOCK_COMM_CUH 2 | #define INC_2D_STENCIL_MULTI_THREADED_TWO_BLOCK_COMM_CUH 3 | 4 | #include "../common.h" 5 | 6 | namespace SSMultiThreadedTwoBlockComm { 7 | int init(int argc, char** argv); 8 | } 9 | 10 | #endif // INC_2D_STENCIL_MULTI_THREADED_TWO_BLOCK_COMM_CUH 11 | -------------------------------------------------------------------------------- /Stencil/jacobi3D/include_nvshmem/multi-stream/multi-gpu-peer-tiling.cuh: -------------------------------------------------------------------------------- 1 | #ifndef INC_3D_STENCIL_MULTI_GPU_PEER_TILING_NVSHMEM_CUH 2 | #define INC_3D_STENCIL_MULTI_GPU_PEER_TILING_NVSHMEM_CUH 3 | 4 | #include "../common.h" 5 | 6 | namespace MultiGPUPeerTilingNvshmem { 7 | int init(int argc, char** argv); 8 | } 9 | 10 | #endif // INC_3D_STENCIL_MULTI_GPU_PEER_TILING_NVSHMEM_CUH 11 | -------------------------------------------------------------------------------- /Stencil/jacobi2D/include_nvshmem/baseline/multi-threaded-nvshmem.cuh: -------------------------------------------------------------------------------- 1 | #ifndef INC_2D_STENCIL_BASELINE_MULTI_THREADED_NVSHMEM_CUH 2 | #define INC_2D_STENCIL_BASELINE_MULTI_THREADED_NVSHMEM_CUH 3 | 4 | #include "../common.h" 5 | 6 | namespace BaselineMultiThreadedNvshmem { 7 | int init(int argc, char **argv); 8 | } 9 | 10 | #endif // INC_2D_STENCIL_BASELINE_MULTI_THREADED_NVSHMEM_CUH 11 | -------------------------------------------------------------------------------- /Stencil/jacobi3D/include_nvshmem/baseline/multi-threaded-nvshmem.cuh: -------------------------------------------------------------------------------- 1 | #ifndef INC_3D_STENCIL_BASELINE_MULTI_THREADED_NVSHMEM_CUH 2 | #define INC_3D_STENCIL_BASELINE_MULTI_THREADED_NVSHMEM_CUH 3 | 4 | #include "../common.h" 5 | 6 | namespace BaselineMultiThreadedNvshmem { 7 | int init(int argc, char **argv); 8 | } 9 | 10 | #endif // INC_3D_STENCIL_BASELINE_MULTI_THREADED_NVSHMEM_CUH 11 | -------------------------------------------------------------------------------- /Plots/data/fig5/2D_Weak_Scaling_8192x4096.csv: -------------------------------------------------------------------------------- 1 | Version,1 GPU (8192x4096),2 GPUs (8192x8192),4 GPUs (16384x8192),8 GPUs (16384x16384) 2 | Baseline Copy,2.2274,2.3971,2.4163,2.4221 3 | Baseline Overlap,2.1595,2.1635,2.1902,2.2097 4 | Baseline P2P,2.1315,2.2578,2.2941,2.3263 5 | Baseline NVSHMEM,2.2502,2.3054,2.3134,2.3307 6 | Ours,2.6494,2.6536,2.6965,2.6987 7 | PERKS + Ours,1.7114,1.7167,1.7254,1.7278 8 | -------------------------------------------------------------------------------- /Stencil/jacobi2D/include/no-compute/multi-gpu-peer-tiling-no-compute.cuh: -------------------------------------------------------------------------------- 1 | #ifndef INC_2D_STENCIL_MULTI_GPU_PEER_TILING_NO_COMPUTE_CUH 2 | #define INC_2D_STENCIL_MULTI_GPU_PEER_TILING_NO_COMPUTE_CUH 3 | 4 | #include "../common.h" 5 | 6 | namespace MultiGPUPeerTilingNoCompute { 7 | int init(int argc, char** argv); 8 | } 9 | 10 | #endif // INC_2D_STENCIL_MULTI_GPU_PEER_TILING_NO_COMPUTE_CUH 11 | -------------------------------------------------------------------------------- /Stencil/jacobi3D/include/no-compute/multi-gpu-peer-tiling-no-compute.cuh: -------------------------------------------------------------------------------- 1 | #ifndef INC_3D_STENCIL_MULTI_GPU_PEER_TILING_NO_COMPUTE_CUH 2 | #define INC_3D_STENCIL_MULTI_GPU_PEER_TILING_NO_COMPUTE_CUH 3 | 4 | #include "../common.h" 5 | 6 | namespace MultiGPUPeerTilingNoCompute { 7 | int init(int argc, char** argv); 8 | } 9 | 10 | #endif // INC_3D_STENCIL_MULTI_GPU_PEER_TILING_NO_COMPUTE_CUH 11 | -------------------------------------------------------------------------------- /Stencil/jacobi3D/include/single-stream/multi-threaded-multi-block-comm.cuh: -------------------------------------------------------------------------------- 1 | #ifndef INC_3D_STENCIL_MULTI_THREADED_MULTI_BLOCK_COMM_CUH 2 | #define INC_3D_STENCIL_MULTI_THREADED_MULTI_BLOCK_COMM_CUH 3 | 4 | #include "../common.h" 5 | 6 | namespace SSMultiThreadedMultiBlockComm { 7 | int init(int argc, char** argv); 8 | } 9 | 10 | #endif // INC_3D_STENCIL_MULTI_THREADED_MULTI_BLOCK_COMM_CUH 11 | -------------------------------------------------------------------------------- /Stencil/jacobi2D/include/baseline/multi-threaded-copy-overlap.cuh: -------------------------------------------------------------------------------- 1 | #ifndef INC_2D_STENCIL_BASELINE_MULTI_THREADED_COPY_OVERLAP_CUH 2 | #define INC_2D_STENCIL_BASELINE_MULTI_THREADED_COPY_OVERLAP_CUH 3 | 4 | #include "../common.h" 5 | 6 | namespace BaselineMultiThreadedCopyOverlap { 7 | int init(int argc, char **argv); 8 | } 9 | 10 | #endif // INC_2D_STENCIL_BASELINE_MULTI_THREADED_COPY_OVERLAP_CUH 11 | -------------------------------------------------------------------------------- /Stencil/jacobi2D/include_nvshmem/no-compute/design-1-multi-block-no-compute.cuh: -------------------------------------------------------------------------------- 1 | #ifndef INC_2D_STENCIL_DESIGN_1_MULTI_BLOCK_NO_COMPUTE_CUH 2 | #define INC_2D_STENCIL_DESIGN_1_MULTI_BLOCK_NO_COMPUTE_CUH 3 | 4 | #include "../common.h" 5 | 6 | namespace Design1MultiBlockNoComputation { 7 | int init(int argc, char **argv); 8 | } 9 | 10 | #endif // INC_2D_STENCIL_DESIGN_1_MULTI_BLOCK_NO_COMPUTE_CUH 11 | -------------------------------------------------------------------------------- /Stencil/jacobi3D/include/baseline/multi-threaded-copy-overlap.cuh: -------------------------------------------------------------------------------- 1 | #ifndef INC_3D_STENCIL_BASELINE_MULTI_THREADED_COPY_OVERLAP_CUH 2 | #define INC_3D_STENCIL_BASELINE_MULTI_THREADED_COPY_OVERLAP_CUH 3 | 4 | #include "../common.h" 5 | 6 | namespace BaselineMultiThreadedCopyOverlap { 7 | int init(int argc, char **argv); 8 | } 9 | 10 | #endif // INC_3D_STENCIL_BASELINE_MULTI_THREADED_COPY_OVERLAP_CUH 11 | -------------------------------------------------------------------------------- /Stencil/jacobi2D/include_nvshmem/baseline/multi-threaded-nvshmem-opt.cuh: -------------------------------------------------------------------------------- 1 | #ifndef INC_2D_STENCIL_BASELINE_MULTI_THREADED_NVSHMEM_OPT_CUH 2 | #define INC_2D_STENCIL_BASELINE_MULTI_THREADED_NVSHMEM_OPT_CUH 3 | 4 | #include "../common.h" 5 | 6 | namespace BaselineMultiThreadedNvshmemOpt { 7 | int init(int argc, char **argv); 8 | } 9 | 10 | #endif // INC_2D_STENCIL_BASELINE_MULTI_THREADED_NVSHMEM_OPT_CUH 11 | -------------------------------------------------------------------------------- /Stencil/jacobi3D/include_nvshmem/baseline/multi-threaded-nvshmem-opt.cuh: -------------------------------------------------------------------------------- 1 | #ifndef INC_3D_STENCIL_BASELINE_MULTI_THREADED_NVSHMEM_OPT_CUH 2 | #define INC_3D_STENCIL_BASELINE_MULTI_THREADED_NVSHMEM_OPT_CUH 3 | 4 | #include "../common.h" 5 | 6 | namespace BaselineMultiThreadedNvshmemOpt { 7 | int init(int argc, char **argv); 8 | } 9 | 10 | #endif // INC_3D_STENCIL_BASELINE_MULTI_THREADED_NVSHMEM_OPT_CUH 11 | -------------------------------------------------------------------------------- /Stencil/jacobi2D/include/no-compute/multi-threaded-p2p-no-compute.cuh: -------------------------------------------------------------------------------- 1 | #ifndef INC_2D_STENCIL_BASELINE_MULTI_THREADED_P2P_NO_COMPUTE_CUH 2 | #define INC_2D_STENCIL_BASELINE_MULTI_THREADED_P2P_NO_COMPUTE_CUH 3 | 4 | #include "../common.h" 5 | 6 | namespace BaselineMultiThreadedP2PNoCompute { 7 | int init(int argc, char **argv); 8 | } 9 | 10 | #endif // INC_2D_STENCIL_BASELINE_MULTI_THREADED_P2P_NO_COMPUTE_CUH 11 | -------------------------------------------------------------------------------- /Stencil/jacobi2D/include/single-stream/multi-threaded-one-block-comm-layer.cuh: -------------------------------------------------------------------------------- 1 | #ifndef INC_2D_STENCIL_MULTI_THREADED_ONE_BLOCK_COMM_LAYER_CUH 2 | #define INC_2D_STENCIL_MULTI_THREADED_ONE_BLOCK_COMM_LAYER_CUH 3 | 4 | #include "../common.h" 5 | 6 | namespace SSMultiThreadedOneBlockCommLayer { 7 | int init(int argc, char** argv); 8 | } 9 | 10 | #endif // INC_2D_STENCIL_MULTI_THREADED_ONE_BLOCK_COMM_LAYER_CUH 11 | -------------------------------------------------------------------------------- /Stencil/jacobi3D/include/no-compute/multi-threaded-p2p-no-compute.cuh: -------------------------------------------------------------------------------- 1 | #ifndef INC_3D_STENCIL_BASELINE_MULTI_THREADED_P2P_NO_COMPUTE_CUH 2 | #define INC_3D_STENCIL_BASELINE_MULTI_THREADED_P2P_NO_COMPUTE_CUH 3 | 4 | #include "../common.h" 5 | 6 | namespace BaselineMultiThreadedP2PNoCompute { 7 | int init(int argc, char **argv); 8 | } 9 | 10 | #endif // INC_3D_STENCIL_BASELINE_MULTI_THREADED_P2P_NO_COMPUTE_CUH 11 | -------------------------------------------------------------------------------- /Stencil/jacobi3D/include/no-compute/multi-threaded-copy-no-compute.cuh: -------------------------------------------------------------------------------- 1 | #ifndef INC_3D_STENCIL_BASELINE_MULTI_THREADED_COPY_NO_COMPUTE_CUH 2 | #define INC_3D_STENCIL_BASELINE_MULTI_THREADED_COPY_NO_COMPUTE_CUH 3 | 4 | #include "../common.h" 5 | 6 | namespace BaselineMultiThreadedCopyNoCompute { 7 | int init(int argc, char **argv); 8 | } 9 | 10 | #endif // INC_3D_STENCIL_BASELINE_MULTI_THREADED_COPY_NO_COMPUTE_CUH 11 | -------------------------------------------------------------------------------- /CG/batch/Simula/_load_simula_modules.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | module load nvidia/nvhpc/22.3 4 | module load gcc/11.2.0 5 | module load python-3.7.4 6 | 7 | COMM_LIBS_PATH="$NVHPC_ROOT"/comm_libs 8 | MATH_LIBS_PATH="$NVHPC_ROOT"/math_libs/lib64 9 | 10 | export NVCC="$NVHPC_ROOT"/cuda/bin/nvcc 11 | export MPI_HOME="$COMM_LIBS_PATH"/mpi 12 | export NVSHMEM_HOME="$COMM_LIBS_PATH"/nvshmem 13 | export MATH_LIBS_PATH="$MATH_LIBS_PATH" -------------------------------------------------------------------------------- /Plots/data/fig5/2D_Weak_Scaling_2048x1024.csv: -------------------------------------------------------------------------------- 1 | Version,1 GPU (2048x1024),2 GPUs (2048x2048),4 GPUs (4096x2048),8 GPUs (4096x4096) 2 | Baseline Copy,18.0521,42.88,67.3238,83.8113 3 | Baseline Overlap,19.9917,73.2787,86.0336,102.6413 4 | Baseline P2P,11.9066,33.1121,48.0276,63.7284 5 | Baseline NVSHMEM,20.2334,24.1339,23.9423,24.2244 6 | CPU-Free (Ours),12.095,12.3177,12.2255,12.3814 7 | PERKS + Ours,14.3985,14.5411,14.5367,14.5505 8 | -------------------------------------------------------------------------------- /Plots/data/fig5/2D_Weak_Scaling_256x256.csv: -------------------------------------------------------------------------------- 1 | Version,1 GPU (256x256),2 GPUs (256x512),4 GPUs (512x512),8 GPUs (512x1024) 2 | Baseline Copy,12.7818,42.7438,64.9559,81.4121 3 | Baseline Overlap,18.2912,63.732,75.788,100.9295 4 | Baseline P2P,4.0037,28.105,39.5831,61.4564 5 | Baseline NVSHMEM,13.5165,17.3821,14.4411,14.6838 6 | CPU-Free (Ours),11.0862,11.0697,11.4963,11.4811 7 | CPU-Free (Ours + PERKS),16.1236,15.291,16.6956,16.73 8 | -------------------------------------------------------------------------------- /Stencil/jacobi2D/include/no-compute/multi-threaded-copy-no-compute.cuh: -------------------------------------------------------------------------------- 1 | #ifndef INC_2D_STENCIL_BASELINE_MULTI_THREADED_COPY_NO_COMPUTE_CUH 2 | #define INC_2D_STENCIL_BASELINE_MULTI_THREADED_COPY_NO_COMPUTE_CUH 3 | 4 | #include "../common.h" 5 | 6 | namespace BaselineMultiThreadedCopyNoCompute { 7 | int init(int argc, char **argv); 8 | } 9 | 10 | #endif // INC_2D_STENCIL_BASELINE_MULTI_THREADED_COPY_NO_COMPUTE_ CUH 11 | -------------------------------------------------------------------------------- /Stencil/jacobi2D/include_nvshmem/single-stream/multi-threaded-two-block-comm.cuh: -------------------------------------------------------------------------------- 1 | #ifndef INC_NVSHMEM_2D_STENCIL_MULTI_THREADED_TWO_BLOCK_COMM_CUH 2 | #define INC_NVSHMEM_2D_STENCIL_MULTI_THREADED_TWO_BLOCK_COMM_CUH 3 | 4 | #include "../common.h" 5 | 6 | namespace SSMultiThreadedTwoBlockCommNvshmem { 7 | int init(int argc, char** argv); 8 | } 9 | 10 | #endif // INC_NVSHMEM_2D_STENCIL_MULTI_THREADED_TWO_BLOCK_COMM_CUH 11 | -------------------------------------------------------------------------------- /Stencil/jacobi3D/include_nvshmem/single-stream/multi-threaded-one-block-comm.cuh: -------------------------------------------------------------------------------- 1 | #ifndef INC_NVSHMEM_3D_STENCIL_MULTI_THREADED_ONE_BLOCK_COMM_CUH 2 | #define INC_NVSHMEM_3D_STENCIL_MULTI_THREADED_ONE_BLOCK_COMM_CUH 3 | 4 | #include "../common.h" 5 | 6 | namespace SSMultiThreadedOneBlockCommNvshmem { 7 | int init(int argc, char** argv); 8 | } 9 | 10 | #endif // INC_NVSHMEM_3D_STENCIL_MULTI_THREADED_ONE_BLOCK_COMM_CUH 11 | -------------------------------------------------------------------------------- /Stencil/jacobi3D/include_nvshmem/single-stream/multi-threaded-two-block-comm.cuh: -------------------------------------------------------------------------------- 1 | #ifndef INC_NVSHMEM_3D_STENCIL_MULTI_THREADED_TWO_BLOCK_COMM_CUH 2 | #define INC_NVSHMEM_3D_STENCIL_MULTI_THREADED_TWO_BLOCK_COMM_CUH 3 | 4 | #include "../common.h" 5 | 6 | namespace SSMultiThreadedTwoBlockCommNvshmem { 7 | int init(int argc, char** argv); 8 | } 9 | 10 | #endif // INC_NVSHMEM_3D_STENCIL_MULTI_THREADED_TWO_BLOCK_COMM_CUH 11 | -------------------------------------------------------------------------------- /Stencil/jacobi2D/include_nvshmem/single-stream/multi-threaded-one-block-comm.cuh: -------------------------------------------------------------------------------- 1 | #ifndef INC_NVSHMEM_2D_STENCIL_MULTI_THREADED_ONE_BLOCK_COMM_CUH 2 | #define INC_NVSHMEM_2D_STENCIL_MULTI_THREADED_ONE_BLOCK_COMM_CUH 3 | 4 | #include "../common.h" 5 | 6 | namespace SSMultiThreadedOneBlockCommNvshmem { 7 | int init(int argc, char** argv); 8 | } 9 | 10 | #endif // INC_NVSHMEM_2D_STENCIL_MULTI_THREADED_ONE_BLOCK_COMM_BULK_CUH 11 | -------------------------------------------------------------------------------- /Stencil/jacobi2D/include/no-compute/multi-threaded-two-block-comm-no-compute.cuh: -------------------------------------------------------------------------------- 1 | #ifndef INC_2D_STENCIL_MULTI_THREADED_TWO_BLOCK_COMM_NO_COMPUTE_CUH 2 | #define INC_2D_STENCIL_MULTI_THREADED_TWO_BLOCK_COMM_NO_COMPUTE_CUH 3 | 4 | #include "../common.h" 5 | 6 | namespace SSMultiThreadedTwoBlockCommNoCompute { 7 | int init(int argc, char** argv); 8 | } 9 | 10 | #endif // INC_2D_STENCIL_MULTI_THREADED_TWO_BLOCK_COMM_NO_COMPUTE_CUH 11 | -------------------------------------------------------------------------------- /Stencil/jacobi2D/include_nvshmem/multi-stream/multi-gpu-multi-block-tiling.cuh: -------------------------------------------------------------------------------- 1 | #ifndef INC_2D_STENCIL_MULTI_GPU_MULTI_BLOCK_PEER_TILING_NVSHMEM_CUH 2 | #define INC_2D_STENCIL_MULTI_GPU_MULTI_BLOCK_PEER_TILING_NVSHMEM_CUH 3 | 4 | #include "../common.h" 5 | 6 | namespace MultiGPUMultiBlockPeerTilingNvshmem { 7 | int init(int argc, char** argv); 8 | } 9 | 10 | #endif // INC_2D_STENCIL_MULTI_GPU_MULTI_BLOCK_PEER_TILING_NVSHMEM_CUH 11 | -------------------------------------------------------------------------------- /Stencil/jacobi2D/include_nvshmem/no-compute/multi-gpu-peer-tiling-no-compute.cuh: -------------------------------------------------------------------------------- 1 | #ifndef INC_2D_STENCIL_MULTI_GPU_PEER_TILING_NVSHMEM_NO_COMPUTE_CUH 2 | #define INC_2D_STENCIL_MULTI_GPU_PEER_TILING_NVSHMEM_NO_COMPUTE_CUH 3 | 4 | #include "../common.h" 5 | 6 | namespace MultiGPUPeerTilingNvshmemNoCompute { 7 | int init(int argc, char** argv); 8 | } 9 | 10 | #endif // INC_2D_STENCIL_MULTI_GPU_PEER_TILING_NVSHMEM_NO_COMPUTE_CUH 11 | -------------------------------------------------------------------------------- /Stencil/jacobi2D/include_nvshmem/single-stream/multi-threaded-multi-block-comm.cuh: -------------------------------------------------------------------------------- 1 | #ifndef INC_NVSHMEM_2D_STENCIL_MULTI_THREADED_MULTI_BLOCK_COMM_CUH 2 | #define INC_NVSHMEM_2D_STENCIL_MULTI_THREADED_MULTI_BLOCK_COMM_CUH 3 | 4 | #include "../common.h" 5 | 6 | namespace SSMultiThreadedMultiBlockCommNvshmem { 7 | int init(int argc, char** argv); 8 | } 9 | 10 | #endif // INC_NVSHMEM_2D_STENCIL_MULTI_THREADED_MULTI_BLOCK_COMM_CUH 11 | -------------------------------------------------------------------------------- /Stencil/jacobi3D/include/no-compute/multi-threaded-two-block-comm-no-compute.cuh: -------------------------------------------------------------------------------- 1 | #ifndef INC_3D_STENCIL_MULTI_THREADED_TWO_BLOCK_COMM_NO_COMPUTE_CUH 2 | #define INC_3D_STENCIL_MULTI_THREADED_TWO_BLOCK_COMM_NO_COMPUTE_CUH 3 | 4 | #include "../common.h" 5 | 6 | namespace SSMultiThreadedTwoBlockCommNoCompute { 7 | int init(int argc, char** argv); 8 | } 9 | 10 | #endif // INC_3D_STENCIL_MULTI_THREADED_TWO_BLOCK_COMM_NO_COMPUTE_CUH 11 | -------------------------------------------------------------------------------- /Stencil/jacobi3D/include_nvshmem/multi-stream/multi-gpu-multi-block-tiling.cuh: -------------------------------------------------------------------------------- 1 | #ifndef INC_3D_STENCIL_MULTI_GPU_MULTI_BLOCK_PEER_TILING_NVSHMEM_CUH 2 | #define INC_3D_STENCIL_MULTI_GPU_MULTI_BLOCK_PEER_TILING_NVSHMEM_CUH 3 | 4 | #include "../common.h" 5 | 6 | namespace MultiGPUMultiBlockPeerTilingNvshmem { 7 | int init(int argc, char** argv); 8 | } 9 | 10 | #endif // INC_3D_STENCIL_MULTI_GPU_MULTI_BLOCK_PEER_TILING_NVSHMEM_CUH 11 | -------------------------------------------------------------------------------- /Stencil/jacobi3D/include_nvshmem/no-compute/multi-gpu-peer-tiling-no-compute.cuh: -------------------------------------------------------------------------------- 1 | #ifndef INC_3D_STENCIL_MULTI_GPU_PEER_TILING_NVSHMEM_NO_COMPUTE_CUH 2 | #define INC_3D_STENCIL_MULTI_GPU_PEER_TILING_NVSHMEM_NO_COMPUTE_CUH 3 | 4 | #include "../common.h" 5 | 6 | namespace MultiGPUPeerTilingNvshmemNoCompute { 7 | int init(int argc, char** argv); 8 | } 9 | 10 | #endif // INC_3D_STENCIL_MULTI_GPU_PEER_TILING_NVSHMEM_NO_COMPUTE_CUH 11 | -------------------------------------------------------------------------------- /Stencil/jacobi3D/include_nvshmem/single-stream/multi-threaded-multi-block-comm.cuh: -------------------------------------------------------------------------------- 1 | #ifndef INC_NVSHMEM_3D_STENCIL_MULTI_THREADED_MULTI_BLOCK_COMM_CUH 2 | #define INC_NVSHMEM_3D_STENCIL_MULTI_THREADED_MULTI_BLOCK_COMM_CUH 3 | 4 | #include "../common.h" 5 | 6 | namespace SSMultiThreadedMultiBlockCommNvshmem { 7 | int init(int argc, char** argv); 8 | } 9 | 10 | #endif // INC_NVSHMEM_3D_STENCIL_MULTI_THREADED_MULTI_BLOCK_COMM_CUH 11 | -------------------------------------------------------------------------------- /Stencil/jacobi2D/include/no-compute/multi-threaded-one-block-comm-no-compute.cuh: -------------------------------------------------------------------------------- 1 | #ifndef INC_2D_STENCIL_MULTI_THREADED_ONE_BLOCK_COMM_NO_COMPUTE_CUH 2 | #define INC_2D_STENCIL_MULTI_THREADED_ONE_BLOCK_COMM_NO_COMPUTE_CUH 3 | 4 | #include "../common.h" 5 | 6 | namespace SSMultiThreadedOneBlockCommNoCompute { 7 | int init(int argc, char** argv); 8 | } 9 | 10 | #endif // INC_2D_STENCIL_MULTI_THREADED_ONE_BLOCK_COMM__NO_COMPUTE_CUH 11 | -------------------------------------------------------------------------------- /Stencil/jacobi3D/include/no-compute/multi-threaded-one-block-comm-no-compute.cuh: -------------------------------------------------------------------------------- 1 | #ifndef INC_3D_STENCIL_MULTI_THREADED_ONE_BLOCK_COMM_NO_COMPUTE_CUH 2 | #define INC_3D_STENCIL_MULTI_THREADED_ONE_BLOCK_COMM_NO_COMPUTE_CUH 3 | 4 | #include "../common.h" 5 | 6 | namespace SSMultiThreadedOneBlockCommNoCompute { 7 | int init(int argc, char** argv); 8 | } 9 | 10 | #endif // INC_3D_STENCIL_MULTI_THREADED_ONE_BLOCK_COMM__NO_COMPUTE_CUH 11 | -------------------------------------------------------------------------------- /Plots/data/fig6/3D_Weak_Scaling_256x256x256.csv: -------------------------------------------------------------------------------- 1 | Version,1 GPUs (256x256x256),2 GPUs (256x256x512),4 GPUs (256x512x512),8 GPUs (512x512x512) 2 | Baseline Copy,12.8902,14.339,15.2628,15.5506 3 | Baseline Overlap,12.3302,12.3777,12.5602,12.7043 4 | Baseline P2P,12.1258,13.4727,14.257,14.2942 5 | Baseline NVSHMEM,33.7791,36.2173,37.692,26.6532 6 | CPU-Free (Ours),21.6456,21.5007,21.6423,21.3508 7 | CPU-Free (Ours + PERKS),12.3902,12.6408,13.7867,24.335 8 | -------------------------------------------------------------------------------- /Stencil/jacobi2D/include_nvshmem/no-compute/multi-threaded-nvshmem-no-compute.cuh: -------------------------------------------------------------------------------- 1 | #ifndef INC_2D_STENCIL_BASELINE_MULTI_THREADED_NVSHMEM_NO_COMPUTE_CUH 2 | #define INC_2D_STENCIL_BASELINE_MULTI_THREADED_NVSHMEM_NO_COMPUTE_CUH 3 | 4 | #include "../common.h" 5 | 6 | namespace BaselineMultiThreadedNvshmemNoCompute { 7 | int init(int argc, char **argv); 8 | } 9 | 10 | #endif // INC_2D_STENCIL_BASELINE_MULTI_THREADED_NVSHMEM_NO_COMPUTE_CUH 11 | -------------------------------------------------------------------------------- /Stencil/jacobi3D/include/no-compute/multi-threaded-multi-block-comm-no-compute.cuh: -------------------------------------------------------------------------------- 1 | #ifndef INC_3D_STENCIL_MULTI_THREADED_MULTI_BLOCK_COMM_NO_COMPUTE_CUH 2 | #define INC_3D_STENCIL_MULTI_THREADED_MULTI_BLOCK_COMM_NO_COMPUTE_CUH 3 | 4 | #include "../common.h" 5 | 6 | namespace SSMultiThreadedMultiBlockCommNoCompute { 7 | int init(int argc, char** argv); 8 | } 9 | 10 | #endif // INC_3D_STENCIL_MULTI_THREADED_MULTI_BLOCK_COMM_NO_COMPUTE_CUH 11 | -------------------------------------------------------------------------------- /Stencil/jacobi3D/include_nvshmem/no-compute/multi-threaded-nvshmem-no-compute.cuh: -------------------------------------------------------------------------------- 1 | #ifndef INC_3D_STENCIL_BASELINE_MULTI_THREADED_NVSHMEM_NO_COMPUTE_CUH 2 | #define INC_3D_STENCIL_BASELINE_MULTI_THREADED_NVSHMEM_NO_COMPUTE_CUH 3 | 4 | #include "../common.h" 5 | 6 | namespace BaselineMultiThreadedNvshmemNoCompute { 7 | int init(int argc, char **argv); 8 | } 9 | 10 | #endif // INC_3D_STENCIL_BASELINE_MULTI_THREADED_NVSHMEM_NO_COMPUTE_CUH 11 | -------------------------------------------------------------------------------- /Stencil/jacobi2D/include/no-compute/multi-threaded-copy-overlap-no-compute.cuh: -------------------------------------------------------------------------------- 1 | #ifndef INC_2D_STENCIL_BASELINE_MULTI_THREADED_COPY_OVERLAP_NO_COMPUTE_CUH 2 | #define INC_2D_STENCIL_BASELINE_MULTI_THREADED_COPY_OVERLAP_NO_COMPUTE_CUH 3 | 4 | #include "../common.h" 5 | 6 | namespace BaselineMultiThreadedCopyOverlapNoCompute { 7 | int init(int argc, char **argv); 8 | } 9 | 10 | #endif // INC_2D_STENCIL_BASELINE_MULTI_THREADED_COPY_OVERLAP_NO_COMPUTE_CUH 11 | -------------------------------------------------------------------------------- /Stencil/jacobi3D/include/no-compute/multi-threaded-copy-overlap-no-compute.cuh: -------------------------------------------------------------------------------- 1 | #ifndef INC_3D_STENCIL_BASELINE_MULTI_THREADED_COPY_OVERLAP_NO_COMPUTE_CUH 2 | #define INC_3D_STENCIL_BASELINE_MULTI_THREADED_COPY_OVERLAP_NO_COMPUTE_CUH 3 | 4 | #include "../common.h" 5 | 6 | namespace BaselineMultiThreadedCopyOverlapNoCompute { 7 | int init(int argc, char **argv); 8 | } 9 | 10 | #endif // INC_3D_STENCIL_BASELINE_MULTI_THREADED_COPY_OVERLAP_NO_COMPUTE_CUH 11 | -------------------------------------------------------------------------------- /Stencil/jacobi2D/include/no-compute/multi-threaded-one-block-comm-layer-no-compute.cuh: -------------------------------------------------------------------------------- 1 | #ifndef INC_2D_STENCIL_MULTI_THREADED_ONE_BLOCK_COMM_LAYER_NO_COMPUTE_CUH 2 | #define INC_2D_STENCIL_MULTI_THREADED_ONE_BLOCK_COMM_LAYER_NO_COMPUTE_CUH 3 | 4 | #include "../common.h" 5 | 6 | namespace SSMultiThreadedOneBlockCommLayerNoCompute { 7 | int init(int argc, char** argv); 8 | } 9 | 10 | #endif // INC_2D_STENCIL_MULTI_THREADED_ONE_BLOCK_COMM_LAYER_NO_COMPUTE_CUH 11 | -------------------------------------------------------------------------------- /Stencil/jacobi2D/include_nvshmem/no-compute/multi-threaded-nvshmem-opt-no-compute.cuh: -------------------------------------------------------------------------------- 1 | #ifndef INC_2D_STENCIL_BASELINE_MULTI_THREADED_NVSHMEM_OPT_NO_COMPUTE_CUH 2 | #define INC_2D_STENCIL_BASELINE_MULTI_THREADED_NVSHMEM_OPT_NO_COMPUTE_CUH 3 | 4 | #include "../common.h" 5 | 6 | namespace BaselineMultiThreadedNvshmemOptNoCompute { 7 | int init(int argc, char **argv); 8 | } 9 | 10 | #endif // INC_2D_STENCIL_BASELINE_MULTI_THREADED_NVSHMEM_OPT_NO_COMPUTE_CUH 11 | -------------------------------------------------------------------------------- /Stencil/jacobi3D/include_nvshmem/no-compute/multi-threaded-nvshmem-opt-no-compute.cuh: -------------------------------------------------------------------------------- 1 | #ifndef INC_3D_STENCIL_BASELINE_MULTI_THREADED_NVSHMEM_OPT_NO_COMPUTE_CUH 2 | #define INC_3D_STENCIL_BASELINE_MULTI_THREADED_NVSHMEM_OPT_NO_COMPUTE_CUH 3 | 4 | #include "../common.h" 5 | 6 | namespace BaselineMultiThreadedNvshmemOptNoCompute { 7 | int init(int argc, char **argv); 8 | } 9 | 10 | #endif // INC_3D_STENCIL_BASELINE_MULTI_THREADED_NVSHMEM_OPT_NO_COMPUTE_CUH 11 | -------------------------------------------------------------------------------- /Plots/data/2d-weak-scaling-small.csv: -------------------------------------------------------------------------------- 1 | Small Domain Weak Scaling 2 | Version,1 GPU (256x256),2 GPUs (256x512),4 GPUs (512x512),8 GPUs (512x1024) 3 | Single Stream 2TB,2.4132,3.2832,3.4327,3.8099 4 | Single Stream 1TB,3.0105,4.6844,4.734,5.0006 5 | Baseline Copy Overlap,12.3663,31.1103,69.8461,96.4217 6 | Baseline Copy,7.9615,23.6853,55.2348,74.4993 7 | Baseline P2P,3.224,18.2069,33.2891,58.6196 8 | Design 1,4.4098,5.0639,5.0658,5.9583 9 | PERKS,1.5574,1.5892,1.6003,1.616 -------------------------------------------------------------------------------- /Stencil/jacobi2D/include_nvshmem/no-compute/multi-threaded-one-block-comm-no-compute.cuh: -------------------------------------------------------------------------------- 1 | #ifndef INC_NVSHMEM_2D_STENCIL_MULTI_THREADED_ONE_BLOCK_COMM_NO_COMPUTE_CUH 2 | #define INC_NVSHMEM_2D_STENCIL_MULTI_THREADED_ONE_BLOCK_COMM_NO_COMPUTE_CUH 3 | 4 | #include "../common.h" 5 | 6 | namespace SSMultiThreadedOneBlockCommNvshmemNoCompute { 7 | int init(int argc, char** argv); 8 | } 9 | 10 | #endif // INC_NVSHMEM_2D_STENCIL_MULTI_THREADED_ONE_BLOCK_COMM_NO_COMPUTE_CUH 11 | -------------------------------------------------------------------------------- /Stencil/jacobi2D/include_nvshmem/no-compute/multi-threaded-two-block-comm-no-compute.cuh: -------------------------------------------------------------------------------- 1 | #ifndef INC_NVSHMEM_2D_STENCIL_MULTI_THREADED_TWO_BLOCK_COMM_NO_COMPUTE_CUH 2 | #define INC_NVSHMEM_2D_STENCIL_MULTI_THREADED_TWO_BLOCK_COMM_NO_COMPUTE_CUH 3 | 4 | #include "../common.h" 5 | 6 | namespace SSMultiThreadedTwoBlockCommNvshmemNoCompute { 7 | int init(int argc, char** argv); 8 | } 9 | 10 | #endif // INC_NVSHMEM_2D_STENCIL_MULTI_THREADED_TWO_BLOCK_NO_COMPUTE_COMM_CUH 11 | -------------------------------------------------------------------------------- /Stencil/jacobi3D/include_nvshmem/no-compute/multi-threaded-one-block-comm-no-compute.cuh: -------------------------------------------------------------------------------- 1 | #ifndef INC_NVSHMEM_3D_STENCIL_MULTI_THREADED_ONE_BLOCK_COMM_NO_COMPUTE_CUH 2 | #define INC_NVSHMEM_3D_STENCIL_MULTI_THREADED_ONE_BLOCK_COMM_NO_COMPUTE_CUH 3 | 4 | #include "../common.h" 5 | 6 | namespace SSMultiThreadedOneBlockCommNvshmemNoCompute { 7 | int init(int argc, char** argv); 8 | } 9 | 10 | #endif // INC_NVSHMEM_3D_STENCIL_MULTI_THREADED_ONE_BLOCK_COMM_NO_COMPUTE_CUH 11 | -------------------------------------------------------------------------------- /Stencil/jacobi3D/include_nvshmem/no-compute/multi-threaded-two-block-comm-no-compute.cuh: -------------------------------------------------------------------------------- 1 | #ifndef INC_NVSHMEM_3D_STENCIL_MULTI_THREADED_TWO_BLOCK_COMM_NO_COMPUTE_CUH 2 | #define INC_NVSHMEM_3D_STENCIL_MULTI_THREADED_TWO_BLOCK_COMM_NO_COMPUTE_CUH 3 | 4 | #include "../common.h" 5 | 6 | namespace SSMultiThreadedTwoBlockCommNvshmemNoCompute { 7 | int init(int argc, char** argv); 8 | } 9 | 10 | #endif // INC_NVSHMEM_3D_STENCIL_MULTI_THREADED_TWO_BLOCK_COMM_NO_COMPUTE_CUH 11 | -------------------------------------------------------------------------------- /Plots/data/2d-weak-scaling-medium.csv: -------------------------------------------------------------------------------- 1 | Medium Domain Weak Scaling 2 | Version,1 GPU (1024x1024),2 GPUs (1024x2048),4 GPUs (2048x2048),8 GPUs (2048x4096) 3 | Single Stream 2TB,7.192,8.612,8.8903,8.9654 4 | Single Stream 1TB,7.2055,8.8046,9.127,15.5033 5 | Baseline Copy Overlap,12.2322,31.4607,71.3725,95.137 6 | Baseline Copy,12.7136,25.9365,55.0721,73.8571 7 | Baseline P2P,7.3627,18.3491,44.5818,60.2386 8 | Design 1,9.5494,10.8306,11.1018,10.747 9 | PERKS,1.5574,1.5892,1.6003,1.616 -------------------------------------------------------------------------------- /Plots/data/fig6/3D_Strong_Scaling_No_Compute_512x512x512.csv: -------------------------------------------------------------------------------- 1 | Version,1,2,3,4,5,6,7,8 2 | Baseline Copy,16.4393,11.0126,8.7214,7.1926,7.2223,7.5639,8.391,9.038 3 | Baseline Overlap,16.1371,8.0562,7.4151,7.8871,8.2864,9.4964,9.9443,10.0324 4 | Baseline P2P,29.6104,17.0617,12.298,9.706,8.3281,7.3623,6.6295,6.5249 5 | Baseline NVSHMEM (No Computation),57.0715,32.6568,23.7486,18.723,16.4555,14.1128,13.8677,12.7757 6 | CPU-Free (Ours),3.8826,4.1155,4.119,4.1262,4.1281,4.1229,4.126,4.1301 7 | -------------------------------------------------------------------------------- /Stencil/jacobi3D/include_nvshmem/no-compute/multi-threaded-multi-block-comm-no-compute.cuh: -------------------------------------------------------------------------------- 1 | #ifndef INC_NVSHMEM_3D_STENCIL_MULTI_THREADED_MULTI_BLOCK_COMM_NO_COMPUTE_CUH 2 | #define INC_NVSHMEM_3D_STENCIL_MULTI_THREADED_MULTI_BLOCK_COMM_NO_COMPUTE_CUH 3 | 4 | #include "../common.h" 5 | 6 | namespace SSMultiThreadedMultiBlockCommNvshmemNoCompute { 7 | int init(int argc, char** argv); 8 | } 9 | 10 | #endif // INC_NVSHMEM_3D_STENCIL_MULTI_THREADED_MULTI_BLOCK_COMM_NO_COMPUTE_CUH 11 | -------------------------------------------------------------------------------- /CG/results/cg_runtime_single_gpu-A100.csv: -------------------------------------------------------------------------------- 1 | Matrix,Runtime 2 | tridiagonal,18.6203 3 | ecology2,1.651 4 | hood,3.8152 5 | bmwcra_1,3.7064 6 | consph,1.7955 7 | thermomech_dM,0.9367 8 | tmt_sym,1.5808 9 | crankseg_1,8.6219 10 | crankseg_2,10.0012 11 | Queen_4147,65.3094 12 | Bump_2911,31.9607 13 | G3_circuit,2.2177 14 | StocF-1465,3.8258 15 | Flan_1565,26.1217 16 | audikw_1,25.5873 17 | Serena,16.2381 18 | Geo_1438,16.3255 19 | Hook_1498,16.2435 20 | ldoor,14.3055 21 | -------------------------------------------------------------------------------- /Plots/data/2d-weak-scaling-large.csv: -------------------------------------------------------------------------------- 1 | Large Domain Weak Scaling 2 | Version,1 GPU (8192x4096),2 GPUs (8192x8192),4 GPUs (8192x16384),8 GPUs (16384x16384) 3 | Single Stream 2TB,26.7276,26.6932,26.9133,26.9483 4 | Single Stream 1TB,26.8677,26.8376,27.0564,27.0928 5 | Baseline Copy Overlap,22.0284,21.7194,21.7976,21.8375 6 | Baseline Copy,22.2609,23.5586,23.6619,23.7858 7 | Baseline P2P,21.5407,22.2303,22.8034,22.9545 8 | Design 1,26.8585,26.8136,27.058,27.1067 9 | PERKS,20.9525,21.0912,21.3203,21.8428 -------------------------------------------------------------------------------- /Plots/data/fig6/3D_Strong_Scaling_256x256x256.csv: -------------------------------------------------------------------------------- 1 | Version,1,2,3,4,5,6,7,8 2 | Baseline Copy,1.2854,0.8522,0.6712,0.5699,0.6781,0.7493,0.8319,0.8308 3 | Baseline Overlap,1.2599,0.6687,0.671,0.661,0.8533,0.8479,0.9338,0.9649 4 | Baseline P2P,1.2433,0.7529,0.5684,0.4499,0.499,0.5614,0.5165,0.5842 5 | Baseline NVSHMEM,3.3892,1.9805,1.4403,1.2186,1.0555,0.8368,0.7347,0.7 6 | CPU-Free (Ours),2.1729,0.9247,0.6057,0.4807,0.3626,0.3254,0.2766,0.2388 7 | CPU-Free (Ours + PERKS),1.2591,0.6948,0.5312,0.5198,0.4622,0.4687,0.4612,0.4659 8 | -------------------------------------------------------------------------------- /Stencil/jacobi3D/scripts/run-bench.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH --job-name=stencil-bench 4 | #SBATCH --ntasks=8 5 | #SBATCH --gres=gpu:8 6 | #SBATCH --partition hgx2q 7 | #SBATCH --time=01:30:00 8 | #SBATCH --output=sbatch_output_%j.log 9 | 10 | SCRIPT="./scripts/weak_scale_bench.sh" 11 | 12 | ARGS=( 13 | "256 256 128 10000" 14 | "256 256 256 10000" 15 | "512 512 32 1000" 16 | "512 512 512 100" 17 | ) 18 | 19 | for i in "${ARGS[@]}" 20 | do 21 | "$SCRIPT" $i; 22 | printf '\n\n' 23 | done 24 | -------------------------------------------------------------------------------- /CG/include/single-stream/standard.cuh: -------------------------------------------------------------------------------- 1 | #ifndef INC_CG_SINGLE_STREAM_STANDARD_CUH 2 | #define INC_CG_SINGLE_STREAM_STANDARD_CUH 3 | 4 | #include "../common.h" 5 | 6 | namespace SingleStreamStandard { 7 | int init(int *device_csrRowIndices, int *device_csrColIndices, real *device_csrVal, 8 | const int num_rows, const int nnz, bool matrix_is_zero_indexed, const int iter_max, 9 | real *x_final_result, const double single_gpu_runtime, bool compare_to_single_gpu, 10 | bool compare_to_cpu, real *x_ref_single_gpu, real *x_ref_cpu); 11 | } 12 | 13 | #endif // INC_CG_SINGLE_STREAM_STANDARD_CUH 14 | -------------------------------------------------------------------------------- /CG/include/single-stream/pipelined.cuh: -------------------------------------------------------------------------------- 1 | #ifndef INC_CG_SINGLE_STREAM_PIPELINED_CUH 2 | #define INC_CG_SINGLE_STREAM_PIPELINED_CUH 3 | 4 | #include "../common.h" 5 | 6 | namespace SingleStreamPipelined { 7 | int init(int *device_csrRowIndices, int *device_csrColIndices, real *device_csrVal, 8 | const int num_rows, const int nnz, bool matrix_is_zero_indexed, const int iter_max, 9 | real *x_final_result, const double single_gpu_runtime, bool compare_to_single_gpu, 10 | bool compare_to_cpu, real *x_ref_single_gpu, real *x_ref_cpu); 11 | } 12 | 13 | #endif // INC_CG_SINGLE_STREAM_PIPELINED_CUH 14 | -------------------------------------------------------------------------------- /CG/include/baseline/discrete-standard.cuh: -------------------------------------------------------------------------------- 1 | #ifndef INC_CG_BASELINE_DISCRETE_STANDARD_CUH 2 | #define INC_CG_BASELINE_DISCRETE_STANDARD_CUH 3 | 4 | #include "../common.h" 5 | 6 | namespace BaselineDiscreteStandard { 7 | int init(int *device_csrRowIndices, int *device_csrColIndices, real *device_csrVal, 8 | const int num_rows, const int nnz, bool matrix_is_zero_indexed, const int iter_max, 9 | real *x_final_result, const double single_gpu_runtime, bool compare_to_single_gpu, 10 | bool compare_to_cpu, real *x_ref_single_gpu, real *x_ref_cpu); 11 | } 12 | 13 | #endif // INC_CG_BASELINE_DISCRETE_STANDARD_CUH 14 | -------------------------------------------------------------------------------- /Stencil/jacobi3D/src/PERKS/common/jacobi_cuda.cuh: -------------------------------------------------------------------------------- 1 | #ifndef PERKS_CUDA_HEADER 2 | #define PERKS_CUDA_HEADER 3 | // template 4 | 5 | // this is where the aimed implementation located 6 | template 7 | int j3d_iterative(REAL*, int, int, int, REAL*, int, int, int, bool, bool, int, bool, 8 | bool getminHeight = false); 9 | 10 | #define PERKS_DECLARE_INITIONIZATION_ITERATIVE(_type) \ 11 | int j3d_iterative(_type*, int, int, int, _type*, int, int, int, bool, bool, int, bool, bool); 12 | 13 | template 14 | int getMinWidthY(int, int, int, bool isDoubleTile = false); 15 | 16 | #endif 17 | -------------------------------------------------------------------------------- /CG/include/baseline/discrete-pipelined.cuh: -------------------------------------------------------------------------------- 1 | #ifndef INC_CG_BASELINE_DISCRETE_PIPELINED_CUH 2 | #define INC_CG_BASELINE_DISCRETE_PIPELINED_CUH 3 | 4 | #include "../common.h" 5 | 6 | namespace BaselineDiscretePipelined { 7 | int init(int *device_csrRowIndices, int *device_csrColIndices, real *device_csrVal, 8 | const int num_rows, const int nnz, bool matrix_is_zero_indexed, const int iter_max, 9 | real *x_final_result, const double single_gpu_runtime, bool compare_to_single_gpu, 10 | bool compare_to_cpu, real *x_ref_single_gpu, real *x_ref_cpu); 11 | } 12 | 13 | #endif // INC_CG_BASELINE_DISCRETE_PIPELINED_CUH 14 | -------------------------------------------------------------------------------- /CG/include/profiling/discrete-standard.cuh: -------------------------------------------------------------------------------- 1 | #ifndef INC_CG_PROFILING_DISCRETE_STANDARD_CUH 2 | #define INC_CG_PROFILING_DISCRETE_STANDARD_CUH 3 | 4 | #include "../common.h" 5 | 6 | namespace ProfilingDiscreteStandard { 7 | int init(int *device_csrRowIndices, int *device_csrColIndices, real *device_csrVal, 8 | const int num_rows, const int nnz, bool matrix_is_zero_indexed, const int iter_max, 9 | real *x_final_result, const double single_gpu_runtime, bool compare_to_single_gpu, 10 | bool compare_to_cpu, real *x_ref_single_gpu, real *x_ref_cpu); 11 | } 12 | 13 | #endif // INC_CG_PROFILING_DISCRETE_STANDARD_CUH 14 | -------------------------------------------------------------------------------- /CG/include/profiling/discrete-pipelined.cuh: -------------------------------------------------------------------------------- 1 | #ifndef INC_CG_PROFILING_DISCRETE_PIPELINED_CUH 2 | #define INC_CG_PROFILING_DISCRETE_PIPELINED_CUH 3 | 4 | #include "../common.h" 5 | 6 | namespace ProfilingDiscretePipelined { 7 | int init(int *device_csrRowIndices, int *device_csrColIndices, real *device_csrVal, 8 | const int num_rows, const int nnz, bool matrix_is_zero_indexed, const int iter_max, 9 | real *x_final_result, const double single_gpu_runtime, bool compare_to_single_gpu, 10 | bool compare_to_cpu, real *x_ref_single_gpu, real *x_ref_cpu); 11 | } 12 | 13 | #endif // INC_CG_PROFILING_DISCRETE_PIPELINED_CUH 14 | -------------------------------------------------------------------------------- /CG/include/single-stream/pipelined-gather.cuh: -------------------------------------------------------------------------------- 1 | #ifndef INC_CG_SINGLE_STREAM_PIPELINED_GATHER_CUH 2 | #define INC_CG_SINGLE_STREAM_PIPELINED_GATHER_CUH 3 | 4 | #include "../common.h" 5 | 6 | namespace SingleStreamPipelinedGather { 7 | int init(int *device_csrRowIndices, int *device_csrColIndices, real *device_csrVal, 8 | const int num_rows, const int nnz, bool matrix_is_zero_indexed, const int iter_max, 9 | real *x_final_result, const double single_gpu_runtime, bool compare_to_single_gpu, 10 | bool compare_to_cpu, real *x_ref_single_gpu, real *x_ref_cpu); 11 | } 12 | 13 | #endif // INC_CG_SINGLE_STREAM_PIPELINED_GATHER_CUH 14 | -------------------------------------------------------------------------------- /Stencil/jacobi3D/src_nvshmem/PERKS-nvshmem/common/jacobi_cuda.cuh: -------------------------------------------------------------------------------- 1 | #ifndef PERKS_CUDA_HEADER 2 | #define PERKS_CUDA_HEADER 3 | // template 4 | 5 | // this is where the aimed implementation located 6 | template 7 | int j3d_iterative(REAL*, int, int, int, REAL*, int, int, int, bool, bool, int, bool, 8 | bool getminHeight = false); 9 | 10 | #define PERKS_DECLARE_INITIONIZATION_ITERATIVE(_type) \ 11 | int j3d_iterative(_type*, int, int, int, _type*, int, int, int, bool, bool, int, bool, bool); 12 | 13 | template 14 | int getMinWidthY(int, int, int, bool isDoubleTile = false); 15 | 16 | #endif 17 | -------------------------------------------------------------------------------- /CG/results/standard_speedup/pipelined_cg_speedup_1 GPU.txt: -------------------------------------------------------------------------------- 1 | Bump_2911 4.426455 2 | Flan_1565 5.112643 3 | G3_circuit 2.494318 4 | Geo_1438 4.623393 5 | Hook_1498 3.594672 6 | Queen_4147 3.705777 7 | Serena 3.341596 8 | StocF-1465 2.237801 9 | audikw_1 4.304354 10 | bmwcra_1 2.411268 11 | consph 2.783882 12 | crankseg_1 1.820689 13 | crankseg_2 1.742189 14 | ecology2 2.929725 15 | hood 4.932044 16 | ldoor 4.913723 17 | thermomech_dM 5.425409 18 | tmt_sym 3.102883 19 | Persistent vs Discrete Standard CG geo mean speedup on 1 GPU: 3.3515941736836967 -------------------------------------------------------------------------------- /CG/results/standard_speedup/pipelined_cg_speedup_2 GPUs.txt: -------------------------------------------------------------------------------- 1 | Bump_2911 7.994366 2 | Flan_1565 9.729552 3 | G3_circuit 5.990779 4 | Geo_1438 9.328254 5 | Hook_1498 7.015388 6 | Queen_4147 7.065026 7 | Serena 6.570781 8 | StocF-1465 4.973564 9 | audikw_1 6.662346 10 | bmwcra_1 4.332335 11 | consph 5.633529 12 | crankseg_1 3.272440 13 | crankseg_2 3.270840 14 | ecology2 6.744991 15 | hood 5.337812 16 | ldoor 7.827719 17 | thermomech_dM 7.555006 18 | tmt_sym 6.046260 19 | Persistent vs Discrete Standard CG geo mean speedup on 2 GPUs: 6.148600489496772 -------------------------------------------------------------------------------- /CG/results/standard_speedup/pipelined_cg_speedup_3 GPUs.txt: -------------------------------------------------------------------------------- 1 | Bump_2911 2.596158 2 | Flan_1565 3.203920 3 | G3_circuit 1.603297 4 | Geo_1438 2.490385 5 | Hook_1498 1.937174 6 | Queen_4147 3.445680 7 | Serena 1.830493 8 | StocF-1465 1.493452 9 | audikw_1 1.998953 10 | bmwcra_1 1.196542 11 | consph 1.373582 12 | crankseg_1 1.000050 13 | crankseg_2 1.005493 14 | ecology2 1.731788 15 | hood 1.398411 16 | ldoor 2.288784 17 | thermomech_dM 1.791408 18 | tmt_sym 1.774122 19 | Persistent vs Discrete Standard CG geo mean speedup on 3 GPUs: 1.789093534511638 -------------------------------------------------------------------------------- /CG/results/standard_speedup/pipelined_cg_speedup_6 GPUs.txt: -------------------------------------------------------------------------------- 1 | Bump_2911 2.334338 2 | Flan_1565 2.720973 3 | G3_circuit 1.660865 4 | Geo_1438 2.323742 5 | Hook_1498 1.862936 6 | Queen_4147 3.157614 7 | Serena 1.773258 8 | StocF-1465 1.477720 9 | audikw_1 1.178450 10 | bmwcra_1 1.194367 11 | consph 1.332771 12 | crankseg_1 0.997814 13 | crankseg_2 1.007675 14 | ecology2 1.801915 15 | hood 1.356637 16 | ldoor 1.420461 17 | thermomech_dM 1.662827 18 | tmt_sym 1.744974 19 | Persistent vs Discrete Standard CG geo mean speedup on 6 GPUs: 1.637689605919504 -------------------------------------------------------------------------------- /CG/results/standard_speedup/pipelined_cg_speedup_8 GPUs.txt: -------------------------------------------------------------------------------- 1 | Bump_2911 2.078453 2 | Flan_1565 2.489724 3 | G3_circuit 1.736853 4 | Geo_1438 1.885539 5 | Hook_1498 1.607650 6 | Queen_4147 3.275249 7 | Serena 1.400108 8 | StocF-1465 1.399548 9 | audikw_1 0.959414 10 | bmwcra_1 1.181162 11 | consph 1.360530 12 | crankseg_1 0.998168 13 | crankseg_2 0.998363 14 | ecology2 1.831156 15 | hood 1.370214 16 | ldoor 1.152638 17 | thermomech_dM 1.713865 18 | tmt_sym 1.872217 19 | Persistent vs Discrete Standard CG geo mean speedup on 8 GPUs: 1.544213236325518 -------------------------------------------------------------------------------- /CG/include/single-stream/standard-saxpy-overlap.cuh: -------------------------------------------------------------------------------- 1 | #ifndef INC_CG_SINGLE_STREAM_STANDARD_SAXPY_OVERLAP_CUH 2 | #define INC_CG_SINGLE_STREAM_STANDARD_SAXPY_OVERLAP_CUH 3 | 4 | #include "../common.h" 5 | 6 | namespace SingleStreamStandardSaxpyOverlap { 7 | int init(int *device_csrRowIndices, int *device_csrColIndices, real *device_csrVal, 8 | const int num_rows, const int nnz, bool matrix_is_zero_indexed, const int iter_max, 9 | real *x_final_result, const double single_gpu_runtime, bool compare_to_single_gpu, 10 | bool compare_to_cpu, real *x_ref_single_gpu, real *x_ref_cpu); 11 | } 12 | 13 | #endif // INC_CG_SINGLE_STREAM_STANDARD_SAXPY_OVERLAP_CUH 14 | -------------------------------------------------------------------------------- /CG/results/pipelined_speedup/pipelined_cg_speedup_1 GPU.txt: -------------------------------------------------------------------------------- 1 | Bump_2911 4.375819 2 | Flan_1565 5.052174 3 | G3_circuit 2.200405 4 | Geo_1438 4.382149 5 | Hook_1498 3.484535 6 | Queen_4147 3.664555 7 | Serena 3.355091 8 | StocF-1465 2.192997 9 | audikw_1 4.212806 10 | bmwcra_1 2.354507 11 | consph 2.495528 12 | crankseg_1 1.872879 13 | crankseg_2 1.814216 14 | ecology2 2.404500 15 | hood 4.038322 16 | ldoor 4.695067 17 | thermomech_dM 4.532305 18 | tmt_sym 2.508903 19 | Persistent vs Discrete Pipelined CG geo mean speedup on 1 GPU: 3.1385564179041943 -------------------------------------------------------------------------------- /CG/results/pipelined_speedup/pipelined_cg_speedup_2 GPUs.txt: -------------------------------------------------------------------------------- 1 | Bump_2911 7.341344 2 | Flan_1565 8.908230 3 | G3_circuit 5.056324 4 | Geo_1438 8.948598 5 | Hook_1498 6.752378 6 | Queen_4147 5.812224 7 | Serena 6.488967 8 | StocF-1465 4.721093 9 | audikw_1 5.705003 10 | bmwcra_1 4.057007 11 | consph 4.543359 12 | crankseg_1 2.812483 13 | crankseg_2 2.760180 14 | ecology2 5.792145 15 | hood 2.951552 16 | ldoor 7.557363 17 | thermomech_dM 8.461633 18 | tmt_sym 5.995432 19 | Persistent vs Discrete Pipelined CG geo mean speedup on 2 GPUs: 5.467063971899593 -------------------------------------------------------------------------------- /CG/results/pipelined_speedup/pipelined_cg_speedup_3 GPUs.txt: -------------------------------------------------------------------------------- 1 | Bump_2911 2.569157 2 | Flan_1565 3.232169 3 | G3_circuit 1.550415 4 | Geo_1438 2.510828 5 | Hook_1498 1.961816 6 | Queen_4147 3.457249 7 | Serena 1.944080 8 | StocF-1465 1.491928 9 | audikw_1 1.781032 10 | bmwcra_1 1.147545 11 | consph 1.299778 12 | crankseg_1 1.005540 13 | crankseg_2 1.021906 14 | ecology2 1.791548 15 | hood 1.303192 16 | ldoor 2.333952 17 | thermomech_dM 2.034179 18 | tmt_sym 1.777085 19 | Persistent vs Discrete Pipelined CG geo mean speedup on 3 GPUs: 1.7860187368560174 -------------------------------------------------------------------------------- /CG/results/pipelined_speedup/pipelined_cg_speedup_4 GPUs.txt: -------------------------------------------------------------------------------- 1 | Bump_2911 2.439995 2 | Flan_1565 2.875764 3 | G3_circuit 1.676666 4 | Geo_1438 2.302183 5 | Hook_1498 1.723814 6 | Queen_4147 3.396676 7 | Serena 1.658784 8 | StocF-1465 1.383142 9 | audikw_1 1.595776 10 | bmwcra_1 1.125087 11 | consph 1.307853 12 | crankseg_1 1.048909 13 | crankseg_2 1.027968 14 | ecology2 1.925187 15 | hood 1.307692 16 | ldoor 2.154850 17 | thermomech_dM 1.899112 18 | tmt_sym 2.364267 19 | Persistent vs Discrete Pipelined CG geo mean speedup on 4 GPUs: 1.7447072391975165 -------------------------------------------------------------------------------- /CG/results/pipelined_speedup/pipelined_cg_speedup_5 GPUs.txt: -------------------------------------------------------------------------------- 1 | Bump_2911 2.344577 2 | Flan_1565 3.025135 3 | G3_circuit 1.857594 4 | Geo_1438 2.514926 5 | Hook_1498 1.965359 6 | Queen_4147 3.355499 7 | Serena 1.821757 8 | StocF-1465 1.460045 9 | audikw_1 1.469713 10 | bmwcra_1 1.168740 11 | consph 1.305694 12 | crankseg_1 1.012210 13 | crankseg_2 1.027019 14 | ecology2 2.465786 15 | hood 1.314367 16 | ldoor 1.853238 17 | thermomech_dM 1.907120 18 | tmt_sym 2.337850 19 | Persistent vs Discrete Pipelined CG geo mean speedup on 5 GPUs: 1.7919301746521321 -------------------------------------------------------------------------------- /CG/results/pipelined_speedup/pipelined_cg_speedup_6 GPUs.txt: -------------------------------------------------------------------------------- 1 | Bump_2911 2.376280 2 | Flan_1565 2.821397 3 | G3_circuit 1.867283 4 | Geo_1438 2.417959 5 | Hook_1498 1.851615 6 | Queen_4147 3.216714 7 | Serena 1.864264 8 | StocF-1465 1.486899 9 | audikw_1 1.182397 10 | bmwcra_1 1.183182 11 | consph 1.306239 12 | crankseg_1 1.004176 13 | crankseg_2 1.030190 14 | ecology2 2.443718 15 | hood 1.343569 16 | ldoor 1.401290 17 | thermomech_dM 1.918803 18 | tmt_sym 2.366743 19 | Persistent vs Discrete Pipelined CG geo mean speedup on 6 GPUs: 1.7323316068961419 -------------------------------------------------------------------------------- /CG/results/pipelined_speedup/pipelined_cg_speedup_7 GPUs.txt: -------------------------------------------------------------------------------- 1 | Bump_2911 2.330349 2 | Flan_1565 2.519700 3 | G3_circuit 1.996439 4 | Geo_1438 2.481387 5 | Hook_1498 2.051455 6 | Queen_4147 3.143436 7 | Serena 1.801486 8 | StocF-1465 1.571447 9 | audikw_1 1.016005 10 | bmwcra_1 1.181831 11 | consph 1.303826 12 | crankseg_1 0.996751 13 | crankseg_2 1.039673 14 | ecology2 2.513164 15 | hood 1.330792 16 | ldoor 1.144547 17 | thermomech_dM 1.918175 18 | tmt_sym 2.742812 19 | Persistent vs Discrete Pipelined CG geo mean speedup on 7 GPUs: 1.7199580346245422 -------------------------------------------------------------------------------- /CG/results/pipelined_speedup/pipelined_cg_speedup_8 GPUs.txt: -------------------------------------------------------------------------------- 1 | Bump_2911 2.153395 2 | Flan_1565 2.587598 3 | G3_circuit 2.235581 4 | Geo_1438 1.912613 5 | Hook_1498 1.624379 6 | Queen_4147 3.338698 7 | Serena 1.403080 8 | StocF-1465 1.298828 9 | audikw_1 0.943741 10 | bmwcra_1 1.149462 11 | consph 1.306150 12 | crankseg_1 1.003387 13 | crankseg_2 1.009850 14 | ecology2 2.563682 15 | hood 1.337760 16 | ldoor 1.043089 17 | thermomech_dM 1.941090 18 | tmt_sym 2.803437 19 | Persistent vs Discrete Pipelined CG geo mean speedup on 8 GPUs: 1.62977591970162 -------------------------------------------------------------------------------- /CG/results/standard_speedup/pipelined_cg_speedup_4 GPUs.txt: -------------------------------------------------------------------------------- 1 | Bump_2911 2.438446 2 | Flan_1565 2.840313 3 | G3_circuit 1.733877 4 | Geo_1438 2.287746 5 | Hook_1498 1.738808 6 | Queen_4147 3.387336 7 | Serena 1.655311 8 | StocF-1465 1.412219 9 | audikw_1 1.622042 10 | bmwcra_1 1.217728 11 | consph 1.379057 12 | crankseg_1 1.020158 13 | crankseg_2 1.002734 14 | ecology2 1.993398 15 | hood 1.404928 16 | ldoor 2.103503 17 | thermomech_dM 1.790069 18 | tmt_sym 1.996205 19 | Persistent vs Discrete Standard CG geo mean speedup on 4 GPUs: 1.7436150261665724 -------------------------------------------------------------------------------- /CG/results/standard_speedup/pipelined_cg_speedup_5 GPUs.txt: -------------------------------------------------------------------------------- 1 | Bump_2911 2.351825 2 | Flan_1565 2.944306 3 | G3_circuit 1.718156 4 | Geo_1438 2.421919 5 | Hook_1498 1.923354 6 | Queen_4147 3.291462 7 | Serena 1.788636 8 | StocF-1465 1.535666 9 | audikw_1 1.435445 10 | bmwcra_1 1.189787 11 | consph 1.348844 12 | crankseg_1 1.002568 13 | crankseg_2 1.007855 14 | ecology2 1.822290 15 | hood 1.358805 16 | ldoor 1.813006 17 | thermomech_dM 1.698135 18 | tmt_sym 1.735853 19 | Persistent vs Discrete Standard CG geo mean speedup on 5 GPUs: 1.7088051164079698 -------------------------------------------------------------------------------- /CG/results/standard_speedup/pipelined_cg_speedup_7 GPUs.txt: -------------------------------------------------------------------------------- 1 | Bump_2911 2.283016 2 | Flan_1565 2.522607 3 | G3_circuit 1.700237 4 | Geo_1438 2.366187 5 | Hook_1498 1.966283 6 | Queen_4147 3.089906 7 | Serena 1.794262 8 | StocF-1465 1.503945 9 | audikw_1 1.022986 10 | bmwcra_1 1.199006 11 | consph 1.340948 12 | crankseg_1 0.995924 13 | crankseg_2 1.008379 14 | ecology2 1.810112 15 | hood 1.349423 16 | ldoor 1.201471 17 | thermomech_dM 1.663449 18 | tmt_sym 1.835639 19 | Persistent vs Discrete Standard CG geo mean speedup on 7 GPUs: 1.6155862324379113 -------------------------------------------------------------------------------- /CG/include/single-stream/pipelined-multi-overlap.cuh: -------------------------------------------------------------------------------- 1 | #ifndef INC_CG_SINGLE_STREAM_PIPELINED_MULTI_OVERLAP_CUH 2 | #define INC_CG_SINGLE_STREAM_PIPELINED_MULTI_OVERLAP_CUH 3 | 4 | #include "../common.h" 5 | 6 | namespace SingleStreamPipelinedMultiOverlap { 7 | int init(int *device_csrRowIndices, int *device_csrColIndices, real *device_csrVal, 8 | const int num_rows, const int nnz, bool matrix_is_zero_indexed, const int iter_max, 9 | real *x_final_result, const double single_gpu_runtime, bool compare_to_single_gpu, 10 | bool compare_to_cpu, real *x_ref_single_gpu, real *x_ref_cpu); 11 | } 12 | 13 | #endif // INC_CG_SINGLE_STREAM_PIPELINED_MULTI_OVERLAP_CUH 14 | -------------------------------------------------------------------------------- /Stencil/jacobi2D/PERKS/common/jacobi_reference.hpp: -------------------------------------------------------------------------------- 1 | 2 | 3 | #ifndef PERKS_REFERENCE_HEADER 4 | #define PERKS_REFERENCE_HEADER 5 | //template 6 | //void jacobi(REAL*, int, int, REAL*); 7 | 8 | // single step reference 9 | template 10 | void jacobi_gold(REAL*, int, int, REAL*); 11 | // iterative reference 12 | template 13 | void jacobi_gold_iterative(REAL*, int, int, REAL*, int ); 14 | 15 | 16 | #define PERKS_DECLARE_INITIONIZATION_REFERENCE(_type) \ 17 | void jacobi_gold(_type*,int,int,_type*); 18 | 19 | #define PERKS_DECLARE_INITIONIZATION_REFERENCE_ITERATIVE(_type) \ 20 | void jacobi_gold_iterative(_type*,int,int,_type*, int); 21 | 22 | #endif -------------------------------------------------------------------------------- /Stencil/jacobi3D/src/PERKS/common/jacobi_reference.hpp: -------------------------------------------------------------------------------- 1 | 2 | 3 | #ifndef PERKS_REFERENCE_HEADER 4 | #define PERKS_REFERENCE_HEADER 5 | // template 6 | // void jacobi(REAL*, int, int, REAL*); 7 | 8 | // single step reference 9 | template 10 | void j3d_gold(REAL*, int, int, int, REAL*); 11 | // iterative reference 12 | template 13 | void j3d_gold_iterative(REAL*, int, int, int, REAL*, int); 14 | 15 | #define PERKS_DECLARE_INITIONIZATION_REFERENCE(_type) void j3d_gold(_type*, int, int, int, _type*); 16 | 17 | #define PERKS_DECLARE_INITIONIZATION_REFERENCE_ITERATIVE(_type) \ 18 | void j3d_gold_iterative(_type*, int, int, int, _type*, int); 19 | 20 | #endif -------------------------------------------------------------------------------- /Stencil/jacobi3D/src_nvshmem/PERKS-nvshmem/common/jacobi_reference.hpp: -------------------------------------------------------------------------------- 1 | 2 | 3 | #ifndef PERKS_REFERENCE_HEADER 4 | #define PERKS_REFERENCE_HEADER 5 | //template 6 | //void jacobi(REAL*, int, int, REAL*); 7 | 8 | // single step reference 9 | template 10 | void j3d_gold(REAL*, int, int, int, REAL*); 11 | // iterative reference 12 | template 13 | void j3d_gold_iterative(REAL*, int, int, int, REAL*, int ); 14 | 15 | 16 | #define PERKS_DECLARE_INITIONIZATION_REFERENCE(_type) \ 17 | void j3d_gold(_type*,int,int,int,_type*); 18 | 19 | #define PERKS_DECLARE_INITIONIZATION_REFERENCE_ITERATIVE(_type) \ 20 | void j3d_gold_iterative(_type*,int,int,int,_type*, int); 21 | 22 | #endif -------------------------------------------------------------------------------- /CG/batch/A100-machine/_load_A100-machine_modules.sh: -------------------------------------------------------------------------------- 1 | source /home/iismayilov21/spack/share/spack/setup-env.sh 2 | 3 | spack load nvshmem@2.7.0-6 4 | 5 | export UCX_WARN_UNUSED_ENV_VARS=n 6 | export UCX_HOME=$SPACK_ROOT/opt/spack/linux-ubuntu20.04-zen2/gcc-11.1.0/ucx-1.13.1-cv37hs5p3lpknxhmuhucbsjotdn653vn/ 7 | export NVSHMEM_HOME=$SPACK_ROOT/opt/spack/linux-ubuntu20.04-zen2/gcc-11.1.0/nvshmem-2.7.0-6-svccom42hd6t6fmfru3txongtfpvuynm/ 8 | export MPI_HOME=$SPACK_ROOT/opt/spack/linux-ubuntu20.04-zen2/gcc-11.1.0/openmpi-4.1.4-cgf2kyjuumewmbove7jagikdbpo42s6q/ 9 | export CUDA_HOME=$SPACK_ROOT/opt/spack/linux-ubuntu20.04-zen2/gcc-11.1.0/cuda-11.8.0-vb4kpzvmja7a3pinvxpbschaqo4jkalp/ 10 | export LD_LIBRARY_PATH=$NVSHMEM_HOME/lib:$UCX_HOME/lib:$CUDA_HOME/lib64:$LD_LIBRARY_PATH 11 | 12 | export CXX=/usr/bin/g++-11 13 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.23) 2 | 3 | set(CMAKE_VERBOSE_MAKEFILE ON) 4 | 5 | project(cpu-free-model VERSION 1.0 6 | DESCRIPTION "TBD" 7 | LANGUAGES C CXX) 8 | 9 | set(CMAKE_CXX_STANDARD 17) 10 | set(NVCC_VER_MIN 11.6) 11 | 12 | set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) 13 | set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) 14 | set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) 15 | 16 | # CUDA 17 | set(CMAKE_CUDA_ARCHITECTURES "80") 18 | find_package(CUDAToolkit ${NVCC_VER_MIN} REQUIRED) 19 | set(CMAKE_CUDA_STANDARD 17) 20 | set(CMAKE_CUDA_SEPARABLE_COMPILATION ON) 21 | 22 | enable_language(CUDA) 23 | 24 | # Why is this necessary 25 | find_package(OpenMP REQUIRED) 26 | list(APPEND CMAKE_CUDA_FLAGS "-Xcompiler -fopenmp") 27 | 28 | add_subdirectory(CG) 29 | add_subdirectory(Stencil) 30 | -------------------------------------------------------------------------------- /Plots/data/2d-comp.csv: -------------------------------------------------------------------------------- 1 | Weak Scaling Start => 1024x1024 End => 2048x4096 1 Million iterations 2 | 3 | 1 GPU (1024x1024) 2 GPUs (1024x2048) 4 GPUs (2048x2048) 8 GPUs (2048x4096) 4 | Single Stream 2TB (No Compute) 2.4382 4.7672 4.9184 8.1736 5 | Baseline Copy (No compute) 7.9792 23.7511 55.5864 78.1668 6 | Single Stream 1TB (No Compute) 3.1543 7.5508 7.6506 13.9486 7 | Single Stream 2TB 7.192 8.612 8.8903 8.9654 8 | Single Stream 1TB 7.2055 8.8046 9.127 15.5033 9 | Baseline Copy Overlap 12.2322 31.4607 71.3725 95.137 10 | Baseline Copy Overlap (No Compute) 12.2878 30.9587 70.8682 96.8606 11 | Baseline Copy 12.7136 25.9365 55.0721 73.8571 12 | Baseline P2P 7.3627 18.3491 44.5818 60.2386 13 | Baseline P2P (No Compute) 4.4936 18.1663 39.4434 56.9321 14 | Design 1 9.5494 10.8306 11.1018 10.747 15 | Design 1 (no compute) 3.4976 4.1581 4.2056 5.4061 16 | PERKS 1.5574 1.5892 1.6003 1.616 -------------------------------------------------------------------------------- /Stencil/jacobi2D/PERKS/common/jacobi_cuda.cuh: -------------------------------------------------------------------------------- 1 | #ifndef PERKS_CUDA_HEADER 2 | #define PERKS_CUDA_HEADER 3 | // template 4 | 5 | // this is where the aimed implementation located 6 | template 7 | int jacobi_iterative(REAL*, int, int, REAL*, int, int, int, bool, bool, bool, int, bool); 8 | 9 | #define PERKS_DECLARE_INITIONIZATION_ITERATIVE(_type) \ 10 | int jacobi_iterative<_type>(_type*, int, int, _type*, int, int, int, bool, bool, bool, int, \ 11 | bool); 12 | 13 | template 14 | int getMinWidthY(int, int, int); 15 | 16 | template 17 | int getMinWidthY(int, int, int, bool, int, bool); 18 | // templateint getMinWidthY(int , int , int ); 19 | template 20 | int getMinWidthY(int, int, bool); 21 | 22 | #endif 23 | -------------------------------------------------------------------------------- /CG/batch/Truba/_load_truba_modules.sh: -------------------------------------------------------------------------------- 1 | . /truba/home/dsagbili/spack/share/spack/setup-env.sh 2 | 3 | # spack load nvshmem@2.7.0-6 4 | spack load nvshmem@2.7.0-6/pdl77w7 5 | 6 | export UCX_WARN_UNUSED_ENV_VARS=n 7 | #export NVSHMEM_IB_ENABLE_GPUINITIATED=1 8 | export UCX_HOME=/truba/home/dsagbili/spack/opt/spack/linux-rhel8-zen/gcc-8.5.0/ucx-1.13.1-tc7ltbeqjfzr4sdwbv5jgppl4p62q5mu 9 | export NVSHMEM_HOME=/truba/home/dsagbili/spack/opt/spack/linux-rhel8-zen/gcc-8.5.0/nvshmem-2.7.0-6-pdl77w7adu5dm334pezvemvt5tjxsowg 10 | export MPI_HOME=/truba/home/dsagbili/spack/opt/spack/linux-rhel8-zen/gcc-8.5.0/openmpi-4.1.4-ycvxffyzzonogvqycd4gpp7aholtkss5 11 | export CUDA_HOME=/truba/home/dsagbili/spack/opt/spack/linux-rhel8-zen/gcc-8.5.0/cuda-11.8.0-37xn6z7age2zvgrmug5jad7l34sizzkp 12 | export LD_LIBRARY_PATH=$NVSHMEM_HOME/lib:$UCX_HOME/lib:$CUDA_HOME/lib64:$LD_LIBRARY_PATH 13 | 14 | module load centos7.3/comp/python/3.7.7-openmpi-1.8.8-gcc-4.8.5-GOLD -------------------------------------------------------------------------------- /Stencil/jacobi2D/PERKS/common/types.hpp: -------------------------------------------------------------------------------- 1 | 2 | #ifndef PERKS_TYPES 3 | #define PERKS_TYPES 4 | 5 | #define PERKS_INITIALIZE_ALL_TYPE(_macro) \ 6 | template _macro(float);\ 7 | template _macro(double) 8 | 9 | #define PERKS_INITIALIZE_ALL_TYPE_1ARG(_macro,halo) \ 10 | template _macro(float,halo);\ 11 | template _macro(double,halo) 12 | 13 | #define PERKS_INITIALIZE_ALL_TYPE_2ARG(_macro,a,b) \ 14 | template _macro(float,a,b);\ 15 | template _macro(double,a,b) 16 | 17 | #define PERKS_INITIALIZE_ALL_TYPE_3ARG(_macro,a,b,c) \ 18 | template _macro(float,a,b,c);\ 19 | template _macro(double,a,b,c) 20 | 21 | 22 | #define PERKS_INITIALIZE_ALL_TYPE_4ARG(_macro,a,b,c,d) \ 23 | template _macro(float,a,b,c,d);\ 24 | template _macro(double,a,b,c,d) 25 | 26 | 27 | #define PERKS_INITIALIZE_ALL_TYPE_5ARG(_macro,a,b,c,d,e) \ 28 | template _macro(float,a,b,c,d,e);\ 29 | template _macro(double,a,b,c,d,e) 30 | 31 | 32 | 33 | 34 | #endif 35 | -------------------------------------------------------------------------------- /CG/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(cg 2 | src/main.cu 3 | src/common.cu 4 | src/mmio.c 5 | src/mmio_wrapper.cpp 6 | src/baseline/discrete-pipelined.cu 7 | src/baseline/discrete-standard.cu 8 | src/profiling/discrete-pipelined.cu 9 | src/profiling/discrete-standard.cu 10 | src/single-gpu/discrete-standard.cu 11 | src/single-stream/pipelined.cu 12 | src/single-stream/pipelined-gather.cu 13 | src/single-stream/pipelined-multi-overlap.cu 14 | src/single-stream/standard.cu 15 | src/single-stream/standard-saxpy-overlap.cu) 16 | 17 | find_package(OpenMP REQUIRED) 18 | find_package(NVSHMEM REQUIRED) 19 | find_package(MPI REQUIRED) 20 | 21 | target_link_libraries(cg 22 | CUDA::cudart 23 | OpenMP::OpenMP_CXX 24 | nvshmem::nvshmem 25 | MPI::MPI_CXX) 26 | 27 | target_include_directories(cg SYSTEM PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/include") 28 | 29 | target_compile_options(cg PRIVATE $<$: 30 | -D_FORCE_INLINES 31 | >) 32 | -------------------------------------------------------------------------------- /Stencil/jacobi2D/PERKS/config.cuh: -------------------------------------------------------------------------------- 1 | #ifndef TILE_X 2 | #define TILE_X (256) 3 | #endif 4 | #ifndef RTILE_Y 5 | // #define RTILE_Y (8) 6 | #define RTILE_Y (8) 7 | #endif 8 | 9 | // minimal architecture is 600 10 | 11 | // #if defined(js2d5pt) 12 | #define HALO (1) 13 | // #define REG_FOLDER_Y (5) 14 | 15 | // #elif defined(js2d9pt) 16 | // #define HALO (2) 17 | // #define REG_FOLDER_Y (10) 18 | // #elif defined(js2d13pt) 19 | // #define HALO (3) 20 | // #define REG_FOLDER_Y (10) 21 | // #elif defined(js2d17pt) 22 | // #define HALO (4) 23 | // #define REG_FOLDER_Y (10) 24 | // #elif defined(js2d21pt) 25 | // #define HALO (5) 26 | // #define REG_FOLDER_Y (10) 27 | // #elif defined(js2d25pt) 28 | // #define HALO (6) 29 | // #define REG_FOLDER_Y (10) 30 | // #elif defined(jb2d9pt) 31 | // #define HALO (1) 32 | // #define BOX 33 | // #define REG_FOLDER_Y (0) 34 | // #elif defined(jb2d25pt) 35 | // #define HALO (2) 36 | // #define BOX 37 | // #define REG_FOLDER_Y (0) 38 | 39 | // #endif 40 | 41 | #ifndef Halo 42 | #define Halo HALO 43 | #endif -------------------------------------------------------------------------------- /CG/scripts/calculate_nnz_num_rows_ratio.py: -------------------------------------------------------------------------------- 1 | # Maps matrix name to tuple (num_rows, num_nnz) 2 | MATRIX_NAMES_TO_INFO = { 3 | 'ecology2': (999999, 4995991), 4 | 'hood': (220542, 10768436), 5 | 'bmwcra_1': (148770, 10644002), 6 | 'consph': (83334, 6010480), 7 | 'thermomech_dM': (204316, 1423116), 8 | 'tmt_sym': (726713, 5080961), 9 | 'crankseg_1': (52804, 10614210), 10 | 'crankseg_2': (63838, 14148858), 11 | 'Queen_4147': (4147110, 329499284), 12 | 'Bump_2911': (2911419, 127729899), 13 | 'G3_circuit': (1585478, 7660826), 14 | 'StocF-1465': (1465137, 21005389), 15 | 'Flan_1565': (1564794, 117406044), 16 | 'audikw_1': (943695, 77651847), 17 | 'Serena': (1391349, 64531701), 18 | 'Geo_1438': (1437960, 63156690), 19 | 'Hook_1498': (1498023, 60917445), 20 | 'ldoor': (952203, 46522475) 21 | } 22 | 23 | if __name__ == "__main__": 24 | for matrix_name, (num_rows, num_nnz) in MATRIX_NAMES_TO_INFO.items(): 25 | nnz_to_num_rows_ratio = num_nnz / num_rows 26 | print( 27 | f'Sparsity for matrix {matrix_name} is {nnz_to_num_rows_ratio:.2f}') 28 | -------------------------------------------------------------------------------- /Stencil/jacobi3D/src/PERKS/config.cuh: -------------------------------------------------------------------------------- 1 | // #ifndef TILE_X 2 | // #define TILE_X (256) 3 | // #endif 4 | // #ifndef RTILE_Y 5 | // // #define RTILE_Y (8) 6 | // #define RTILE_Y (8) 7 | // #endif 8 | 9 | // minimal architecture is 600 10 | 11 | // #if defined(js2d5pt) 12 | #define HALO (1) 13 | // #define REG_FOLDER_Y (5) 14 | 15 | // #elif defined(js2d9pt) 16 | // #define HALO (2) 17 | // #define REG_FOLDER_Y (10) 18 | // #elif defined(js2d13pt) 19 | // #define HALO (3) 20 | // #define REG_FOLDER_Y (10) 21 | // #elif defined(js2d17pt) 22 | // #define HALO (4) 23 | // #define REG_FOLDER_Y (10) 24 | // #elif defined(js2d21pt) 25 | // #define HALO (5) 26 | // #define REG_FOLDER_Y (10) 27 | // #elif defined(js2d25pt) 28 | // #define HALO (6) 29 | // #define REG_FOLDER_Y (10) 30 | // #elif defined(jb2d9pt) 31 | // #define HALO (1) 32 | // #define BOX 33 | // #define REG_FOLDER_Y (0) 34 | // #elif defined(jb2d25pt) 35 | // #define HALO (2) 36 | // #define BOX 37 | // #define REG_FOLDER_Y (0) 38 | 39 | // #endif 40 | 41 | // #ifndef Halo 42 | // #define Halo HALO 43 | // #endif -------------------------------------------------------------------------------- /Stencil/jacobi3D/src_nvshmem/PERKS-nvshmem/config.cuh: -------------------------------------------------------------------------------- 1 | // #ifndef TILE_X 2 | // #define TILE_X (256) 3 | // #endif 4 | // #ifndef RTILE_Y 5 | // // #define RTILE_Y (8) 6 | // #define RTILE_Y (8) 7 | // #endif 8 | 9 | // minimal architecture is 600 10 | 11 | // #if defined(js2d5pt) 12 | #define HALO (1) 13 | // #define REG_FOLDER_Y (5) 14 | 15 | // #elif defined(js2d9pt) 16 | // #define HALO (2) 17 | // #define REG_FOLDER_Y (10) 18 | // #elif defined(js2d13pt) 19 | // #define HALO (3) 20 | // #define REG_FOLDER_Y (10) 21 | // #elif defined(js2d17pt) 22 | // #define HALO (4) 23 | // #define REG_FOLDER_Y (10) 24 | // #elif defined(js2d21pt) 25 | // #define HALO (5) 26 | // #define REG_FOLDER_Y (10) 27 | // #elif defined(js2d25pt) 28 | // #define HALO (6) 29 | // #define REG_FOLDER_Y (10) 30 | // #elif defined(jb2d9pt) 31 | // #define HALO (1) 32 | // #define BOX 33 | // #define REG_FOLDER_Y (0) 34 | // #elif defined(jb2d25pt) 35 | // #define HALO (2) 36 | // #define BOX 37 | // #define REG_FOLDER_Y (0) 38 | 39 | // #endif 40 | 41 | // #ifndef Halo 42 | // #define Halo HALO 43 | // #endif -------------------------------------------------------------------------------- /CG/batch/Karolina/_measure_single_gpu_runtime.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | NUM_ITER=${NUM_ITER:-1000} 4 | NUM_RUNS=${NUM_RUNS:-5} 5 | FILENAME=${FILENAME:-USE_DEFAULT_FILENAME} 6 | MATRICES_FOLDER=${MATRICES_FOLDER:-USE_DEFAULT_MATRICES_FOLDER} 7 | GPU_MODEL=${GPU_MODEL:-A100} 8 | 9 | while [ $# -gt 0 ]; do 10 | 11 | if [[ $1 == *"--"* ]]; then 12 | param="${1/--/}" 13 | declare $param="$2" 14 | fi 15 | 16 | shift 17 | done 18 | 19 | WORK_DIR=~/multi-perks/CG 20 | SCRATCH_DIR=/scratch/project/${PBS_ACCOUNT,,}/multi-perks-runs 21 | # SCRATCH_DIR=/scratch/project/${PBS_ACCOUNT,,}/${USER}/multi-perks-runs 22 | 23 | cd $WORK_DIR 24 | 25 | . ./batch/Karolina/_load_karolina_modules.sh > /dev/null 26 | 27 | cd $SCRATCH_DIR 28 | 29 | cp $WORK_DIR/bin/cg ./bin/cg 30 | cp $WORK_DIR/scripts/measure_single_gpu_runtime.py ./scripts/measure_single_gpu_runtime.py 31 | 32 | echo "--- RUNNING ---" 33 | date 34 | 35 | python3 ./scripts/measure_single_gpu_runtime.py --num_iter $NUM_ITER --num_runs $NUM_RUNS --filename $FILENAME --matrices_folder $MATRICES_FOLDER --gpu_model $GPU_MODEL 36 | 37 | echo "" 38 | 39 | echo "--- DONE ---" 40 | date 41 | 42 | cd $WORK_DIR -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 ParCore Lab, Koç University (Istanbul), and all contributors listed in AUTHORS All rights reserved. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /CG/Makefile2: -------------------------------------------------------------------------------- 1 | SELF_DIR := $(dir $(lastword $(MAKEFILE_LIST))) 2 | include $(SELF_DIR)/../common.mk 3 | 4 | BUILD_ROOT ?= bin 5 | OBJ_ROOT ?= $(BUILD_ROOT)/obj 6 | 7 | NV_SRCDIR := $(SELF_DIR)/src 8 | NV_OBJDIR := $(OBJ_ROOT) 9 | NV_DEPDIR := $(NV_OBJDIR)/.deps 10 | 11 | .DEFAULT_GOAL := cg 12 | 13 | ifdef PROFILE 14 | NVCC_NV_FLAGS += -lineinfo --generate-line-info 15 | endif 16 | 17 | ifdef USE_NVTX 18 | NVCC_NV_FLAGS += -DUSE_NVTX 19 | NVCC_NV_LDFLAGS += -lnvToolsExt 20 | endif 21 | 22 | NV_SRCS = $(call rwildcard,$(NV_SRCDIR),*.cu) 23 | NV_OBJS := $(patsubst $(NV_SRCDIR)/%.cu, $(NV_OBJDIR)/%.o, $(NV_SRCS)) 24 | NV_DEPS := $(patsubst $(NV_SRCDIR)/%.cu, $(NV_DEPDIR)/%.d, $(NV_SRCS)) 25 | 26 | cg: $(NV_OBJS) $(NV_OBJDIR)/mmio.c.o $(NV_OBJDIR)/mmio_wrapper.o 27 | $(LINK_NVSHMEM) 28 | 29 | $(NV_OBJS) : $(NV_OBJDIR)/%.o : $(NV_SRCDIR)/%.cu $(NV_DEPDIR)/%.d | $(NV_DEPDIR) 30 | $(call COMPILE_NVSHMEM, $(NV_DEPDIR)) 31 | 32 | $(NV_OBJDIR)/mmio.c.o: $(NV_SRCDIR)/mmio.c 33 | $(NVCC) -o $@ -c $< 34 | 35 | $(NV_OBJDIR)/mmio_wrapper.o: $(NV_SRCDIR)/mmio_wrapper.cpp 36 | $(NVCC) -o $@ -c $< 37 | 38 | run: cg 39 | ./cg 40 | 41 | $(NV_DEPDIR): 42 | @mkdir -p $(NV_DEPDIR) 43 | 44 | $(NV_DEPS): 45 | 46 | include $(wildcard $(NV_DEPS)) 47 | -------------------------------------------------------------------------------- /CG/batch/Karolina/_measure_total_runtime.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | NUM_ITER=${NUM_ITER:-1000} 4 | NUM_RUNS=${NUM_RUNS:-5} 5 | FILENAME=${FILENAME:-USE_DEFAULT_FILENAME} 6 | MATRICES_FOLDER=${MATRICES_FOLDER:-USE_DEFAULT_MATRICES_FOLDER} 7 | GPU_MODEL=${GPU_MODEL:-A100} 8 | NUM_NODES=${NUM_NODES:-1} 9 | 10 | # Runs all versions by default 11 | VERSIONS_TO_RUN=${VERSIONS_TO_RUN:-RUN_ALL_VERSIONS} 12 | 13 | while [ $# -gt 0 ]; do 14 | 15 | if [[ $1 == *"--"* ]]; then 16 | param="${1/--/}" 17 | declare $param="$2" 18 | fi 19 | 20 | shift 21 | done 22 | 23 | NUM_GPUS=$((NUM_NODES * 8)) 24 | 25 | WORK_DIR=~/multi-perks/CG 26 | SCRATCH_DIR=/scratch/project/${PBS_ACCOUNT,,}/multi-perks-runs 27 | # SCRATCH_DIR=/scratch/project/${PBS_ACCOUNT,,}/${USER}/multi-perks-runs 28 | 29 | cd $WORK_DIR 30 | 31 | . ./batch/Karolina/_load_karolina_modules.sh > /dev/null 32 | 33 | cd $SCRATCH_DIR 34 | 35 | cp $WORK_DIR/bin/cg ./bin/cg 36 | cp $WORK_DIR/scripts/measure_runtime.py ./scripts/measure_runtime.py 37 | 38 | echo "--- RUNNING ---" 39 | date 40 | 41 | python3 ./scripts/measure_runtime.py --num_iter $NUM_ITER --num_runs $NUM_RUNS --filename $FILENAME --matrices_folder $MATRICES_FOLDER --num_gpus $NUM_GPUS --versions_to_run $VERSIONS_TO_RUN --gpu_model $GPU_MODEL 42 | 43 | echo "" 44 | 45 | echo "--- DONE ---" 46 | date 47 | 48 | cd $WORK_DIR -------------------------------------------------------------------------------- /Stencil/jacobi3D/src_nvshmem/PERKS-nvshmem/common/types.hpp: -------------------------------------------------------------------------------- 1 | 2 | #ifndef PERKS_TYPES 3 | #define PERKS_TYPES 4 | 5 | #define PERKS_INITIALIZE_ALL_TYPE(_macro) \ 6 | template _macro(float);\ 7 | template _macro(double) 8 | 9 | #define PERKS_INITIALIZE_ALL_TYPE_1ARG(_macro,halo) \ 10 | template _macro(float,halo);\ 11 | template _macro(double,halo) 12 | 13 | #define PERKS_INITIALIZE_ALL_TYPE_2ARG(_macro,a,b) \ 14 | template _macro(float,a,b);\ 15 | template _macro(double,a,b) 16 | 17 | #define PERKS_INITIALIZE_ALL_TYPE_3ARG(_macro,a,b,c) \ 18 | template _macro(float,a,b,c);\ 19 | template _macro(double,a,b,c) 20 | 21 | 22 | #define PERKS_INITIALIZE_ALL_TYPE_4ARG(_macro,a,b,c,d) \ 23 | template _macro(float,a,b,c,d);\ 24 | template _macro(double,a,b,c,d) 25 | 26 | 27 | #define PERKS_INITIALIZE_ALL_TYPE_5ARG(_macro,a,b,c,d,e) \ 28 | template _macro(float,a,b,c,d,e);\ 29 | template _macro(double,a,b,c,d,e) 30 | 31 | #define PERKS_INITIALIZE_ALL_TYPE_6ARG(_macro,a,b,c,d,e,f) \ 32 | template _macro(float,a,b,c,d,e,f);\ 33 | template _macro(double,a,b,c,d,e,f) 34 | 35 | #define PERKS_INITIALIZE_ALL_TYPE_7ARG(_macro,a,b,c,d,e,f,g) \ 36 | template _macro(float,a,b,c,d,e,f,g);\ 37 | template _macro(double,a,b,c,d,e,f,g) 38 | 39 | #define PERKS_INITIALIZE_ALL_TYPE_8ARG(_macro,a,b,c,d,e,f,g,h) \ 40 | template _macro(float,a,b,c,d,e,f,g,h);\ 41 | template _macro(double,a,b,c,d,e,f,g,h) 42 | #endif 43 | -------------------------------------------------------------------------------- /CG/batch/Simula/_measure_operation_breakdown.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH --job-name=cg-bench 4 | #SBATCH --ntasks=8 5 | #SBATCH --gres=gpu:8 6 | #SBATCH --partition=dgx2q 7 | #SBATCH --time=06:00:00 8 | #SBATCH --output=sbatch_output_%j.log 9 | 10 | NUM_ITER=${NUM_ITER:-1000} 11 | FILENAME=${FILENAME:-USE_DEFAULT_FILENAME} 12 | MATRICES_FOLDER=${MATRICES_FOLDER:-USE_DEFAULT_MATRICES_FOLDER} 13 | GPU_MODEL=${GPU_MODEL:-V100} 14 | 15 | # This will be a comma delimited list of number of GPUs to run on 16 | # No spaces between numbers 17 | # Single numbers also work 18 | # (Example => 2,3,4,8) 19 | NUM_GPUS=${NUM_GPUS:-8} 20 | 21 | # This will be a comma delimited list of version indices 22 | # No spaces between numbers 23 | # Single numbers also work 24 | # (Example => 0,1,2,4) 25 | # Runs all versions by default 26 | VERSIONS_TO_RUN=${VERSIONS_TO_RUN:-RUN_ALL_VERSIONS} 27 | 28 | while [ $# -gt 0 ]; do 29 | 30 | if [[ $1 == *"--"* ]]; then 31 | param="${1/--/}" 32 | declare $param="$2" 33 | fi 34 | 35 | shift 36 | done 37 | 38 | cd ~/multi-perks/CG 39 | 40 | . ./batch/Simula/_load_simula_modules.sh > /dev/null 41 | 42 | echo "--- RUNNING ---" 43 | date 44 | 45 | python3 ./scripts/measure_operation_breakdown.py --num_iter $NUM_ITER --filename $FILENAME --matrices_folder $MATRICES_FOLDER --num_gpus $NUM_GPUS --versions_to_run $VERSIONS_TO_RUN --gpu_model $GPU_MODEL 46 | rm ./nsys_reports/* 47 | 48 | echo "" 49 | 50 | echo "--- DONE ---" 51 | date -------------------------------------------------------------------------------- /CG/batch/Simula/_measure_total_runtime.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH --job-name=cg-bench 4 | #SBATCH --ntasks=8 5 | #SBATCH --gres=gpu:8 6 | #SBATCH --partition=dgx2q 7 | #SBATCH --time=06:00:00 8 | #SBATCH --output=sbatch_output_%j.log 9 | 10 | NUM_ITER=${NUM_ITER:-1000} 11 | NUM_RUNS=${NUM_RUNS:-5} 12 | FILENAME=${FILENAME:-USE_DEFAULT_FILENAME} 13 | MATRICES_FOLDER=${MATRICES_FOLDER:-USE_DEFAULT_MATRICES_FOLDER} 14 | GPU_MODEL=${GPU_MODEL:-V100} 15 | 16 | # This will be a comma delimited list of number of GPUs to run on 17 | # No spaces between numbers 18 | # Single numbers also work 19 | # (Example => 2,3,4,8) 20 | NUM_GPUS=${NUM_GPUS:-8} 21 | 22 | # This will be a comma delimited list of version indices 23 | # No spaces between numbers 24 | # Single numbers also work 25 | # (Example => 0,1,2,4) 26 | # Runs all versions by default 27 | VERSIONS_TO_RUN=${VERSIONS_TO_RUN:-RUN_ALL_VERSIONS} 28 | 29 | while [ $# -gt 0 ]; do 30 | 31 | if [[ $1 == *"--"* ]]; then 32 | param="${1/--/}" 33 | declare $param="$2" 34 | fi 35 | 36 | shift 37 | done 38 | 39 | cd ~/multi-perks/CG 40 | 41 | . ./batch/Simula/_load_simula_modules.sh > /dev/null 42 | 43 | echo "--- RUNNING ---" 44 | date 45 | 46 | python3 ./scripts/measure_runtime.py --num_iter $NUM_ITER --num_runs $NUM_RUNS --filename $FILENAME --matrices_folder $MATRICES_FOLDER --num_gpus $NUM_GPUS --versions_to_run $VERSIONS_TO_RUN --gpu_model $GPU_MODEL 47 | 48 | echo "" 49 | 50 | echo "--- DONE ---" 51 | date -------------------------------------------------------------------------------- /Plots/common.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import textwrap 3 | from pathlib import Path 4 | 5 | BASE_DIR = Path('Images') 6 | BASE_DIR.mkdir(exist_ok=True) 7 | 8 | 9 | def rotate(l, n): 10 | return l[-n:] + l[:-n] 11 | 12 | 13 | def get_files(): 14 | parser = argparse.ArgumentParser() 15 | parser.add_argument('files', type=argparse.FileType('r'), nargs='+') 16 | return parser.parse_args().files 17 | 18 | 19 | def get_module_dir(dir_name): 20 | module_dir = BASE_DIR / dir_name 21 | module_dir.mkdir(exist_ok=True) 22 | return module_dir 23 | 24 | 25 | def wrap_labels(ax, width, break_long_words=False): 26 | labels = [] 27 | for label in ax.get_xticklabels(): 28 | text = label.get_text() 29 | labels.append(textwrap.fill(text, width=width, 30 | break_long_words=break_long_words)) 31 | ax.set_xticklabels(labels, rotation=0) 32 | 33 | 34 | markers = [ 35 | '.', # point 36 | ',', # pixel 37 | 'o', # circle 38 | 'v', # triangle down 39 | '^', # triangle up 40 | '<', # triangle_left 41 | '>', # triangle_right 42 | '1', # tri_down 43 | '2', # tri_up 44 | '3', # tri_left 45 | '4', # tri_right 46 | '8', # octagon 47 | 's', # square 48 | 'p', # pentagon 49 | '*', # star 50 | 'h', # hexagon1 51 | 'H', # hexagon2 52 | '+', # plus 53 | 'x', # x 54 | 'D', # diamond 55 | 'd', # thin_diamond 56 | '|', # vline 57 | ] 58 | -------------------------------------------------------------------------------- /CG/batch/Truba/_measure_operation_breakdown.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH -J cg-operation-breakdown 4 | #SBATCH -N 1 5 | #SBATCH -n 8 6 | #SBATCH -c 16 7 | #SBATCH -A proj16 8 | #SBATCH -p palamut-cuda 9 | #SBATCH --gres=gpu:8 10 | #SBATCH --time=24:00:00 11 | #SBATCH -o cg-operation-breakdown_output_%j.log 12 | 13 | NUM_ITER=${NUM_ITER:-1000} 14 | FILENAME=${FILENAME:-USE_DEFAULT_FILENAME} 15 | MATRICES_FOLDER=${MATRICES_FOLDER:-USE_DEFAULT_MATRICES_FOLDER} 16 | GPU_MODEL=${GPU_MODEL:-A100} 17 | 18 | # This will be a comma delimited list of number of GPUs to run on 19 | # No spaces between numbers 20 | # Single numbers also work 21 | # (Example => 2,3,4,8) 22 | NUM_GPUS=${NUM_GPUS:-8} 23 | 24 | # This will be a comma delimited list of version indices 25 | # No spaces between numbers 26 | # Single numbers also work 27 | # (Example => 0,1,2,4) 28 | # Runs all versions by default 29 | VERSIONS_TO_RUN=${VERSIONS_TO_RUN:-RUN_ALL_VERSIONS} 30 | 31 | while [ $# -gt 0 ]; do 32 | 33 | if [[ $1 == *"--"* ]]; then 34 | param="${1/--/}" 35 | declare $param="$2" 36 | fi 37 | 38 | shift 39 | done 40 | 41 | cd ~/ismayil/multi-perks/CG 42 | 43 | . ./batch/Truba/_load_truba_modules.sh > /dev/null 44 | 45 | echo "--- RUNNING ---" 46 | date 47 | 48 | python3 ./scripts/measure_operation_breakdown.py --num_iter $NUM_ITER --matrices_folder $MATRICES_FOLDER --num_gpus $NUM_GPUS --versions_to_run $VERSIONS_TO_RUN --gpu_model $GPU_MODEL 49 | rm ./nsys_reports/* 50 | 51 | echo "" 52 | 53 | echo "--- DONE ---" 54 | date -------------------------------------------------------------------------------- /CG/batch/Truba/_measure_total_runtime.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH -J cg-runtime_benchmark 4 | #SBATCH -N 1 5 | #SBATCH -n 8 6 | #SBATCH -c 16 7 | #SBATCH -A proj16 8 | #SBATCH -p palamut-cuda 9 | #SBATCH --gres=gpu:8 10 | #SBATCH --time=24:00:00 11 | #SBATCH -o cg-runtime_benchmark_output_%j.log 12 | 13 | NUM_ITER=${NUM_ITER:-1000} 14 | NUM_RUNS=${NUM_RUNS:-5} 15 | FILENAME=${FILENAME:-USE_DEFAULT_FILENAME} 16 | MATRICES_FOLDER=${MATRICES_FOLDER:-USE_DEFAULT_MATRICES_FOLDER} 17 | GPU_MODEL=${GPU_MODEL:-A100} 18 | 19 | # This will be a comma delimited list of number of GPUs to run on 20 | # No spaces between numbers 21 | # Single numbers also work 22 | # (Example => 2,3,4,8) 23 | NUM_GPUS=${NUM_GPUS:-8} 24 | 25 | # This will be a comma delimited list of version indices 26 | # No spaces between numbers 27 | # Single numbers also work 28 | # (Example => 0,1,2,4) 29 | # Runs all versions by default 30 | VERSIONS_TO_RUN=${VERSIONS_TO_RUN:-RUN_ALL_VERSIONS} 31 | 32 | while [ $# -gt 0 ]; do 33 | 34 | if [[ $1 == *"--"* ]]; then 35 | param="${1/--/}" 36 | declare $param="$2" 37 | fi 38 | 39 | shift 40 | done 41 | 42 | cd ~/ismayil/multi-perks/CG 43 | 44 | . ./batch/Truba/_load_truba_modules.sh > /dev/null 45 | 46 | echo "--- RUNNING ---" 47 | date 48 | 49 | python3 ./scripts/measure_runtime.py --num_iter $NUM_ITER --num_runs $NUM_RUNS --filename $FILENAME --matrices_folder $MATRICES_FOLDER --num_gpus $NUM_GPUS --versions_to_run $VERSIONS_TO_RUN --gpu_model $GPU_MODEL 50 | 51 | echo "" 52 | 53 | echo "--- DONE ---" 54 | date -------------------------------------------------------------------------------- /Stencil/jacobi2D/scripts/bench.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH --job-name=stencil-bench 4 | #SBATCH --ntasks=8 5 | #SBATCH --gres=gpu:8 6 | #SBATCH --partition hgx2q 7 | #SBATCH --time=01:00:00 8 | #SBATCH --output=sbatch_output_%j.log 9 | 10 | . ./scripts/modules.sh > /dev/null 11 | 12 | BIN="./jacobi -s 1" 13 | NUM_RUNS=5 14 | V_OURS=1 15 | V_BASELINE=6 16 | 17 | #OUT_CSV="./results.csv" 18 | OUT_CSV="/dev/stdout" 19 | echo "version,nx,ny,niter,num_gpus,execution_time" >> "$OUT_CSV" 20 | 21 | #MAX_NUM_GPUS=$(nvidia-smi --list-gpus | wc -l) 22 | MAX_NUM_GPUS=4 23 | 24 | # First element reserved for pretty output in the loop 25 | CUDA_VISIBLE_DEVICES_SETTING=("x" "0" "0,1" "0,1,2,3" "0,1,2,3,4,5,6,7") 26 | DOMAIN_SIZES=( 27 | "x" 28 | "8192 4096" 29 | "8192 8192" 30 | "8192 16348" 31 | "16348 16348" 32 | ) 33 | NUM_ITERS=100000 34 | 35 | function runp() { 36 | cmd="$BIN -v $1 -nx $2 -ny $3 -niter $4" 37 | 38 | min_execution_time=9223372036854775807 39 | 40 | for ((i = 0; i < NUM_RUNS; i += 1)); do 41 | execution_time=$($cmd | grep -o -E "[0-9]+.?[0-9]+") 42 | min_execution_time=$(python -c "print(min($execution_time, $min_execution_time))") 43 | done 44 | 45 | echo "$1,$2,$3,$4,$5,$min_execution_time" >> "$OUT_CSV" 46 | } 47 | 48 | for ((NUM_GPUS = 1; NUM_GPUS <= MAX_NUM_GPUS; NUM_GPUS += 1)); do 49 | export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES_SETTING[${NUM_GPUS}]} 50 | 51 | read -r nx ny <<< "${DOMAIN_SIZES[$NUM_GPUS]}" 52 | 53 | # Our version 54 | runp "$V_OURS" "$nx" "$ny" "$NUM_ITERS" "$NUM_GPUS" 55 | 56 | # Baseline 57 | runp "$V_BASELINE" "$nx" "$ny" "$NUM_ITERS" "$NUM_GPUS" 58 | done 59 | -------------------------------------------------------------------------------- /Stencil/jacobi3D/scripts/bench.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH --job-name=stencil-bench 4 | #SBATCH --ntasks=8 5 | #SBATCH --gres=gpu:8 6 | #SBATCH --partition hgx2q 7 | #SBATCH --time=01:00:00 8 | #SBATCH --output=sbatch_output_%j.log 9 | 10 | . ./scripts/modules.sh > /dev/null 11 | 12 | BIN="./jacobi -s 1" 13 | NUM_RUNS=5 14 | V_OURS=1 15 | V_BASELINE=6 16 | 17 | #OUT_CSV="./results.csv" 18 | OUT_CSV="/dev/stdout" 19 | echo "version,nx,ny,niter,num_gpus,execution_time" >> "$OUT_CSV" 20 | 21 | #MAX_NUM_GPUS=$(nvidia-smi --list-gpus | wc -l) 22 | MAX_NUM_GPUS=4 23 | 24 | # First element reserved for pretty output in the loop 25 | CUDA_VISIBLE_DEVICES_SETTING=("x" "0" "0,1" "0,1,2,3" "0,1,2,3,4,5,6,7") 26 | DOMAIN_SIZES=( 27 | "x" 28 | "8192 4096" 29 | "8192 8192" 30 | "8192 16348" 31 | "16348 16348" 32 | ) 33 | NUM_ITERS=100000 34 | 35 | function runp() { 36 | cmd="$BIN -v $1 -nx $2 -ny $3 -niter $4" 37 | 38 | min_execution_time=9223372036854775807 39 | 40 | for ((i = 0; i < NUM_RUNS; i += 1)); do 41 | execution_time=$($cmd | grep -o -E "[0-9]+.?[0-9]+") 42 | min_execution_time=$(python -c "print(min($execution_time, $min_execution_time))") 43 | done 44 | 45 | echo "$1,$2,$3,$4,$5,$min_execution_time" >> "$OUT_CSV" 46 | } 47 | 48 | for ((NUM_GPUS = 1; NUM_GPUS <= MAX_NUM_GPUS; NUM_GPUS += 1)); do 49 | export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES_SETTING[${NUM_GPUS}]} 50 | 51 | read -r nx ny <<< "${DOMAIN_SIZES[$NUM_GPUS]}" 52 | 53 | # Our version 54 | runp "$V_OURS" "$nx" "$ny" "$NUM_ITERS" "$NUM_GPUS" 55 | 56 | # Baseline 57 | runp "$V_BASELINE" "$nx" "$ny" "$NUM_ITERS" "$NUM_GPUS" 58 | done 59 | -------------------------------------------------------------------------------- /Plots/scaling-bar.py: -------------------------------------------------------------------------------- 1 | from itertools import cycle 2 | 3 | import matplotlib.pyplot as plt 4 | import pandas as pd 5 | 6 | import matplotlib as mpl 7 | mpl.rcParams['hatch.linewidth'] = 0.3 8 | 9 | import common 10 | from common import get_files, markers, get_module_dir, wrap_labels 11 | 12 | plt.style.use('https://github.com/dhaitz/matplotlib-stylesheets/raw/master/pitayasmoothie-light.mplstyle') 13 | 14 | MODULE_DIR = get_module_dir('Bar Scaling') 15 | 16 | files = get_files() 17 | 18 | for file in files: 19 | data = pd.read_csv(file, index_col='Version') 20 | data = data.sort_index() 21 | 22 | data = data.T 23 | 24 | colors = [ 25 | '#c6c9cb', '#64b8e5', '#ee7fb2' 26 | ] 27 | 28 | axes = data.plot.bar(colormap='Paired', color=colors, edgecolor='black') 29 | 30 | bars = axes.patches 31 | patterns = ('///', '\\\\\\', 'xxx') 32 | hatches = [p for p in patterns for i in range(len(data))] 33 | for bar, hatch in zip(bars, hatches): 34 | bar.set_hatch(hatch) 35 | 36 | # for line in axes.get_lines(): 37 | # line.set_hatch(next(markers_cycle)) 38 | # line.set_linewidth(2) 39 | # line.set(alpha=0.5) 40 | # line.set(color=next(colors)) 41 | # 42 | # # If our versions 43 | # if line.get_label().lower().startswith('baseline'): 44 | # # line.set(alpha=0.5) 45 | # line.set_linestyle('dashed') 46 | # 47 | # # axes.legend(axes.get_lines(), data.columns, loc='best') 48 | # wrap_labels(axes, 10) 49 | # 50 | # # plt.xticks(fontsize=15) 51 | # plt.title(title) 52 | # plt.savefig(MODULE_DIR / title) 53 | 54 | # plt.grid(axis='x') 55 | 56 | plt.show() 57 | -------------------------------------------------------------------------------- /Stencil/jacobi3D/src/PERKS/common/types.hpp: -------------------------------------------------------------------------------- 1 | 2 | #ifndef PERKS_TYPES 3 | #define PERKS_TYPES 4 | 5 | #define PERKS_INITIALIZE_ALL_TYPE(_macro) \ 6 | template _macro(float); \ 7 | template _macro(double) 8 | 9 | #define PERKS_INITIALIZE_ALL_TYPE_1ARG(_macro, halo) \ 10 | template _macro(float, halo); \ 11 | template _macro(double, halo) 12 | 13 | #define PERKS_INITIALIZE_ALL_TYPE_2ARG(_macro, a, b) \ 14 | template _macro(float, a, b); \ 15 | template _macro(double, a, b) 16 | 17 | #define PERKS_INITIALIZE_ALL_TYPE_3ARG(_macro, a, b, c) \ 18 | template _macro(float, a, b, c); \ 19 | template _macro(double, a, b, c) 20 | 21 | #define PERKS_INITIALIZE_ALL_TYPE_4ARG(_macro, a, b, c, d) \ 22 | template _macro(float, a, b, c, d); \ 23 | template _macro(double, a, b, c, d) 24 | 25 | #define PERKS_INITIALIZE_ALL_TYPE_5ARG(_macro, a, b, c, d, e) \ 26 | template _macro(float, a, b, c, d, e); \ 27 | template _macro(double, a, b, c, d, e) 28 | 29 | #define PERKS_INITIALIZE_ALL_TYPE_6ARG(_macro, a, b, c, d, e, f) \ 30 | template _macro(float, a, b, c, d, e, f); \ 31 | template _macro(double, a, b, c, d, e, f) 32 | 33 | #define PERKS_INITIALIZE_ALL_TYPE_7ARG(_macro, a, b, c, d, e, f, g) \ 34 | template _macro(float, a, b, c, d, e, f, g); \ 35 | template _macro(double, a, b, c, d, e, f, g) 36 | 37 | #define PERKS_INITIALIZE_ALL_TYPE_8ARG(_macro, a, b, c, d, e, f, g, h) \ 38 | template _macro(float, a, b, c, d, e, f, g, h); \ 39 | template _macro(double, a, b, c, d, e, f, g, h) 40 | #endif 41 | -------------------------------------------------------------------------------- /CG/results/cg_operation_breakdown_8A100_discrete_pipelined.csv: -------------------------------------------------------------------------------- 1 | Discrete Pipelined Operation Breakdown 2 | Matrix,Global Reductions (+Barrier),Memcpy Dots To Host,Merged Dots (+Reset),NVSHMEM Barrier 1 (After SpMV),NVSHMEM Barrier 2 (End of Iteration),Saxpy 1,Saxpy 2,Saxpy 3,Saxpy 4,Saxpy 5,Saxpy 6,SpMV 3 | tridiagonal,0.1818,0.0681,0.2480,0.1142,0.1210,0.1980,0.2435,0.2461,0.2417,0.2439,0.2443,0.5201 4 | ecology2,0.1751,0.0658,0.0788,0.1014,0.1104,0.0590,0.0588,0.0589,0.0586,0.0584,0.0584,0.0987 5 | hood,0.1740,0.0659,0.0724,0.1124,0.1051,0.0558,0.0558,0.0558,0.0553,0.0550,0.0552,0.4796 6 | bmwcra_1,0.1703,0.0669,0.0718,0.2503,0.1036,0.0552,0.0550,0.0549,0.0547,0.0544,0.0544,0.9414 7 | consph,0.1735,0.0667,0.0722,0.1561,0.1060,0.0556,0.0553,0.0554,0.0551,0.0549,0.0548,0.4441 8 | thermomech_dM,0.1741,0.0665,0.0726,0.0938,0.1028,0.0562,0.0561,0.0560,0.0557,0.0555,0.0556,0.2408 9 | tmt_sym,0.1746,0.0667,0.0764,0.0988,0.1040,0.0584,0.0582,0.0583,0.0577,0.0576,0.0578,0.1144 10 | crankseg_1,0.1733,0.0690,0.0765,7.5716,0.1040,0.0555,0.0550,0.0552,0.0548,0.0544,0.0546,1.7455 11 | crankseg_2,0.1758,0.0682,0.0777,10.8521,0.1061,0.0553,0.0550,0.0549,0.0546,0.0544,0.0545,1.8073 12 | Queen_4147,0.1791,0.0697,0.1071,0.2459,0.1102,0.0801,0.0792,0.0817,0.0730,0.0685,0.0717,15.1174 13 | Bump_2911,0.1799,0.0692,0.0949,0.1052,0.1058,0.0735,0.0728,0.0742,0.0676,0.0646,0.0646,4.9992 14 | G3_circuit,0.1746,0.0663,0.0823,0.0891,0.1030,0.0607,0.0605,0.0610,0.0604,0.0601,0.0601,0.2353 15 | StocF-1465,0.1827,0.0681,0.0828,0.0944,0.1040,0.0626,0.0625,0.0665,0.0621,0.0600,0.0596,0.6804 16 | Flan_1565,0.1787,0.0685,0.0846,0.3012,0.1051,0.0644,0.0653,0.0662,0.0623,0.0597,0.0598,4.4129 17 | audikw_1,0.1788,0.0699,0.0811,2.9359,0.1025,0.0606,0.0601,0.0620,0.0598,0.0578,0.0579,1.9612 18 | Serena,0.1805,0.0688,0.0828,0.3760,0.1068,0.0633,0.0625,0.0649,0.0609,0.0586,0.0588,2.5207 19 | Geo_1438,0.1797,0.0693,0.0825,0.1363,0.1035,0.0641,0.0633,0.0657,0.0613,0.0592,0.0590,2.3228 20 | Hook_1498,0.1791,0.0675,0.0827,0.1411,0.1048,0.0639,0.0634,0.0663,0.0615,0.0592,0.0593,1.8353 21 | ldoor,0.1811,0.0679,0.0798,0.1083,0.1043,0.0602,0.0608,0.0621,0.0597,0.0578,0.0578,0.9961 -------------------------------------------------------------------------------- /CG/scripts/plots/plot_operation_breakdown.py: -------------------------------------------------------------------------------- 1 | 2 | from itertools import cycle 3 | from os.path import dirname, realpath 4 | 5 | from common import get_files, markers, get_module_dir, wrap_labels 6 | 7 | import matplotlib.pyplot as plt 8 | import pandas as pd 9 | import numpy as np 10 | 11 | MATRIX_NAMES = [ 12 | 'tridiagonal', 13 | 'ecology2', 14 | # 'shallow_water2', Too little non-zeros 15 | # 'Trefethen_2000', Too little non-zeros 16 | 'hood', 17 | 'bmwcra_1', 18 | 'consph', 19 | 'thermomech_dM', 20 | 'tmt_sym', 21 | 'crankseg_1', 22 | 'crankseg_2', 23 | 'Queen_4147', 24 | 'Bump_2911', 25 | 'G3_circuit', 26 | 'StocF-1465', 27 | 'Flan_1565', 28 | 'audikw_1', 29 | 'Serena', 30 | 'Geo_1438', 31 | 'Hook_1498', 32 | # 'bone010', Multi-part matrix, don't handle those for now 33 | 'ldoor' 34 | ] 35 | 36 | MODULE_DIR = get_module_dir('Operation Breakdown') 37 | 38 | dir_path = dirname(realpath(__file__)) 39 | 40 | # plt.style.use(dir_path + '/default.mplstyle') 41 | plt.style.use('fivethirtyeight') 42 | 43 | files = get_files() 44 | 45 | for file in files: 46 | title = file.readline().strip() 47 | 48 | operation_breakdowns = pd.read_csv(file, index_col='Matrix') 49 | operation_breakdowns = operation_breakdowns.T 50 | operation_breakdowns = operation_breakdowns[MATRIX_NAMES] 51 | operation_breakdowns = operation_breakdowns.T 52 | 53 | # Is this necessary? 54 | # operation_breakdowns = operation_breakdowns.sort_index() 55 | 56 | # Get percentages of operations 57 | per_operation_percentages = operation_breakdowns.div( 58 | operation_breakdowns.sum(axis=1), axis=0) * 100 59 | 60 | ax = per_operation_percentages.plot.barh(stacked=True) 61 | ax.invert_yaxis() 62 | ax.xaxis.set_visible(False) 63 | ax.set_xlim(0, np.sum(per_operation_percentages, axis=1).max()) 64 | 65 | for container in ax.containers: 66 | ax.bar_label(container) 67 | 68 | wrap_labels(ax, 10, break_long_words=True) 69 | 70 | plt.title(title) 71 | plt.savefig(MODULE_DIR / title) 72 | 73 | plt.show() 74 | -------------------------------------------------------------------------------- /CG/results/cg_operation_breakdown_8A100_discrete_standard.csv: -------------------------------------------------------------------------------- 1 | Discrete Standard Operation Breakdown 2 | Matrix,Dot 1 (+Reset),Dot 2 (+Reset),Global Reduction 1 (+Barrier),Global Reduction 2 (+Barrier),Memcpy Dot to Host 1,Memcpy Dot to Host 2,NVSHMEM Barrier 1 (After SpMV),NVSHMEM Barrier 2 (End of Iteration),Saxpy 1,Saxpy 2,Saxpy 3,SpMV 3 | tridiagonal,0.2094,0.1695,0.2585,0.2599,0.0680,0.0673,0.1108,0.1102,0.1827,0.2416,0.1814,0.5409 4 | ecology2,0.0746,0.0736,0.2211,0.2304,0.0657,0.0653,0.0952,0.0971,0.0593,0.0588,0.0605,0.1004 5 | hood,0.0705,0.0700,0.2160,0.2223,0.0652,0.0642,0.1086,0.0973,0.0553,0.0549,0.0559,0.4870 6 | bmwcra_1,0.0700,0.0696,0.2146,0.2280,0.0650,0.0640,0.2559,0.0973,0.0549,0.0546,0.0557,0.9622 7 | consph,0.0698,0.0694,0.2164,0.2261,0.0653,0.0648,0.2090,0.0971,0.0545,0.0544,0.0554,0.3745 8 | thermomech_dM,0.0701,0.0699,0.2182,0.2286,0.0649,0.0645,0.1132,0.0987,0.0553,0.0550,0.0561,0.2324 9 | tmt_sym,0.0735,0.0731,0.2180,0.2266,0.0662,0.0657,0.0945,0.0967,0.0580,0.0577,0.0590,0.1137 10 | crankseg_1,0.0702,0.0687,0.2249,0.2288,0.0669,0.0650,7.6064,0.0960,0.0543,0.0540,0.0554,1.6466 11 | crankseg_2,0.0717,0.0699,0.2342,0.2338,0.0684,0.0658,10.8982,0.0978,0.0547,0.0544,0.0556,1.8452 12 | Queen_4147,0.1025,0.0889,0.2284,0.2247,0.0685,0.0650,0.2232,0.0959,0.0722,0.0720,0.0681,15.1167 13 | Bump_2911,0.0957,0.0850,0.2273,0.2234,0.0686,0.0654,0.3731,0.0966,0.0672,0.0676,0.0653,4.7050 14 | G3_circuit,0.0763,0.0751,0.2168,0.2256,0.0654,0.0648,0.1236,0.0975,0.0600,0.0596,0.0608,0.2145 15 | StocF-1465,0.0835,0.0780,0.2241,0.2272,0.0659,0.0638,0.1718,0.0955,0.0616,0.0609,0.0605,0.5684 16 | Flan_1565,0.0857,0.0781,0.2274,0.2240,0.0676,0.0644,0.1596,0.0962,0.0621,0.0622,0.0606,4.5419 17 | audikw_1,0.0834,0.0769,0.2250,0.2208,0.0675,0.0643,0.0936,0.0959,0.0599,0.0595,0.0586,4.8515 18 | Serena,0.0836,0.0779,0.2228,0.2217,0.0669,0.0640,0.2073,0.0955,0.0613,0.0615,0.0600,2.6818 19 | Geo_1438,0.0842,0.0781,0.2225,0.2195,0.0674,0.0648,0.1707,0.0960,0.0620,0.0623,0.0605,2.2589 20 | Hook_1498,0.0831,0.0775,0.2219,0.2201,0.0665,0.0640,0.1343,0.0964,0.0617,0.0623,0.0606,1.8148 21 | ldoor,0.0808,0.0755,0.2255,0.2252,0.0667,0.0644,0.2791,0.0954,0.0593,0.0595,0.0585,0.8216 -------------------------------------------------------------------------------- /common.mk: -------------------------------------------------------------------------------- 1 | ifeq ($(_COMMON_),) 2 | _COMMON_ := defined 3 | 4 | NVCC ?= nvcc 5 | MPIRUN ?= mpirun 6 | MPICCX ?= mpic++ 7 | CXX ?= g++ 8 | 9 | BUILD_ROOT := bin 10 | OBJ_ROOT := $(BUILD_ROOT)/obj 11 | 12 | ifndef NVSHMEM_HOME 13 | $(warning NVSHMEM_HOME is not set) 14 | endif 15 | ifndef MPI_HOME 16 | $(warning MPI_HOME is not set) 17 | endif 18 | ifndef UCX_HOME 19 | $(warning UCX_HOME is not set) 20 | endif 21 | 22 | MAKEFLAGS += -j 23 | 24 | # Can't compile CUDA with -Wpedantic 25 | WARN_FLAGS = "-Wall -Wno-comment -Werror -Wextra" 26 | 27 | rwildcard=$(foreach d,$(wildcard $(1:=/*)),$(call rwildcard,$d,$2) $(filter $(subst *,%,$2),$d)) 28 | 29 | GENCODE_SM70 := -gencode 'arch=compute_70,code=sm_70' 30 | GENCODE_SM80 := -gencode 'arch=compute_80,code=sm_80' -gencode 'arch=compute_80,code=compute_80' 31 | GENCODE_FLAGS := $(GENCODE_SM70) $(GENCODE_SM80) 32 | 33 | DEP_FLAGS = -MT $@ -MMD -MP -MF 34 | 35 | NVCC_FLAGS_GENERIC = -O2 -dc -Xcompiler $(WARN_FLAGS) -Xcompiler -fopenmp $(GENCODE_FLAGS) -std=c++17 36 | 37 | # Regular 38 | NVCC_FLAGS = $(NVCC_FLAGS_GENERIC) -ccbin=$(CXX) -I./include 39 | NVCC_LDFLAGS = -ccbin=$(CXX) -lgomp -L$(CUDA_HOME)/lib64 -lcuda -lcudart 40 | 41 | # NVSHMEM 42 | NVCC_NV_FLAGS = $(NVCC_FLAGS_GENERIC) -ccbin=$(MPICCX) -isystem $(NVSHMEM_HOME)/include -isystem $(MPI_HOME)/include -I./include_nvshmem 43 | NVCC_NV_LDFLAGS = -ccbin=$(MPICCX) -lgomp -L$(CUDA_HOME)/lib64 -lcuda -lcudart -lnvidia-ml -L$(NVSHMEM_HOME)/lib -lnvshmem -L$(MPI_HOME)/lib -lmpi -L$(UCX_HOME)/lib -lucp -lucs -luct -lucm -lmlx5 44 | 45 | # Example 46 | #$(OBJS_2D) : $(OBJ_DIR_2D)/%.o : $(SRC_DIR_2D)/%.cu $(DEP_DIR_2D)/%.d | $(DEP_DIR_2D) 47 | # $(call COMPILE, $(DEP_DIR_2D)) 48 | 49 | define LINK = 50 | @mkdir -p $(BUILD_ROOT) 51 | $(NVCC) $(GENCODE_FLAGS) -o $(BUILD_ROOT)/$@ $^ $(NVCC_LDFLAGS) 52 | endef 53 | 54 | define COMPILE = 55 | @mkdir -p "$(dir $(1)/$*)" 56 | @mkdir -p $(@D) 57 | $(NVCC) $(NVCC_FLAGS) $(DEP_FLAGS) $(1)/$*.d -o $@ $< 58 | endef 59 | 60 | define LINK_NVSHMEM = 61 | @mkdir -p $(BUILD_ROOT) 62 | $(NVCC) $(GENCODE_FLAGS) -o $(BUILD_ROOT)/$@ $^ $(NVCC_NV_LDFLAGS) 63 | endef 64 | 65 | define COMPILE_NVSHMEM = 66 | @mkdir -p "$(dir $(1)/$*)" 67 | @mkdir -p $(@D) 68 | $(NVCC) $(NVCC_NV_FLAGS) $(DEP_FLAGS) $(1)/$*.d -o $@ $< 69 | endef 70 | 71 | clean: 72 | $(RM) -rd $(BUILD_ROOT) 73 | 74 | endif 75 | -------------------------------------------------------------------------------- /Stencil/jacobi2D/scripts/constant_num_gpus_bench.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH --job-name=stencil-bench 4 | #SBATCH --ntasks=8 5 | #SBATCH --gres=gpu:8 6 | #SBATCH --time=03:00:00 7 | 8 | . ./scripts/modules.sh > /dev/null 9 | 10 | CUDA_VISIBLE_DEVICES_SETTING=("0" "0" "0,1" "0,1,2" "0,1,2,3" "0,1,2,3,4" "0,1,2,3,4,5" "0,1,2,3,4,5,6" "0,1,2,3,4,5,6,7" ) 11 | 12 | declare -A version_name_to_idx_map 13 | 14 | version_name_to_idx_map["Single Stream 1TB"]=0 15 | version_name_to_idx_map["Single Stream 2TB"]=1 16 | 17 | version_name_to_idx_map["Baseline Copy"]=3 18 | version_name_to_idx_map["Baseline Copy Overlap"]=4 19 | version_name_to_idx_map["Baseline P2P"]=5 20 | 21 | version_name_to_idx_map["Single Stream 1TB (No Compute)"]=9 22 | version_name_to_idx_map["Single Stream 2TB (No Compute)"]=10 23 | version_name_to_idx_map["Baseline Copy (No compute)"]=12 24 | version_name_to_idx_map["Baseline Copy Overlap (No Compute)"]=13 25 | version_name_to_idx_map["Baseline P2P (No Compute)"]=14 26 | 27 | BIN="./jacobi -s 1" 28 | 29 | 30 | STARTING_NX=${STARTING_NX:-1024} 31 | STARTING_NY=${STARTING_NY:-1024} 32 | NUM_ITER=${NUM_ITER:-1000000} 33 | NUM_RUNS=${NUM_RUNS:-5} 34 | 35 | NUM_GPUS=${NUM_GPUS:-4} 36 | MAX_DOMAIN_SIZE=${MAX_DOMAIN_SIZE:-16384} 37 | 38 | while [ $# -gt 0 ]; do 39 | 40 | if [[ $1 == *"--"* ]]; then 41 | param="${1/--/}" 42 | declare $param="$2" 43 | fi 44 | 45 | shift 46 | done 47 | 48 | for version_name in "${!version_name_to_idx_map[@]}"; do 49 | echo "Running ${version_name}"; echo "" 50 | 51 | version_idx=${version_name_to_idx_map[$version_name]} 52 | 53 | NX=${STARTING_NX} 54 | NY=${STARTING_NY} 55 | 56 | export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES_SETTING[${NUM_GPUS}]} 57 | 58 | while : ; do 59 | 60 | echo "Num GPUS: ${NUM_GPUS}" 61 | echo "${NUM_ITER} iterations on grid ${NY}x${NX}" 62 | 63 | for (( i=1; i <= ${NUM_RUNS}; i++ )); do 64 | execution_time=$(${BIN} -v ${version_idx} -nx ${NX} -ny ${NY} -niter ${NUM_ITER}) 65 | echo "${execution_time} on run ${i}" 66 | done 67 | 68 | printf "\n" 69 | 70 | if [[ $NX -ne ${MAX_DOMAIN_SIZE} ]]; then 71 | NX=$((2*NX)) 72 | NY=$((2*NY)) 73 | else 74 | break 75 | fi 76 | done 77 | 78 | echo "-------------------------------------" 79 | done 80 | -------------------------------------------------------------------------------- /Stencil/jacobi3D/scripts/constant_num_gpus_bench.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH --job-name=stencil-bench 4 | #SBATCH --ntasks=8 5 | #SBATCH --gres=gpu:8 6 | #SBATCH --time=03:00:00 7 | 8 | . ./scripts/modules.sh > /dev/null 9 | 10 | CUDA_VISIBLE_DEVICES_SETTING=("0" "0" "0,1" "0,1,2" "0,1,2,3" "0,1,2,3,4" "0,1,2,3,4,5" "0,1,2,3,4,5,6" "0,1,2,3,4,5,6,7" ) 11 | 12 | declare -A version_name_to_idx_map 13 | 14 | version_name_to_idx_map["Single Stream 1TB"]=0 15 | version_name_to_idx_map["Single Stream 2TB"]=1 16 | 17 | version_name_to_idx_map["Baseline Copy"]=3 18 | version_name_to_idx_map["Baseline Copy Overlap"]=4 19 | version_name_to_idx_map["Baseline P2P"]=5 20 | 21 | version_name_to_idx_map["Single Stream 1TB (No Compute)"]=9 22 | version_name_to_idx_map["Single Stream 2TB (No Compute)"]=10 23 | version_name_to_idx_map["Baseline Copy (No compute)"]=12 24 | version_name_to_idx_map["Baseline Copy Overlap (No Compute)"]=13 25 | version_name_to_idx_map["Baseline P2P (No Compute)"]=14 26 | 27 | BIN="./jacobi -s 1" 28 | 29 | 30 | STARTING_NX=${STARTING_NX:-1024} 31 | STARTING_NY=${STARTING_NY:-1024} 32 | NUM_ITER=${NUM_ITER:-1000000} 33 | NUM_RUNS=${NUM_RUNS:-5} 34 | 35 | NUM_GPUS=${NUM_GPUS:-4} 36 | MAX_DOMAIN_SIZE=${MAX_DOMAIN_SIZE:-16384} 37 | 38 | while [ $# -gt 0 ]; do 39 | 40 | if [[ $1 == *"--"* ]]; then 41 | param="${1/--/}" 42 | declare $param="$2" 43 | fi 44 | 45 | shift 46 | done 47 | 48 | for version_name in "${!version_name_to_idx_map[@]}"; do 49 | echo "Running ${version_name}"; echo "" 50 | 51 | version_idx=${version_name_to_idx_map[$version_name]} 52 | 53 | NX=${STARTING_NX} 54 | NY=${STARTING_NY} 55 | 56 | export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES_SETTING[${NUM_GPUS}]} 57 | 58 | while : ; do 59 | 60 | echo "Num GPUS: ${NUM_GPUS}" 61 | echo "${NUM_ITER} iterations on grid ${NY}x${NX}" 62 | 63 | for (( i=1; i <= ${NUM_RUNS}; i++ )); do 64 | execution_time=$(${BIN} -v ${version_idx} -nx ${NX} -ny ${NY} -niter ${NUM_ITER}) 65 | echo "${execution_time} on run ${i}" 66 | done 67 | 68 | printf "\n" 69 | 70 | if [[ $NX -ne ${MAX_DOMAIN_SIZE} ]]; then 71 | NX=$((2*NX)) 72 | NY=$((2*NY)) 73 | else 74 | break 75 | fi 76 | done 77 | 78 | echo "-------------------------------------" 79 | done 80 | -------------------------------------------------------------------------------- /CG/results/cg_operation_breakdown_4A100.txt: -------------------------------------------------------------------------------- 1 | Results per version; rows are matrices 2 | 3 | Results for version Baseline Discrete Standard NVSHMEM => 4 | ,Dot 1 (+Reset),Dot 2 (+Reset),Global Reduction 1 (+Barrier),Global Reduction 2 (+Barrier),Memcpy Dot to Host 1,Memcpy Dot to Host 2,NVSHMEM Barrier 1 (After SpMV),NVSHMEM Barrier 2 (End of Iteration),Saxpy 1,Saxpy 2,Saxpy 3,SpMV 5 | (generated)_tridiagonal,0.3252,0.2800,0.2109,0.2248,0.0662,0.0657,0.1073,0.0979,0.3663,0.4253,0.3655,1.0031 6 | ecology2,0.0785,0.0810,0.1890,0.1936,0.0651,0.0646,0.0922,0.0920,0.0611,0.0615,0.0626,0.1194 7 | hood,0.0721,0.0713,0.1947,0.1960,0.0658,0.0638,0.2090,0.0907,0.0576,0.0576,0.0571,0.4670 8 | bmwcra_1,0.0721,0.0706,0.1842,0.1850,0.0664,0.0647,0.1759,0.0876,0.0571,0.0572,0.0561,1.2385 9 | consph,0.0702,0.0698,0.1836,0.1916,0.0653,0.0644,0.1791,0.0909,0.0552,0.0550,0.0557,0.3645 10 | thermomech_dM,0.0722,0.0715,0.1842,0.1895,0.0652,0.0643,0.1041,0.0902,0.0563,0.0561,0.0572,0.2593 11 | tmt_sym,0.0765,0.0755,0.1849,0.1934,0.0653,0.0649,0.0957,0.0907,0.0600,0.0598,0.0609,0.1585 12 | crankseg_1,0.0717,0.0699,0.1904,0.1883,0.0675,0.0642,8.8024,0.0881,0.0564,0.0563,0.0555,2.2143 13 | crankseg_2,0.0795,0.0736,0.1930,0.1872,0.0688,0.0650,0.0941,0.0890,0.0573,0.0572,0.0558,14.0743 14 | 15 | 16 | Results for version Baseline Discrete Pipelined NVSHMEM (No Overlap) => 17 | ,Global Reductions (+Barrier),Memcpy Dots To Host,Merged Dots (+Reset),NVSHMEM Barrier 1 (After SpMV),NVSHMEM Barrier 2 (End of Iteration),Saxpy 1,Saxpy 2,Saxpy 3,Saxpy 4,Saxpy 5,Saxpy 6,SpMV 18 | (generated)_tridiagonal,0.1507,0.0681,0.3863,0.1048,0.1055,0.3840,0.4209,0.4236,0.4229,0.4240,0.4234,0.9681 19 | ecology2,0.1508,0.0659,0.0866,0.0975,0.1008,0.0649,0.0637,0.0656,0.0627,0.0613,0.0614,0.1272 20 | hood,0.1506,0.0671,0.0759,0.2009,0.0954,0.0628,0.0579,0.0585,0.0581,0.0563,0.0564,0.4757 21 | bmwcra_1,0.1445,0.0665,0.0736,0.4255,0.0932,0.0576,0.0575,0.0575,0.0572,0.0557,0.0556,1.0926 22 | consph,0.1470,0.0656,0.0721,0.1186,0.0936,0.0554,0.0551,0.0550,0.0549,0.0546,0.0547,0.4386 23 | thermomech_dM,0.1458,0.0667,0.0742,0.1101,0.0933,0.0567,0.0567,0.0566,0.0563,0.0561,0.0561,0.2419 24 | tmt_sym,0.1468,0.0663,0.0817,0.0929,0.0925,0.0606,0.0605,0.0610,0.0603,0.0597,0.0602,0.1614 25 | crankseg_1,0.1479,0.0671,0.0768,9.5558,0.0994,0.0556,0.0555,0.0565,0.0560,0.0548,0.0548,1.5266 26 | crankseg_2,0.1510,0.0693,0.0794,11.0112,0.0934,0.0570,0.0565,0.0573,0.0568,0.0548,0.0548,2.9162 27 | 28 | 29 | -------------------------------------------------------------------------------- /CG/scripts/download_matrices.py: -------------------------------------------------------------------------------- 1 | from io import BytesIO 2 | from urllib.request import urlopen 3 | import tarfile 4 | 5 | from os.path import dirname, realpath, basename 6 | import os 7 | import sys 8 | 9 | SAVE_MATRICES_TO_FOLDER = None 10 | 11 | SUITE_SPARSE_BASE_URL = 'https://suitesparse-collection-website.herokuapp.com/MM' 12 | 13 | MATRIX_INDICES = [ 14 | 'McRae/ecology2', 15 | 'GHS_psdef/hood', 16 | 'GHS_psdef/bmwcra_1', 17 | 'Williams/consph', 18 | 'Botonakis/thermomech_dM', 19 | 'CEMW/tmt_sym', 20 | 'GHS_psdef/crankseg_1', 21 | 'GHS_psdef/crankseg_2', 22 | 'TKK/cbuckle', 23 | 'BenElechi/BenElechi1', 24 | 'MaxPlanck/shallow_water2', 25 | 'JGD_Trefethen/Trefethen_2000', 26 | 'Janna/Queen_4147', 27 | 'Janna/Bump_2911', 28 | 'AMD/G3_circuit', 29 | 'Janna/StocF-1465', 30 | 'Janna/Flan_1565', 31 | 'GHS_psdef/audikw_1', 32 | 'Janna/Serena', 33 | 'Janna/Geo_1438', 34 | 'Janna/Hook_1498', 35 | # 'Oberwolfach/bone010', Multi-part matrix, don't handle those for now 36 | 'GHS_psdef/ldoor', 37 | ] 38 | 39 | 40 | def download_matrices(): 41 | for matrix_index in MATRIX_INDICES: 42 | matrix_name = matrix_index.split('/')[-1] 43 | 44 | mtx_filename = f'{matrix_name}.mtx' 45 | mtx_filepath = f'{SAVE_MATRICES_TO_FOLDER}/{mtx_filename}' 46 | 47 | if os.path.exists(mtx_filepath): 48 | print(f'Matrix {matrix_name} is already downloaded') 49 | continue 50 | 51 | matrix_url = f'{SUITE_SPARSE_BASE_URL}/{matrix_index}.tar.gz' 52 | 53 | with urlopen(matrix_url) as zip_response: 54 | zip_file = tarfile.open(fileobj=zip_response, mode='r|gz') 55 | 56 | zip_file.extractall(SAVE_MATRICES_TO_FOLDER) 57 | 58 | tmp_folder_path = f'{SAVE_MATRICES_TO_FOLDER}/{matrix_name}' 59 | old_matrix_path = f'{tmp_folder_path}/{matrix_name}.mtx' 60 | 61 | os.rename(old_matrix_path, mtx_filepath) 62 | 63 | os.rmdir(tmp_folder_path) 64 | 65 | print(f'Downloaded matrix {matrix_name}') 66 | 67 | 68 | if __name__ == "__main__": 69 | dir_path = dirname(realpath(__file__)) 70 | 71 | arg_idx = 1 72 | 73 | while arg_idx < len(sys.argv): 74 | if sys.argv[arg_idx] == '--save_matrices_to_folder': 75 | arg_idx += 1 76 | arg_val = sys.argv[arg_idx] 77 | 78 | SAVE_MATRICES_TO_FOLDER = arg_val 79 | 80 | arg_idx += 1 81 | 82 | download_matrices() 83 | -------------------------------------------------------------------------------- /Stencil/jacobi2D/PERKS/jacobi-general-wrapper.cu: -------------------------------------------------------------------------------- 1 | #include "./config.cuh" 2 | // #include "./genconfig.cuh" 3 | #include "./jacobi-general-kernel.cuh" 4 | #include "./perksconfig.cuh" 5 | 6 | // #ifdef SMASYNC 7 | // #include 8 | // #include 9 | // #endif 10 | #include "./common/cuda_common.cuh" 11 | 12 | #define MAXTHREAD (256) 13 | // #define MINBLOCK (1) 14 | template 16 | __launch_bounds__(MAXTHREAD, minblocks) __global__ 17 | void kernel_general_wrapper(REAL *input, int width_y, int width_x, int iy_start, int iy_end, 18 | REAL *__var_4__, REAL *l2_cache_o, REAL *l2_cache_i, int iteration, 19 | int max_sm_flder, volatile int *iteration_done) { 20 | inner_general< 21 | REAL, LOCAL_TILE_Y, halo, 22 | regfolder::val, 23 | // 1, 24 | UseSMCache>(input, width_y, width_x, iy_start, iy_end, __var_4__, l2_cache_o, l2_cache_i, 25 | iteration, max_sm_flder, iteration_done); 26 | } 27 | 28 | PERKS_INITIALIZE_ALL_TYPE_4ARG(PERKS_DECLARE_INITIONIZATION_GENERAL_WRAPPER, 8, HALO, 128, true); 29 | PERKS_INITIALIZE_ALL_TYPE_4ARG(PERKS_DECLARE_INITIONIZATION_GENERAL_WRAPPER, 8, HALO, 128, false); 30 | PERKS_INITIALIZE_ALL_TYPE_4ARG(PERKS_DECLARE_INITIONIZATION_GENERAL_WRAPPER, 8, HALO, 256, true); 31 | PERKS_INITIALIZE_ALL_TYPE_4ARG(PERKS_DECLARE_INITIONIZATION_GENERAL_WRAPPER, 8, HALO, 256, false); 32 | 33 | PERKS_INITIALIZE_ALL_TYPE_4ARG(PERKS_DECLARE_INITIONIZATION_GENERAL_WRAPPER, 16, HALO, 128, true); 34 | PERKS_INITIALIZE_ALL_TYPE_4ARG(PERKS_DECLARE_INITIONIZATION_GENERAL_WRAPPER, 16, HALO, 128, false); 35 | PERKS_INITIALIZE_ALL_TYPE_4ARG(PERKS_DECLARE_INITIONIZATION_GENERAL_WRAPPER, 16, HALO, 256, true); 36 | PERKS_INITIALIZE_ALL_TYPE_4ARG(PERKS_DECLARE_INITIONIZATION_GENERAL_WRAPPER, 16, HALO, 256, false); 37 | // #if PERKS_ARCH==800 38 | // #elif PERKS_ARCH==700 39 | // #elif PERKS_ARCH==600 40 | // #error "should not be 600" 41 | // #elif PERKS_ARCH==000 42 | // #error "undefined" 43 | // #else 44 | // #error "wrong architecture" 45 | // #endif 46 | // template<> 47 | // __global__ void kernel_general_wrapper 48 | // ( float * __restrict__ input, int width_y, int width_x, 49 | // float * __restrict__ __var_4__, 50 | // float * __restrict__ l2_cache_o,float * __restrict__ l2_cache_i, 51 | // int iteration, 52 | // int max_sm_flder); -------------------------------------------------------------------------------- /CG/scripts/plots/common.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import textwrap 3 | from pathlib import Path 4 | from os.path import dirname, realpath 5 | 6 | dir_path = dirname(realpath(__file__)) 7 | 8 | BASE_DIR = Path(dir_path + '/../../img') 9 | BASE_DIR.mkdir(exist_ok=True) 10 | 11 | ACM_DOCUMENT_WIDTH = 506.295 12 | 13 | 14 | def rotate(l, n): 15 | return l[-n:] + l[:-n] 16 | 17 | 18 | def get_files(): 19 | parser = argparse.ArgumentParser() 20 | parser.add_argument('files', type=argparse.FileType('r'), nargs='+') 21 | return parser.parse_args().files 22 | 23 | 24 | def get_module_dir(dir_name): 25 | module_dir = BASE_DIR / dir_name 26 | module_dir.mkdir(exist_ok=True) 27 | return module_dir 28 | 29 | 30 | def wrap_labels(ax, width, break_long_words=False): 31 | labels = [] 32 | for label in ax.get_xticklabels(): 33 | text = label.get_text() 34 | labels.append(textwrap.fill(text, width=width, 35 | break_long_words=break_long_words)) 36 | ax.set_xticklabels(labels, rotation=0) 37 | 38 | 39 | def set_size(width, fraction=1): 40 | """Set figure dimensions to avoid scaling in LaTeX. 41 | 42 | Parameters 43 | ---------- 44 | width: float 45 | Document textwidth or columnwidth in pts 46 | fraction: float, optional 47 | Fraction of the width which you wish the figure to occupy 48 | 49 | Returns 50 | ------- 51 | fig_dim: tuple 52 | Dimensions of figure in inches 53 | """ 54 | # Width of figure (in pts) 55 | fig_width_pt = width * fraction 56 | 57 | # Convert from pt to inches 58 | inches_per_pt = 1 / 72.27 59 | 60 | # Golden ratio to set aesthetic figure height 61 | # https://disq.us/p/2940ij3 62 | golden_ratio = (5**.5 - 1) / 2 63 | 64 | # Figure width in inches 65 | fig_width_in = fig_width_pt * inches_per_pt 66 | # Figure height in inches 67 | fig_height_in = fig_width_in * golden_ratio 68 | 69 | fig_dim = (fig_width_in, fig_height_in) 70 | 71 | return fig_dim 72 | 73 | 74 | markers = [ 75 | '.', # point 76 | ',', # pixel 77 | # 'o', # circle 78 | # 'v', # triangle down 79 | '^', # triangle up 80 | '<', # triangle_left 81 | '>', # triangle_right 82 | '1', # tri_down 83 | '2', # tri_up 84 | '3', # tri_left 85 | '4', # tri_right 86 | '8', # octagon 87 | 's', # square 88 | 'p', # pentagon 89 | '*', # star 90 | 'h', # hexagon1 91 | 'H', # hexagon2 92 | '+', # plus 93 | 'x', # x 94 | 'D', # diamond 95 | 'd', # thin_diamond 96 | '|', # vline 97 | ] 98 | -------------------------------------------------------------------------------- /Stencil/jacobi2D/scripts/plot.py: -------------------------------------------------------------------------------- 1 | import re 2 | import sys 3 | from collections import defaultdict 4 | 5 | import pandas as pd 6 | 7 | DELIMITER = '-------------------------------------' 8 | 9 | results_path = sys.argv[1] 10 | 11 | 12 | def print_data_tabular(version_to_result_map, column_labels): 13 | row_labels = version_to_result_map.keys() 14 | full_perf_data = version_to_result_map.values() 15 | transposed_perf_data = list(zip(*full_perf_data)) 16 | 17 | df = pd.DataFrame(full_perf_data, columns=column_labels, 18 | index=row_labels) 19 | 20 | df.to_csv(sys.stdout) 21 | 22 | 23 | with open(results_path) as file: 24 | results = file.read() 25 | 26 | results_per_version = results.split(DELIMITER)[:-1] 27 | results_per_version = [result.strip() for result in results_per_version] 28 | 29 | version_to_result_map = defaultdict(list) 30 | num_gpus_grid_size_label = [] 31 | 32 | for version_result in results_per_version: 33 | chunks = version_result.split('\n\n') 34 | version_name = ' '.join(chunks[0].split()[1:]) 35 | x_axis_label = [] 36 | 37 | for data_chunk in chunks[1:]: 38 | chunk_lines = data_chunk.splitlines() 39 | num_gpus = int(re.match("Num GPUS: (?P\d+)", 40 | chunk_lines[0]).group('num_gpus')) 41 | 42 | run_parameters_match = re.match( 43 | "(?P\d+) iterations on grid (?P\d+)x(?P\d+)", chunk_lines[1]) 44 | 45 | num_iterations = int(run_parameters_match.group('num_iter')) 46 | grid_nx = int(run_parameters_match.group('nx')) 47 | grid_ny = int(run_parameters_match.group('ny')) 48 | 49 | if not num_gpus_grid_size_label: 50 | label = f"{num_gpus} GPU" + \ 51 | ("s" if num_gpus > 1 else "") + f" ({grid_nx}x{grid_ny})" 52 | x_axis_label.append(label) 53 | 54 | perf_data_pattern = re.compile( 55 | "Execution time:\s+(?P[-+]?(\d+(\.\d*)?|\.\d+)([eE][-+]?\d+)?) s on run (?P\d+)") 56 | 57 | execution_times = [] 58 | run_idx = 2 59 | 60 | while run_idx < len(chunk_lines) and (perf_data_match := perf_data_pattern.match(chunk_lines[run_idx])): 61 | exec_time = float(perf_data_match.group('exec_time')) 62 | execution_times.append(exec_time) 63 | 64 | run_idx += 1 65 | 66 | min_execution_time = min(execution_times) 67 | version_to_result_map[version_name].append(min_execution_time) 68 | 69 | if not num_gpus_grid_size_label: 70 | num_gpus_grid_size_label[:] = x_axis_label 71 | 72 | print_data_tabular(version_to_result_map, num_gpus_grid_size_label) 73 | -------------------------------------------------------------------------------- /Plots/comp-vs-comm.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import pandas as pd 3 | import numpy as np 4 | from itertools import cycle 5 | 6 | from common import get_module_dir, wrap_labels 7 | 8 | MODULE_DIR = get_module_dir('Comp vs Comm') 9 | 10 | plt.style.use('./paper.mplstyle') 11 | 12 | data_comp = pd.read_csv('data/comp.csv', index_col='Version') 13 | data_no_comp = pd.read_csv('data/no-comp.csv', index_col='Version') 14 | 15 | # Make sure both have the same version names 16 | data_no_comp.index = data_comp.index.copy() 17 | 18 | 19 | def plot_one_gpu(comp, no_comp, title): 20 | # Normalize to 100% 21 | # Percentage 22 | no_comp_left = (no_comp / comp) * 100 23 | # comp_left = 100 - no_comp_left 24 | comp_left = 100 - no_comp_left 25 | 26 | # no_comp_left += comp_left 27 | 28 | # Actual execution time 29 | comp_right = comp #- no_comp 30 | 31 | fig, axes = plt.subplots(1, 2) 32 | fig.set_size_inches(10, 6) 33 | 34 | data_left = pd.DataFrame({'Comp %': comp_left, 'Comm %': no_comp_left, 'idx_col': comp_left.index}) 35 | data_right = pd.DataFrame( 36 | {'Comp sec.': comp_right, 'Comm sec.': no_comp, 'idx_col': comp_right.index} 37 | ) 38 | 39 | indices = np.arange(3) 40 | 41 | colors = cycle(plt.rcParams['axes.prop_cycle'].by_key()['color'][:2]) 42 | widths = cycle([0.8, 0.6]) 43 | 44 | # data_left.iloc[:, 1].plot.bar(indices, color='r', width=0.8, stacked=True) 45 | # comp_left.plot.bar(indices, color='b', width=0.6, stacked=True) 46 | # plt.show() 47 | 48 | for ax, data in zip(axes, [data_left, data_right]): 49 | 50 | for i in range(2): 51 | data.iloc[:, i].plot.bar(ax=ax, color=next(colors), width=next(widths)) 52 | 53 | # data.plot.bar(ax=ax, width=0.6, stacked=True) 54 | 55 | # Hatch stuff 56 | bars = ax.patches 57 | 58 | bars[0].set_width(0.8) 59 | bars[1].set_width(0.8) 60 | 61 | hatches = ''.join(h * len(data_left) for h in [' ', '/']) 62 | 63 | for bar, hatch in zip(bars, hatches): 64 | bar.set_hatch(hatch) 65 | 66 | # Beautify stuff 67 | ax.set(xlabel=None) 68 | wrap_labels(ax, 10) 69 | 70 | ax.legend(loc='upper center', 71 | bbox_to_anchor=(0.5, # horizontal 72 | 1.09), # vertical 73 | ncol=3, fancybox=True) 74 | 75 | plt.xticks(rotation=0, ha='center') 76 | fig.suptitle(title) 77 | plt.savefig(MODULE_DIR / title) 78 | plt.show() 79 | 80 | 81 | for (title, comp), (_, no_comp) in zip(data_comp.iteritems(), data_no_comp.iteritems()): 82 | plot_one_gpu(comp=comp, no_comp=no_comp, title=title) 83 | -------------------------------------------------------------------------------- /Stencil/jacobi3D/scripts/plot.py: -------------------------------------------------------------------------------- 1 | import re 2 | import sys 3 | from collections import defaultdict 4 | 5 | import pandas as pd 6 | 7 | DELIMITER = '-------------------------------------' 8 | 9 | results_path = sys.argv[1] 10 | 11 | 12 | def print_data_tabular(version_to_result_map, column_labels): 13 | row_labels = version_to_result_map.keys() 14 | full_perf_data = version_to_result_map.values() 15 | 16 | df = pd.DataFrame(full_perf_data, columns=column_labels, 17 | index=row_labels) 18 | 19 | df.to_csv(sys.stdout) 20 | 21 | 22 | with open(results_path) as file: 23 | results = file.read() 24 | 25 | results_per_version = results.split(DELIMITER)[:-1] 26 | results_per_version = [result.strip() for result in results_per_version] 27 | 28 | version_to_result_map = defaultdict(list) 29 | num_gpus_grid_size_label = [] 30 | 31 | for version_result in results_per_version: 32 | chunks = version_result.split('\n\n') 33 | version_name = ' '.join(chunks[0].split()[1:]) 34 | x_axis_label = [] 35 | 36 | for data_chunk in chunks[1:]: 37 | chunk_lines = data_chunk.splitlines() 38 | num_gpus = int(re.match("Num GPUS: (?P\d+)", 39 | chunk_lines[0]).group('num_gpus')) 40 | 41 | run_parameters_match = re.match( 42 | "(?P\d+) iterations on grid (?P\d+)x(?P\d+)x(?P\d+)", chunk_lines[1]) 43 | 44 | num_iterations = int(run_parameters_match.group('num_iter')) 45 | grid_nx = int(run_parameters_match.group('nx')) 46 | grid_ny = int(run_parameters_match.group('ny')) 47 | grid_nz = int(run_parameters_match.group('nz')) 48 | 49 | if not num_gpus_grid_size_label: 50 | label = f"{num_gpus} GPU" + \ 51 | ("s" if num_gpus > 1 else "") + f" ({grid_nx}x{grid_ny}x{grid_nz})" 52 | x_axis_label.append(label) 53 | 54 | perf_data_pattern = re.compile( 55 | "Execution time:\s+(?P[-+]?(\d+(\.\d*)?|\.\d+)([eE][-+]?\d+)?) s on run (?P\d+)") 56 | 57 | execution_times = [] 58 | run_idx = 2 59 | 60 | while run_idx < len(chunk_lines) and (perf_data_match := perf_data_pattern.match(chunk_lines[run_idx])): 61 | exec_time = float(perf_data_match.group('exec_time')) 62 | execution_times.append(exec_time) 63 | 64 | run_idx += 1 65 | 66 | min_execution_time = min(execution_times) 67 | version_to_result_map[version_name].append(min_execution_time) 68 | 69 | if not num_gpus_grid_size_label: 70 | num_gpus_grid_size_label[:] = x_axis_label 71 | 72 | print_data_tabular(version_to_result_map, num_gpus_grid_size_label) 73 | -------------------------------------------------------------------------------- /Plots/weak-scaling.py: -------------------------------------------------------------------------------- 1 | import math 2 | from itertools import cycle 3 | from pathlib import Path 4 | 5 | import matplotlib.pyplot as plt 6 | import pandas as pd 7 | 8 | from common import get_files, markers, get_module_dir, wrap_labels, rotate 9 | 10 | MODULE_DIR = get_module_dir('Weak Scaling') 11 | 12 | plt.style.use('https://github.com/dhaitz/matplotlib-stylesheets/raw/master/pitayasmoothie-light.mplstyle') 13 | 14 | plt.rcParams.update({"axes.facecolor": (0.5, 0.5, 0.5, 0.1)}) 15 | 16 | plt.rcParams['text.usetex'] = True 17 | 18 | 19 | MICROSECOND = 1000000 20 | 21 | #NUM_ITERS = [1_000_000, 1_000_000, 10_000] 22 | NUM_ITERS = [1_000_000, 1_000_000, 1_000_000, 10_000] 23 | 24 | files = get_files() 25 | 26 | plots = len(files) 27 | fig, axes = plt.subplots(math.ceil(plots / 3), plots if plots < 3 else 3, layout='constrained') 28 | # fig.set_size_inches(15, 3 * math.ceil(plots / 3)) 29 | fig.set_size_inches(13, 3 * math.ceil(plots / 3)) 30 | # fig.tight_layout() 31 | 32 | colors = plt.rcParams['axes.prop_cycle'].by_key()['color'] 33 | colors = rotate(list(reversed(colors)), 1) 34 | 35 | colors[1] = colors[-1] 36 | 37 | for ax, file, num_iter in zip(axes.flatten(), files, NUM_ITERS): 38 | data = pd.read_csv(file, index_col='Version') 39 | # data = data.sort_index() 40 | data = data.T / num_iter * MICROSECOND 41 | 42 | ax = data.plot(ax=ax, color=colors) 43 | 44 | markers_cycle = cycle(markers) 45 | 46 | for line in ax.get_lines(): 47 | line.set_marker(next(markers_cycle)) 48 | line.set_linewidth(1.5) 49 | # If our versions 50 | if line.get_label().lower().startswith('baseline'): 51 | # line.set_linewidth(1.0) 52 | # line.set(alpha=0.5) 53 | line.set_linestyle('dashed') 54 | 55 | # axes.legend(axes.get_lines(), data.columns, loc='best') 56 | ax.get_legend().remove() 57 | wrap_labels(ax, 10) 58 | 59 | # plt.xticks(fontsize=15) 60 | # plt.title(title, fontsize=15) 61 | 62 | # handles, labels = axes.get_legend_handles_labels() 63 | 64 | axes.flatten()[0].legend(loc='best', fancybox=True, prop={'weight': 'bold', 'size': 'large'}) 65 | axes.flatten()[0].legend(loc='best', fancybox=True) 66 | 67 | # legend = fig.legend(handles, labels, loc='upper center', 68 | # bbox_to_anchor=(0.5, # horizontal 69 | # 1.1), # vertical 70 | # ncol=6, fancybox=True) 71 | 72 | #legend = fig.legend(handles, labels, loc='best') 73 | 74 | fig.supylabel(r'$\mu$ seconds per iteration', weight='normal') 75 | 76 | title = Path(files[0].name).stem 77 | 78 | format = 'pdf' 79 | #plt.constrained_layoadia 80 | plt.savefig(MODULE_DIR / f'{title}.{format}', bbox_inches='tight', format=format, transparent=False) 81 | 82 | plt.show() 83 | -------------------------------------------------------------------------------- /Stencil/jacobi2D/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(jacobi2D 2 | src/main.cu 3 | src/common.cu 4 | PERKS/jacobi-general-wrapper.cu 5 | src/baseline/multi-threaded-copy.cu 6 | src/baseline/multi-threaded-copy-overlap.cu 7 | src/baseline/multi-threaded-p2p.cu 8 | src/baseline/single-threaded-copy.cu 9 | src/multi-stream/multi-gpu-peer-tiling.cu 10 | src/PERKS/multi-stream-perks.cu 11 | src/single-stream/multi-threaded-one-block-comm.cu 12 | src/single-stream/multi-threaded-two-block-comm.cu 13 | src/no-compute/multi-gpu-peer-tiling-no-compute.cu 14 | src/no-compute/multi-threaded-copy-no-compute.cu 15 | src/no-compute/multi-threaded-copy-overlap-no-compute.cu 16 | src/no-compute/multi-threaded-one-block-comm-no-compute.cu 17 | src/no-compute/multi-threaded-p2p-no-compute.cu 18 | src/no-compute/multi-threaded-two-block-comm-no-compute.cu) 19 | 20 | add_executable(jacobi2D_nvshmem 21 | src_nvshmem/main.cu 22 | src_nvshmem/common.cu 23 | PERKS/jacobi-general-wrapper.cu 24 | src_nvshmem/baseline/multi-threaded-nvshmem.cu 25 | src_nvshmem/baseline/multi-threaded-nvshmem-opt.cu 26 | src_nvshmem/PERKS/multi-stream-perks.cu 27 | src_nvshmem/multi-stream/multi-gpu-multi-block-tiling.cu 28 | src_nvshmem/multi-stream/multi-gpu-peer-tiling.cu 29 | src_nvshmem/single-stream/multi-threaded-multi-block-comm.cu 30 | src_nvshmem/single-stream/multi-threaded-one-block-comm.cu 31 | src_nvshmem/single-stream/multi-threaded-two-block-comm.cu 32 | src_nvshmem/no-compute/design-1-multi-block-no-compute.cu 33 | src_nvshmem/no-compute/multi-gpu-peer-tiling-no-compute.cu 34 | src_nvshmem/no-compute/multi-threaded-nvshmem-no-compute.cu 35 | src_nvshmem/no-compute/multi-threaded-nvshmem-opt-no-compute.cu 36 | src_nvshmem/no-compute/multi-threaded-one-block-comm-no-compute.cu 37 | src_nvshmem/no-compute/multi-threaded-two-block-comm-no-compute.cu) 38 | 39 | target_include_directories(jacobi2D PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/include") 40 | target_include_directories(jacobi2D PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/PERKS") 41 | 42 | target_include_directories(jacobi2D_nvshmem PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/include_nvshmem") 43 | target_include_directories(jacobi2D_nvshmem PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/PERKS") 44 | 45 | find_package(OpenMP REQUIRED) 46 | find_package(NVSHMEM REQUIRED) 47 | find_package(MPI REQUIRED) 48 | 49 | target_link_libraries(jacobi2D 50 | CUDA::cudart 51 | OpenMP::OpenMP_CXX) 52 | 53 | target_link_libraries(jacobi2D_nvshmem 54 | CUDA::cudart 55 | OpenMP::OpenMP_CXX 56 | nvshmem::nvshmem 57 | MPI::MPI_CXX) 58 | -------------------------------------------------------------------------------- /Stencil/jacobi2D/src/main.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "../include/baseline/multi-threaded-copy-overlap.cuh" 4 | #include "../include/baseline/multi-threaded-copy.cuh" 5 | #include "../include/baseline/multi-threaded-p2p.cuh" 6 | #include "../include/baseline/single-threaded-copy.cuh" 7 | 8 | #include "../include/single-stream/multi-threaded-one-block-comm.cuh" 9 | #include "../include/single-stream/multi-threaded-two-block-comm.cuh" 10 | 11 | #include "../include/PERKS/multi-stream-perks.cuh" 12 | #include "../include/multi-stream/multi-gpu-peer-tiling.cuh" 13 | 14 | #include "../include/no-compute/multi-gpu-peer-tiling-no-compute.cuh" 15 | #include "../include/no-compute/multi-threaded-copy-no-compute.cuh" 16 | #include "../include/no-compute/multi-threaded-copy-overlap-no-compute.cuh" 17 | 18 | #include "../include/no-compute/multi-threaded-one-block-comm-no-compute.cuh" 19 | #include "../include/no-compute/multi-threaded-p2p-no-compute.cuh" 20 | #include "../include/no-compute/multi-threaded-two-block-comm-no-compute.cuh" 21 | 22 | using std::make_pair; 23 | 24 | int main(int argc, char *argv[]) { 25 | const std::array versions{ 26 | make_pair("Baseline Copy", BaselineMultiThreadedCopy::init), 27 | make_pair("Baseline Overlap", BaselineMultiThreadedCopyOverlap::init), 28 | make_pair("Baseline P2P", BaselineMultiThreadedP2P::init), 29 | 30 | make_pair("Design 1", MultiGPUPeerTiling::init), 31 | make_pair("Design 2", SSMultiThreadedTwoBlockComm::init), 32 | make_pair("PERKS", MultiStreamPERKS::init), 33 | 34 | make_pair("Baseline Copy (No computation)", BaselineMultiThreadedCopyNoCompute::init), 35 | make_pair("Baseline Overlap (No Computation)", 36 | BaselineMultiThreadedCopyOverlapNoCompute::init), 37 | make_pair("Baseline P2P (No Computation)", BaselineMultiThreadedP2PNoCompute::init), 38 | 39 | make_pair("Design 1 (No Computation)", MultiGPUPeerTilingNoCompute::init), 40 | make_pair("Design 2 (No Computation)", SSMultiThreadedTwoBlockCommNoCompute::init), 41 | 42 | // make_pair("Baseline Single Threaded Copy", BaselineSingleThreadedCopy::init), 43 | // make_pair("Single stream multi threaded (one thread block communicates)", 44 | // SSMultiThreadedOneBlockComm::init), 45 | 46 | // make_pair("Single stream multi threaded (one thread block communicates; no 47 | // computation)", 48 | // SSMultiThreadedOneBlockCommNoCompute::init), 49 | }; 50 | 51 | const int selection = get_argval(argv, argv + argc, "-v", 0); 52 | const bool silent = get_arg(argv, argv + argc, "-s"); 53 | 54 | auto &selected = versions[selection]; 55 | 56 | if (!silent) { 57 | std::cout << "Versions (select with -v):" 58 | << "\n"; 59 | for (size_t i = 0; i < versions.size(); ++i) { 60 | auto &v = versions[i]; 61 | std::cout << i << ":\t" << v.first << "\n"; 62 | } 63 | std::cout << std::endl; 64 | 65 | std::cout << "Running " << selected.first << "\n" << std::endl; 66 | } 67 | 68 | return selected.second(argc, argv); 69 | } 70 | -------------------------------------------------------------------------------- /Plots/weak-scaling-2.py: -------------------------------------------------------------------------------- 1 | import math 2 | from itertools import cycle 3 | from pathlib import Path 4 | 5 | import matplotlib.pyplot as plt 6 | import pandas as pd 7 | 8 | from common import get_files, markers, get_module_dir, wrap_labels, rotate 9 | 10 | from matplotlib.ticker import FormatStrFormatter 11 | 12 | MODULE_DIR = get_module_dir('Weak Scaling') 13 | 14 | plt.style.use('https://github.com/dhaitz/matplotlib-stylesheets/raw/master/pitayasmoothie-light.mplstyle') 15 | 16 | plt.rcParams.update({"axes.facecolor": (0.5, 0.5, 0.5, 0.1)}) 17 | 18 | plt.rcParams['text.usetex'] = True 19 | 20 | MICROSECOND = 1000000 21 | 22 | #NUM_ITERS = [1_000_000, 1_000_000, 10_000] 23 | NUM_ITERS = [100_000, 100_000, 10_000] 24 | 25 | files = get_files() 26 | 27 | plots = len(files) 28 | fig, axes = plt.subplots(math.ceil(plots / 3), plots if plots < 3 else 3, layout='constrained') 29 | # fig.set_size_inches(15, 3 * math.ceil(plots / 3)) 30 | fig.set_size_inches(15, 3 * math.ceil(plots / 3)) 31 | # fig.tight_layout() 32 | 33 | titles = ['Weak Scaling', 'Strong Scaling (No Compute) ($512^3$)', 'Strong Scaling ($256^3$)',] 34 | 35 | logy = [False, True, True] 36 | 37 | colors = plt.rcParams['axes.prop_cycle'].by_key()['color'] 38 | colors = rotate(list(reversed(colors)), 1) 39 | 40 | colors[1] = colors[-1] 41 | 42 | ok = False 43 | 44 | for ax, file, num_iter, title, logy in zip(axes.flatten(), files, NUM_ITERS, titles, logy): 45 | # ax.margins(x=0) 46 | 47 | data = pd.read_csv(file, index_col='Version') 48 | # data = data.sort_index() 49 | data = data.T / num_iter * MICROSECOND 50 | 51 | ax = data.plot(ax=ax, color=colors, title=title, logy=logy) 52 | 53 | if logy: 54 | ax.set_yscale('log', base=2) 55 | 56 | markers_cycle = cycle(markers) 57 | 58 | for line in ax.get_lines(): 59 | line.set_marker(next(markers_cycle)) 60 | line.set_linewidth(1.5) 61 | # If our versions 62 | if line.get_label().lower().startswith('baseline'): 63 | # line.set_linewidth(1.0) 64 | # line.set(alpha=0.5) 65 | line.set_linestyle('dashed') 66 | 67 | # axes.legend(axes.get_lines(), data.columns, loc='best') 68 | if ok: 69 | ax.set_xlabel('Number of GPUs', weight='bold', fontdict={'fontsize': 11.0}) 70 | 71 | ok = True 72 | 73 | ax.get_legend().remove() 74 | wrap_labels(ax, 10) 75 | # plt.title(title, fontsize=15) 76 | 77 | # handles, labels = axes.get_legend_handles_labels() 78 | 79 | axes.flatten()[0].legend(loc='best', fancybox=True, prop={'weight': 'bold', 'size': 'large'}) 80 | axes.flatten()[0].legend(loc='best', fancybox=True) 81 | 82 | # legend = fig.legend(handles, labels, loc='upper center', 83 | # bbox_to_anchor=(0.5, # horizontal 84 | # 1.1), # vertical 85 | # ncol=6, fancybox=True) 86 | 87 | #legend = fig.legend(handles, labels, loc='best') 88 | 89 | fig.supylabel(r'$\mu$ seconds per iteration', weight='normal') 90 | 91 | title = Path(files[0].name).stem 92 | 93 | format = 'pdf' 94 | #plt.constrained_layoadia 95 | plt.savefig(MODULE_DIR / f'{title}.{format}', bbox_inches='tight', format=format, transparent=False) 96 | 97 | plt.show() 98 | -------------------------------------------------------------------------------- /Stencil/jacobi3D/src/main.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "../include/baseline/multi-threaded-copy-overlap.cuh" 5 | #include "../include/baseline/multi-threaded-copy.cuh" 6 | #include "../include/baseline/multi-threaded-p2p.cuh" 7 | #include "../include/baseline/single-threaded-copy.cuh" 8 | 9 | #include "../include/PERKS/multi-stream-perks.cuh" 10 | 11 | #include "../include/single-stream/multi-threaded-one-block-comm.cuh" 12 | 13 | #include "../include/single-stream/multi-threaded-two-block-comm.cuh" 14 | 15 | #include "../include/multi-stream/multi-gpu-peer-tiling.cuh" 16 | 17 | #include "../include/no-compute/multi-threaded-copy-no-compute.cuh" 18 | #include "../include/no-compute/multi-threaded-copy-overlap-no-compute.cuh" 19 | #include "../include/no-compute/multi-threaded-p2p-no-compute.cuh" 20 | 21 | #include "../include/no-compute/multi-threaded-one-block-comm-no-compute.cuh" 22 | 23 | #include "../include/no-compute/multi-gpu-peer-tiling-no-compute.cuh" 24 | #include "../include/no-compute/multi-threaded-two-block-comm-no-compute.cuh" 25 | 26 | #include "../include/no-compute/multi-gpu-peer-tiling-no-compute.cuh" 27 | 28 | using std::make_pair; 29 | 30 | int main(int argc, char *argv[]) { 31 | const std::array versions{ 32 | make_pair("Baseline Copy", BaselineMultiThreadedCopy::init), 33 | make_pair("Baseline Overlap", BaselineMultiThreadedCopyOverlap::init), 34 | make_pair("Baseline P2P", BaselineMultiThreadedP2P::init), 35 | 36 | make_pair("Design 1", MultiGPUPeerTiling::init), 37 | make_pair("Design 2", SSMultiThreadedTwoBlockComm::init), 38 | make_pair("PERKS", MultiStreamPERKS::init), 39 | 40 | make_pair("Baseline Copy (No computation)", BaselineMultiThreadedCopyNoCompute::init), 41 | make_pair("Baseline Overlap (No Computation)", 42 | BaselineMultiThreadedCopyOverlapNoCompute::init), 43 | make_pair("Baseline P2P (No Computation)", BaselineMultiThreadedP2PNoCompute::init), 44 | 45 | make_pair("Design 1 (No Computation)", MultiGPUPeerTilingNoCompute::init), 46 | make_pair("Design 2 (No Computation)", SSMultiThreadedTwoBlockCommNoCompute::init), 47 | 48 | // make_pair("Baseline Single Threaded Copy", BaselineSingleThreadedCopy::init), 49 | // make_pair("Naive Single stream multi threaded (one thread block 50 | // communicates)",SSMultiThreadedOneBlockComm::init), make_pair("Single stream multi 51 | // threaded (one thread block communicates; no computation)", 52 | // SSMultiThreadedOneBlockCommNoCompute::init), 53 | }; 54 | 55 | const int selection = get_argval(argv, argv + argc, "-v", 0); 56 | const bool silent = get_arg(argv, argv + argc, "-s"); 57 | 58 | auto &selected = versions[selection]; 59 | 60 | if (!silent) { 61 | std::cout << "Versions (select with -v):" 62 | << "\n"; 63 | for (size_t i = 0; i < versions.size(); ++i) { 64 | auto &v = versions[i]; 65 | std::cout << i << ":\t" << v.first << "\n"; 66 | } 67 | std::cout << std::endl; 68 | 69 | std::cout << "Running " << selected.first << "\n" << std::endl; 70 | } 71 | 72 | return selected.second(argc, argv); 73 | } 74 | -------------------------------------------------------------------------------- /Stencil/jacobi2D/src_nvshmem/main.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "../include_nvshmem/baseline/multi-threaded-nvshmem-opt.cuh" 5 | #include "../include_nvshmem/baseline/multi-threaded-nvshmem.cuh" 6 | 7 | #include "../include_nvshmem/multi-stream/multi-gpu-multi-block-tiling.cuh" 8 | #include "../include_nvshmem/multi-stream/multi-gpu-peer-tiling.cuh" 9 | 10 | #include "../include_nvshmem/single-stream/multi-threaded-multi-block-comm.cuh" 11 | #include "../include_nvshmem/single-stream/multi-threaded-one-block-comm.cuh" 12 | #include "../include_nvshmem/single-stream/multi-threaded-two-block-comm.cuh" 13 | 14 | #include "../include_nvshmem/no-compute/multi-threaded-nvshmem-no-compute.cuh" 15 | #include "../include_nvshmem/no-compute/multi-threaded-nvshmem-opt-no-compute.cuh" 16 | 17 | #include "../include_nvshmem/no-compute/design-1-multi-block-no-compute.cuh" 18 | #include "../include_nvshmem/no-compute/multi-threaded-one-block-comm-no-compute.cuh" 19 | #include "../include_nvshmem/no-compute/multi-threaded-two-block-comm-no-compute.cuh" 20 | 21 | #include "../include_nvshmem/no-compute/multi-gpu-peer-tiling-no-compute.cuh" 22 | 23 | #include "../include_nvshmem/PERKS/multi-stream-perks.cuh" 24 | 25 | using std::make_pair; 26 | 27 | int main(int argc, char *argv[]) { 28 | const std::array versions{ 29 | make_pair("Baseline NVSHMEM", BaselineMultiThreadedNvshmemOpt::init), 30 | 31 | make_pair("Design 1 (NVSHMEM)", MultiGPUPeerTilingNvshmem::init), 32 | make_pair("Design 2 (NVSHMEM)", SSMultiThreadedTwoBlockCommNvshmem::init), 33 | make_pair("Design 1 Partitioned (NVSHMEM)", MultiGPUMultiBlockPeerTilingNvshmem::init), 34 | make_pair("PERKS NVSHMEM", MultiStreamPERKSNVSHMEM::init), 35 | 36 | make_pair("Baseline NVSHMEM (No Computation)", 37 | BaselineMultiThreadedNvshmemOptNoCompute::init), 38 | 39 | make_pair("Design 1 NVSHMEM (No Computation)", MultiGPUPeerTilingNvshmemNoCompute::init), 40 | make_pair("Design 2 NVSHMEM (No Computation", 41 | SSMultiThreadedTwoBlockCommNvshmemNoCompute::init), 42 | make_pair("Design 1 Partitioned (No Computation)", Design1MultiBlockNoComputation::init), 43 | 44 | // make_pair("Design 2 Partitioned (NVSHMEM)", 45 | // SSMultiThreadedMultiBlockCommNvshmem::init), make_pair("NVSHMEM Baseline Multi 46 | // Threaded", BaselineMultiThreadedNvshmem::init), make_pair("NVSHMEM Single stream 47 | // multi threaded (one thread block communicates)", 48 | // SSMultiThreadedOneBlockCommNvshmem::init), 49 | // make_pair("NVSHMEM Baseline Multi Threaded (No Computation)", 50 | // BaselineMultiThreadedNvshmemNoCompute::init), make_pair( 51 | // "NVSHMEM Single stream multi threaded (one thread block communicates; no 52 | // computation)", SSMultiThreadedOneBlockCommNvshmemNoCompute::init), 53 | 54 | }; 55 | 56 | const int selection = get_argval(argv, argv + argc, "-v", 0); 57 | const bool silent = get_arg(argv, argv + argc, "-s"); 58 | 59 | auto &selected = versions[selection]; 60 | 61 | if (!silent) { 62 | std::cout << "Versions (select with -v):" << std::endl; 63 | for (size_t i = 0; i < versions.size(); ++i) { 64 | auto &v = versions[i]; 65 | std::cout << i << ":\t" << v.first << "\n"; 66 | } 67 | std::cout << std::endl; 68 | 69 | std::cout << "Running " << selected.first << "\n" << std::endl; 70 | } 71 | 72 | return selected.second(argc, argv); 73 | } 74 | -------------------------------------------------------------------------------- /CG/src/single-gpu/discrete-standard.cu: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | * 3 | * Redistribution and use in source and binary forms, with or without 4 | * modification, are permitted provided that the following conditions 5 | * are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of NVIDIA CORPORATION nor the names of its 12 | * contributors may be used to endorse or promote products derived 13 | * from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | 28 | /* 29 | * This sample implements a conjugate gradient solver on multiple GPU using 30 | * Unified Memory optimized prefetching and usage hints. 31 | * 32 | */ 33 | 34 | // includes, system 35 | #include 36 | #include 37 | #include 38 | #include 39 | #include 40 | #include 41 | #include 42 | #include 43 | 44 | #include 45 | 46 | #include 47 | #include 48 | #include 49 | 50 | #include "../../include/common.h" 51 | 52 | #include 53 | #include 54 | 55 | namespace cg = cooperative_groups; 56 | 57 | // This should only be run with a single GPU 58 | int SingleGPUDiscreteStandard::init(int *device_csrRowIndices, int *device_csrColIndices, 59 | real *device_csrVal, const int num_rows, const int nnz, 60 | bool matrix_is_zero_indexed, const int iter_max, 61 | [[maybe_unused]] real *x_final_result, 62 | [[maybe_unused]] const double single_gpu_runtime, 63 | [[maybe_unused]] bool compare_to_single_gpu, 64 | [[maybe_unused]] bool compare_to_cpu, real *x_ref_single_gpu, 65 | [[maybe_unused]] real *x_ref_cpu) { 66 | // This version should be run with 1 GPU only but adding this check here just in case. 67 | int mype = nvshmem_my_pe(); 68 | 69 | if (mype == 0) { 70 | bool run_as_separate_version = true; 71 | 72 | double single_gpu_runtime = SingleGPUDiscreteStandard::run_single_gpu( 73 | iter_max, device_csrRowIndices, device_csrColIndices, device_csrVal, x_ref_single_gpu, 74 | num_rows, nnz, matrix_is_zero_indexed, run_as_separate_version); 75 | 76 | printf("Execution time: %8.4f s\n", single_gpu_runtime); 77 | } 78 | 79 | return 0; 80 | } -------------------------------------------------------------------------------- /Stencil/jacobi3D/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(jacobi3D 2 | src/main.cu 3 | src/common.cu 4 | src/baseline/multi-threaded-copy.cu 5 | src/baseline/multi-threaded-copy-overlap.cu 6 | src/baseline/multi-threaded-p2p.cu 7 | src/baseline/single-threaded-copy.cu 8 | src/multi-stream/multi-gpu-peer-tiling.cu 9 | src/single-stream/multi-threaded-one-block-comm.cu 10 | src/single-stream/multi-threaded-two-block-comm.cu 11 | src/PERKS/common/common.hpp 12 | src/PERKS/common/cub_utils.cuh 13 | src/PERKS/common/cuda_common.cuh 14 | src/PERKS/common/cuda_computation.cuh 15 | src/PERKS/common/jacobi_cuda.cuh 16 | src/PERKS/common/jacobi_reference.hpp 17 | src/PERKS/common/types.hpp 18 | src/PERKS/config.cuh 19 | src/PERKS/genconfig.cuh 20 | src/PERKS/j3d-general-kernels.cuh 21 | src/PERKS/j3d-general-wrapper.cu 22 | src/PERKS/multi-stream-perks.cu 23 | src/PERKS/perksconfig.cuh 24 | src/no-compute/multi-gpu-peer-tiling-no-compute.cu 25 | src/no-compute/multi-threaded-copy-no-compute.cu 26 | src/no-compute/multi-threaded-copy-overlap-no-compute.cu 27 | src/no-compute/multi-threaded-one-block-comm-no-compute.cu 28 | src/no-compute/multi-threaded-p2p-no-compute.cu 29 | src/no-compute/multi-threaded-two-block-comm-no-compute.cu) 30 | 31 | add_executable(jacobi3D_nvshmem 32 | src_nvshmem/baseline/multi-threaded-nvshmem.cu 33 | src_nvshmem/baseline/multi-threaded-nvshmem-opt.cu 34 | src_nvshmem/common.cu 35 | src_nvshmem/main.cu 36 | src_nvshmem/multi-stream/multi-gpu-multi-block-tiling.cu 37 | src_nvshmem/multi-stream/multi-gpu-peer-tiling.cu 38 | src_nvshmem/no-compute/multi-gpu-peer-tiling-no-compute.cu 39 | src_nvshmem/no-compute/multi-threaded-multi-block-comm-no-compute.cu 40 | src_nvshmem/no-compute/multi-threaded-nvshmem-no-compute.cu 41 | src_nvshmem/no-compute/multi-threaded-nvshmem-opt-no-compute.cu 42 | src_nvshmem/no-compute/multi-threaded-one-block-comm-no-compute.cu 43 | src_nvshmem/no-compute/multi-threaded-two-block-comm-no-compute.cu 44 | src_nvshmem/PERKS-nvshmem/common/common.hpp 45 | src_nvshmem/PERKS-nvshmem/common/cub_utils.cuh 46 | src_nvshmem/PERKS-nvshmem/common/cuda_common.cuh 47 | src_nvshmem/PERKS-nvshmem/common/cuda_computation.cuh 48 | src_nvshmem/PERKS-nvshmem/common/jacobi_cuda.cuh 49 | src_nvshmem/PERKS-nvshmem/common/jacobi_reference.hpp 50 | src_nvshmem/PERKS-nvshmem/common/types.hpp 51 | src_nvshmem/PERKS-nvshmem/config.cuh 52 | src_nvshmem/PERKS-nvshmem/genconfig.cuh 53 | src_nvshmem/PERKS-nvshmem/j3d-general-kernels.cuh 54 | src_nvshmem/PERKS-nvshmem/j3d-general-wrapper.cu 55 | src_nvshmem/PERKS-nvshmem/multi-stream-perks-block.cu 56 | src_nvshmem/PERKS-nvshmem/multi-stream-perks.cu 57 | src_nvshmem/PERKS-nvshmem/perksconfig.cuh 58 | src_nvshmem/single-stream/multi-threaded-multi-block-comm.cu 59 | src_nvshmem/single-stream/multi-threaded-one-block-comm.cu 60 | src_nvshmem/single-stream/multi-threaded-two-block-comm.cu) 61 | 62 | target_include_directories(jacobi3D PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/include") 63 | 64 | target_include_directories(jacobi3D_nvshmem PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/include_nvshmem") 65 | 66 | find_package(OpenMP REQUIRED) 67 | find_package(NVSHMEM REQUIRED) 68 | find_package(MPI REQUIRED) 69 | 70 | target_link_libraries(jacobi3D 71 | CUDA::cudart 72 | OpenMP::OpenMP_CXX) 73 | 74 | target_link_libraries(jacobi3D_nvshmem 75 | CUDA::cudart 76 | OpenMP::OpenMP_CXX 77 | nvshmem::nvshmem 78 | MPI::MPI_CXX) 79 | -------------------------------------------------------------------------------- /Stencil/jacobi3D/src_nvshmem/main.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "../include_nvshmem/PERKS-nvshmem/multi-stream-perks-nvshmem-block.h" 5 | #include "../include_nvshmem/PERKS-nvshmem/multi-stream-perks-nvshmem.h" 6 | #include "../include_nvshmem/baseline/multi-threaded-nvshmem-opt.cuh" 7 | #include "../include_nvshmem/baseline/multi-threaded-nvshmem.cuh" 8 | #include "../include_nvshmem/multi-stream/multi-gpu-multi-block-tiling.cuh" 9 | #include "../include_nvshmem/multi-stream/multi-gpu-peer-tiling.cuh" 10 | #include "../include_nvshmem/no-compute/multi-gpu-peer-tiling-no-compute.cuh" 11 | #include "../include_nvshmem/no-compute/multi-threaded-multi-block-comm-no-compute.cuh" 12 | #include "../include_nvshmem/no-compute/multi-threaded-nvshmem-no-compute.cuh" 13 | #include "../include_nvshmem/no-compute/multi-threaded-nvshmem-opt-no-compute.cuh" 14 | #include "../include_nvshmem/no-compute/multi-threaded-one-block-comm-no-compute.cuh" 15 | #include "../include_nvshmem/no-compute/multi-threaded-two-block-comm-no-compute.cuh" 16 | #include "../include_nvshmem/single-stream/multi-threaded-multi-block-comm.cuh" 17 | #include "../include_nvshmem/single-stream/multi-threaded-one-block-comm.cuh" 18 | #include "../include_nvshmem/single-stream/multi-threaded-two-block-comm.cuh" 19 | 20 | using std::make_pair; 21 | 22 | int main(int argc, char *argv[]) { 23 | const std::array versions{ 24 | make_pair("Baseline NVSHMEM", BaselineMultiThreadedNvshmemOpt::init), 25 | 26 | make_pair("Design 1 (NVSHMEM)", MultiGPUPeerTilingNvshmem::init), 27 | make_pair("Design 2 (NVSHMEM)", SSMultiThreadedTwoBlockCommNvshmem::init), 28 | 29 | make_pair("Design 1 Partitioned (NVSHMEM)", MultiGPUMultiBlockPeerTilingNvshmem::init), 30 | make_pair("Design 2 Partitioned (NVSHMEM)", SSMultiThreadedMultiBlockCommNvshmem::init), 31 | 32 | make_pair("Baseline NVSHMEM (No Computation)", 33 | BaselineMultiThreadedNvshmemOptNoCompute::init), 34 | make_pair("Design 1 NVSHMEM (No Compute)", MultiGPUPeerTilingNvshmemNoCompute::init), 35 | 36 | make_pair("PERKS NVSHMEM", MultiStreamPERKSNvshmem::init), 37 | make_pair("PERKS NVSHMEM Partitioned", MultiStreamPERKSNvshmemBlock::init), 38 | 39 | // make_pair("NVSHMEM Baseline Multi Threaded", BaselineMultiThreadedNvshmem::init), 40 | // make_pair("NVSHMEM Single stream multi threaded (one thread block communicates)", 41 | // SSMultiThreadedOneBlockCommNvshmem::init), 42 | // make_pair("NVSHMEM Baseline Multi Threaded (No Computation)", 43 | // BaselineMultiThreadedNvshmemNoCompute::init), make_pair( 44 | // "NVSHMEM Single stream multi threaded (one thread block communicates; no 45 | // computation)", SSMultiThreadedOneBlockCommNvshmemNoCompute::init), 46 | // make_pair( 47 | // "NVSHMEM Single stream multi threaded (two thread blocks communicate; 48 | // no computation)", SSMultiThreadedTwoBlockCommNvshmemNoCompute::init), 49 | // make_pair("Design 2 NVSHMEM (No Compute)", 50 | // SSMultiThreadedMultiBlockCommNvshmemNoCompute::init), 51 | }; 52 | 53 | const int selection = get_argval(argv, argv + argc, "-v", 0); 54 | const bool silent = get_arg(argv, argv + argc, "-s"); 55 | 56 | auto &selected = versions[selection]; 57 | 58 | if (!silent) { 59 | std::cout << "Versions (select with -v):" 60 | << "\n"; 61 | for (size_t i = 0; i < versions.size(); ++i) { 62 | auto &v = versions[i]; 63 | std::cout << i << ":\t" << v.first << "\n"; 64 | } 65 | std::cout << std::endl; 66 | 67 | std::cout << "Running " << selected.first << "\n" << std::endl; 68 | } 69 | 70 | return selected.second(argc, argv); 71 | } 72 | -------------------------------------------------------------------------------- /Scripts/full_bench.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | #SBATCH -J stencil-bench-weak 4 | #SBATCH -N 1 5 | #SBATCH -n 8 6 | #SBATCH -c 16 7 | #SBATCH -A proj16 8 | #SBATCH -p palamut-cuda 9 | #SBATCH --gres=gpu:8 10 | #SBATCH --time=24:00:00 11 | #SBATCH -o stencil_bench_%j.log 12 | 13 | import os 14 | import sys 15 | 16 | sys.path.append(os.getcwd()) 17 | 18 | from pathlib import Path 19 | from datetime import datetime 20 | from itertools import cycle 21 | 22 | import bench 23 | 24 | BIN = './jacobi' 25 | BIN_3D = './jacobi3d' 26 | 27 | BIN_NVSHMEM = './jacobi_nvshmem' 28 | BIN_3D_NVSHMEM = './jacobi3d_nvshmem' 29 | 30 | VERSIONS = [ 31 | 0, # Baseline Copy 32 | 1, # Baseline Overlap 33 | 2, # Baseline P2P 34 | 3, # Design 1 35 | # 4, # Design 2 36 | 5 # PERKS 37 | ] 38 | VERSIONS_NO_COMPUTE = [ 39 | 6, # Baseline Copy 40 | 7, # Baseline Overlap 41 | 8, # Baseline P2P 42 | 9, # Design 1 43 | 10 # Design 2 44 | ] 45 | 46 | VERSIONS_NVSHMEM = [ 47 | 0, # Baseline 48 | 1, # Design 1 49 | # 2, # Design 2 50 | 3, # Design 1 Partitioned 51 | # 3 # PERKS 52 | 7, # PERKS, possibly 53 | ] 54 | 55 | VERSIONS_NVSHMEM_NO_COMPUTE = [ 56 | 4, # Baseline 57 | 5, # Design 1 58 | 6, # Design 2 59 | 7 # PERKS 60 | ] 61 | 62 | NUM_REPEAT = 1 63 | 64 | BASE_DIR = Path(str(datetime.now())) 65 | BASE_DIR.mkdir() 66 | 67 | 68 | def get_dim_str(dim): 69 | return 'x'.join([str(x) for x in dim]) 70 | 71 | 72 | # Multiplies the last index by 2 73 | def dim_func_last(dims): 74 | last_index = len(dims) - 1 75 | 76 | while True: 77 | yield dims.copy() 78 | dims[last_index] *= 2 79 | 80 | 81 | default_args = {'bin': BIN, 'num_repeat': NUM_REPEAT} 82 | default_args_strong = { 83 | **default_args, 84 | 'gpu_step': lambda x: x + 1, # Add 1 more GPU 85 | 'dim_func': lambda x: cycle([x]) 86 | } 87 | 88 | weak_scaling = [ 89 | {'starting_dim': (256, 256), 'num_iter': 1_000_000}, 90 | {'starting_dim': (1024, 1024), 'num_iter': 1_000_000}, 91 | {'starting_dim': (2048, 1024), 'num_iter': 1_000_000}, 92 | {'starting_dim': (8192, 4096), 'num_iter': 10_000}, 93 | ] 94 | 95 | strong_scaling = [ 96 | {'starting_dim': (4096, 4096), 'num_iter': 10000}, 97 | ] 98 | 99 | weak_scaling_3D = [ 100 | {'bin': BIN_3D, 'starting_dim': (256, 256, 256), 'num_iter': 10000}, 101 | {'bin': BIN_3D, 'starting_dim': (256, 256, 256), 'num_iter': 10000, 'dim_func': dim_func_last}, 102 | ] 103 | 104 | strong_scaling_3D = [ 105 | {'bin': BIN_3D, 'starting_dim': (512, 512, 512), 'num_iter': 10000}, 106 | ] 107 | 108 | 109 | def run_experiment(name: str, args): 110 | dim_str = get_dim_str(args['starting_dim']) 111 | args['out_file'] = BASE_DIR / f'{name}_{dim_str}.csv' 112 | bench.run(**args) 113 | 114 | 115 | def run(args, version=''): 116 | run_experiment(version, {**args, 'versions': VERSIONS, 'bin': BIN}) 117 | # run_experiment(version, {**args, 'versions': VERSIONS_NVSHMEM, 'bin': BIN_NVSHMEM, 'mpi': True}) 118 | 119 | # run_experiment(f'{version}_No_Compute', {**args, 'versions': VERSIONS_NO_COMPUTE, 'bin': BIN}) 120 | # run_experiment(f'{version}_No_Compute', 121 | # {**args, 'versions': VERSIONS_NVSHMEM_NO_COMPUTE, 'bin': BIN_NVSHMEM, 'mpi': True}) 122 | 123 | 124 | if __name__ == '__main__': 125 | # Running with the same name merges them 126 | for args in weak_scaling: 127 | run({**default_args, **args}, version='2D_Weak_Scaling') 128 | 129 | # for args in weak_scaling_3D: 130 | # run({**default_args, **args}, version='3D_Weak_Scaling') 131 | 132 | # for args in strong_scaling: 133 | # run({**default_args_strong, **args}, version='2D_Strong_Scaling') 134 | 135 | # for args in strong_scaling_3D: 136 | # run({**default_args_strong, **args}, version='3D_Strong_Scaling') 137 | -------------------------------------------------------------------------------- /Stencil/jacobi2D/scripts/strong_scale_bench_truba.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH -J stencil-bench-strong 4 | #SBATCH -N 1 5 | #SBATCH -n 8 6 | #SBATCH -c 16 7 | #SBATCH -A proj16 8 | #SBATCH -p palamut-cuda 9 | #SBATCH --gres=gpu:8 10 | #SBATCH --time=24:00:00 11 | #SBATCH -o stencil_bench_strong_output_%j.log 12 | 13 | . ./scripts/modules_truba.sh > /dev/null 14 | 15 | MAX_NUM_GPUS=8 16 | CUDA_VISIBLE_DEVICES_SETTING=("0" "0" "0,1" "0,1,2" "0,1,2,3" "0,1,2,3,4" "0,1,2,3,4,5" "0,1,2,3,4,5,6" "0,1,2,3,4,5,6,7" ) 17 | 18 | declare -A version_name_to_idx_map 19 | 20 | version_name_to_idx_map["Baseline Copy"]=0 21 | version_name_to_idx_map["Baseline Copy Overlap"]=1 22 | version_name_to_idx_map["Baseline P2P"]=2 23 | #version_name_to_idx_map["Baseline Single Copy"]=3 24 | 25 | version_name_to_idx_map["Single Stream 1TB"]=4 26 | version_name_to_idx_map["Single Stream 2TB"]=5 27 | version_name_to_idx_map["Double Stream"]=6 28 | 29 | version_name_to_idx_map["Baseline Copy (No compute)"]=7 30 | version_name_to_idx_map["Baseline Copy Overlap (No Compute)"]=8 31 | version_name_to_idx_map["Baseline P2P (No Compute)"]=9 32 | 33 | version_name_to_idx_map["Single Stream 1TB (No Compute)"]=10 34 | version_name_to_idx_map["Single Stream 2TB (No Compute)"]=11 35 | version_name_to_idx_map["Double Stream (No Compute)"]=12 36 | 37 | declare -A version_name_to_idx_map_nvshmem 38 | 39 | version_name_to_idx_map_nvshmem["NVSHMEM Baseline"]=0 40 | version_name_to_idx_map_nvshmem["NVSHMEM Baseline Optimized"]=1 41 | 42 | version_name_to_idx_map_nvshmem["NVSHMEM Single Stream 1TB"]=2 43 | version_name_to_idx_map_nvshmem["NVSHMEM Single Stream 2TB"]=3 44 | version_name_to_idx_map_nvshmem["NVSHMEM Double Stream"]=4 45 | 46 | version_name_to_idx_map_nvshmem["NVSHMEM Baseline (No Compute)"]=5 47 | version_name_to_idx_map_nvshmem["NVSHMEM Baseline Optimized (No Compute)"]=6 48 | 49 | version_name_to_idx_map_nvshmem["NVSHMEM Single Stream 1TB (No Compute)"]=7 50 | version_name_to_idx_map_nvshmem["NVSHMEM Single Stream 2TB (No Compute)"]=8 51 | version_name_to_idx_map_nvshmem["NVSHMEM Double Stream (No Compute)"]=9 52 | 53 | 54 | BIN="./jacobi -s 1" 55 | NV_BIN="./jacobi_nvshmem -s 1" 56 | 57 | 58 | NUM_ITER=${NUM_ITER:-10000} 59 | NUM_RUNS=${NUM_RUNS:-5} 60 | 61 | while [ $# -gt 0 ]; do 62 | 63 | if [[ $1 == *"--"* ]]; then 64 | param="${1/--/}" 65 | declare $param="$2" 66 | fi 67 | 68 | shift 69 | done 70 | 71 | 72 | for (( STARTING_NX=16384; STARTING_NX<=16384; STARTING_NX*=2 )); do 73 | 74 | NX=${STARTING_NX} 75 | NY=${NX} 76 | 77 | for version_name in "${!version_name_to_idx_map[@]}"; do 78 | echo "Running ${version_name}"; echo "" 79 | 80 | version_idx=${version_name_to_idx_map[$version_name]} 81 | 82 | for (( NUM_GPUS=1; NUM_GPUS <= ${MAX_NUM_GPUS}; NUM_GPUS+=1 )); do 83 | export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES_SETTING[${NUM_GPUS}]} 84 | 85 | echo "Num GPUS: ${NUM_GPUS}" 86 | echo "${NUM_ITER} iterations on grid ${NX}x${NY}" 87 | 88 | for (( i=1; i <= ${NUM_RUNS}; i++ )); do 89 | execution_time=$(${BIN} -v ${version_idx} -nx ${NX} -ny ${NY} -niter ${NUM_ITER}) 90 | echo "${execution_time} on run ${i}" 91 | done 92 | 93 | printf "\n" 94 | 95 | done 96 | 97 | echo "-------------------------------------" 98 | done 99 | 100 | 101 | for version_name in "${!version_name_to_idx_map_nvshmem[@]}"; do 102 | echo "Running ${version_name}"; echo "" 103 | 104 | version_idx=${version_name_to_idx_map_nvshmem[$version_name]} 105 | 106 | for (( NP=1; NP <= ${MAX_NUM_GPUS}; NP+=1 )); do 107 | 108 | echo "Num GPUS: ${NP}" 109 | echo "${NUM_ITER} iterations on grid ${NX}x${NY}" 110 | 111 | for (( i=1; i <= ${NUM_RUNS}; i++ )); do 112 | execution_time=$(mpirun -np ${NP} ${NV_BIN} -v ${version_idx} -nx ${NX} -ny ${NY} -niter ${NUM_ITER}) 113 | echo "${execution_time} on run ${i}" 114 | done 115 | 116 | printf "\n" 117 | 118 | done 119 | 120 | echo "-------------------------------------" 121 | done 122 | 123 | echo "#####################################" 124 | done -------------------------------------------------------------------------------- /CG/results/cg_operation_breakdown_8A100.txt: -------------------------------------------------------------------------------- 1 | Results per version; rows are matrices 2 | 3 | Results for version Profiling Discrete Standard NVSHMEM => 4 | ,Dot 1 (+Reset),Dot 2 (+Reset),Global Reduction 1 (+Barrier),Global Reduction 2 (+Barrier),Memcpy Dot to Host 1,Memcpy Dot to Host 2,NVSHMEM Barrier 1 (After SpMV),NVSHMEM Barrier 2 (End of Iteration),Saxpy 1,Saxpy 2,Saxpy 3,SpMV 5 | (generated)_tridiagonal,0.2094,0.1695,0.2585,0.2599,0.0680,0.0673,0.1108,0.1102,0.1827,0.2416,0.1814,0.5409 6 | ecology2,0.0746,0.0736,0.2211,0.2304,0.0657,0.0653,0.0952,0.0971,0.0593,0.0588,0.0605,0.1004 7 | hood,0.0705,0.0700,0.2160,0.2223,0.0652,0.0642,0.1086,0.0973,0.0553,0.0549,0.0559,0.4870 8 | bmwcra_1,0.0700,0.0696,0.2146,0.2280,0.0650,0.0640,0.2559,0.0973,0.0549,0.0546,0.0557,0.9622 9 | consph,0.0698,0.0694,0.2164,0.2261,0.0653,0.0648,0.2090,0.0971,0.0545,0.0544,0.0554,0.3745 10 | thermomech_dM,0.0701,0.0699,0.2182,0.2286,0.0649,0.0645,0.1132,0.0987,0.0553,0.0550,0.0561,0.2324 11 | tmt_sym,0.0735,0.0731,0.2180,0.2266,0.0662,0.0657,0.0945,0.0967,0.0580,0.0577,0.0590,0.1137 12 | crankseg_1,0.0702,0.0687,0.2249,0.2288,0.0669,0.0650,7.6064,0.0960,0.0543,0.0540,0.0554,1.6466 13 | crankseg_2,0.0717,0.0699,0.2342,0.2338,0.0684,0.0658,10.8982,0.0978,0.0547,0.0544,0.0556,1.8452 14 | Queen_4147,0.1025,0.0889,0.2284,0.2247,0.0685,0.0650,0.2232,0.0959,0.0722,0.0720,0.0681,15.1167 15 | Bump_2911,0.0957,0.0850,0.2273,0.2234,0.0686,0.0654,0.3731,0.0966,0.0672,0.0676,0.0653,4.7050 16 | G3_circuit,0.0763,0.0751,0.2168,0.2256,0.0654,0.0648,0.1236,0.0975,0.0600,0.0596,0.0608,0.2145 17 | StocF-1465,0.0835,0.0780,0.2241,0.2272,0.0659,0.0638,0.1718,0.0955,0.0616,0.0609,0.0605,0.5684 18 | Flan_1565,0.0857,0.0781,0.2274,0.2240,0.0676,0.0644,0.1596,0.0962,0.0621,0.0622,0.0606,4.5419 19 | audikw_1,0.0834,0.0769,0.2250,0.2208,0.0675,0.0643,0.0936,0.0959,0.0599,0.0595,0.0586,4.8515 20 | Serena,0.0836,0.0779,0.2228,0.2217,0.0669,0.0640,0.2073,0.0955,0.0613,0.0615,0.0600,2.6818 21 | Geo_1438,0.0842,0.0781,0.2225,0.2195,0.0674,0.0648,0.1707,0.0960,0.0620,0.0623,0.0605,2.2589 22 | Hook_1498,0.0831,0.0775,0.2219,0.2201,0.0665,0.0640,0.1343,0.0964,0.0617,0.0623,0.0606,1.8148 23 | ldoor,0.0808,0.0755,0.2255,0.2252,0.0667,0.0644,0.2791,0.0954,0.0593,0.0595,0.0585,0.8216 24 | 25 | 26 | Results for version Profiling Discrete Pipelined NVSHMEM (No Overlap) => 27 | ,Global Reductions (+Barrier),Memcpy Dots To Host,Merged Dots (+Reset),NVSHMEM Barrier 1 (After SpMV),NVSHMEM Barrier 2 (End of Iteration),Saxpy 1,Saxpy 2,Saxpy 3,Saxpy 4,Saxpy 5,Saxpy 6,SpMV 28 | (generated)_tridiagonal,0.1818,0.0681,0.2480,0.1142,0.1210,0.1980,0.2435,0.2461,0.2417,0.2439,0.2443,0.5201 29 | ecology2,0.1751,0.0658,0.0788,0.1014,0.1104,0.0590,0.0588,0.0589,0.0586,0.0584,0.0584,0.0987 30 | hood,0.1740,0.0659,0.0724,0.1124,0.1051,0.0558,0.0558,0.0558,0.0553,0.0550,0.0552,0.4796 31 | bmwcra_1,0.1703,0.0669,0.0718,0.2503,0.1036,0.0552,0.0550,0.0549,0.0547,0.0544,0.0544,0.9414 32 | consph,0.1735,0.0667,0.0722,0.1561,0.1060,0.0556,0.0553,0.0554,0.0551,0.0549,0.0548,0.4441 33 | thermomech_dM,0.1741,0.0665,0.0726,0.0938,0.1028,0.0562,0.0561,0.0560,0.0557,0.0555,0.0556,0.2408 34 | tmt_sym,0.1746,0.0667,0.0764,0.0988,0.1040,0.0584,0.0582,0.0583,0.0577,0.0576,0.0578,0.1144 35 | crankseg_1,0.1733,0.0690,0.0765,7.5716,0.1040,0.0555,0.0550,0.0552,0.0548,0.0544,0.0546,1.7455 36 | crankseg_2,0.1758,0.0682,0.0777,10.8521,0.1061,0.0553,0.0550,0.0549,0.0546,0.0544,0.0545,1.8073 37 | Queen_4147,0.1791,0.0697,0.1071,0.2459,0.1102,0.0801,0.0792,0.0817,0.0730,0.0685,0.0717,15.1174 38 | Bump_2911,0.1799,0.0692,0.0949,0.1052,0.1058,0.0735,0.0728,0.0742,0.0676,0.0646,0.0646,4.9992 39 | G3_circuit,0.1746,0.0663,0.0823,0.0891,0.1030,0.0607,0.0605,0.0610,0.0604,0.0601,0.0601,0.2353 40 | StocF-1465,0.1827,0.0681,0.0828,0.0944,0.1040,0.0626,0.0625,0.0665,0.0621,0.0600,0.0596,0.6804 41 | Flan_1565,0.1787,0.0685,0.0846,0.3012,0.1051,0.0644,0.0653,0.0662,0.0623,0.0597,0.0598,4.4129 42 | audikw_1,0.1788,0.0699,0.0811,2.9359,0.1025,0.0606,0.0601,0.0620,0.0598,0.0578,0.0579,1.9612 43 | Serena,0.1805,0.0688,0.0828,0.3760,0.1068,0.0633,0.0625,0.0649,0.0609,0.0586,0.0588,2.5207 44 | Geo_1438,0.1797,0.0693,0.0825,0.1363,0.1035,0.0641,0.0633,0.0657,0.0613,0.0592,0.0590,2.3228 45 | Hook_1498,0.1791,0.0675,0.0827,0.1411,0.1048,0.0639,0.0634,0.0663,0.0615,0.0592,0.0593,1.8353 46 | ldoor,0.1811,0.0679,0.0798,0.1083,0.1043,0.0602,0.0608,0.0621,0.0597,0.0578,0.0578,0.9961 47 | 48 | 49 | -------------------------------------------------------------------------------- /Stencil/jacobi2D/scripts/strong_scale_bench.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH --job-name=stencil-bench 4 | #SBATCH --ntasks=8 5 | #SBATCH --gres=gpu:8 6 | #SBATCH --partition hgx2q 7 | #SBATCH --time=03:00:00 8 | #SBATCH --output=sbatch_output_%j.log 9 | 10 | . ./scripts/modules.sh > /dev/null 11 | 12 | MAX_NUM_GPUS=8 13 | CUDA_VISIBLE_DEVICES_SETTING=("0" "0" "0,1" "0,1,2" "0,1,2,3" "0,1,2,3,4" "0,1,2,3,4,5" "0,1,2,3,4,5,6" "0,1,2,3,4,5,6,7" ) 14 | 15 | declare -A version_name_to_idx_map 16 | 17 | version_name_to_idx_map["Baseline Copy"]=0 18 | version_name_to_idx_map["Baseline Copy Overlap"]=1 19 | version_name_to_idx_map["Baseline P2P"]=2 20 | #version_name_to_idx_map["Baseline Single Copy"]=3 21 | 22 | version_name_to_idx_map["Single Stream 1TB"]=4 23 | version_name_to_idx_map["Single Stream 2TB"]=5 24 | version_name_to_idx_map["Double Stream"]=6 25 | 26 | version_name_to_idx_map["Baseline Copy (No compute)"]=7 27 | version_name_to_idx_map["Baseline Copy Overlap (No Compute)"]=8 28 | version_name_to_idx_map["Baseline P2P (No Compute)"]=9 29 | 30 | version_name_to_idx_map["Single Stream 1TB (No Compute)"]=10 31 | version_name_to_idx_map["Single Stream 2TB (No Compute)"]=11 32 | version_name_to_idx_map["Double Stream (No Compute)"]=12 33 | 34 | declare -A version_name_to_idx_map_nvshmem 35 | 36 | version_name_to_idx_map_nvshmem["NVSHMEM Baseline"]=0 37 | version_name_to_idx_map_nvshmem["NVSHMEM Baseline Optimized"]=1 38 | 39 | version_name_to_idx_map_nvshmem["NVSHMEM Single Stream 1TB"]=2 40 | version_name_to_idx_map_nvshmem["NVSHMEM Single Stream 2TB"]=3 41 | version_name_to_idx_map_nvshmem["NVSHMEM Double Stream"]=4 42 | 43 | version_name_to_idx_map_nvshmem["NVSHMEM Baseline (No Compute)"]=5 44 | version_name_to_idx_map_nvshmem["NVSHMEM Baseline Optimized (No Compute)"]=6 45 | 46 | version_name_to_idx_map_nvshmem["NVSHMEM Single Stream 1TB (No Compute)"]=7 47 | version_name_to_idx_map_nvshmem["NVSHMEM Single Stream 2TB (No Compute)"]=8 48 | version_name_to_idx_map_nvshmem["NVSHMEM Double Stream (No Compute)"]=9 49 | 50 | BIN="./jacobi -s 1" 51 | NV_BIN="./jacobi_nvshmem -s 1" 52 | 53 | MAX_NX=${MAX_NX:-16384} 54 | MAX_NY=${MAX_NY:-16384} 55 | 56 | STARTING_NX=${STARTING_NX:-4096} 57 | STARTING_NY=${STARTING_NY:-4096} 58 | 59 | NUM_ITER=${NUM_ITER:-100000} 60 | NUM_RUNS=${NUM_RUNS:-5} 61 | 62 | while [ $# -gt 0 ]; do 63 | 64 | if [[ $1 == *"--"* ]]; then 65 | param="${1/--/}" 66 | declare $param="$2" 67 | fi 68 | 69 | shift 70 | done 71 | 72 | for (( STARTING_NX=16384; STARTING_NX<=16384; STARTING_NX*=2 )); do 73 | 74 | NX=${STARTING_NX} 75 | NY=${NX} 76 | 77 | for version_name in "${!version_name_to_idx_map[@]}"; do 78 | echo "Running ${version_name}"; echo "" 79 | 80 | version_idx=${version_name_to_idx_map[$version_name]} 81 | 82 | for (( NUM_GPUS=1; NUM_GPUS <= ${MAX_NUM_GPUS}; NUM_GPUS+=1 )); do 83 | export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES_SETTING[${NUM_GPUS}]} 84 | 85 | echo "Num GPUS: ${NUM_GPUS}" 86 | echo "${NUM_ITER} iterations on grid ${NX}x${NY}" 87 | 88 | for (( i=1; i <= ${NUM_RUNS}; i++ )); do 89 | execution_time=$(${BIN} -v ${version_idx} -nx ${NX} -ny ${NY} -niter ${NUM_ITER}) 90 | echo "${execution_time} on run ${i}" 91 | done 92 | 93 | printf "\n" 94 | 95 | done 96 | 97 | echo "-------------------------------------" 98 | done 99 | 100 | for version_name in "${!version_name_to_idx_map_nvshmem[@]}"; do 101 | echo "Running ${version_name}"; echo "" 102 | 103 | version_idx=${version_name_to_idx_map_nvshmem[$version_name]} 104 | 105 | for (( NP=1; NP <= ${MAX_NUM_GPUS}; NP+=1 )); do 106 | 107 | echo "Num GPUS: ${NP}" 108 | echo "${NUM_ITER} iterations on grid ${NX}x${NY}" 109 | 110 | for (( i=1; i <= ${NUM_RUNS}; i++ )); do 111 | execution_time=$(mpirun -np ${NP} ${NV_BIN} -v ${version_idx} -nx ${NX} -ny ${NY} -niter ${NUM_ITER}) 112 | echo "${execution_time} on run ${i}" 113 | done 114 | 115 | printf "\n" 116 | 117 | done 118 | 119 | echo "-------------------------------------" 120 | done 121 | 122 | echo "#####################################" 123 | done 124 | -------------------------------------------------------------------------------- /Stencil/Makefile2: -------------------------------------------------------------------------------- 1 | SELF_DIR := $(dir $(lastword $(MAKEFILE_LIST))) 2 | include $(SELF_DIR)/../common.mk 3 | 4 | BUILD_ROOT ?= bin 5 | OBJ_ROOT ?= $(BUILD_ROOT)/obj 6 | 7 | VERSION_2D = $(SELF_DIR)/jacobi2D 8 | VERSION_3D = $(SELF_DIR)/jacobi3D 9 | 10 | 2D_PERKS_WRAPPER = $(VERSION_2D)/PERKS/*.cu 11 | 12 | # ========================================================================= 13 | # 2D 14 | SRC_DIR_2D = $(VERSION_2D)/src/ 15 | OBJ_DIR_2D := $(OBJ_ROOT)/$(VERSION_2D)/ 16 | DEP_DIR_2D := $(OBJ_DIR_2D)/.deps/ 17 | 18 | SRCS_2D = $(call rwildcard,$(SRC_DIR_2D),*.cu) 19 | OBJS_2D := $(patsubst $(SRC_DIR_2D)/%.cu, $(OBJ_DIR_2D)/%.o, $(SRCS_2D)) 20 | DEPS_2D := $(patsubst $(SRC_DIR_2D)/%.cu, $(DEP_DIR_2D)/%.d, $(SRCS_2D)) 21 | # ========================================================================= 22 | 23 | # ========================================================================= 24 | # 3D 25 | SRC_DIR_3D = $(VERSION_3D)/src/ 26 | OBJ_DIR_3D := $(OBJ_ROOT)/$(VERSION_3D)/ 27 | DEP_DIR_3D := $(OBJ_DIR_3D)/.deps/ 28 | 29 | SRCS_3D = $(call rwildcard,$(SRC_DIR_3D),*.cu) 30 | OBJS_3D := $(patsubst $(SRC_DIR_3D)/%.cu, $(OBJ_DIR_3D)/%.o, $(SRCS_3D)) 31 | DEPS_3D := $(patsubst $(SRC_DIR_3D)/%.cu, $(DEP_DIR_3D)/%.d, $(SRCS_3D)) 32 | # ========================================================================= 33 | 34 | # ========================================================================= 35 | # 2D NVSHMEM 36 | SRC_DIR_2D_NVSHMEM = $(VERSION_2D)/src_nvshmem/ 37 | OBJ_DIR_2D_NVSHMEM := $(OBJ_ROOT)/$(VERSION_2D)_nvshmem/ 38 | DEP_DIR_2D_NVSHMEM := $(OBJ_DIR_2D_NVSHMEM)/.deps/ 39 | 40 | SRCS_2D_NVSHMEM = $(call rwildcard,$(SRC_DIR_2D_NVSHMEM),*.cu) 41 | OBJS_2D_NVSHMEM := $(patsubst $(SRC_DIR_2D_NVSHMEM)/%.cu, $(OBJ_DIR_2D_NVSHMEM)/%.o, $(SRCS_2D_NVSHMEM)) 42 | DEPS_2D_NVSHMEM := $(patsubst $(SRC_DIR_2D_NVSHMEM)/%.cu, $(DEP_DIR_2D_NVSHMEM)/%.d, $(SRCS_2D_NVSHMEM)) 43 | # ========================================================================= 44 | 45 | # ========================================================================= 46 | # 3D NVSHMEM 47 | SRC_DIR_3D_NVSHMEM = $(VERSION_3D)/src_nvshmem/ 48 | OBJ_DIR_3D_NVSHMEM := $(OBJ_ROOT)/$(VERSION_3D)_nvshmem/ 49 | DEP_DIR_3D_NVSHMEM := $(OBJ_DIR_3D_NVSHMEM)/.deps/ 50 | 51 | SRCS_3D_NVSHMEM = $(call rwildcard,$(SRC_DIR_3D_NVSHMEM),*.cu) 52 | OBJS_3D_NVSHMEM := $(patsubst $(SRC_DIR_3D_NVSHMEM)/%.cu, $(OBJ_DIR_3D_NVSHMEM)/%.o, $(SRCS_3D_NVSHMEM)) 53 | DEPS_3D_NVSHMEM := $(patsubst $(SRC_DIR_3D_NVSHMEM)/%.cu, $(DEP_DIR_3D_NVSHMEM)/%.d, $(SRCS_3D_NVSHMEM)) 54 | # ========================================================================= 55 | stencil: jacobi jacobi_nvshmem 56 | jacobi: jacobi2D jacobi3D 57 | jacobi_nvshmem: jacobi2D_nvshmem jacobi3D_nvshmem 58 | 59 | # ========================================================================= 60 | jacobi2D: $(OBJS_2D) $(2D_PERKS_WRAPPER) 61 | $(LINK) 62 | 63 | jacobi3D: $(OBJS_3D) 64 | $(LINK) 65 | 66 | $(OBJS_2D) : $(OBJ_DIR_2D)/%.o : $(SRC_DIR_2D)/%.cu $(DEP_DIR_2D)/%.d | $(DEP_DIR_2D) 67 | $(call COMPILE, $(DEP_DIR_2D)) 68 | 69 | $(OBJS_3D) : $(OBJ_DIR_3D)/%.o : $(SRC_DIR_3D)/%.cu $(DEP_DIR_3D)/%.d | $(DEP_DIR_3D) 70 | $(call COMPILE, $(DEP_DIR_3D)) 71 | # ========================================================================= 72 | 73 | # ========================================================================= 74 | jacobi2D_nvshmem: $(OBJS_2D_NVSHMEM) $(2D_PERKS_WRAPPER) 75 | $(LINK_NVSHMEM) 76 | 77 | jacobi3D_nvshmem: $(OBJS_3D_NVSHMEM) 78 | $(LINK_NVSHMEM) 79 | 80 | $(OBJS_2D_NVSHMEM) : $(OBJ_DIR_2D_NVSHMEM)/%.o : $(SRC_DIR_2D_NVSHMEM)/%.cu $(DEP_DIR_2D_NVSHMEM)/%.d | $(DEP_DIR_2D_NVSHMEM) 81 | $(call COMPILE_NVSHMEM, $(DEP_DIR_2D_NVSHMEM)) 82 | 83 | $(OBJS_3D_NVSHMEM) : $(OBJ_DIR_3D_NVSHMEM)/%.o : $(SRC_DIR_3D_NVSHMEM)/%.cu $(DEP_DIR_3D_NVSHMEM)/%.d | $(DEP_DIR_3D_NVSHMEM) 84 | $(call COMPILE_NVSHMEM, $(DEP_DIR_3D_NVSHMEM)) 85 | # ========================================================================= 86 | 87 | $(DEP_DIR_2D): 88 | @mkdir -p $(DEP_DIR_2D) 89 | 90 | $(DEP_DIR_3D): 91 | @mkdir -p $(DEP_DIR_3D) 92 | 93 | $(DEP_DIR_2D_NVSHMEM): 94 | @mkdir -p $(DEP_DIR_2D_NVSHMEM) 95 | 96 | $(DEP_DIR_3D_NVSHMEM): 97 | @mkdir -p $(DEP_DIR_3D_NVSHMEM) 98 | 99 | $(DEPS_2D): 100 | 101 | $(DEPS_3D): 102 | 103 | $(DEPS_2D_NVSHMEM): 104 | 105 | $(DEPS_3D_NVSHMEM): 106 | 107 | include $(wildcard $(DEPS_2D)) 108 | include $(wildcard $(DEPS_3D)) 109 | include $(wildcard $(DEPS_2D_NVSHMEM)) 110 | include $(wildcard $(DEPS_3D_NVSHMEM)) 111 | -------------------------------------------------------------------------------- /Stencil/jacobi2D/scripts/weak_scale_comm_bench_truba.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH -J stencil-bench-weak 4 | #SBATCH -N 1 5 | #SBATCH -n 8 6 | #SBATCH -c 16 7 | #SBATCH -A proj16 8 | #SBATCH -p palamut-cuda 9 | #SBATCH --gres=gpu:8 10 | #SBATCH --time=24:00:00 11 | #SBATCH -o stencil_bench_weak_output_%j.log 12 | 13 | . ./scripts/modules_truba.sh > /dev/null 14 | 15 | MAX_NUM_GPUS=8 16 | CUDA_VISIBLE_DEVICES_SETTING=("0" "0" "0,1" "0,1,2" "0,1,2,3" "0,1,2,3,4" "0,1,2,3,4,5" "0,1,2,3,4,5,6" "0,1,2,3,4,5,6,7" ) 17 | 18 | declare -A version_name_to_idx_map 19 | 20 | declare -A version_name_to_idx_map 21 | 22 | version_name_to_idx_map["Baseline Copy"]=0 23 | version_name_to_idx_map["Baseline Copy Overlap"]=1 24 | version_name_to_idx_map["Baseline P2P"]=2 25 | #version_name_to_idx_map["Baseline Single Copy"]=3 26 | 27 | version_name_to_idx_map["Single Stream 1TB"]=4 28 | version_name_to_idx_map["Single Stream 2TB"]=5 29 | version_name_to_idx_map["Double Stream"]=6 30 | 31 | version_name_to_idx_map["Baseline Copy (No compute)"]=7 32 | version_name_to_idx_map["Baseline Copy Overlap (No Compute)"]=8 33 | version_name_to_idx_map["Baseline P2P (No Compute)"]=9 34 | 35 | version_name_to_idx_map["Single Stream 1TB (No Compute)"]=10 36 | version_name_to_idx_map["Single Stream 2TB (No Compute)"]=11 37 | version_name_to_idx_map["Double Stream (No Compute)"]=12 38 | 39 | declare -A version_name_to_idx_map_nvshmem 40 | 41 | version_name_to_idx_map_nvshmem["NVSHMEM Baseline"]=0 42 | version_name_to_idx_map_nvshmem["NVSHMEM Baseline Optimized"]=1 43 | 44 | version_name_to_idx_map_nvshmem["NVSHMEM Single Stream 1TB"]=2 45 | version_name_to_idx_map_nvshmem["NVSHMEM Single Stream 2TB"]=3 46 | version_name_to_idx_map_nvshmem["NVSHMEM Double Stream"]=4 47 | 48 | version_name_to_idx_map_nvshmem["NVSHMEM Baseline (No Compute)"]=5 49 | version_name_to_idx_map_nvshmem["NVSHMEM Baseline Optimized (No Compute)"]=6 50 | 51 | version_name_to_idx_map_nvshmem["NVSHMEM Single Stream 1TB (No Compute)"]=7 52 | version_name_to_idx_map_nvshmem["NVSHMEM Single Stream 2TB (No Compute)"]=8 53 | version_name_to_idx_map_nvshmem["NVSHMEM Double Stream (No Compute)"]=9 54 | 55 | BIN="./jacobi -s 1" 56 | NV_BIN="./jacobi_nvshmem -s 1" 57 | 58 | NUM_ITER=${NUM_ITER:-10000} 59 | NUM_RUNS=${NUM_RUNS:-5} 60 | 61 | while [ $# -gt 0 ]; do 62 | 63 | if [[ $1 == *"--"* ]]; then 64 | param="${1/--/}" 65 | declare $param="$2" 66 | fi 67 | 68 | shift 69 | done 70 | 71 | 72 | for (( STARTING_NX=16384; STARTING_NX<=16384; STARTING_NX*=2 )); do 73 | 74 | for version_name in "${!version_name_to_idx_map[@]}"; do 75 | echo "Running ${version_name}"; echo "" 76 | NX=${STARTING_NX} 77 | NY=${NX} 78 | 79 | version_idx=${version_name_to_idx_map[$version_name]} 80 | 81 | for (( NUM_GPUS=1; NUM_GPUS <= ${MAX_NUM_GPUS}; NUM_GPUS*=2 )); do 82 | export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES_SETTING[${NUM_GPUS}]} 83 | 84 | echo "Num GPUS: ${NUM_GPUS}" 85 | echo "${NUM_ITER} iterations on grid ${NX}x${NY}" 86 | 87 | for (( i=1; i <= ${NUM_RUNS}; i++ )); do 88 | execution_time=$(${BIN} -v ${version_idx} -nx ${NX} -ny ${NY} -niter ${NUM_ITER}) 89 | echo "${execution_time} on run ${i}" 90 | done 91 | 92 | printf "\n" 93 | 94 | NX=$((2*NX)) 95 | 96 | done 97 | 98 | echo "-------------------------------------" 99 | done 100 | 101 | for version_name in "${!version_name_to_idx_map_nvshmem[@]}"; do 102 | echo "Running ${version_name}"; echo "" 103 | NX=${STARTING_NX} 104 | NY=${NX} 105 | 106 | version_idx=${version_name_to_idx_map_nvshmem[$version_name]} 107 | 108 | for (( NP=1; NP <= ${MAX_NUM_GPUS}; NP*=2 )); do 109 | 110 | echo "Num GPUS: ${NP}" 111 | echo "${NUM_ITER} iterations on grid ${NX}x${NY}" 112 | 113 | for (( i=1; i <= ${NUM_RUNS}; i++ )); do 114 | execution_time=$(mpirun -np ${NP} ${NV_BIN} -v ${version_idx} -nx ${NX} -ny ${NY} -niter ${NUM_ITER}) 115 | echo "${execution_time} on run ${i}" 116 | done 117 | 118 | printf "\n" 119 | 120 | NX=$((2*NX)) 121 | 122 | done 123 | 124 | echo "-------------------------------------" 125 | done 126 | 127 | echo "#####################################" 128 | done 129 | -------------------------------------------------------------------------------- /Stencil/jacobi2D/scripts/weak_scale_comp_bench_truba.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH -J stencil-bench-weak 4 | #SBATCH -N 1 5 | #SBATCH -n 8 6 | #SBATCH -c 16 7 | #SBATCH -A proj16 8 | #SBATCH -p palamut-cuda 9 | #SBATCH --gres=gpu:8 10 | #SBATCH --time=24:00:00 11 | #SBATCH -o stencil_bench_weak_output_%j.log 12 | 13 | . ./scripts/modules_truba.sh > /dev/null 14 | 15 | MAX_NUM_GPUS=8 16 | CUDA_VISIBLE_DEVICES_SETTING=("0" "0" "0,1" "0,1,2" "0,1,2,3" "0,1,2,3,4" "0,1,2,3,4,5" "0,1,2,3,4,5,6" "0,1,2,3,4,5,6,7" ) 17 | 18 | declare -A version_name_to_idx_map 19 | 20 | declare -A version_name_to_idx_map 21 | 22 | version_name_to_idx_map["Baseline Copy"]=0 23 | version_name_to_idx_map["Baseline Copy Overlap"]=1 24 | version_name_to_idx_map["Baseline P2P"]=2 25 | #version_name_to_idx_map["Baseline Single Copy"]=3 26 | 27 | version_name_to_idx_map["Single Stream 1TB"]=4 28 | version_name_to_idx_map["Single Stream 2TB"]=5 29 | version_name_to_idx_map["Double Stream"]=6 30 | 31 | version_name_to_idx_map["Baseline Copy (No compute)"]=7 32 | version_name_to_idx_map["Baseline Copy Overlap (No Compute)"]=8 33 | version_name_to_idx_map["Baseline P2P (No Compute)"]=9 34 | 35 | version_name_to_idx_map["Single Stream 1TB (No Compute)"]=10 36 | version_name_to_idx_map["Single Stream 2TB (No Compute)"]=11 37 | version_name_to_idx_map["Double Stream (No Compute)"]=12 38 | 39 | declare -A version_name_to_idx_map_nvshmem 40 | 41 | version_name_to_idx_map_nvshmem["NVSHMEM Baseline"]=0 42 | version_name_to_idx_map_nvshmem["NVSHMEM Baseline Optimized"]=1 43 | 44 | version_name_to_idx_map_nvshmem["NVSHMEM Single Stream 1TB"]=2 45 | version_name_to_idx_map_nvshmem["NVSHMEM Single Stream 2TB"]=3 46 | version_name_to_idx_map_nvshmem["NVSHMEM Double Stream"]=4 47 | 48 | version_name_to_idx_map_nvshmem["NVSHMEM Baseline (No Compute)"]=5 49 | version_name_to_idx_map_nvshmem["NVSHMEM Baseline Optimized (No Compute)"]=6 50 | 51 | version_name_to_idx_map_nvshmem["NVSHMEM Single Stream 1TB (No Compute)"]=7 52 | version_name_to_idx_map_nvshmem["NVSHMEM Single Stream 2TB (No Compute)"]=8 53 | version_name_to_idx_map_nvshmem["NVSHMEM Double Stream (No Compute)"]=9 54 | 55 | BIN="./jacobi -s 1" 56 | NV_BIN="./jacobi_nvshmem -s 1" 57 | 58 | NUM_ITER=${NUM_ITER:-10000} 59 | NUM_RUNS=${NUM_RUNS:-5} 60 | 61 | while [ $# -gt 0 ]; do 62 | 63 | if [[ $1 == *"--"* ]]; then 64 | param="${1/--/}" 65 | declare $param="$2" 66 | fi 67 | 68 | shift 69 | done 70 | 71 | 72 | for (( STARTING_NX=16384; STARTING_NX<=16384; STARTING_NX*=2 )); do 73 | 74 | for version_name in "${!version_name_to_idx_map[@]}"; do 75 | echo "Running ${version_name}"; echo "" 76 | NX=${STARTING_NX} 77 | NY=${NX} 78 | 79 | version_idx=${version_name_to_idx_map[$version_name]} 80 | 81 | for (( NUM_GPUS=1; NUM_GPUS <= ${MAX_NUM_GPUS}; NUM_GPUS*=2 )); do 82 | export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES_SETTING[${NUM_GPUS}]} 83 | 84 | echo "Num GPUS: ${NUM_GPUS}" 85 | echo "${NUM_ITER} iterations on grid ${NX}x${NY}" 86 | 87 | for (( i=1; i <= ${NUM_RUNS}; i++ )); do 88 | execution_time=$(${BIN} -v ${version_idx} -nx ${NX} -ny ${NY} -niter ${NUM_ITER}) 89 | echo "${execution_time} on run ${i}" 90 | done 91 | 92 | printf "\n" 93 | 94 | NY=$((2*NY)) 95 | 96 | done 97 | 98 | echo "-------------------------------------" 99 | done 100 | 101 | for version_name in "${!version_name_to_idx_map_nvshmem[@]}"; do 102 | echo "Running ${version_name}"; echo "" 103 | NX=${STARTING_NX} 104 | NY=${NX} 105 | 106 | version_idx=${version_name_to_idx_map_nvshmem[$version_name]} 107 | 108 | for (( NP=1; NP <= ${MAX_NUM_GPUS}; NP*=2 )); do 109 | 110 | echo "Num GPUS: ${NP}" 111 | echo "${NUM_ITER} iterations on grid ${NX}x${NY}" 112 | 113 | for (( i=1; i <= ${NUM_RUNS}; i++ )); do 114 | execution_time=$(mpirun -np ${NP} ${NV_BIN} -v ${version_idx} -nx ${NX} -ny ${NY} -niter ${NUM_ITER}) 115 | echo "${execution_time} on run ${i}" 116 | done 117 | 118 | printf "\n" 119 | 120 | NY=$((2*NY)) 121 | 122 | done 123 | 124 | echo "-------------------------------------" 125 | done 126 | 127 | echo "#####################################" 128 | done 129 | -------------------------------------------------------------------------------- /Stencil/jacobi2D/scripts/weak_scale_bench.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH --job-name=stencil-bench 4 | #SBATCH --ntasks=8 5 | #SBATCH --gres=gpu:8 6 | #SBATCH --partition hgx2q 7 | #SBATCH --time=03:00:00 8 | #SBATCH --output=sbatch_output_%j.log 9 | 10 | . ./scripts/modules.sh > /dev/null 11 | 12 | MAX_NUM_GPUS=8 13 | CUDA_VISIBLE_DEVICES_SETTING=("0" "0" "0,1" "0,1,2" "0,1,2,3" "0,1,2,3,4" "0,1,2,3,4,5" "0,1,2,3,4,5,6" "0,1,2,3,4,5,6,7" ) 14 | 15 | declare -A version_name_to_idx_map 16 | 17 | version_name_to_idx_map["Baseline Copy"]=0 18 | version_name_to_idx_map["Baseline Copy Overlap"]=1 19 | version_name_to_idx_map["Baseline P2P"]=2 20 | #version_name_to_idx_map["Baseline Single Copy"]=3 21 | 22 | version_name_to_idx_map["Single Stream 1TB"]=4 23 | version_name_to_idx_map["Single Stream 2TB"]=5 24 | version_name_to_idx_map["Double Stream"]=6 25 | 26 | version_name_to_idx_map["Baseline Copy (No compute)"]=7 27 | version_name_to_idx_map["Baseline Copy Overlap (No Compute)"]=8 28 | version_name_to_idx_map["Baseline P2P (No Compute)"]=9 29 | 30 | version_name_to_idx_map["Single Stream 1TB (No Compute)"]=10 31 | version_name_to_idx_map["Single Stream 2TB (No Compute)"]=11 32 | version_name_to_idx_map["Double Stream (No Compute)"]=12 33 | 34 | declare -A version_name_to_idx_map_nvshmem 35 | 36 | version_name_to_idx_map_nvshmem["NVSHMEM Baseline"]=0 37 | version_name_to_idx_map_nvshmem["NVSHMEM Baseline Optimized"]=1 38 | 39 | version_name_to_idx_map_nvshmem["NVSHMEM Single Stream 1TB"]=2 40 | version_name_to_idx_map_nvshmem["NVSHMEM Single Stream 2TB"]=3 41 | version_name_to_idx_map_nvshmem["NVSHMEM Double Stream"]=4 42 | 43 | version_name_to_idx_map_nvshmem["NVSHMEM Baseline (No Compute)"]=5 44 | version_name_to_idx_map_nvshmem["NVSHMEM Baseline Optimized (No Compute)"]=6 45 | 46 | version_name_to_idx_map_nvshmem["NVSHMEM Single Stream 1TB (No Compute)"]=7 47 | version_name_to_idx_map_nvshmem["NVSHMEM Single Stream 2TB (No Compute)"]=8 48 | version_name_to_idx_map_nvshmem["NVSHMEM Double Stream (No Compute)"]=9 49 | 50 | BIN="./jacobi -s 1" 51 | NV_BIN="./jacobi_nvshmem -s 1" 52 | 53 | MAX_NX=${MAX_NX:-16384} 54 | MAX_NY=${MAX_NY:-16384} 55 | 56 | STARTING_NX=${STARTING_NX:-4096} 57 | STARTING_NY=${STARTING_NY:-4096} 58 | 59 | NUM_ITER=${NUM_ITER:-1000000} 60 | NUM_RUNS=${NUM_RUNS:-5} 61 | 62 | while [ $# -gt 0 ]; do 63 | 64 | if [[ $1 == *"--"* ]]; then 65 | param="${1/--/}" 66 | declare $param="$2" 67 | fi 68 | 69 | shift 70 | done 71 | 72 | for (( STARTING_NX=16384; STARTING_NX<=16384; STARTING_NX*=2 )); do 73 | 74 | for version_name in "${!version_name_to_idx_map[@]}"; do 75 | echo "Running ${version_name}"; echo "" 76 | NX=${STARTING_NX} 77 | NY=${NX} 78 | 79 | version_idx=${version_name_to_idx_map[$version_name]} 80 | 81 | for (( NUM_GPUS=1; NUM_GPUS <= ${MAX_NUM_GPUS}; NUM_GPUS*=2 )); do 82 | export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES_SETTING[${NUM_GPUS}]} 83 | 84 | echo "Num GPUS: ${NUM_GPUS}" 85 | echo "${NUM_ITER} iterations on grid ${NX}x${NY}" 86 | 87 | for (( i=1; i <= ${NUM_RUNS}; i++ )); do 88 | execution_time=$(${BIN} -v ${version_idx} -nx ${NX} -ny ${NY} -niter ${NUM_ITER}) 89 | echo "${execution_time} on run ${i}" 90 | done 91 | 92 | printf "\n" 93 | 94 | NY=$((2*NY)) 95 | 96 | done 97 | 98 | echo "-------------------------------------" 99 | done 100 | 101 | for version_name in "${!version_name_to_idx_map_nvshmem[@]}"; do 102 | echo "Running ${version_name}"; echo "" 103 | NX=${STARTING_NX} 104 | NY=${NX} 105 | 106 | version_idx=${version_name_to_idx_map_nvshmem[$version_name]} 107 | 108 | for (( NP=1; NP <= ${MAX_NUM_GPUS}; NP*=2 )); do 109 | 110 | echo "Num GPUS: ${NP}" 111 | echo "${NUM_ITER} iterations on grid ${NX}x${NY}" 112 | 113 | for (( i=1; i <= ${NUM_RUNS}; i++ )); do 114 | execution_time=$(mpirun -np ${NP} ${NV_BIN} -v ${version_idx} -nx ${NX} -ny ${NY} -niter ${NUM_ITER}) 115 | echo "${execution_time} on run ${i}" 116 | done 117 | 118 | printf "\n" 119 | 120 | NY=$((2*NY)) 121 | done 122 | 123 | echo "-------------------------------------" 124 | done 125 | 126 | echo "#####################################" 127 | done 128 | -------------------------------------------------------------------------------- /Stencil/jacobi3D/scripts/multi-node.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH -J multi-node-test 4 | #SBATCH --nodes 2 5 | #SBATCH --gres=gpu:2 6 | #SBATCH --ntasks=4 7 | #SBATCH --ntasks-per-node=2 8 | 9 | #SBATCH --cpus-per-task 16 10 | #SBATCH -A proj16 11 | #SBATCH -p palamut-cuda 12 | 13 | #SBATCH --time=1:00:00 14 | #SBATCH -o %x_%j.log 15 | 16 | . ./scripts/modules_truba.sh > /dev/null 17 | 18 | MAX_NUM_GPUS=8 19 | CUDA_VISIBLE_DEVICES_SETTING=("0" "0" "0,1" "0,1,2" "0,1,2,3" "0,1,2,3,4" "0,1,2,3,4,5" "0,1,2,3,4,5,6" "0,1,2,3,4,5,6,7" ) 20 | 21 | declare -A version_name_to_idx_map 22 | 23 | #version_name_to_idx_map["Baseline Copy"]=0 24 | #version_name_to_idx_map["Baseline Copy Overlap"]=1 25 | #version_name_to_idx_map["Baseline P2P"]=2 26 | #version_name_to_idx_map["Baseline Single Copy"]=3 27 | 28 | #version_name_to_idx_map["Single Stream 1TB"]=4 29 | #version_name_to_idx_map["Single Stream 2TB"]=5 30 | #version_name_to_idx_map["Double Stream"]=6 31 | 32 | #version_name_to_idx_map["Baseline Copy (No compute)"]=7 33 | #version_name_to_idx_map["Baseline Copy Overlap (No Compute)"]=8 34 | #version_name_to_idx_map["Baseline P2P (No Compute)"]=9 35 | 36 | #version_name_to_idx_map["Single Stream 1TB (No Compute)"]=10 37 | #version_name_to_idx_map["Single Stream 2TB (No Compute)"]=11 38 | #version_name_to_idx_map["Double Stream (No Compute)"]=12 39 | 40 | declare -A version_name_to_idx_map_nvshmem 41 | 42 | version_name_to_idx_map_nvshmem["NVSHMEM Baseline"]=0 43 | version_name_to_idx_map_nvshmem["NVSHMEM Baseline Optimized"]=1 44 | 45 | version_name_to_idx_map_nvshmem["NVSHMEM Single Stream 1TB"]=2 46 | version_name_to_idx_map_nvshmem["NVSHMEM Single Stream 2TB"]=3 47 | version_name_to_idx_map_nvshmem["NVSHMEM Double Stream"]=4 48 | version_name_to_idx_map_nvshmem["NVSHMEM Single Stream All TB Partitoned"]=5 49 | 50 | version_name_to_idx_map_nvshmem["NVSHMEM Baseline (No Compute)"]=6 51 | version_name_to_idx_map_nvshmem["NVSHMEM Baseline Optimized (No Compute)"]=7 52 | 53 | version_name_to_idx_map_nvshmem["NVSHMEM Single Stream 1TB (No Compute)"]=8 54 | version_name_to_idx_map_nvshmem["NVSHMEM Single Stream 2TB (No Compute)"]=9 55 | version_name_to_idx_map_nvshmem["NVSHMEM Double Stream (No Compute)"]=10 56 | version_name_to_idx_map_nvshmem["NVSHMEM Single Stream All TB Partitoned (No Compute)"]=11 57 | 58 | 59 | BIN="./jacobi -s 1" 60 | NV_BIN="./jacobi_nvshmem -s 1" 61 | 62 | 63 | NUM_ITER=${NUM_ITER:-10000} 64 | NUM_RUNS=${NUM_RUNS:-5} 65 | 66 | while [ $# -gt 0 ]; do 67 | 68 | if [[ $1 == *"--"* ]]; then 69 | param="${1/--/}" 70 | declare $param="$2" 71 | fi 72 | 73 | shift 74 | done 75 | 76 | 77 | for (( STARTING_NX=512; STARTING_NX<=512; STARTING_NX*=2 )); do 78 | 79 | NX=${STARTING_NX} 80 | NY=${NX} 81 | NZ=${NX} 82 | 83 | for version_name in "${!version_name_to_idx_map[@]}"; do 84 | echo "Running ${version_name}"; echo "" 85 | 86 | version_idx=${version_name_to_idx_map[$version_name]} 87 | 88 | for (( NUM_GPUS=1; NUM_GPUS <= ${MAX_NUM_GPUS}; NUM_GPUS+=1 )); do 89 | export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES_SETTING[${NUM_GPUS}]} 90 | 91 | echo "Num GPUS: ${NUM_GPUS}" 92 | echo "${NUM_ITER} iterations on grid ${NX}x${NY}x${NZ}" 93 | 94 | for (( i=1; i <= ${NUM_RUNS}; i++ )); do 95 | execution_time=$(${BIN} -v ${version_idx} -nx ${NX} -ny ${NY} -nz ${NZ} -niter ${NUM_ITER}) 96 | echo "${execution_time} on run ${i}" 97 | done 98 | 99 | printf "\n" 100 | 101 | done 102 | 103 | echo "-------------------------------------" 104 | done 105 | 106 | 107 | for version_name in "${!version_name_to_idx_map_nvshmem[@]}"; do 108 | echo "Running ${version_name}"; echo "" 109 | 110 | version_idx=${version_name_to_idx_map_nvshmem[$version_name]} 111 | 112 | for (( NP=1; NP <= ${MAX_NUM_GPUS}; NP+=1 )); do 113 | 114 | echo "Num GPUS: ${NP}" 115 | echo "${NUM_ITER} iterations on grid ${NX}x${NY}x${NZ}" 116 | 117 | for (( i=1; i <= ${NUM_RUNS}; i++ )); do 118 | execution_time=$(mpirun -np ${NP} ${NV_BIN} -v ${version_idx} -nx ${NX} -ny ${NY} -nz ${NZ} -niter ${NUM_ITER}) 119 | echo "${execution_time} on run ${i}" 120 | done 121 | 122 | printf "\n" 123 | 124 | done 125 | 126 | echo "-------------------------------------" 127 | done 128 | 129 | echo "#####################################" 130 | done -------------------------------------------------------------------------------- /Stencil/jacobi3D/scripts/strong_scale_bench.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #SBATCH --job-name=stencil-bench 4 | #SBATCH --ntasks=8 5 | #SBATCH --gres=gpu:8 6 | #SBATCH --partition hgx2q 7 | #SBATCH --time=06:00:00 8 | #SBATCH --output=sbatch_output_%j.log 9 | 10 | . ./scripts/modules.sh > /dev/null 11 | 12 | MAX_NUM_GPUS=8 13 | CUDA_VISIBLE_DEVICES_SETTING=("0" "0" "0,1" "0,1,2" "0,1,2,3" "0,1,2,3,4" "0,1,2,3,4,5" "0,1,2,3,4,5,6" "0,1,2,3,4,5,6,7" ) 14 | 15 | declare -A version_name_to_idx_map 16 | 17 | #version_name_to_idx_map["Baseline Copy"]=0 18 | version_name_to_idx_map["Baseline Copy Overlap"]=1 19 | version_name_to_idx_map["Baseline P2P"]=2 20 | #version_name_to_idx_map["Baseline Single Copy"]=3 21 | 22 | #version_name_to_idx_map["Single Stream 1TB"]=4 23 | #version_name_to_idx_map["Single Stream 2TB"]=5 24 | #version_name_to_idx_map["Double Stream"]=6 25 | #version_name_to_idx_map["PERKS"]=7 26 | 27 | #version_name_to_idx_map["Baseline Copy (No compute)"]=8 28 | #version_name_to_idx_map["Baseline Copy Overlap (No Compute)"]=9 29 | #version_name_to_idx_map["Baseline P2P (No Compute)"]=10 30 | 31 | #version_name_to_idx_map["Single Stream 1TB (No Compute)"]=10 32 | #version_name_to_idx_map["Single Stream 2TB (No Compute)"]=11 33 | #version_name_to_idx_map["Double Stream (No Compute)"]=12 34 | 35 | declare -A version_name_to_idx_map_nvshmem 36 | 37 | version_name_to_idx_map_nvshmem["NVSHMEM Baseline"]=0 38 | version_name_to_idx_map_nvshmem["NVSHMEM Baseline Optimized"]=1 39 | 40 | version_name_to_idx_map_nvshmem["NVSHMEM Single Stream 1TB"]=2 41 | version_name_to_idx_map_nvshmem["NVSHMEM Single Stream 2TB"]=3 42 | version_name_to_idx_map_nvshmem["NVSHMEM Double Stream"]=4 43 | version_name_to_idx_map_nvshmem["NVSHMEM Single Stream Partitoned"]=5 44 | version_name_to_idx_map_nvshmem["NVSHMEM Double Stream Partitoned"]=6 45 | 46 | version_name_to_idx_map_nvshmem["NVSHMEM PERKS"]=13 47 | 48 | #version_name_to_idx_map_nvshmem["NVSHMEM Baseline (No Compute)"]=7 49 | #version_name_to_idx_map_nvshmem["NVSHMEM Baseline Optimized (No Compute)"]=8 50 | 51 | #version_name_to_idx_map_nvshmem["NVSHMEM Single Stream 1TB (No Compute)"]=9 52 | #version_name_to_idx_map_nvshmem["NVSHMEM Single Stream 2TB (No Compute)"]=10 53 | #version_name_to_idx_map_nvshmem["NVSHMEM Double Stream (No Compute)"]=11 54 | #version_name_to_idx_map_nvshmem["NVSHMEM Single Stream Partitoned (No Compute)"]=12 55 | 56 | BIN="./jacobi -s 1" 57 | NV_BIN="./jacobi_nvshmem -s 1" 58 | 59 | NUM_ITER=${NUM_ITER:-100000} 60 | NUM_RUNS=${NUM_RUNS:-5} 61 | 62 | while [ $# -gt 0 ]; do 63 | 64 | if [[ $1 == *"--"* ]]; then 65 | param="${1/--/}" 66 | declare $param="$2" 67 | fi 68 | 69 | shift 70 | done 71 | 72 | 73 | for (( STARTING_NX=512; STARTING_NX<=512; STARTING_NX*=2 )); do 74 | 75 | NX=${STARTING_NX} 76 | NY=${NX} 77 | NZ=${NX} 78 | 79 | for version_name in "${!version_name_to_idx_map[@]}"; do 80 | echo "Running ${version_name}"; echo "" 81 | version_idx=${version_name_to_idx_map[$version_name]} 82 | 83 | for (( NUM_GPUS=1; NUM_GPUS <= ${MAX_NUM_GPUS}; NUM_GPUS+=1 )); do 84 | export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES_SETTING[${NUM_GPUS}]} 85 | 86 | echo "Num GPUS: ${NUM_GPUS}" 87 | echo "${NUM_ITER} iterations on grid ${NX}x${NY}x${NZ}" 88 | 89 | for (( i=1; i <= ${NUM_RUNS}; i++ )); do 90 | execution_time=$(${BIN} -v ${version_idx} -nx ${NX} -ny ${NY} -nz ${NZ} -niter ${NUM_ITER}) 91 | echo "${execution_time} on run ${i}" 92 | done 93 | 94 | printf "\n" 95 | 96 | done 97 | 98 | echo "-------------------------------------" 99 | done 100 | 101 | 102 | for version_name in "${!version_name_to_idx_map_nvshmem[@]}"; do 103 | echo "Running ${version_name}"; echo "" 104 | version_idx=${version_name_to_idx_map_nvshmem[$version_name]} 105 | 106 | for (( NP=1; NP <= ${MAX_NUM_GPUS}; NP+=1 )); do 107 | 108 | echo "Num GPUS: ${NP}" 109 | echo "${NUM_ITER} iterations on grid ${NX}x${NY}x${NZ}" 110 | 111 | for (( i=1; i <= ${NUM_RUNS}; i++ )); do 112 | execution_time=$(mpirun -np ${NP} ${NV_BIN} -v ${version_idx} -nx ${NX} -ny ${NY} -nz ${NZ} -niter ${NUM_ITER}) 113 | echo "${execution_time} on run ${i}" 114 | done 115 | 116 | printf "\n" 117 | 118 | done 119 | 120 | echo "-------------------------------------" 121 | done 122 | 123 | echo "#####################################" 124 | 125 | done -------------------------------------------------------------------------------- /CG/scripts/calculate_speedup.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | import pandas as pd 4 | from os.path import dirname, realpath 5 | import argparse 6 | 7 | MATRIX_NAMES = [ 8 | # 'tridiagonal', 9 | 'ecology2', 10 | # 'shallow_water2', Too little non-zeros 11 | # 'Trefethen_2000', Too little non-zeros 12 | 'hood', 13 | 'bmwcra_1', 14 | 'consph', 15 | 'thermomech_dM', 16 | 'tmt_sym', 17 | 'crankseg_1', 18 | 'crankseg_2', 19 | 'Queen_4147', 20 | 'Bump_2911', 21 | 'G3_circuit', 22 | 'StocF-1465', 23 | 'Flan_1565', 24 | 'audikw_1', 25 | 'Serena', 26 | 'Geo_1438', 27 | 'Hook_1498', 28 | # 'bone010', Multi-part matrix, don't handle those for now 29 | 'ldoor' 30 | ] 31 | 32 | VERSIONS_TO_KEEP = [ 33 | 'CPU-Controlled Standard CG (Baseline)', 34 | 'CPU-Controlled Pipelined CG (Baseline)', 35 | 'CPU-Free Standard CG (Ours)', 36 | 'CPU-Free Pipelined CG (Ours)' 37 | ] 38 | 39 | dir_path = dirname(realpath(__file__)) 40 | 41 | # First file should be the full CSV file 42 | # Second should be the SingleGPU runtimes 43 | parser = argparse.ArgumentParser() 44 | parser.add_argument('files', type=argparse.FileType('r'), nargs='+') 45 | files = parser.parse_args().files 46 | 47 | full_runtimes_csv = files[0] 48 | single_gpu_runtimes_csv = files[1] 49 | 50 | if __name__ == "__main__": 51 | dir_path = dirname(realpath(__file__)) 52 | 53 | SAVE_RESULT_TO_DIR_PATH = dir_path + '/../results' 54 | 55 | # Skip first line 56 | full_runtimes_csv.readline() 57 | 58 | data = pd.read_csv(full_runtimes_csv, index_col='Matrix') 59 | data = data.sort_index() 60 | 61 | single_gpu_baseline_standard_runtimes = pd.read_csv( 62 | single_gpu_runtimes_csv, index_col='Matrix')['Runtime'] 63 | single_gpu_baseline_standard_runtimes = single_gpu_baseline_standard_runtimes.sort_index() 64 | 65 | matrix_names = np.unique( 66 | [matrix_name for matrix_name, _ in data.iterrows()]) 67 | 68 | gpu_num_column_labels = [column_label 69 | for column_label in data.columns if 'GPU' in column_label] 70 | 71 | for matrix_name in matrix_names: 72 | if matrix_name not in MATRIX_NAMES: 73 | data.drop(matrix_name, inplace=True) 74 | single_gpu_baseline_standard_runtimes.drop( 75 | matrix_name, inplace=True) 76 | 77 | for gpu_num_column_label in gpu_num_column_labels: 78 | per_gpu_num_data = data[['Version', gpu_num_column_label]] 79 | per_gpu_num_data = per_gpu_num_data.pivot_table( 80 | gpu_num_column_label, 'Matrix', 'Version') 81 | 82 | per_gpu_num_data = pd.DataFrame( 83 | per_gpu_num_data, columns=VERSIONS_TO_KEEP) 84 | 85 | per_gpu_num_speedup = 1 / per_gpu_num_data.div( 86 | single_gpu_baseline_standard_runtimes, axis=0) 87 | 88 | pipelined_cg_speedup = per_gpu_num_speedup['CPU-Free Pipelined CG (Ours)'] / \ 89 | per_gpu_num_speedup['CPU-Controlled Pipelined CG (Baseline)'] 90 | standard_cg_speedup = per_gpu_num_speedup['CPU-Free Standard CG (Ours)'] / \ 91 | per_gpu_num_speedup['CPU-Controlled Standard CG (Baseline)'] 92 | 93 | pipelined_cg_geo_mean_spedup = np.exp( 94 | np.log(pipelined_cg_speedup).mean()) 95 | standard_cg_geo_mean_spedup = np.exp( 96 | np.log(standard_cg_speedup).mean()) 97 | 98 | pipelined_speedup_file_path = SAVE_RESULT_TO_DIR_PATH + \ 99 | '/pipelined_speedup/pipelined_cg_speedup_' + gpu_num_column_label + '.txt' 100 | standard_speedup_file_path = SAVE_RESULT_TO_DIR_PATH + \ 101 | '/standard_speedup/pipelined_cg_speedup_' + gpu_num_column_label + '.txt' 102 | 103 | with open(pipelined_speedup_file_path, 'w') as pipelined_speedup_file: 104 | pipelined_cg_speedup.to_string( 105 | pipelined_speedup_file, header=False) 106 | pipelined_speedup_file.write('\n') 107 | pipelined_speedup_file.write( 108 | f'Persistent vs Discrete Pipelined CG geo mean speedup on {gpu_num_column_label}: {pipelined_cg_geo_mean_spedup}') 109 | 110 | with open(standard_speedup_file_path, 'w') as standard_speedup_file: 111 | standard_cg_speedup.to_string( 112 | standard_speedup_file, header=False) 113 | standard_speedup_file.write('\n') 114 | standard_speedup_file.write( 115 | f'Persistent vs Discrete Standard CG geo mean speedup on {gpu_num_column_label}: {standard_cg_geo_mean_spedup}') 116 | -------------------------------------------------------------------------------- /.clang-format: -------------------------------------------------------------------------------- 1 | --- 2 | Language: Cpp 3 | BasedOnStyle: Google 4 | AccessModifierOffset: -1 5 | AlignAfterOpenBracket: Align 6 | AlignConsecutiveAssignments: false 7 | AlignConsecutiveDeclarations: false 8 | AlignEscapedNewlines: Left 9 | AlignOperands: true 10 | AlignTrailingComments: true 11 | AllowAllParametersOfDeclarationOnNextLine: true 12 | AllowShortBlocksOnASingleLine: false 13 | AllowShortCaseLabelsOnASingleLine: false 14 | AllowShortFunctionsOnASingleLine: All 15 | AllowShortIfStatementsOnASingleLine: true 16 | AllowShortLoopsOnASingleLine: true 17 | AlwaysBreakAfterDefinitionReturnType: None 18 | AlwaysBreakAfterReturnType: None 19 | AlwaysBreakBeforeMultilineStrings: true 20 | AlwaysBreakTemplateDeclarations: Yes 21 | BinPackArguments: true 22 | BinPackParameters: true 23 | BraceWrapping: 24 | AfterClass: false 25 | AfterControlStatement: false 26 | AfterEnum: false 27 | AfterFunction: false 28 | AfterNamespace: false 29 | AfterObjCDeclaration: false 30 | AfterStruct: false 31 | AfterUnion: false 32 | AfterExternBlock: false 33 | BeforeCatch: false 34 | BeforeElse: false 35 | IndentBraces: false 36 | SplitEmptyFunction: true 37 | SplitEmptyRecord: true 38 | SplitEmptyNamespace: true 39 | BreakBeforeBinaryOperators: None 40 | BreakBeforeBraces: Attach 41 | BreakBeforeInheritanceComma: false 42 | BreakInheritanceList: BeforeColon 43 | BreakBeforeTernaryOperators: true 44 | BreakConstructorInitializersBeforeComma: false 45 | BreakConstructorInitializers: BeforeColon 46 | BreakAfterJavaFieldAnnotations: false 47 | BreakStringLiterals: true 48 | ColumnLimit: 100 49 | CommentPragmas: '^ IWYU pragma:' 50 | CompactNamespaces: false 51 | ConstructorInitializerAllOnOneLineOrOnePerLine: true 52 | ConstructorInitializerIndentWidth: 4 53 | ContinuationIndentWidth: 4 54 | Cpp11BracedListStyle: true 55 | DerivePointerAlignment: true 56 | DisableFormat: false 57 | ExperimentalAutoDetectBinPacking: false 58 | FixNamespaceComments: true 59 | ForEachMacros: 60 | - foreach 61 | - Q_FOREACH 62 | - BOOST_FOREACH 63 | IncludeBlocks: Preserve 64 | IncludeCategories: 65 | - Regex: '^' 66 | Priority: 2 67 | - Regex: '^<.*\.h>' 68 | Priority: 1 69 | - Regex: '^<.*' 70 | Priority: 2 71 | - Regex: '.*' 72 | Priority: 3 73 | IncludeIsMainRegex: '([-_](test|unittest))?$' 74 | IndentCaseLabels: true 75 | IndentPPDirectives: None 76 | IndentWidth: 4 77 | IndentWrappedFunctionNames: false 78 | JavaScriptQuotes: Leave 79 | JavaScriptWrapImports: true 80 | KeepEmptyLinesAtTheStartOfBlocks: false 81 | MacroBlockBegin: '' 82 | MacroBlockEnd: '' 83 | MaxEmptyLinesToKeep: 1 84 | NamespaceIndentation: None 85 | ObjCBinPackProtocolList: Never 86 | ObjCBlockIndentWidth: 4 87 | ObjCSpaceAfterProperty: false 88 | ObjCSpaceBeforeProtocolList: true 89 | PenaltyBreakAssignment: 2 90 | PenaltyBreakBeforeFirstCallParameter: 1 91 | PenaltyBreakComment: 300 92 | PenaltyBreakFirstLessLess: 120 93 | PenaltyBreakString: 1000 94 | PenaltyBreakTemplateDeclaration: 10 95 | PenaltyExcessCharacter: 1000000 96 | PenaltyReturnTypeOnItsOwnLine: 200 97 | PointerAlignment: Left 98 | RawStringFormats: 99 | - Language: Cpp 100 | Delimiters: 101 | - cc 102 | - CC 103 | - cpp 104 | - Cpp 105 | - CPP 106 | - 'c++' 107 | - 'C++' 108 | CanonicalDelimiter: '' 109 | BasedOnStyle: Google 110 | - Language: TextProto 111 | Delimiters: 112 | - pb 113 | - PB 114 | - proto 115 | - PROTO 116 | EnclosingFunctions: 117 | - EqualsProto 118 | - EquivToProto 119 | - PARSE_PARTIAL_TEXT_PROTO 120 | - PARSE_TEST_PROTO 121 | - PARSE_TEXT_PROTO 122 | - ParseTextOrDie 123 | - ParseTextProtoOrDie 124 | CanonicalDelimiter: '' 125 | BasedOnStyle: Google 126 | ReflowComments: true 127 | SortIncludes: true 128 | SortUsingDeclarations: true 129 | SpaceAfterCStyleCast: false 130 | SpaceAfterTemplateKeyword: true 131 | SpaceBeforeAssignmentOperators: true 132 | SpaceBeforeCpp11BracedList: false 133 | SpaceBeforeCtorInitializerColon: true 134 | SpaceBeforeInheritanceColon: true 135 | SpaceBeforeParens: ControlStatements 136 | SpaceBeforeRangeBasedForLoopColon: true 137 | SpaceInEmptyParentheses: false 138 | SpacesBeforeTrailingComments: 2 139 | SpacesInAngles: false 140 | SpacesInContainerLiterals: true 141 | SpacesInCStyleCastParentheses: false 142 | SpacesInParentheses: false 143 | SpacesInSquareBrackets: false 144 | Standard: Auto 145 | TabWidth: 8 146 | UseTab: Never --------------------------------------------------------------------------------