├── CG
    ├── img
    │   ├── .gitkeep
    │   ├── Strong Scaling
    │   │   ├── cg_runtime_8A100.pdf
    │   │   └── cg_runtime_8A100.png
    │   ├── Constant Number of GPUs
    │   │   ├── matrix_speedup_table_1 GPU.pdf
    │   │   ├── matrix_speedup_table_2 GPUs.pdf
    │   │   ├── matrix_speedup_table_3 GPUs.pdf
    │   │   ├── matrix_speedup_table_4 GPUs.pdf
    │   │   ├── matrix_speedup_table_5 GPUs.pdf
    │   │   ├── matrix_speedup_table_6 GPUs.pdf
    │   │   ├── matrix_speedup_table_7 GPUs.pdf
    │   │   └── matrix_speedup_table_8 GPUs.pdf
    │   └── Operation Breakdown
    │   │   └── Discrete Pipelined Operation Breakdown.png
    ├── results
    │   ├── .gitkeep
    │   ├── cg_runtime_single_gpu-A100.csv
    │   ├── standard_speedup
    │   │   ├── pipelined_cg_speedup_1 GPU.txt
    │   │   ├── pipelined_cg_speedup_2 GPUs.txt
    │   │   ├── pipelined_cg_speedup_3 GPUs.txt
    │   │   ├── pipelined_cg_speedup_6 GPUs.txt
    │   │   ├── pipelined_cg_speedup_8 GPUs.txt
    │   │   ├── pipelined_cg_speedup_4 GPUs.txt
    │   │   ├── pipelined_cg_speedup_5 GPUs.txt
    │   │   └── pipelined_cg_speedup_7 GPUs.txt
    │   ├── pipelined_speedup
    │   │   ├── pipelined_cg_speedup_1 GPU.txt
    │   │   ├── pipelined_cg_speedup_2 GPUs.txt
    │   │   ├── pipelined_cg_speedup_3 GPUs.txt
    │   │   ├── pipelined_cg_speedup_4 GPUs.txt
    │   │   ├── pipelined_cg_speedup_5 GPUs.txt
    │   │   ├── pipelined_cg_speedup_6 GPUs.txt
    │   │   ├── pipelined_cg_speedup_7 GPUs.txt
    │   │   └── pipelined_cg_speedup_8 GPUs.txt
    │   ├── cg_operation_breakdown_8A100_discrete_pipelined.csv
    │   ├── cg_operation_breakdown_8A100_discrete_standard.csv
    │   ├── cg_operation_breakdown_4A100.txt
    │   └── cg_operation_breakdown_8A100.txt
    ├── nsys_reports
    │   └── .gitkeep
    ├── scripts
    │   ├── .gitignore
    │   ├── requirements.txt
    │   ├── calculate_nnz_num_rows_ratio.py
    │   ├── plots
    │   │   ├── plot_operation_breakdown.py
    │   │   └── common.py
    │   ├── download_matrices.py
    │   └── calculate_speedup.py
    ├── batch
    │   ├── Karolina
    │   │   ├── _load_karolina_modules.sh
    │   │   ├── _measure_single_gpu_runtime.sh
    │   │   └── _measure_total_runtime.sh
    │   ├── Simula
    │   │   ├── _load_simula_modules.sh
    │   │   ├── _measure_operation_breakdown.sh
    │   │   └── _measure_total_runtime.sh
    │   ├── A100-machine
    │   │   └── _load_A100-machine_modules.sh
    │   └── Truba
    │   │   ├── _load_truba_modules.sh
    │   │   ├── _measure_operation_breakdown.sh
    │   │   └── _measure_total_runtime.sh
    ├── include
    │   ├── single-stream
    │   │   ├── standard.cuh
    │   │   ├── pipelined.cuh
    │   │   ├── pipelined-gather.cuh
    │   │   ├── standard-saxpy-overlap.cuh
    │   │   └── pipelined-multi-overlap.cuh
    │   ├── baseline
    │   │   ├── discrete-standard.cuh
    │   │   └── discrete-pipelined.cuh
    │   └── profiling
    │   │   ├── discrete-standard.cuh
    │   │   └── discrete-pipelined.cuh
    ├── CMakeLists.txt
    ├── Makefile2
    └── src
    │   └── single-gpu
    │       └── discrete-standard.cu
├── Plots
    ├── Images
    │   ├── .gitkeep
    │   ├── 8 GPUs (2048x4096).png
    │   ├── 2D_Weak_Scaling_256x256.png
    │   ├── 3D_Weak_Scaling_256x256x256.png
    │   ├── matrix_speedup_table_8 GPUs.png
    │   └── 2D_Weak_Scaling_No_Compute_16384x16384__.png
    ├── data
    │   ├── .gitkeep
    │   ├── comp.csv
    │   ├── no-comp.csv
    │   ├── fig5
    │   │   ├── 2D_Weak_Scaling_8192x4096.csv
    │   │   ├── 2D_Weak_Scaling_2048x1024.csv
    │   │   └── 2D_Weak_Scaling_256x256.csv
    │   ├── fig6
    │   │   ├── 3D_Weak_Scaling_256x256x256.csv
    │   │   ├── 3D_Strong_Scaling_No_Compute_512x512x512.csv
    │   │   └── 3D_Strong_Scaling_256x256x256.csv
    │   ├── 2d-weak-scaling-small.csv
    │   ├── 2d-weak-scaling-medium.csv
    │   ├── 2d-weak-scaling-large.csv
    │   └── 2d-comp.csv
    ├── .gitignore
    ├── requirements.txt
    ├── README.md
    ├── common.py
    ├── scaling-bar.py
    ├── comp-vs-comm.py
    ├── weak-scaling.py
    └── weak-scaling-2.py
├── Stencil
    ├── jacobi2D
    │   ├── scripts
    │   │   ├── .gitignore
    │   │   ├── requirements.txt
    │   │   ├── bench.sh
    │   │   ├── constant_num_gpus_bench.sh
    │   │   ├── plot.py
    │   │   ├── strong_scale_bench_truba.sh
    │   │   ├── strong_scale_bench.sh
    │   │   ├── weak_scale_comm_bench_truba.sh
    │   │   ├── weak_scale_comp_bench_truba.sh
    │   │   └── weak_scale_bench.sh
    │   ├── PERKS
    │   │   ├── genconfig.cuh
    │   │   ├── common
    │   │   │   ├── jacobi_reference.hpp
    │   │   │   ├── jacobi_cuda.cuh
    │   │   │   └── types.hpp
    │   │   ├── config.cuh
    │   │   └── jacobi-general-wrapper.cu
    │   ├── include
    │   │   ├── PERKS
    │   │   │   └── multi-stream-perks.cuh
    │   │   ├── multi-stream
    │   │   │   └── multi-gpu-peer-tiling.cuh
    │   │   ├── baseline
    │   │   │   ├── multi-threaded-p2p.cuh
    │   │   │   ├── multi-threaded-copy.cuh
    │   │   │   ├── single-threaded-copy.cuh
    │   │   │   └── multi-threaded-copy-overlap.cuh
    │   │   ├── single-stream
    │   │   │   ├── multi-threaded-one-block-comm.cuh
    │   │   │   ├── multi-threaded-two-block-comm.cuh
    │   │   │   └── multi-threaded-one-block-comm-layer.cuh
    │   │   └── no-compute
    │   │   │   ├── multi-gpu-peer-tiling-no-compute.cuh
    │   │   │   ├── multi-threaded-p2p-no-compute.cuh
    │   │   │   ├── multi-threaded-copy-no-compute.cuh
    │   │   │   ├── multi-threaded-two-block-comm-no-compute.cuh
    │   │   │   ├── multi-threaded-one-block-comm-no-compute.cuh
    │   │   │   ├── multi-threaded-copy-overlap-no-compute.cuh
    │   │   │   └── multi-threaded-one-block-comm-layer-no-compute.cuh
    │   ├── include_nvshmem
    │   │   ├── PERKS
    │   │   │   └── multi-stream-perks.cuh
    │   │   ├── multi-stream
    │   │   │   ├── multi-gpu-peer-tiling.cuh
    │   │   │   └── multi-gpu-multi-block-tiling.cuh
    │   │   ├── baseline
    │   │   │   ├── multi-threaded-nvshmem.cuh
    │   │   │   └── multi-threaded-nvshmem-opt.cuh
    │   │   ├── no-compute
    │   │   │   ├── design-1-multi-block-no-compute.cuh
    │   │   │   ├── multi-gpu-peer-tiling-no-compute.cuh
    │   │   │   ├── multi-threaded-nvshmem-no-compute.cuh
    │   │   │   ├── multi-threaded-nvshmem-opt-no-compute.cuh
    │   │   │   ├── multi-threaded-one-block-comm-no-compute.cuh
    │   │   │   └── multi-threaded-two-block-comm-no-compute.cuh
    │   │   └── single-stream
    │   │   │   ├── multi-threaded-two-block-comm.cuh
    │   │   │   ├── multi-threaded-one-block-comm.cuh
    │   │   │   └── multi-threaded-multi-block-comm.cuh
    │   ├── CMakeLists.txt
    │   ├── src
    │   │   └── main.cu
    │   └── src_nvshmem
    │   │   └── main.cu
    ├── jacobi3D
    │   ├── scripts
    │   │   ├── .gitignore
    │   │   ├── requirements.txt
    │   │   ├── run-bench.sh
    │   │   ├── bench.sh
    │   │   ├── constant_num_gpus_bench.sh
    │   │   ├── plot.py
    │   │   ├── multi-node.sh
    │   │   └── strong_scale_bench.sh
    │   ├── src
    │   │   ├── PERKS
    │   │   │   ├── genconfig.cuh
    │   │   │   ├── common
    │   │   │   │   ├── jacobi_cuda.cuh
    │   │   │   │   ├── jacobi_reference.hpp
    │   │   │   │   └── types.hpp
    │   │   │   └── config.cuh
    │   │   └── main.cu
    │   ├── src_nvshmem
    │   │   ├── PERKS-nvshmem
    │   │   │   ├── genconfig.cuh
    │   │   │   ├── common
    │   │   │   │   ├── jacobi_cuda.cuh
    │   │   │   │   ├── jacobi_reference.hpp
    │   │   │   │   └── types.hpp
    │   │   │   └── config.cuh
    │   │   └── main.cu
    │   ├── include
    │   │   ├── PERKS
    │   │   │   └── multi-stream-perks.cuh
    │   │   ├── PERKS-nvshmem
    │   │   │   └── multi-stream-perks-nvshmem.h
    │   │   ├── multi-stream
    │   │   │   └── multi-gpu-peer-tiling.cuh
    │   │   ├── baseline
    │   │   │   ├── multi-threaded-p2p.cuh
    │   │   │   ├── multi-threaded-copy.cuh
    │   │   │   ├── single-threaded-copy.cuh
    │   │   │   └── multi-threaded-copy-overlap.cuh
    │   │   ├── single-stream
    │   │   │   ├── multi-threaded-one-block-comm.cuh
    │   │   │   ├── multi-threaded-two-block-comm.cuh
    │   │   │   └── multi-threaded-multi-block-comm.cuh
    │   │   └── no-compute
    │   │   │   ├── multi-gpu-peer-tiling-no-compute.cuh
    │   │   │   ├── multi-threaded-p2p-no-compute.cuh
    │   │   │   ├── multi-threaded-copy-no-compute.cuh
    │   │   │   ├── multi-threaded-two-block-comm-no-compute.cuh
    │   │   │   ├── multi-threaded-one-block-comm-no-compute.cuh
    │   │   │   ├── multi-threaded-multi-block-comm-no-compute.cuh
    │   │   │   └── multi-threaded-copy-overlap-no-compute.cuh
    │   ├── include_nvshmem
    │   │   ├── PERKS-nvshmem
    │   │   │   ├── multi-stream-perks-nvshmem.h
    │   │   │   └── multi-stream-perks-nvshmem-block.h
    │   │   ├── multi-stream
    │   │   │   ├── multi-gpu-peer-tiling.cuh
    │   │   │   └── multi-gpu-multi-block-tiling.cuh
    │   │   ├── baseline
    │   │   │   ├── multi-threaded-nvshmem.cuh
    │   │   │   └── multi-threaded-nvshmem-opt.cuh
    │   │   ├── single-stream
    │   │   │   ├── multi-threaded-one-block-comm.cuh
    │   │   │   ├── multi-threaded-two-block-comm.cuh
    │   │   │   └── multi-threaded-multi-block-comm.cuh
    │   │   └── no-compute
    │   │   │   ├── multi-gpu-peer-tiling-no-compute.cuh
    │   │   │   ├── multi-threaded-nvshmem-no-compute.cuh
    │   │   │   ├── multi-threaded-nvshmem-opt-no-compute.cuh
    │   │   │   ├── multi-threaded-one-block-comm-no-compute.cuh
    │   │   │   ├── multi-threaded-two-block-comm-no-compute.cuh
    │   │   │   └── multi-threaded-multi-block-comm-no-compute.cuh
    │   └── CMakeLists.txt
    ├── CMakeLists.txt
    └── Makefile2
├── .gitignore
├── Makefile2
├── CMakeLists.txt
├── LICENSE
├── common.mk
├── Scripts
    └── full_bench.py
└── .clang-format


/CG/img/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/CG/results/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/Plots/Images/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/Plots/data/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/CG/nsys_reports/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/Plots/.gitignore:
--------------------------------------------------------------------------------
1 | Images/
2 | data/
3 | 


--------------------------------------------------------------------------------
/CG/scripts/.gitignore:
--------------------------------------------------------------------------------
1 | venv
2 | __pycache__


--------------------------------------------------------------------------------
/Stencil/jacobi2D/scripts/.gitignore:
--------------------------------------------------------------------------------
1 | venv


--------------------------------------------------------------------------------
/Stencil/jacobi3D/scripts/.gitignore:
--------------------------------------------------------------------------------
1 | venv


--------------------------------------------------------------------------------
/Plots/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | pandas
3 | matplotlib
4 | 


--------------------------------------------------------------------------------
/CG/scripts/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | pandas
3 | matplotlib
4 | 


--------------------------------------------------------------------------------
/Stencil/jacobi2D/PERKS/genconfig.cuh:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | // #define REG_FOLDER_Y (0)
4 | #define REG_FOLDER_Y (0)


--------------------------------------------------------------------------------
/Plots/Images/8 GPUs (2048x4096).png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ParCoreLab/CPU-Free-model/HEAD/Plots/Images/8 GPUs (2048x4096).png


--------------------------------------------------------------------------------
/Stencil/jacobi3D/src/PERKS/genconfig.cuh:
--------------------------------------------------------------------------------
1 | // #define REG_FOLDER_Z (8)
2 | // #define REG_FOLDER_Z (12)
3 | #define REG_FOLDER_Z (0)


--------------------------------------------------------------------------------
/Plots/Images/2D_Weak_Scaling_256x256.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ParCoreLab/CPU-Free-model/HEAD/Plots/Images/2D_Weak_Scaling_256x256.png


--------------------------------------------------------------------------------
/CG/img/Strong Scaling/cg_runtime_8A100.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ParCoreLab/CPU-Free-model/HEAD/CG/img/Strong Scaling/cg_runtime_8A100.pdf


--------------------------------------------------------------------------------
/CG/img/Strong Scaling/cg_runtime_8A100.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ParCoreLab/CPU-Free-model/HEAD/CG/img/Strong Scaling/cg_runtime_8A100.png


--------------------------------------------------------------------------------
/Stencil/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(jacobi2D)
2 | add_subdirectory(jacobi3D)
3 | 
4 | add_custom_target(jacobi jacobi2D jacobi2D_nvshmem)
5 | 


--------------------------------------------------------------------------------
/Stencil/jacobi3D/src_nvshmem/PERKS-nvshmem/genconfig.cuh:
--------------------------------------------------------------------------------
1 | // #define REG_FOLDER_Z (8)
2 | // #define REG_FOLDER_Z (12)
3 | #define REG_FOLDER_Z (0)


--------------------------------------------------------------------------------
/Plots/Images/3D_Weak_Scaling_256x256x256.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ParCoreLab/CPU-Free-model/HEAD/Plots/Images/3D_Weak_Scaling_256x256x256.png


--------------------------------------------------------------------------------
/Plots/Images/matrix_speedup_table_8 GPUs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ParCoreLab/CPU-Free-model/HEAD/Plots/Images/matrix_speedup_table_8 GPUs.png


--------------------------------------------------------------------------------
/CG/batch/Karolina/_load_karolina_modules.sh:
--------------------------------------------------------------------------------
1 | ml NVSHMEM/2.9.0-gompi-2022a-CUDA-11.7.0
2 | ml Python/3.10.4-GCCcore-11.3.0
3 | export NVSHMEM_IB_ENABLE_IBGDA=true


--------------------------------------------------------------------------------
/Plots/README.md:
--------------------------------------------------------------------------------
 1 | ## Figure 5:
 2 | ```bash
 3 | ./weak-scaling.py fig5/*.csv
 4 | ```
 5 | 
 6 | ## Figure 6:
 7 | ```bash
 8 | ./weak-scaling-2.py fig6/*.csv
 9 | ```
10 | 


--------------------------------------------------------------------------------
/Plots/Images/2D_Weak_Scaling_No_Compute_16384x16384__.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ParCoreLab/CPU-Free-model/HEAD/Plots/Images/2D_Weak_Scaling_No_Compute_16384x16384__.png


--------------------------------------------------------------------------------
/CG/img/Constant Number of GPUs/matrix_speedup_table_1 GPU.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ParCoreLab/CPU-Free-model/HEAD/CG/img/Constant Number of GPUs/matrix_speedup_table_1 GPU.pdf


--------------------------------------------------------------------------------
/CG/img/Constant Number of GPUs/matrix_speedup_table_2 GPUs.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ParCoreLab/CPU-Free-model/HEAD/CG/img/Constant Number of GPUs/matrix_speedup_table_2 GPUs.pdf


--------------------------------------------------------------------------------
/CG/img/Constant Number of GPUs/matrix_speedup_table_3 GPUs.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ParCoreLab/CPU-Free-model/HEAD/CG/img/Constant Number of GPUs/matrix_speedup_table_3 GPUs.pdf


--------------------------------------------------------------------------------
/CG/img/Constant Number of GPUs/matrix_speedup_table_4 GPUs.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ParCoreLab/CPU-Free-model/HEAD/CG/img/Constant Number of GPUs/matrix_speedup_table_4 GPUs.pdf


--------------------------------------------------------------------------------
/CG/img/Constant Number of GPUs/matrix_speedup_table_5 GPUs.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ParCoreLab/CPU-Free-model/HEAD/CG/img/Constant Number of GPUs/matrix_speedup_table_5 GPUs.pdf


--------------------------------------------------------------------------------
/CG/img/Constant Number of GPUs/matrix_speedup_table_6 GPUs.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ParCoreLab/CPU-Free-model/HEAD/CG/img/Constant Number of GPUs/matrix_speedup_table_6 GPUs.pdf


--------------------------------------------------------------------------------
/CG/img/Constant Number of GPUs/matrix_speedup_table_7 GPUs.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ParCoreLab/CPU-Free-model/HEAD/CG/img/Constant Number of GPUs/matrix_speedup_table_7 GPUs.pdf


--------------------------------------------------------------------------------
/CG/img/Constant Number of GPUs/matrix_speedup_table_8 GPUs.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ParCoreLab/CPU-Free-model/HEAD/CG/img/Constant Number of GPUs/matrix_speedup_table_8 GPUs.pdf


--------------------------------------------------------------------------------
/CG/img/Operation Breakdown/Discrete Pipelined Operation Breakdown.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ParCoreLab/CPU-Free-model/HEAD/CG/img/Operation Breakdown/Discrete Pipelined Operation Breakdown.png


--------------------------------------------------------------------------------
/Plots/data/comp.csv:
--------------------------------------------------------------------------------
1 | Version,1 GPU (256x256x256),2 GPUs (256x256x512),4 GPUs (256x256x1024),8 GPUs (256x256x2048)
2 | Baseline Overlap,12.4439,12.1929,12.2302,12.3219
3 | Baseline P2P,11.9452,13.3371,13.5145,13.6295
4 | Ours,15.1575,15.2379,15.2893,15.2893


--------------------------------------------------------------------------------
/Stencil/jacobi2D/scripts/requirements.txt:
--------------------------------------------------------------------------------
 1 | cycler==0.11.0
 2 | fonttools==4.29.1
 3 | kiwisolver==1.3.2
 4 | numpy==1.22.2
 5 | packaging==21.3
 6 | pandas==1.4.1
 7 | Pillow==9.0.1
 8 | pyparsing==3.0.7
 9 | python-dateutil==2.8.2
10 | pytz==2021.3
11 | six==1.16.0
12 | 


--------------------------------------------------------------------------------
/Stencil/jacobi3D/scripts/requirements.txt:
--------------------------------------------------------------------------------
 1 | cycler==0.11.0
 2 | fonttools==4.29.1
 3 | kiwisolver==1.3.2
 4 | numpy==1.22.2
 5 | packaging==21.3
 6 | pandas==1.4.1
 7 | Pillow==9.0.1
 8 | pyparsing==3.0.7
 9 | python-dateutil==2.8.2
10 | pytz==2021.3
11 | six==1.16.0
12 | 


--------------------------------------------------------------------------------
/Plots/data/no-comp.csv:
--------------------------------------------------------------------------------
1 | Version,1 GPU (256x256x256),2 GPUs (256x256x512),4 GPUs (256x256x1024),8 GPUs (256x256x2048)
2 | Baseline Copy Overlap (No Compute),2.5735,6.952,7.8268,10.0821
3 | Baseline P2P (No Compute),3.9442,5.209,5.3338,5.9174
4 | NVSHMEM Double Stream (No Compute),1.7782,1.9406,1.9862,2.0168
5 | 


--------------------------------------------------------------------------------
/Stencil/jacobi2D/include/PERKS/multi-stream-perks.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef INC_2D_STENCIL_MULTI_STREAM_PERKS_CUH
 2 | #define INC_2D_STENCIL_MULTI_STREAM_PERKS_CUH
 3 | 
 4 | #include "../common.h"
 5 | 
 6 | namespace MultiStreamPERKS {
 7 | int init(int argc, char** argv);
 8 | }
 9 | 
10 | #endif  // INC_2D_STENCIL_MULTI_STREAM_PERKS_CUH
11 | 


--------------------------------------------------------------------------------
/Stencil/jacobi3D/include/PERKS/multi-stream-perks.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef INC_3D_STENCIL_MULTI_THREADED_PERKS_H
 2 | #define INC_3D_STENCIL_MULTI_THREADED_PERKS_H
 3 | 
 4 | #include "../common.h"
 5 | 
 6 | namespace MultiStreamPERKS {
 7 | int init(int argc, char** argv);
 8 | }
 9 | 
10 | #endif  // INC_3D_STENCIL_MULTI_THREADED_PERKS_H
11 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .vscode/
 2 | .idea/
 3 | venv/
 4 | __pycache__/
 5 | 
 6 | CMakeFiles/
 7 | CMakeCache.txt
 8 | Makefile
 9 | 
10 | *.cmake
11 | *.o
12 | *.out
13 | 
14 | bin/
15 | obj/
16 | build/
17 | 
18 | cg
19 | jacobi
20 | jacobi_nvshmem
21 | */obj/*
22 | !obj/.gitkeep
23 | */obj_nvshmem/
24 | 
25 | # Benchmarking artifacts
26 | *.log
27 | 


--------------------------------------------------------------------------------
/Makefile2:
--------------------------------------------------------------------------------
 1 | include common.mk
 2 | 
 3 | include Stencil/Makefile
 4 | include CG/Makefile
 5 | 
 6 | all: stencil cg
 7 | 
 8 | SOURCES := $(shell find . -type f -name '*.cu' -or -name '*.c' -or -name '*.cuh' -or -name '*.h' -or -name '*.cpp')
 9 | 
10 | .PHONY format:
11 | format: $(SOURCES)
12 | 	clang-format --style=file:.clang-format -i $^
13 | 


--------------------------------------------------------------------------------
/Stencil/jacobi2D/include/multi-stream/multi-gpu-peer-tiling.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef INC_2D_STENCIL_MULTI_GPU_PEER_TILING_CUH
 2 | #define INC_2D_STENCIL_MULTI_GPU_PEER_TILING_CUH
 3 | 
 4 | #include "../common.h"
 5 | 
 6 | namespace MultiGPUPeerTiling {
 7 | int init(int argc, char** argv);
 8 | }
 9 | 
10 | #endif  // INC_2D_STENCIL_MULTI_GPU_PEER_TILING_CUH
11 | 


--------------------------------------------------------------------------------
/Stencil/jacobi3D/include/PERKS-nvshmem/multi-stream-perks-nvshmem.h:
--------------------------------------------------------------------------------
 1 | #ifndef JACOBI3D_MULTI_STREAM_PERKS_NVSHMEM_H
 2 | #define JACOBI3D_MULTI_STREAM_PERKS_NVSHMEM_H
 3 | 
 4 | #include "../common.h"
 5 | 
 6 | namespace MultiStreamPERKSNvshmem {
 7 | int init(int argc, char **argv);
 8 | }
 9 | 
10 | #endif  // JACOBI3D_MULTI_STREAM_PERKS_NVSHMEM_H
11 | 


--------------------------------------------------------------------------------
/Stencil/jacobi3D/include/multi-stream/multi-gpu-peer-tiling.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef INC_3D_STENCIL_MULTI_GPU_PEER_TILING_CUH
 2 | #define INC_3D_STENCIL_MULTI_GPU_PEER_TILING_CUH
 3 | 
 4 | #include "../common.h"
 5 | 
 6 | namespace MultiGPUPeerTiling {
 7 | int init(int argc, char** argv);
 8 | }
 9 | 
10 | #endif  // INC_3D_STENCIL_MULTI_GPU_PEER_TILING_CUH
11 | 


--------------------------------------------------------------------------------
/Stencil/jacobi3D/include_nvshmem/PERKS-nvshmem/multi-stream-perks-nvshmem.h:
--------------------------------------------------------------------------------
 1 | #ifndef JACOBI3D_MULTI_STREAM_PERKS_NVSHMEM_H
 2 | #define JACOBI3D_MULTI_STREAM_PERKS_NVSHMEM_H
 3 | 
 4 | #include "../common.h"
 5 | 
 6 | namespace MultiStreamPERKSNvshmem {
 7 | int init(int argc, char **argv);
 8 | }
 9 | 
10 | #endif  // JACOBI3D_MULTI_STREAM_PERKS_NVSHMEM_H
11 | 


--------------------------------------------------------------------------------
/Stencil/jacobi2D/include/baseline/multi-threaded-p2p.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef INC_2D_STENCIL_BASELINE_MULTI_THREADED_P2P_CUH
 2 | #define INC_2D_STENCIL_BASELINE_MULTI_THREADED_P2P_CUH
 3 | 
 4 | #include "../common.h"
 5 | 
 6 | namespace BaselineMultiThreadedP2P {
 7 | int init(int argc, char **argv);
 8 | }
 9 | 
10 | #endif  // INC_2D_STENCIL_BASELINE_MULTI_THREADED_P2P_CUH
11 | 


--------------------------------------------------------------------------------
/Stencil/jacobi3D/include/baseline/multi-threaded-p2p.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef INC_3D_STENCIL_BASELINE_MULTI_THREADED_P2P_CUH
 2 | #define INC_3D_STENCIL_BASELINE_MULTI_THREADED_P2P_CUH
 3 | 
 4 | #include "../common.h"
 5 | 
 6 | namespace BaselineMultiThreadedP2P {
 7 | int init(int argc, char **argv);
 8 | }
 9 | 
10 | #endif  // INC_3D_STENCIL_BASELINE_MULTI_THREADED_P2P_CUH
11 | 


--------------------------------------------------------------------------------
/Stencil/jacobi2D/include/baseline/multi-threaded-copy.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef INC_2D_STENCIL_BASELINE_MULTI_THREADED_COPY_CUH
 2 | #define INC_2D_STENCIL_BASELINE_MULTI_THREADED_COPY_CUH
 3 | 
 4 | #include "../common.h"
 5 | 
 6 | namespace BaselineMultiThreadedCopy {
 7 | int init(int argc, char **argv);
 8 | }
 9 | 
10 | #endif  // INC_2D_STENCIL_BASELINE_MULTI_THREADED_COPY_CUH
11 | 


--------------------------------------------------------------------------------
/Stencil/jacobi2D/include_nvshmem/PERKS/multi-stream-perks.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef INC_2D_STENCIL_MULTI_STREAM_PERKS_NVSHMEM_CUH
 2 | #define INC_2D_STENCIL_MULTI_STREAM_PERKS_NVSHMEM_CUH
 3 | 
 4 | #include "../common.h"
 5 | 
 6 | namespace MultiStreamPERKSNVSHMEM {
 7 | int init(int argc, char** argv);
 8 | }
 9 | 
10 | #endif  // INC_2D_STENCIL_MULTI_STREAM_PERKS_NVSHMEM_CUH
11 | 


--------------------------------------------------------------------------------
/Stencil/jacobi3D/include/baseline/multi-threaded-copy.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef INC_3D_STENCIL_BASELINE_MULTI_THREADED_COPY_CUH
 2 | #define INC_3D_STENCIL_BASELINE_MULTI_THREADED_COPY_CUH
 3 | 
 4 | #include "../common.h"
 5 | 
 6 | namespace BaselineMultiThreadedCopy {
 7 | int init(int argc, char **argv);
 8 | }
 9 | 
10 | #endif  // INC_2D_STENCIL_BASELINE_MULTI_THREADED_COPY_CUH
11 | 


--------------------------------------------------------------------------------
/Stencil/jacobi2D/include/baseline/single-threaded-copy.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef INC_2D_STENCIL_BASELINE_SINGLE_THREADED_COPY_CUH
 2 | #define INC_2D_STENCIL_BASELINE_SINGLE_THREADED_COPY_CUH
 3 | 
 4 | #include "../common.h"
 5 | 
 6 | namespace BaselineSingleThreadedCopy {
 7 | int init(int argc, char **argv);
 8 | }
 9 | 
10 | #endif  // INC_2D_STENCIL_BASELINE_SINGLE_THREADED_COPY_CUH
11 | 


--------------------------------------------------------------------------------
/Stencil/jacobi3D/include/baseline/single-threaded-copy.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef INC_3D_STENCIL_BASELINE_SINGLE_THREADED_COPY_CUH
 2 | #define INC_3D_STENCIL_BASELINE_SINGLE_THREADED_COPY_CUH
 3 | 
 4 | #include "../common.h"
 5 | 
 6 | namespace BaselineSingleThreadedCopy {
 7 | int init(int argc, char **argv);
 8 | }
 9 | 
10 | #endif  // INC_3D_STENCIL_BASELINE_SINGLE_THREADED_COPY_CUH
11 | 


--------------------------------------------------------------------------------
/Stencil/jacobi3D/include_nvshmem/PERKS-nvshmem/multi-stream-perks-nvshmem-block.h:
--------------------------------------------------------------------------------
 1 | #ifndef JACOBI3D_MULTI_STREAM_PERKS_NVSHMEM_BLOCK_H
 2 | #define JACOBI3D_MULTI_STREAM_PERKS_NVSHMEM_BLOCK_H
 3 | 
 4 | #include "../common.h"
 5 | 
 6 | namespace MultiStreamPERKSNvshmemBlock {
 7 | int init(int argc, char **argv);
 8 | }
 9 | 
10 | #endif  // JACOBI3D_MULTI_STREAM_PERKS_NVSHMEM_BLOCK_H
11 | 


--------------------------------------------------------------------------------
/Stencil/jacobi2D/include/single-stream/multi-threaded-one-block-comm.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef INC_2D_STENCIL_MULTI_THREADED_ONE_BLOCK_COMM_CUH
 2 | #define INC_2D_STENCIL_MULTI_THREADED_ONE_BLOCK_COMM_CUH
 3 | 
 4 | #include "../common.h"
 5 | 
 6 | namespace SSMultiThreadedOneBlockComm {
 7 | int init(int argc, char** argv);
 8 | }
 9 | 
10 | #endif  // INC_2D_STENCIL_MULTI_THREADED_ONE_BLOCK_COMM_CUH
11 | 


--------------------------------------------------------------------------------
/Stencil/jacobi2D/include/single-stream/multi-threaded-two-block-comm.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef INC_2D_STENCIL_MULTI_THREADED_TWO_BLOCK_COMM_CUH
 2 | #define INC_2D_STENCIL_MULTI_THREADED_TWO_BLOCK_COMM_CUH
 3 | 
 4 | #include "../common.h"
 5 | 
 6 | namespace SSMultiThreadedTwoBlockComm {
 7 | int init(int argc, char** argv);
 8 | }
 9 | 
10 | #endif  // INC_2D_STENCIL_MULTI_THREADED_TWO_BLOCK_COMM_CUH
11 | 


--------------------------------------------------------------------------------
/Stencil/jacobi2D/include_nvshmem/multi-stream/multi-gpu-peer-tiling.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef INC_2D_STENCIL_MULTI_GPU_PEER_TILING_NVSHMEM_CUH
 2 | #define INC_2D_STENCIL_MULTI_GPU_PEER_TILING_NVSHMEM_CUH
 3 | 
 4 | #include "../common.h"
 5 | 
 6 | namespace MultiGPUPeerTilingNvshmem {
 7 | int init(int argc, char** argv);
 8 | }
 9 | 
10 | #endif  // INC_2D_STENCIL_MULTI_GPU_PEER_TILING_NVSHMEM_CUH
11 | 


--------------------------------------------------------------------------------
/Stencil/jacobi3D/include/single-stream/multi-threaded-one-block-comm.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef INC_3D_STENCIL_MULTI_THREADED_ONE_BLOCK_COMM_CUH
 2 | #define INC_3D_STENCIL_MULTI_THREADED_ONE_BLOCK_COMM_CUH
 3 | 
 4 | #include "../common.h"
 5 | 
 6 | namespace SSMultiThreadedOneBlockComm {
 7 | int init(int argc, char** argv);
 8 | }
 9 | 
10 | #endif  // INC_3D_STENCIL_MULTI_THREADED_ONE_BLOCK_COMM_CUH
11 | 


--------------------------------------------------------------------------------
/Stencil/jacobi3D/include/single-stream/multi-threaded-two-block-comm.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef INC_2D_STENCIL_MULTI_THREADED_TWO_BLOCK_COMM_CUH
 2 | #define INC_2D_STENCIL_MULTI_THREADED_TWO_BLOCK_COMM_CUH
 3 | 
 4 | #include "../common.h"
 5 | 
 6 | namespace SSMultiThreadedTwoBlockComm {
 7 | int init(int argc, char** argv);
 8 | }
 9 | 
10 | #endif  // INC_2D_STENCIL_MULTI_THREADED_TWO_BLOCK_COMM_CUH
11 | 


--------------------------------------------------------------------------------
/Stencil/jacobi3D/include_nvshmem/multi-stream/multi-gpu-peer-tiling.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef INC_3D_STENCIL_MULTI_GPU_PEER_TILING_NVSHMEM_CUH
 2 | #define INC_3D_STENCIL_MULTI_GPU_PEER_TILING_NVSHMEM_CUH
 3 | 
 4 | #include "../common.h"
 5 | 
 6 | namespace MultiGPUPeerTilingNvshmem {
 7 | int init(int argc, char** argv);
 8 | }
 9 | 
10 | #endif  // INC_3D_STENCIL_MULTI_GPU_PEER_TILING_NVSHMEM_CUH
11 | 


--------------------------------------------------------------------------------
/Stencil/jacobi2D/include_nvshmem/baseline/multi-threaded-nvshmem.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef INC_2D_STENCIL_BASELINE_MULTI_THREADED_NVSHMEM_CUH
 2 | #define INC_2D_STENCIL_BASELINE_MULTI_THREADED_NVSHMEM_CUH
 3 | 
 4 | #include "../common.h"
 5 | 
 6 | namespace BaselineMultiThreadedNvshmem {
 7 | int init(int argc, char **argv);
 8 | }
 9 | 
10 | #endif  // INC_2D_STENCIL_BASELINE_MULTI_THREADED_NVSHMEM_CUH
11 | 


--------------------------------------------------------------------------------
/Stencil/jacobi3D/include_nvshmem/baseline/multi-threaded-nvshmem.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef INC_3D_STENCIL_BASELINE_MULTI_THREADED_NVSHMEM_CUH
 2 | #define INC_3D_STENCIL_BASELINE_MULTI_THREADED_NVSHMEM_CUH
 3 | 
 4 | #include "../common.h"
 5 | 
 6 | namespace BaselineMultiThreadedNvshmem {
 7 | int init(int argc, char **argv);
 8 | }
 9 | 
10 | #endif  // INC_3D_STENCIL_BASELINE_MULTI_THREADED_NVSHMEM_CUH
11 | 


--------------------------------------------------------------------------------
/Plots/data/fig5/2D_Weak_Scaling_8192x4096.csv:
--------------------------------------------------------------------------------
1 | Version,1 GPU (8192x4096),2 GPUs (8192x8192),4 GPUs (16384x8192),8 GPUs (16384x16384)
2 | Baseline Copy,2.2274,2.3971,2.4163,2.4221
3 | Baseline Overlap,2.1595,2.1635,2.1902,2.2097
4 | Baseline P2P,2.1315,2.2578,2.2941,2.3263
5 | Baseline NVSHMEM,2.2502,2.3054,2.3134,2.3307
6 | Ours,2.6494,2.6536,2.6965,2.6987
7 | PERKS + Ours,1.7114,1.7167,1.7254,1.7278
8 | 


--------------------------------------------------------------------------------
/Stencil/jacobi2D/include/no-compute/multi-gpu-peer-tiling-no-compute.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef INC_2D_STENCIL_MULTI_GPU_PEER_TILING_NO_COMPUTE_CUH
 2 | #define INC_2D_STENCIL_MULTI_GPU_PEER_TILING_NO_COMPUTE_CUH
 3 | 
 4 | #include "../common.h"
 5 | 
 6 | namespace MultiGPUPeerTilingNoCompute {
 7 | int init(int argc, char** argv);
 8 | }
 9 | 
10 | #endif  // INC_2D_STENCIL_MULTI_GPU_PEER_TILING_NO_COMPUTE_CUH
11 | 


--------------------------------------------------------------------------------
/Stencil/jacobi3D/include/no-compute/multi-gpu-peer-tiling-no-compute.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef INC_3D_STENCIL_MULTI_GPU_PEER_TILING_NO_COMPUTE_CUH
 2 | #define INC_3D_STENCIL_MULTI_GPU_PEER_TILING_NO_COMPUTE_CUH
 3 | 
 4 | #include "../common.h"
 5 | 
 6 | namespace MultiGPUPeerTilingNoCompute {
 7 | int init(int argc, char** argv);
 8 | }
 9 | 
10 | #endif  // INC_3D_STENCIL_MULTI_GPU_PEER_TILING_NO_COMPUTE_CUH
11 | 


--------------------------------------------------------------------------------
/Stencil/jacobi3D/include/single-stream/multi-threaded-multi-block-comm.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef INC_3D_STENCIL_MULTI_THREADED_MULTI_BLOCK_COMM_CUH
 2 | #define INC_3D_STENCIL_MULTI_THREADED_MULTI_BLOCK_COMM_CUH
 3 | 
 4 | #include "../common.h"
 5 | 
 6 | namespace SSMultiThreadedMultiBlockComm {
 7 | int init(int argc, char** argv);
 8 | }
 9 | 
10 | #endif  // INC_3D_STENCIL_MULTI_THREADED_MULTI_BLOCK_COMM_CUH
11 | 


--------------------------------------------------------------------------------
/Stencil/jacobi2D/include/baseline/multi-threaded-copy-overlap.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef INC_2D_STENCIL_BASELINE_MULTI_THREADED_COPY_OVERLAP_CUH
 2 | #define INC_2D_STENCIL_BASELINE_MULTI_THREADED_COPY_OVERLAP_CUH
 3 | 
 4 | #include "../common.h"
 5 | 
 6 | namespace BaselineMultiThreadedCopyOverlap {
 7 | int init(int argc, char **argv);
 8 | }
 9 | 
10 | #endif  // INC_2D_STENCIL_BASELINE_MULTI_THREADED_COPY_OVERLAP_CUH
11 | 


--------------------------------------------------------------------------------
/Stencil/jacobi2D/include_nvshmem/no-compute/design-1-multi-block-no-compute.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef INC_2D_STENCIL_DESIGN_1_MULTI_BLOCK_NO_COMPUTE_CUH
 2 | #define INC_2D_STENCIL_DESIGN_1_MULTI_BLOCK_NO_COMPUTE_CUH
 3 | 
 4 | #include "../common.h"
 5 | 
 6 | namespace Design1MultiBlockNoComputation {
 7 | int init(int argc, char **argv);
 8 | }
 9 | 
10 | #endif  // INC_2D_STENCIL_DESIGN_1_MULTI_BLOCK_NO_COMPUTE_CUH
11 | 


--------------------------------------------------------------------------------
/Stencil/jacobi3D/include/baseline/multi-threaded-copy-overlap.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef INC_3D_STENCIL_BASELINE_MULTI_THREADED_COPY_OVERLAP_CUH
 2 | #define INC_3D_STENCIL_BASELINE_MULTI_THREADED_COPY_OVERLAP_CUH
 3 | 
 4 | #include "../common.h"
 5 | 
 6 | namespace BaselineMultiThreadedCopyOverlap {
 7 | int init(int argc, char **argv);
 8 | }
 9 | 
10 | #endif  // INC_3D_STENCIL_BASELINE_MULTI_THREADED_COPY_OVERLAP_CUH
11 | 


--------------------------------------------------------------------------------
/Stencil/jacobi2D/include_nvshmem/baseline/multi-threaded-nvshmem-opt.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef INC_2D_STENCIL_BASELINE_MULTI_THREADED_NVSHMEM_OPT_CUH
 2 | #define INC_2D_STENCIL_BASELINE_MULTI_THREADED_NVSHMEM_OPT_CUH
 3 | 
 4 | #include "../common.h"
 5 | 
 6 | namespace BaselineMultiThreadedNvshmemOpt {
 7 | int init(int argc, char **argv);
 8 | }
 9 | 
10 | #endif  // INC_2D_STENCIL_BASELINE_MULTI_THREADED_NVSHMEM_OPT_CUH
11 | 


--------------------------------------------------------------------------------
/Stencil/jacobi3D/include_nvshmem/baseline/multi-threaded-nvshmem-opt.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef INC_3D_STENCIL_BASELINE_MULTI_THREADED_NVSHMEM_OPT_CUH
 2 | #define INC_3D_STENCIL_BASELINE_MULTI_THREADED_NVSHMEM_OPT_CUH
 3 | 
 4 | #include "../common.h"
 5 | 
 6 | namespace BaselineMultiThreadedNvshmemOpt {
 7 | int init(int argc, char **argv);
 8 | }
 9 | 
10 | #endif  // INC_3D_STENCIL_BASELINE_MULTI_THREADED_NVSHMEM_OPT_CUH
11 | 


--------------------------------------------------------------------------------
/Stencil/jacobi2D/include/no-compute/multi-threaded-p2p-no-compute.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef INC_2D_STENCIL_BASELINE_MULTI_THREADED_P2P_NO_COMPUTE_CUH
 2 | #define INC_2D_STENCIL_BASELINE_MULTI_THREADED_P2P_NO_COMPUTE_CUH
 3 | 
 4 | #include "../common.h"
 5 | 
 6 | namespace BaselineMultiThreadedP2PNoCompute {
 7 | int init(int argc, char **argv);
 8 | }
 9 | 
10 | #endif  // INC_2D_STENCIL_BASELINE_MULTI_THREADED_P2P_NO_COMPUTE_CUH
11 | 


--------------------------------------------------------------------------------
/Stencil/jacobi2D/include/single-stream/multi-threaded-one-block-comm-layer.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef INC_2D_STENCIL_MULTI_THREADED_ONE_BLOCK_COMM_LAYER_CUH
 2 | #define INC_2D_STENCIL_MULTI_THREADED_ONE_BLOCK_COMM_LAYER_CUH
 3 | 
 4 | #include "../common.h"
 5 | 
 6 | namespace SSMultiThreadedOneBlockCommLayer {
 7 | int init(int argc, char** argv);
 8 | }
 9 | 
10 | #endif  // INC_2D_STENCIL_MULTI_THREADED_ONE_BLOCK_COMM_LAYER_CUH
11 | 


--------------------------------------------------------------------------------
/Stencil/jacobi3D/include/no-compute/multi-threaded-p2p-no-compute.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef INC_3D_STENCIL_BASELINE_MULTI_THREADED_P2P_NO_COMPUTE_CUH
 2 | #define INC_3D_STENCIL_BASELINE_MULTI_THREADED_P2P_NO_COMPUTE_CUH
 3 | 
 4 | #include "../common.h"
 5 | 
 6 | namespace BaselineMultiThreadedP2PNoCompute {
 7 | int init(int argc, char **argv);
 8 | }
 9 | 
10 | #endif  // INC_3D_STENCIL_BASELINE_MULTI_THREADED_P2P_NO_COMPUTE_CUH
11 | 


--------------------------------------------------------------------------------
/Stencil/jacobi3D/include/no-compute/multi-threaded-copy-no-compute.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef INC_3D_STENCIL_BASELINE_MULTI_THREADED_COPY_NO_COMPUTE_CUH
 2 | #define INC_3D_STENCIL_BASELINE_MULTI_THREADED_COPY_NO_COMPUTE_CUH
 3 | 
 4 | #include "../common.h"
 5 | 
 6 | namespace BaselineMultiThreadedCopyNoCompute {
 7 | int init(int argc, char **argv);
 8 | }
 9 | 
10 | #endif  // INC_3D_STENCIL_BASELINE_MULTI_THREADED_COPY_NO_COMPUTE_CUH
11 | 


--------------------------------------------------------------------------------
/CG/batch/Simula/_load_simula_modules.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | module load nvidia/nvhpc/22.3
 4 | module load gcc/11.2.0
 5 | module load python-3.7.4
 6 | 
 7 | COMM_LIBS_PATH="$NVHPC_ROOT"/comm_libs
 8 | MATH_LIBS_PATH="$NVHPC_ROOT"/math_libs/lib64
 9 | 
10 | export NVCC="$NVHPC_ROOT"/cuda/bin/nvcc
11 | export MPI_HOME="$COMM_LIBS_PATH"/mpi
12 | export NVSHMEM_HOME="$COMM_LIBS_PATH"/nvshmem
13 | export MATH_LIBS_PATH="$MATH_LIBS_PATH"


--------------------------------------------------------------------------------
/Plots/data/fig5/2D_Weak_Scaling_2048x1024.csv:
--------------------------------------------------------------------------------
1 | Version,1 GPU (2048x1024),2 GPUs (2048x2048),4 GPUs (4096x2048),8 GPUs (4096x4096)
2 | Baseline Copy,18.0521,42.88,67.3238,83.8113
3 | Baseline Overlap,19.9917,73.2787,86.0336,102.6413
4 | Baseline P2P,11.9066,33.1121,48.0276,63.7284
5 | Baseline NVSHMEM,20.2334,24.1339,23.9423,24.2244
6 | CPU-Free (Ours),12.095,12.3177,12.2255,12.3814
7 | PERKS + Ours,14.3985,14.5411,14.5367,14.5505
8 | 


--------------------------------------------------------------------------------
/Plots/data/fig5/2D_Weak_Scaling_256x256.csv:
--------------------------------------------------------------------------------
1 | Version,1 GPU (256x256),2 GPUs (256x512),4 GPUs (512x512),8 GPUs (512x1024)
2 | Baseline Copy,12.7818,42.7438,64.9559,81.4121
3 | Baseline Overlap,18.2912,63.732,75.788,100.9295
4 | Baseline P2P,4.0037,28.105,39.5831,61.4564
5 | Baseline NVSHMEM,13.5165,17.3821,14.4411,14.6838
6 | CPU-Free (Ours),11.0862,11.0697,11.4963,11.4811
7 | CPU-Free (Ours + PERKS),16.1236,15.291,16.6956,16.73
8 | 


--------------------------------------------------------------------------------
/Stencil/jacobi2D/include/no-compute/multi-threaded-copy-no-compute.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef INC_2D_STENCIL_BASELINE_MULTI_THREADED_COPY_NO_COMPUTE_CUH
 2 | #define INC_2D_STENCIL_BASELINE_MULTI_THREADED_COPY_NO_COMPUTE_CUH
 3 | 
 4 | #include "../common.h"
 5 | 
 6 | namespace BaselineMultiThreadedCopyNoCompute {
 7 | int init(int argc, char **argv);
 8 | }
 9 | 
10 | #endif  // INC_2D_STENCIL_BASELINE_MULTI_THREADED_COPY_NO_COMPUTE_  CUH
11 | 


--------------------------------------------------------------------------------
/Stencil/jacobi2D/include_nvshmem/single-stream/multi-threaded-two-block-comm.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef INC_NVSHMEM_2D_STENCIL_MULTI_THREADED_TWO_BLOCK_COMM_CUH
 2 | #define INC_NVSHMEM_2D_STENCIL_MULTI_THREADED_TWO_BLOCK_COMM_CUH
 3 | 
 4 | #include "../common.h"
 5 | 
 6 | namespace SSMultiThreadedTwoBlockCommNvshmem {
 7 | int init(int argc, char** argv);
 8 | }
 9 | 
10 | #endif  // INC_NVSHMEM_2D_STENCIL_MULTI_THREADED_TWO_BLOCK_COMM_CUH
11 | 


--------------------------------------------------------------------------------
/Stencil/jacobi3D/include_nvshmem/single-stream/multi-threaded-one-block-comm.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef INC_NVSHMEM_3D_STENCIL_MULTI_THREADED_ONE_BLOCK_COMM_CUH
 2 | #define INC_NVSHMEM_3D_STENCIL_MULTI_THREADED_ONE_BLOCK_COMM_CUH
 3 | 
 4 | #include "../common.h"
 5 | 
 6 | namespace SSMultiThreadedOneBlockCommNvshmem {
 7 | int init(int argc, char** argv);
 8 | }
 9 | 
10 | #endif  // INC_NVSHMEM_3D_STENCIL_MULTI_THREADED_ONE_BLOCK_COMM_CUH
11 | 


--------------------------------------------------------------------------------
/Stencil/jacobi3D/include_nvshmem/single-stream/multi-threaded-two-block-comm.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef INC_NVSHMEM_3D_STENCIL_MULTI_THREADED_TWO_BLOCK_COMM_CUH
 2 | #define INC_NVSHMEM_3D_STENCIL_MULTI_THREADED_TWO_BLOCK_COMM_CUH
 3 | 
 4 | #include "../common.h"
 5 | 
 6 | namespace SSMultiThreadedTwoBlockCommNvshmem {
 7 | int init(int argc, char** argv);
 8 | }
 9 | 
10 | #endif  // INC_NVSHMEM_3D_STENCIL_MULTI_THREADED_TWO_BLOCK_COMM_CUH
11 | 


--------------------------------------------------------------------------------
/Stencil/jacobi2D/include_nvshmem/single-stream/multi-threaded-one-block-comm.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef INC_NVSHMEM_2D_STENCIL_MULTI_THREADED_ONE_BLOCK_COMM_CUH
 2 | #define INC_NVSHMEM_2D_STENCIL_MULTI_THREADED_ONE_BLOCK_COMM_CUH
 3 | 
 4 | #include "../common.h"
 5 | 
 6 | namespace SSMultiThreadedOneBlockCommNvshmem {
 7 | int init(int argc, char** argv);
 8 | }
 9 | 
10 | #endif  // INC_NVSHMEM_2D_STENCIL_MULTI_THREADED_ONE_BLOCK_COMM_BULK_CUH
11 | 


--------------------------------------------------------------------------------
/Stencil/jacobi2D/include/no-compute/multi-threaded-two-block-comm-no-compute.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef INC_2D_STENCIL_MULTI_THREADED_TWO_BLOCK_COMM_NO_COMPUTE_CUH
 2 | #define INC_2D_STENCIL_MULTI_THREADED_TWO_BLOCK_COMM_NO_COMPUTE_CUH
 3 | 
 4 | #include "../common.h"
 5 | 
 6 | namespace SSMultiThreadedTwoBlockCommNoCompute {
 7 | int init(int argc, char** argv);
 8 | }
 9 | 
10 | #endif  // INC_2D_STENCIL_MULTI_THREADED_TWO_BLOCK_COMM_NO_COMPUTE_CUH
11 | 


--------------------------------------------------------------------------------
/Stencil/jacobi2D/include_nvshmem/multi-stream/multi-gpu-multi-block-tiling.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef INC_2D_STENCIL_MULTI_GPU_MULTI_BLOCK_PEER_TILING_NVSHMEM_CUH
 2 | #define INC_2D_STENCIL_MULTI_GPU_MULTI_BLOCK_PEER_TILING_NVSHMEM_CUH
 3 | 
 4 | #include "../common.h"
 5 | 
 6 | namespace MultiGPUMultiBlockPeerTilingNvshmem {
 7 | int init(int argc, char** argv);
 8 | }
 9 | 
10 | #endif  // INC_2D_STENCIL_MULTI_GPU_MULTI_BLOCK_PEER_TILING_NVSHMEM_CUH
11 | 


--------------------------------------------------------------------------------
/Stencil/jacobi2D/include_nvshmem/no-compute/multi-gpu-peer-tiling-no-compute.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef INC_2D_STENCIL_MULTI_GPU_PEER_TILING_NVSHMEM_NO_COMPUTE_CUH
 2 | #define INC_2D_STENCIL_MULTI_GPU_PEER_TILING_NVSHMEM_NO_COMPUTE_CUH
 3 | 
 4 | #include "../common.h"
 5 | 
 6 | namespace MultiGPUPeerTilingNvshmemNoCompute {
 7 | int init(int argc, char** argv);
 8 | }
 9 | 
10 | #endif  // INC_2D_STENCIL_MULTI_GPU_PEER_TILING_NVSHMEM_NO_COMPUTE_CUH
11 | 


--------------------------------------------------------------------------------
/Stencil/jacobi2D/include_nvshmem/single-stream/multi-threaded-multi-block-comm.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef INC_NVSHMEM_2D_STENCIL_MULTI_THREADED_MULTI_BLOCK_COMM_CUH
 2 | #define INC_NVSHMEM_2D_STENCIL_MULTI_THREADED_MULTI_BLOCK_COMM_CUH
 3 | 
 4 | #include "../common.h"
 5 | 
 6 | namespace SSMultiThreadedMultiBlockCommNvshmem {
 7 | int init(int argc, char** argv);
 8 | }
 9 | 
10 | #endif  // INC_NVSHMEM_2D_STENCIL_MULTI_THREADED_MULTI_BLOCK_COMM_CUH
11 | 


--------------------------------------------------------------------------------
/Stencil/jacobi3D/include/no-compute/multi-threaded-two-block-comm-no-compute.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef INC_3D_STENCIL_MULTI_THREADED_TWO_BLOCK_COMM_NO_COMPUTE_CUH
 2 | #define INC_3D_STENCIL_MULTI_THREADED_TWO_BLOCK_COMM_NO_COMPUTE_CUH
 3 | 
 4 | #include "../common.h"
 5 | 
 6 | namespace SSMultiThreadedTwoBlockCommNoCompute {
 7 | int init(int argc, char** argv);
 8 | }
 9 | 
10 | #endif  // INC_3D_STENCIL_MULTI_THREADED_TWO_BLOCK_COMM_NO_COMPUTE_CUH
11 | 


--------------------------------------------------------------------------------
/Stencil/jacobi3D/include_nvshmem/multi-stream/multi-gpu-multi-block-tiling.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef INC_3D_STENCIL_MULTI_GPU_MULTI_BLOCK_PEER_TILING_NVSHMEM_CUH
 2 | #define INC_3D_STENCIL_MULTI_GPU_MULTI_BLOCK_PEER_TILING_NVSHMEM_CUH
 3 | 
 4 | #include "../common.h"
 5 | 
 6 | namespace MultiGPUMultiBlockPeerTilingNvshmem {
 7 | int init(int argc, char** argv);
 8 | }
 9 | 
10 | #endif  // INC_3D_STENCIL_MULTI_GPU_MULTI_BLOCK_PEER_TILING_NVSHMEM_CUH
11 | 


--------------------------------------------------------------------------------
/Stencil/jacobi3D/include_nvshmem/no-compute/multi-gpu-peer-tiling-no-compute.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef INC_3D_STENCIL_MULTI_GPU_PEER_TILING_NVSHMEM_NO_COMPUTE_CUH
 2 | #define INC_3D_STENCIL_MULTI_GPU_PEER_TILING_NVSHMEM_NO_COMPUTE_CUH
 3 | 
 4 | #include "../common.h"
 5 | 
 6 | namespace MultiGPUPeerTilingNvshmemNoCompute {
 7 | int init(int argc, char** argv);
 8 | }
 9 | 
10 | #endif  // INC_3D_STENCIL_MULTI_GPU_PEER_TILING_NVSHMEM_NO_COMPUTE_CUH
11 | 


--------------------------------------------------------------------------------
/Stencil/jacobi3D/include_nvshmem/single-stream/multi-threaded-multi-block-comm.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef INC_NVSHMEM_3D_STENCIL_MULTI_THREADED_MULTI_BLOCK_COMM_CUH
 2 | #define INC_NVSHMEM_3D_STENCIL_MULTI_THREADED_MULTI_BLOCK_COMM_CUH
 3 | 
 4 | #include "../common.h"
 5 | 
 6 | namespace SSMultiThreadedMultiBlockCommNvshmem {
 7 | int init(int argc, char** argv);
 8 | }
 9 | 
10 | #endif  // INC_NVSHMEM_3D_STENCIL_MULTI_THREADED_MULTI_BLOCK_COMM_CUH
11 | 


--------------------------------------------------------------------------------
/Stencil/jacobi2D/include/no-compute/multi-threaded-one-block-comm-no-compute.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef INC_2D_STENCIL_MULTI_THREADED_ONE_BLOCK_COMM_NO_COMPUTE_CUH
 2 | #define INC_2D_STENCIL_MULTI_THREADED_ONE_BLOCK_COMM_NO_COMPUTE_CUH
 3 | 
 4 | #include "../common.h"
 5 | 
 6 | namespace SSMultiThreadedOneBlockCommNoCompute {
 7 | int init(int argc, char** argv);
 8 | }
 9 | 
10 | #endif  // INC_2D_STENCIL_MULTI_THREADED_ONE_BLOCK_COMM__NO_COMPUTE_CUH
11 | 


--------------------------------------------------------------------------------
/Stencil/jacobi3D/include/no-compute/multi-threaded-one-block-comm-no-compute.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef INC_3D_STENCIL_MULTI_THREADED_ONE_BLOCK_COMM_NO_COMPUTE_CUH
 2 | #define INC_3D_STENCIL_MULTI_THREADED_ONE_BLOCK_COMM_NO_COMPUTE_CUH
 3 | 
 4 | #include "../common.h"
 5 | 
 6 | namespace SSMultiThreadedOneBlockCommNoCompute {
 7 | int init(int argc, char** argv);
 8 | }
 9 | 
10 | #endif  // INC_3D_STENCIL_MULTI_THREADED_ONE_BLOCK_COMM__NO_COMPUTE_CUH
11 | 


--------------------------------------------------------------------------------
/Plots/data/fig6/3D_Weak_Scaling_256x256x256.csv:
--------------------------------------------------------------------------------
1 | Version,1 GPUs (256x256x256),2 GPUs (256x256x512),4 GPUs (256x512x512),8 GPUs (512x512x512)
2 | Baseline Copy,12.8902,14.339,15.2628,15.5506
3 | Baseline Overlap,12.3302,12.3777,12.5602,12.7043
4 | Baseline P2P,12.1258,13.4727,14.257,14.2942
5 | Baseline NVSHMEM,33.7791,36.2173,37.692,26.6532
6 | CPU-Free (Ours),21.6456,21.5007,21.6423,21.3508
7 | CPU-Free (Ours + PERKS),12.3902,12.6408,13.7867,24.335
8 | 


--------------------------------------------------------------------------------
/Stencil/jacobi2D/include_nvshmem/no-compute/multi-threaded-nvshmem-no-compute.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef INC_2D_STENCIL_BASELINE_MULTI_THREADED_NVSHMEM_NO_COMPUTE_CUH
 2 | #define INC_2D_STENCIL_BASELINE_MULTI_THREADED_NVSHMEM_NO_COMPUTE_CUH
 3 | 
 4 | #include "../common.h"
 5 | 
 6 | namespace BaselineMultiThreadedNvshmemNoCompute {
 7 | int init(int argc, char **argv);
 8 | }
 9 | 
10 | #endif  // INC_2D_STENCIL_BASELINE_MULTI_THREADED_NVSHMEM_NO_COMPUTE_CUH
11 | 


--------------------------------------------------------------------------------
/Stencil/jacobi3D/include/no-compute/multi-threaded-multi-block-comm-no-compute.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef INC_3D_STENCIL_MULTI_THREADED_MULTI_BLOCK_COMM_NO_COMPUTE_CUH
 2 | #define INC_3D_STENCIL_MULTI_THREADED_MULTI_BLOCK_COMM_NO_COMPUTE_CUH
 3 | 
 4 | #include "../common.h"
 5 | 
 6 | namespace SSMultiThreadedMultiBlockCommNoCompute {
 7 | int init(int argc, char** argv);
 8 | }
 9 | 
10 | #endif  // INC_3D_STENCIL_MULTI_THREADED_MULTI_BLOCK_COMM_NO_COMPUTE_CUH
11 | 


--------------------------------------------------------------------------------
/Stencil/jacobi3D/include_nvshmem/no-compute/multi-threaded-nvshmem-no-compute.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef INC_3D_STENCIL_BASELINE_MULTI_THREADED_NVSHMEM_NO_COMPUTE_CUH
 2 | #define INC_3D_STENCIL_BASELINE_MULTI_THREADED_NVSHMEM_NO_COMPUTE_CUH
 3 | 
 4 | #include "../common.h"
 5 | 
 6 | namespace BaselineMultiThreadedNvshmemNoCompute {
 7 | int init(int argc, char **argv);
 8 | }
 9 | 
10 | #endif  // INC_3D_STENCIL_BASELINE_MULTI_THREADED_NVSHMEM_NO_COMPUTE_CUH
11 | 


--------------------------------------------------------------------------------
/Stencil/jacobi2D/include/no-compute/multi-threaded-copy-overlap-no-compute.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef INC_2D_STENCIL_BASELINE_MULTI_THREADED_COPY_OVERLAP_NO_COMPUTE_CUH
 2 | #define INC_2D_STENCIL_BASELINE_MULTI_THREADED_COPY_OVERLAP_NO_COMPUTE_CUH
 3 | 
 4 | #include "../common.h"
 5 | 
 6 | namespace BaselineMultiThreadedCopyOverlapNoCompute {
 7 | int init(int argc, char **argv);
 8 | }
 9 | 
10 | #endif  // INC_2D_STENCIL_BASELINE_MULTI_THREADED_COPY_OVERLAP_NO_COMPUTE_CUH
11 | 


--------------------------------------------------------------------------------
/Stencil/jacobi3D/include/no-compute/multi-threaded-copy-overlap-no-compute.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef INC_3D_STENCIL_BASELINE_MULTI_THREADED_COPY_OVERLAP_NO_COMPUTE_CUH
 2 | #define INC_3D_STENCIL_BASELINE_MULTI_THREADED_COPY_OVERLAP_NO_COMPUTE_CUH
 3 | 
 4 | #include "../common.h"
 5 | 
 6 | namespace BaselineMultiThreadedCopyOverlapNoCompute {
 7 | int init(int argc, char **argv);
 8 | }
 9 | 
10 | #endif  // INC_3D_STENCIL_BASELINE_MULTI_THREADED_COPY_OVERLAP_NO_COMPUTE_CUH
11 | 


--------------------------------------------------------------------------------
/Stencil/jacobi2D/include/no-compute/multi-threaded-one-block-comm-layer-no-compute.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef INC_2D_STENCIL_MULTI_THREADED_ONE_BLOCK_COMM_LAYER_NO_COMPUTE_CUH
 2 | #define INC_2D_STENCIL_MULTI_THREADED_ONE_BLOCK_COMM_LAYER_NO_COMPUTE_CUH
 3 | 
 4 | #include "../common.h"
 5 | 
 6 | namespace SSMultiThreadedOneBlockCommLayerNoCompute {
 7 | int init(int argc, char** argv);
 8 | }
 9 | 
10 | #endif  // INC_2D_STENCIL_MULTI_THREADED_ONE_BLOCK_COMM_LAYER_NO_COMPUTE_CUH
11 | 


--------------------------------------------------------------------------------
/Stencil/jacobi2D/include_nvshmem/no-compute/multi-threaded-nvshmem-opt-no-compute.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef INC_2D_STENCIL_BASELINE_MULTI_THREADED_NVSHMEM_OPT_NO_COMPUTE_CUH
 2 | #define INC_2D_STENCIL_BASELINE_MULTI_THREADED_NVSHMEM_OPT_NO_COMPUTE_CUH
 3 | 
 4 | #include "../common.h"
 5 | 
 6 | namespace BaselineMultiThreadedNvshmemOptNoCompute {
 7 | int init(int argc, char **argv);
 8 | }
 9 | 
10 | #endif  // INC_2D_STENCIL_BASELINE_MULTI_THREADED_NVSHMEM_OPT_NO_COMPUTE_CUH
11 | 


--------------------------------------------------------------------------------
/Stencil/jacobi3D/include_nvshmem/no-compute/multi-threaded-nvshmem-opt-no-compute.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef INC_3D_STENCIL_BASELINE_MULTI_THREADED_NVSHMEM_OPT_NO_COMPUTE_CUH
 2 | #define INC_3D_STENCIL_BASELINE_MULTI_THREADED_NVSHMEM_OPT_NO_COMPUTE_CUH
 3 | 
 4 | #include "../common.h"
 5 | 
 6 | namespace BaselineMultiThreadedNvshmemOptNoCompute {
 7 | int init(int argc, char **argv);
 8 | }
 9 | 
10 | #endif  // INC_3D_STENCIL_BASELINE_MULTI_THREADED_NVSHMEM_OPT_NO_COMPUTE_CUH
11 | 


--------------------------------------------------------------------------------
/Plots/data/2d-weak-scaling-small.csv:
--------------------------------------------------------------------------------
1 | Small Domain Weak Scaling
2 | Version,1 GPU (256x256),2 GPUs (256x512),4 GPUs (512x512),8 GPUs (512x1024)
3 | Single Stream 2TB,2.4132,3.2832,3.4327,3.8099
4 | Single Stream 1TB,3.0105,4.6844,4.734,5.0006
5 | Baseline Copy Overlap,12.3663,31.1103,69.8461,96.4217
6 | Baseline Copy,7.9615,23.6853,55.2348,74.4993
7 | Baseline P2P,3.224,18.2069,33.2891,58.6196
8 | Design 1,4.4098,5.0639,5.0658,5.9583
9 | PERKS,1.5574,1.5892,1.6003,1.616


--------------------------------------------------------------------------------
/Stencil/jacobi2D/include_nvshmem/no-compute/multi-threaded-one-block-comm-no-compute.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef INC_NVSHMEM_2D_STENCIL_MULTI_THREADED_ONE_BLOCK_COMM_NO_COMPUTE_CUH
 2 | #define INC_NVSHMEM_2D_STENCIL_MULTI_THREADED_ONE_BLOCK_COMM_NO_COMPUTE_CUH
 3 | 
 4 | #include "../common.h"
 5 | 
 6 | namespace SSMultiThreadedOneBlockCommNvshmemNoCompute {
 7 | int init(int argc, char** argv);
 8 | }
 9 | 
10 | #endif  // INC_NVSHMEM_2D_STENCIL_MULTI_THREADED_ONE_BLOCK_COMM_NO_COMPUTE_CUH
11 | 


--------------------------------------------------------------------------------
/Stencil/jacobi2D/include_nvshmem/no-compute/multi-threaded-two-block-comm-no-compute.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef INC_NVSHMEM_2D_STENCIL_MULTI_THREADED_TWO_BLOCK_COMM_NO_COMPUTE_CUH
 2 | #define INC_NVSHMEM_2D_STENCIL_MULTI_THREADED_TWO_BLOCK_COMM_NO_COMPUTE_CUH
 3 | 
 4 | #include "../common.h"
 5 | 
 6 | namespace SSMultiThreadedTwoBlockCommNvshmemNoCompute {
 7 | int init(int argc, char** argv);
 8 | }
 9 | 
10 | #endif  // INC_NVSHMEM_2D_STENCIL_MULTI_THREADED_TWO_BLOCK_NO_COMPUTE_COMM_CUH
11 | 


--------------------------------------------------------------------------------
/Stencil/jacobi3D/include_nvshmem/no-compute/multi-threaded-one-block-comm-no-compute.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef INC_NVSHMEM_3D_STENCIL_MULTI_THREADED_ONE_BLOCK_COMM_NO_COMPUTE_CUH
 2 | #define INC_NVSHMEM_3D_STENCIL_MULTI_THREADED_ONE_BLOCK_COMM_NO_COMPUTE_CUH
 3 | 
 4 | #include "../common.h"
 5 | 
 6 | namespace SSMultiThreadedOneBlockCommNvshmemNoCompute {
 7 | int init(int argc, char** argv);
 8 | }
 9 | 
10 | #endif  // INC_NVSHMEM_3D_STENCIL_MULTI_THREADED_ONE_BLOCK_COMM_NO_COMPUTE_CUH
11 | 


--------------------------------------------------------------------------------
/Stencil/jacobi3D/include_nvshmem/no-compute/multi-threaded-two-block-comm-no-compute.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef INC_NVSHMEM_3D_STENCIL_MULTI_THREADED_TWO_BLOCK_COMM_NO_COMPUTE_CUH
 2 | #define INC_NVSHMEM_3D_STENCIL_MULTI_THREADED_TWO_BLOCK_COMM_NO_COMPUTE_CUH
 3 | 
 4 | #include "../common.h"
 5 | 
 6 | namespace SSMultiThreadedTwoBlockCommNvshmemNoCompute {
 7 | int init(int argc, char** argv);
 8 | }
 9 | 
10 | #endif  // INC_NVSHMEM_3D_STENCIL_MULTI_THREADED_TWO_BLOCK_COMM_NO_COMPUTE_CUH
11 | 


--------------------------------------------------------------------------------
/Plots/data/2d-weak-scaling-medium.csv:
--------------------------------------------------------------------------------
1 | Medium Domain Weak Scaling
2 | Version,1 GPU (1024x1024),2 GPUs (1024x2048),4 GPUs (2048x2048),8 GPUs (2048x4096)
3 | Single Stream 2TB,7.192,8.612,8.8903,8.9654
4 | Single Stream 1TB,7.2055,8.8046,9.127,15.5033
5 | Baseline Copy Overlap,12.2322,31.4607,71.3725,95.137
6 | Baseline Copy,12.7136,25.9365,55.0721,73.8571
7 | Baseline P2P,7.3627,18.3491,44.5818,60.2386
8 | Design 1,9.5494,10.8306,11.1018,10.747
9 | PERKS,1.5574,1.5892,1.6003,1.616


--------------------------------------------------------------------------------
/Plots/data/fig6/3D_Strong_Scaling_No_Compute_512x512x512.csv:
--------------------------------------------------------------------------------
1 | Version,1,2,3,4,5,6,7,8
2 | Baseline Copy,16.4393,11.0126,8.7214,7.1926,7.2223,7.5639,8.391,9.038
3 | Baseline Overlap,16.1371,8.0562,7.4151,7.8871,8.2864,9.4964,9.9443,10.0324
4 | Baseline P2P,29.6104,17.0617,12.298,9.706,8.3281,7.3623,6.6295,6.5249
5 | Baseline NVSHMEM (No Computation),57.0715,32.6568,23.7486,18.723,16.4555,14.1128,13.8677,12.7757
6 | CPU-Free (Ours),3.8826,4.1155,4.119,4.1262,4.1281,4.1229,4.126,4.1301
7 | 


--------------------------------------------------------------------------------
/Stencil/jacobi3D/include_nvshmem/no-compute/multi-threaded-multi-block-comm-no-compute.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef INC_NVSHMEM_3D_STENCIL_MULTI_THREADED_MULTI_BLOCK_COMM_NO_COMPUTE_CUH
 2 | #define INC_NVSHMEM_3D_STENCIL_MULTI_THREADED_MULTI_BLOCK_COMM_NO_COMPUTE_CUH
 3 | 
 4 | #include "../common.h"
 5 | 
 6 | namespace SSMultiThreadedMultiBlockCommNvshmemNoCompute {
 7 | int init(int argc, char** argv);
 8 | }
 9 | 
10 | #endif  // INC_NVSHMEM_3D_STENCIL_MULTI_THREADED_MULTI_BLOCK_COMM_NO_COMPUTE_CUH
11 | 


--------------------------------------------------------------------------------
/CG/results/cg_runtime_single_gpu-A100.csv:
--------------------------------------------------------------------------------
 1 | Matrix,Runtime
 2 | tridiagonal,18.6203
 3 | ecology2,1.651
 4 | hood,3.8152
 5 | bmwcra_1,3.7064
 6 | consph,1.7955
 7 | thermomech_dM,0.9367
 8 | tmt_sym,1.5808
 9 | crankseg_1,8.6219
10 | crankseg_2,10.0012
11 | Queen_4147,65.3094
12 | Bump_2911,31.9607
13 | G3_circuit,2.2177
14 | StocF-1465,3.8258
15 | Flan_1565,26.1217
16 | audikw_1,25.5873
17 | Serena,16.2381
18 | Geo_1438,16.3255
19 | Hook_1498,16.2435
20 | ldoor,14.3055
21 | 


--------------------------------------------------------------------------------
/Plots/data/2d-weak-scaling-large.csv:
--------------------------------------------------------------------------------
1 | Large Domain Weak Scaling
2 | Version,1 GPU (8192x4096),2 GPUs (8192x8192),4 GPUs (8192x16384),8 GPUs (16384x16384)
3 | Single Stream 2TB,26.7276,26.6932,26.9133,26.9483
4 | Single Stream 1TB,26.8677,26.8376,27.0564,27.0928
5 | Baseline Copy Overlap,22.0284,21.7194,21.7976,21.8375
6 | Baseline Copy,22.2609,23.5586,23.6619,23.7858
7 | Baseline P2P,21.5407,22.2303,22.8034,22.9545
8 | Design 1,26.8585,26.8136,27.058,27.1067
9 | PERKS,20.9525,21.0912,21.3203,21.8428


--------------------------------------------------------------------------------
/Plots/data/fig6/3D_Strong_Scaling_256x256x256.csv:
--------------------------------------------------------------------------------
1 | Version,1,2,3,4,5,6,7,8
2 | Baseline Copy,1.2854,0.8522,0.6712,0.5699,0.6781,0.7493,0.8319,0.8308
3 | Baseline Overlap,1.2599,0.6687,0.671,0.661,0.8533,0.8479,0.9338,0.9649
4 | Baseline P2P,1.2433,0.7529,0.5684,0.4499,0.499,0.5614,0.5165,0.5842
5 | Baseline NVSHMEM,3.3892,1.9805,1.4403,1.2186,1.0555,0.8368,0.7347,0.7
6 | CPU-Free (Ours),2.1729,0.9247,0.6057,0.4807,0.3626,0.3254,0.2766,0.2388
7 | CPU-Free (Ours + PERKS),1.2591,0.6948,0.5312,0.5198,0.4622,0.4687,0.4612,0.4659
8 | 


--------------------------------------------------------------------------------
/Stencil/jacobi3D/scripts/run-bench.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #SBATCH --job-name=stencil-bench
 4 | #SBATCH --ntasks=8
 5 | #SBATCH --gres=gpu:8
 6 | #SBATCH --partition hgx2q
 7 | #SBATCH --time=01:30:00
 8 | #SBATCH --output=sbatch_output_%j.log
 9 | 
10 | SCRIPT="./scripts/weak_scale_bench.sh"
11 | 
12 | ARGS=(
13 |     "256 256 128 10000"
14 |     "256 256 256 10000"
15 |     "512 512 32 1000"
16 |     "512 512 512 100"
17 | )
18 | 
19 | for i in "${ARGS[@]}"
20 | do
21 |     "$SCRIPT" $i;
22 |     printf '\n\n'
23 | done
24 | 


--------------------------------------------------------------------------------
/CG/include/single-stream/standard.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef INC_CG_SINGLE_STREAM_STANDARD_CUH
 2 | #define INC_CG_SINGLE_STREAM_STANDARD_CUH
 3 | 
 4 | #include "../common.h"
 5 | 
 6 | namespace SingleStreamStandard {
 7 | int init(int *device_csrRowIndices, int *device_csrColIndices, real *device_csrVal,
 8 |          const int num_rows, const int nnz, bool matrix_is_zero_indexed, const int iter_max,
 9 |          real *x_final_result, const double single_gpu_runtime, bool compare_to_single_gpu,
10 |          bool compare_to_cpu, real *x_ref_single_gpu, real *x_ref_cpu);
11 | }
12 | 
13 | #endif  // INC_CG_SINGLE_STREAM_STANDARD_CUH
14 | 


--------------------------------------------------------------------------------
/CG/include/single-stream/pipelined.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef INC_CG_SINGLE_STREAM_PIPELINED_CUH
 2 | #define INC_CG_SINGLE_STREAM_PIPELINED_CUH
 3 | 
 4 | #include "../common.h"
 5 | 
 6 | namespace SingleStreamPipelined {
 7 | int init(int *device_csrRowIndices, int *device_csrColIndices, real *device_csrVal,
 8 |          const int num_rows, const int nnz, bool matrix_is_zero_indexed, const int iter_max,
 9 |          real *x_final_result, const double single_gpu_runtime, bool compare_to_single_gpu,
10 |          bool compare_to_cpu, real *x_ref_single_gpu, real *x_ref_cpu);
11 | }
12 | 
13 | #endif  // INC_CG_SINGLE_STREAM_PIPELINED_CUH
14 | 


--------------------------------------------------------------------------------
/CG/include/baseline/discrete-standard.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef INC_CG_BASELINE_DISCRETE_STANDARD_CUH
 2 | #define INC_CG_BASELINE_DISCRETE_STANDARD_CUH
 3 | 
 4 | #include "../common.h"
 5 | 
 6 | namespace BaselineDiscreteStandard {
 7 | int init(int *device_csrRowIndices, int *device_csrColIndices, real *device_csrVal,
 8 |          const int num_rows, const int nnz, bool matrix_is_zero_indexed, const int iter_max,
 9 |          real *x_final_result, const double single_gpu_runtime, bool compare_to_single_gpu,
10 |          bool compare_to_cpu, real *x_ref_single_gpu, real *x_ref_cpu);
11 | }
12 | 
13 | #endif  // INC_CG_BASELINE_DISCRETE_STANDARD_CUH
14 | 


--------------------------------------------------------------------------------
/Stencil/jacobi3D/src/PERKS/common/jacobi_cuda.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef PERKS_CUDA_HEADER
 2 | #define PERKS_CUDA_HEADER
 3 | // template<class REAL>
 4 | 
 5 | // this is where the aimed implementation located
 6 | template <class REAL>
 7 | int j3d_iterative(REAL*, int, int, int, REAL*, int, int, int, bool, bool, int, bool,
 8 |                   bool getminHeight = false);
 9 | 
10 | #define PERKS_DECLARE_INITIONIZATION_ITERATIVE(_type) \
11 |     int j3d_iterative(_type*, int, int, int, _type*, int, int, int, bool, bool, int, bool, bool);
12 | 
13 | template <class REAL>
14 | int getMinWidthY(int, int, int, bool isDoubleTile = false);
15 | 
16 | #endif
17 | 


--------------------------------------------------------------------------------
/CG/include/baseline/discrete-pipelined.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef INC_CG_BASELINE_DISCRETE_PIPELINED_CUH
 2 | #define INC_CG_BASELINE_DISCRETE_PIPELINED_CUH
 3 | 
 4 | #include "../common.h"
 5 | 
 6 | namespace BaselineDiscretePipelined {
 7 | int init(int *device_csrRowIndices, int *device_csrColIndices, real *device_csrVal,
 8 |          const int num_rows, const int nnz, bool matrix_is_zero_indexed, const int iter_max,
 9 |          real *x_final_result, const double single_gpu_runtime, bool compare_to_single_gpu,
10 |          bool compare_to_cpu, real *x_ref_single_gpu, real *x_ref_cpu);
11 | }
12 | 
13 | #endif  // INC_CG_BASELINE_DISCRETE_PIPELINED_CUH
14 | 


--------------------------------------------------------------------------------
/CG/include/profiling/discrete-standard.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef INC_CG_PROFILING_DISCRETE_STANDARD_CUH
 2 | #define INC_CG_PROFILING_DISCRETE_STANDARD_CUH
 3 | 
 4 | #include "../common.h"
 5 | 
 6 | namespace ProfilingDiscreteStandard {
 7 | int init(int *device_csrRowIndices, int *device_csrColIndices, real *device_csrVal,
 8 |          const int num_rows, const int nnz, bool matrix_is_zero_indexed, const int iter_max,
 9 |          real *x_final_result, const double single_gpu_runtime, bool compare_to_single_gpu,
10 |          bool compare_to_cpu, real *x_ref_single_gpu, real *x_ref_cpu);
11 | }
12 | 
13 | #endif  // INC_CG_PROFILING_DISCRETE_STANDARD_CUH
14 | 


--------------------------------------------------------------------------------
/CG/include/profiling/discrete-pipelined.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef INC_CG_PROFILING_DISCRETE_PIPELINED_CUH
 2 | #define INC_CG_PROFILING_DISCRETE_PIPELINED_CUH
 3 | 
 4 | #include "../common.h"
 5 | 
 6 | namespace ProfilingDiscretePipelined {
 7 | int init(int *device_csrRowIndices, int *device_csrColIndices, real *device_csrVal,
 8 |          const int num_rows, const int nnz, bool matrix_is_zero_indexed, const int iter_max,
 9 |          real *x_final_result, const double single_gpu_runtime, bool compare_to_single_gpu,
10 |          bool compare_to_cpu, real *x_ref_single_gpu, real *x_ref_cpu);
11 | }
12 | 
13 | #endif  // INC_CG_PROFILING_DISCRETE_PIPELINED_CUH
14 | 


--------------------------------------------------------------------------------
/CG/include/single-stream/pipelined-gather.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef INC_CG_SINGLE_STREAM_PIPELINED_GATHER_CUH
 2 | #define INC_CG_SINGLE_STREAM_PIPELINED_GATHER_CUH
 3 | 
 4 | #include "../common.h"
 5 | 
 6 | namespace SingleStreamPipelinedGather {
 7 | int init(int *device_csrRowIndices, int *device_csrColIndices, real *device_csrVal,
 8 |          const int num_rows, const int nnz, bool matrix_is_zero_indexed, const int iter_max,
 9 |          real *x_final_result, const double single_gpu_runtime, bool compare_to_single_gpu,
10 |          bool compare_to_cpu, real *x_ref_single_gpu, real *x_ref_cpu);
11 | }
12 | 
13 | #endif  // INC_CG_SINGLE_STREAM_PIPELINED_GATHER_CUH
14 | 


--------------------------------------------------------------------------------
/Stencil/jacobi3D/src_nvshmem/PERKS-nvshmem/common/jacobi_cuda.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef PERKS_CUDA_HEADER
 2 | #define PERKS_CUDA_HEADER
 3 | // template<class REAL>
 4 | 
 5 | // this is where the aimed implementation located
 6 | template <class REAL>
 7 | int j3d_iterative(REAL*, int, int, int, REAL*, int, int, int, bool, bool, int, bool,
 8 |                   bool getminHeight = false);
 9 | 
10 | #define PERKS_DECLARE_INITIONIZATION_ITERATIVE(_type) \
11 |     int j3d_iterative(_type*, int, int, int, _type*, int, int, int, bool, bool, int, bool, bool);
12 | 
13 | template <class REAL>
14 | int getMinWidthY(int, int, int, bool isDoubleTile = false);
15 | 
16 | #endif
17 | 


--------------------------------------------------------------------------------
/CG/results/standard_speedup/pipelined_cg_speedup_1 GPU.txt:
--------------------------------------------------------------------------------
 1 | Bump_2911        4.426455
 2 | Flan_1565        5.112643
 3 | G3_circuit       2.494318
 4 | Geo_1438         4.623393
 5 | Hook_1498        3.594672
 6 | Queen_4147       3.705777
 7 | Serena           3.341596
 8 | StocF-1465       2.237801
 9 | audikw_1         4.304354
10 | bmwcra_1         2.411268
11 | consph           2.783882
12 | crankseg_1       1.820689
13 | crankseg_2       1.742189
14 | ecology2         2.929725
15 | hood             4.932044
16 | ldoor            4.913723
17 | thermomech_dM    5.425409
18 | tmt_sym          3.102883
19 | Persistent vs Discrete Standard CG geo mean speedup on 1 GPU: 3.3515941736836967


--------------------------------------------------------------------------------
/CG/results/standard_speedup/pipelined_cg_speedup_2 GPUs.txt:
--------------------------------------------------------------------------------
 1 | Bump_2911        7.994366
 2 | Flan_1565        9.729552
 3 | G3_circuit       5.990779
 4 | Geo_1438         9.328254
 5 | Hook_1498        7.015388
 6 | Queen_4147       7.065026
 7 | Serena           6.570781
 8 | StocF-1465       4.973564
 9 | audikw_1         6.662346
10 | bmwcra_1         4.332335
11 | consph           5.633529
12 | crankseg_1       3.272440
13 | crankseg_2       3.270840
14 | ecology2         6.744991
15 | hood             5.337812
16 | ldoor            7.827719
17 | thermomech_dM    7.555006
18 | tmt_sym          6.046260
19 | Persistent vs Discrete Standard CG geo mean speedup on 2 GPUs: 6.148600489496772


--------------------------------------------------------------------------------
/CG/results/standard_speedup/pipelined_cg_speedup_3 GPUs.txt:
--------------------------------------------------------------------------------
 1 | Bump_2911        2.596158
 2 | Flan_1565        3.203920
 3 | G3_circuit       1.603297
 4 | Geo_1438         2.490385
 5 | Hook_1498        1.937174
 6 | Queen_4147       3.445680
 7 | Serena           1.830493
 8 | StocF-1465       1.493452
 9 | audikw_1         1.998953
10 | bmwcra_1         1.196542
11 | consph           1.373582
12 | crankseg_1       1.000050
13 | crankseg_2       1.005493
14 | ecology2         1.731788
15 | hood             1.398411
16 | ldoor            2.288784
17 | thermomech_dM    1.791408
18 | tmt_sym          1.774122
19 | Persistent vs Discrete Standard CG geo mean speedup on 3 GPUs: 1.789093534511638


--------------------------------------------------------------------------------
/CG/results/standard_speedup/pipelined_cg_speedup_6 GPUs.txt:
--------------------------------------------------------------------------------
 1 | Bump_2911        2.334338
 2 | Flan_1565        2.720973
 3 | G3_circuit       1.660865
 4 | Geo_1438         2.323742
 5 | Hook_1498        1.862936
 6 | Queen_4147       3.157614
 7 | Serena           1.773258
 8 | StocF-1465       1.477720
 9 | audikw_1         1.178450
10 | bmwcra_1         1.194367
11 | consph           1.332771
12 | crankseg_1       0.997814
13 | crankseg_2       1.007675
14 | ecology2         1.801915
15 | hood             1.356637
16 | ldoor            1.420461
17 | thermomech_dM    1.662827
18 | tmt_sym          1.744974
19 | Persistent vs Discrete Standard CG geo mean speedup on 6 GPUs: 1.637689605919504


--------------------------------------------------------------------------------
/CG/results/standard_speedup/pipelined_cg_speedup_8 GPUs.txt:
--------------------------------------------------------------------------------
 1 | Bump_2911        2.078453
 2 | Flan_1565        2.489724
 3 | G3_circuit       1.736853
 4 | Geo_1438         1.885539
 5 | Hook_1498        1.607650
 6 | Queen_4147       3.275249
 7 | Serena           1.400108
 8 | StocF-1465       1.399548
 9 | audikw_1         0.959414
10 | bmwcra_1         1.181162
11 | consph           1.360530
12 | crankseg_1       0.998168
13 | crankseg_2       0.998363
14 | ecology2         1.831156
15 | hood             1.370214
16 | ldoor            1.152638
17 | thermomech_dM    1.713865
18 | tmt_sym          1.872217
19 | Persistent vs Discrete Standard CG geo mean speedup on 8 GPUs: 1.544213236325518


--------------------------------------------------------------------------------
/CG/include/single-stream/standard-saxpy-overlap.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef INC_CG_SINGLE_STREAM_STANDARD_SAXPY_OVERLAP_CUH
 2 | #define INC_CG_SINGLE_STREAM_STANDARD_SAXPY_OVERLAP_CUH
 3 | 
 4 | #include "../common.h"
 5 | 
 6 | namespace SingleStreamStandardSaxpyOverlap {
 7 | int init(int *device_csrRowIndices, int *device_csrColIndices, real *device_csrVal,
 8 |          const int num_rows, const int nnz, bool matrix_is_zero_indexed, const int iter_max,
 9 |          real *x_final_result, const double single_gpu_runtime, bool compare_to_single_gpu,
10 |          bool compare_to_cpu, real *x_ref_single_gpu, real *x_ref_cpu);
11 | }
12 | 
13 | #endif  // INC_CG_SINGLE_STREAM_STANDARD_SAXPY_OVERLAP_CUH
14 | 


--------------------------------------------------------------------------------
/CG/results/pipelined_speedup/pipelined_cg_speedup_1 GPU.txt:
--------------------------------------------------------------------------------
 1 | Bump_2911        4.375819
 2 | Flan_1565        5.052174
 3 | G3_circuit       2.200405
 4 | Geo_1438         4.382149
 5 | Hook_1498        3.484535
 6 | Queen_4147       3.664555
 7 | Serena           3.355091
 8 | StocF-1465       2.192997
 9 | audikw_1         4.212806
10 | bmwcra_1         2.354507
11 | consph           2.495528
12 | crankseg_1       1.872879
13 | crankseg_2       1.814216
14 | ecology2         2.404500
15 | hood             4.038322
16 | ldoor            4.695067
17 | thermomech_dM    4.532305
18 | tmt_sym          2.508903
19 | Persistent vs Discrete Pipelined CG geo mean speedup on 1 GPU: 3.1385564179041943


--------------------------------------------------------------------------------
/CG/results/pipelined_speedup/pipelined_cg_speedup_2 GPUs.txt:
--------------------------------------------------------------------------------
 1 | Bump_2911        7.341344
 2 | Flan_1565        8.908230
 3 | G3_circuit       5.056324
 4 | Geo_1438         8.948598
 5 | Hook_1498        6.752378
 6 | Queen_4147       5.812224
 7 | Serena           6.488967
 8 | StocF-1465       4.721093
 9 | audikw_1         5.705003
10 | bmwcra_1         4.057007
11 | consph           4.543359
12 | crankseg_1       2.812483
13 | crankseg_2       2.760180
14 | ecology2         5.792145
15 | hood             2.951552
16 | ldoor            7.557363
17 | thermomech_dM    8.461633
18 | tmt_sym          5.995432
19 | Persistent vs Discrete Pipelined CG geo mean speedup on 2 GPUs: 5.467063971899593


--------------------------------------------------------------------------------
/CG/results/pipelined_speedup/pipelined_cg_speedup_3 GPUs.txt:
--------------------------------------------------------------------------------
 1 | Bump_2911        2.569157
 2 | Flan_1565        3.232169
 3 | G3_circuit       1.550415
 4 | Geo_1438         2.510828
 5 | Hook_1498        1.961816
 6 | Queen_4147       3.457249
 7 | Serena           1.944080
 8 | StocF-1465       1.491928
 9 | audikw_1         1.781032
10 | bmwcra_1         1.147545
11 | consph           1.299778
12 | crankseg_1       1.005540
13 | crankseg_2       1.021906
14 | ecology2         1.791548
15 | hood             1.303192
16 | ldoor            2.333952
17 | thermomech_dM    2.034179
18 | tmt_sym          1.777085
19 | Persistent vs Discrete Pipelined CG geo mean speedup on 3 GPUs: 1.7860187368560174


--------------------------------------------------------------------------------
/CG/results/pipelined_speedup/pipelined_cg_speedup_4 GPUs.txt:
--------------------------------------------------------------------------------
 1 | Bump_2911        2.439995
 2 | Flan_1565        2.875764
 3 | G3_circuit       1.676666
 4 | Geo_1438         2.302183
 5 | Hook_1498        1.723814
 6 | Queen_4147       3.396676
 7 | Serena           1.658784
 8 | StocF-1465       1.383142
 9 | audikw_1         1.595776
10 | bmwcra_1         1.125087
11 | consph           1.307853
12 | crankseg_1       1.048909
13 | crankseg_2       1.027968
14 | ecology2         1.925187
15 | hood             1.307692
16 | ldoor            2.154850
17 | thermomech_dM    1.899112
18 | tmt_sym          2.364267
19 | Persistent vs Discrete Pipelined CG geo mean speedup on 4 GPUs: 1.7447072391975165


--------------------------------------------------------------------------------
/CG/results/pipelined_speedup/pipelined_cg_speedup_5 GPUs.txt:
--------------------------------------------------------------------------------
 1 | Bump_2911        2.344577
 2 | Flan_1565        3.025135
 3 | G3_circuit       1.857594
 4 | Geo_1438         2.514926
 5 | Hook_1498        1.965359
 6 | Queen_4147       3.355499
 7 | Serena           1.821757
 8 | StocF-1465       1.460045
 9 | audikw_1         1.469713
10 | bmwcra_1         1.168740
11 | consph           1.305694
12 | crankseg_1       1.012210
13 | crankseg_2       1.027019
14 | ecology2         2.465786
15 | hood             1.314367
16 | ldoor            1.853238
17 | thermomech_dM    1.907120
18 | tmt_sym          2.337850
19 | Persistent vs Discrete Pipelined CG geo mean speedup on 5 GPUs: 1.7919301746521321


--------------------------------------------------------------------------------
/CG/results/pipelined_speedup/pipelined_cg_speedup_6 GPUs.txt:
--------------------------------------------------------------------------------
 1 | Bump_2911        2.376280
 2 | Flan_1565        2.821397
 3 | G3_circuit       1.867283
 4 | Geo_1438         2.417959
 5 | Hook_1498        1.851615
 6 | Queen_4147       3.216714
 7 | Serena           1.864264
 8 | StocF-1465       1.486899
 9 | audikw_1         1.182397
10 | bmwcra_1         1.183182
11 | consph           1.306239
12 | crankseg_1       1.004176
13 | crankseg_2       1.030190
14 | ecology2         2.443718
15 | hood             1.343569
16 | ldoor            1.401290
17 | thermomech_dM    1.918803
18 | tmt_sym          2.366743
19 | Persistent vs Discrete Pipelined CG geo mean speedup on 6 GPUs: 1.7323316068961419


--------------------------------------------------------------------------------
/CG/results/pipelined_speedup/pipelined_cg_speedup_7 GPUs.txt:
--------------------------------------------------------------------------------
 1 | Bump_2911        2.330349
 2 | Flan_1565        2.519700
 3 | G3_circuit       1.996439
 4 | Geo_1438         2.481387
 5 | Hook_1498        2.051455
 6 | Queen_4147       3.143436
 7 | Serena           1.801486
 8 | StocF-1465       1.571447
 9 | audikw_1         1.016005
10 | bmwcra_1         1.181831
11 | consph           1.303826
12 | crankseg_1       0.996751
13 | crankseg_2       1.039673
14 | ecology2         2.513164
15 | hood             1.330792
16 | ldoor            1.144547
17 | thermomech_dM    1.918175
18 | tmt_sym          2.742812
19 | Persistent vs Discrete Pipelined CG geo mean speedup on 7 GPUs: 1.7199580346245422


--------------------------------------------------------------------------------
/CG/results/pipelined_speedup/pipelined_cg_speedup_8 GPUs.txt:
--------------------------------------------------------------------------------
 1 | Bump_2911        2.153395
 2 | Flan_1565        2.587598
 3 | G3_circuit       2.235581
 4 | Geo_1438         1.912613
 5 | Hook_1498        1.624379
 6 | Queen_4147       3.338698
 7 | Serena           1.403080
 8 | StocF-1465       1.298828
 9 | audikw_1         0.943741
10 | bmwcra_1         1.149462
11 | consph           1.306150
12 | crankseg_1       1.003387
13 | crankseg_2       1.009850
14 | ecology2         2.563682
15 | hood             1.337760
16 | ldoor            1.043089
17 | thermomech_dM    1.941090
18 | tmt_sym          2.803437
19 | Persistent vs Discrete Pipelined CG geo mean speedup on 8 GPUs: 1.62977591970162


--------------------------------------------------------------------------------
/CG/results/standard_speedup/pipelined_cg_speedup_4 GPUs.txt:
--------------------------------------------------------------------------------
 1 | Bump_2911        2.438446
 2 | Flan_1565        2.840313
 3 | G3_circuit       1.733877
 4 | Geo_1438         2.287746
 5 | Hook_1498        1.738808
 6 | Queen_4147       3.387336
 7 | Serena           1.655311
 8 | StocF-1465       1.412219
 9 | audikw_1         1.622042
10 | bmwcra_1         1.217728
11 | consph           1.379057
12 | crankseg_1       1.020158
13 | crankseg_2       1.002734
14 | ecology2         1.993398
15 | hood             1.404928
16 | ldoor            2.103503
17 | thermomech_dM    1.790069
18 | tmt_sym          1.996205
19 | Persistent vs Discrete Standard CG geo mean speedup on 4 GPUs: 1.7436150261665724


--------------------------------------------------------------------------------
/CG/results/standard_speedup/pipelined_cg_speedup_5 GPUs.txt:
--------------------------------------------------------------------------------
 1 | Bump_2911        2.351825
 2 | Flan_1565        2.944306
 3 | G3_circuit       1.718156
 4 | Geo_1438         2.421919
 5 | Hook_1498        1.923354
 6 | Queen_4147       3.291462
 7 | Serena           1.788636
 8 | StocF-1465       1.535666
 9 | audikw_1         1.435445
10 | bmwcra_1         1.189787
11 | consph           1.348844
12 | crankseg_1       1.002568
13 | crankseg_2       1.007855
14 | ecology2         1.822290
15 | hood             1.358805
16 | ldoor            1.813006
17 | thermomech_dM    1.698135
18 | tmt_sym          1.735853
19 | Persistent vs Discrete Standard CG geo mean speedup on 5 GPUs: 1.7088051164079698


--------------------------------------------------------------------------------
/CG/results/standard_speedup/pipelined_cg_speedup_7 GPUs.txt:
--------------------------------------------------------------------------------
 1 | Bump_2911        2.283016
 2 | Flan_1565        2.522607
 3 | G3_circuit       1.700237
 4 | Geo_1438         2.366187
 5 | Hook_1498        1.966283
 6 | Queen_4147       3.089906
 7 | Serena           1.794262
 8 | StocF-1465       1.503945
 9 | audikw_1         1.022986
10 | bmwcra_1         1.199006
11 | consph           1.340948
12 | crankseg_1       0.995924
13 | crankseg_2       1.008379
14 | ecology2         1.810112
15 | hood             1.349423
16 | ldoor            1.201471
17 | thermomech_dM    1.663449
18 | tmt_sym          1.835639
19 | Persistent vs Discrete Standard CG geo mean speedup on 7 GPUs: 1.6155862324379113


--------------------------------------------------------------------------------
/CG/include/single-stream/pipelined-multi-overlap.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef INC_CG_SINGLE_STREAM_PIPELINED_MULTI_OVERLAP_CUH
 2 | #define INC_CG_SINGLE_STREAM_PIPELINED_MULTI_OVERLAP_CUH
 3 | 
 4 | #include "../common.h"
 5 | 
 6 | namespace SingleStreamPipelinedMultiOverlap {
 7 | int init(int *device_csrRowIndices, int *device_csrColIndices, real *device_csrVal,
 8 |          const int num_rows, const int nnz, bool matrix_is_zero_indexed, const int iter_max,
 9 |          real *x_final_result, const double single_gpu_runtime, bool compare_to_single_gpu,
10 |          bool compare_to_cpu, real *x_ref_single_gpu, real *x_ref_cpu);
11 | }
12 | 
13 | #endif  // INC_CG_SINGLE_STREAM_PIPELINED_MULTI_OVERLAP_CUH
14 | 


--------------------------------------------------------------------------------
/Stencil/jacobi2D/PERKS/common/jacobi_reference.hpp:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | #ifndef PERKS_REFERENCE_HEADER
 4 | #define PERKS_REFERENCE_HEADER
 5 | //template<class REAL>
 6 | //void jacobi(REAL*, int, int, REAL*);
 7 | 
 8 | // single step reference
 9 | template<class REAL>
10 | void jacobi_gold(REAL*, int, int, REAL*);
11 | // iterative reference
12 | template<class REAL>
13 | void jacobi_gold_iterative(REAL*, int, int, REAL*, int );
14 | 
15 | 
16 | #define PERKS_DECLARE_INITIONIZATION_REFERENCE(_type) \
17 |     void jacobi_gold(_type*,int,int,_type*);
18 | 
19 | #define PERKS_DECLARE_INITIONIZATION_REFERENCE_ITERATIVE(_type) \
20 |     void jacobi_gold_iterative(_type*,int,int,_type*, int);
21 | 
22 | #endif


--------------------------------------------------------------------------------
/Stencil/jacobi3D/src/PERKS/common/jacobi_reference.hpp:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | #ifndef PERKS_REFERENCE_HEADER
 4 | #define PERKS_REFERENCE_HEADER
 5 | // template<class REAL>
 6 | // void jacobi(REAL*, int, int, REAL*);
 7 | 
 8 | // single step reference
 9 | template <class REAL>
10 | void j3d_gold(REAL*, int, int, int, REAL*);
11 | // iterative reference
12 | template <class REAL>
13 | void j3d_gold_iterative(REAL*, int, int, int, REAL*, int);
14 | 
15 | #define PERKS_DECLARE_INITIONIZATION_REFERENCE(_type) void j3d_gold(_type*, int, int, int, _type*);
16 | 
17 | #define PERKS_DECLARE_INITIONIZATION_REFERENCE_ITERATIVE(_type) \
18 |     void j3d_gold_iterative(_type*, int, int, int, _type*, int);
19 | 
20 | #endif


--------------------------------------------------------------------------------
/Stencil/jacobi3D/src_nvshmem/PERKS-nvshmem/common/jacobi_reference.hpp:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | #ifndef PERKS_REFERENCE_HEADER
 4 | #define PERKS_REFERENCE_HEADER
 5 | //template<class REAL>
 6 | //void jacobi(REAL*, int, int, REAL*);
 7 | 
 8 | // single step reference
 9 | template<class REAL>
10 | void j3d_gold(REAL*, int, int, int, REAL*);
11 | // iterative reference
12 | template<class REAL>
13 | void j3d_gold_iterative(REAL*, int, int, int, REAL*, int );
14 | 
15 | 
16 | #define PERKS_DECLARE_INITIONIZATION_REFERENCE(_type) \
17 |     void j3d_gold(_type*,int,int,int,_type*);
18 | 
19 | #define PERKS_DECLARE_INITIONIZATION_REFERENCE_ITERATIVE(_type) \
20 |     void j3d_gold_iterative(_type*,int,int,int,_type*, int);
21 | 
22 | #endif


--------------------------------------------------------------------------------
/CG/batch/A100-machine/_load_A100-machine_modules.sh:
--------------------------------------------------------------------------------
 1 | source /home/iismayilov21/spack/share/spack/setup-env.sh
 2 | 
 3 | spack load nvshmem@2.7.0-6
 4 | 
 5 | export UCX_WARN_UNUSED_ENV_VARS=n
 6 | export UCX_HOME=$SPACK_ROOT/opt/spack/linux-ubuntu20.04-zen2/gcc-11.1.0/ucx-1.13.1-cv37hs5p3lpknxhmuhucbsjotdn653vn/
 7 | export NVSHMEM_HOME=$SPACK_ROOT/opt/spack/linux-ubuntu20.04-zen2/gcc-11.1.0/nvshmem-2.7.0-6-svccom42hd6t6fmfru3txongtfpvuynm/
 8 | export MPI_HOME=$SPACK_ROOT/opt/spack/linux-ubuntu20.04-zen2/gcc-11.1.0/openmpi-4.1.4-cgf2kyjuumewmbove7jagikdbpo42s6q/
 9 | export CUDA_HOME=$SPACK_ROOT/opt/spack/linux-ubuntu20.04-zen2/gcc-11.1.0/cuda-11.8.0-vb4kpzvmja7a3pinvxpbschaqo4jkalp/
10 | export LD_LIBRARY_PATH=$NVSHMEM_HOME/lib:$UCX_HOME/lib:$CUDA_HOME/lib64:$LD_LIBRARY_PATH
11 | 
12 | export CXX=/usr/bin/g++-11
13 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.23)
 2 | 
 3 | set(CMAKE_VERBOSE_MAKEFILE ON)
 4 | 
 5 | project(cpu-free-model VERSION 1.0
 6 |         DESCRIPTION "TBD"
 7 |         LANGUAGES C CXX)
 8 | 
 9 | set(CMAKE_CXX_STANDARD 17)
10 | set(NVCC_VER_MIN 11.6)
11 | 
12 | set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
13 | set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
14 | set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
15 | 
16 | # CUDA
17 | set(CMAKE_CUDA_ARCHITECTURES "80")
18 | find_package(CUDAToolkit ${NVCC_VER_MIN} REQUIRED)
19 | set(CMAKE_CUDA_STANDARD 17)
20 | set(CMAKE_CUDA_SEPARABLE_COMPILATION ON)
21 | 
22 | enable_language(CUDA)
23 | 
24 | # Why is this necessary
25 | find_package(OpenMP REQUIRED)
26 | list(APPEND CMAKE_CUDA_FLAGS "-Xcompiler -fopenmp")
27 | 
28 | add_subdirectory(CG)
29 | add_subdirectory(Stencil)
30 | 


--------------------------------------------------------------------------------
/Plots/data/2d-comp.csv:
--------------------------------------------------------------------------------
 1 | Weak Scaling 	Start => 1024x1024	End => 2048x4096	1 Million iterations
 2 | 
 3 | 	1 GPU (1024x1024)	2 GPUs (1024x2048)	4 GPUs (2048x2048)	8 GPUs (2048x4096)
 4 | Single Stream 2TB (No Compute)	2.4382	4.7672	4.9184	8.1736
 5 | Baseline Copy (No compute)	7.9792	23.7511	55.5864	78.1668
 6 | Single Stream 1TB (No Compute)	3.1543	7.5508	7.6506	13.9486
 7 | Single Stream 2TB	7.192	8.612	8.8903	8.9654
 8 | Single Stream 1TB	7.2055	8.8046	9.127	15.5033
 9 | Baseline Copy Overlap	12.2322	31.4607	71.3725	95.137
10 | Baseline Copy Overlap (No Compute)	12.2878	30.9587	70.8682	96.8606
11 | Baseline Copy	12.7136	25.9365	55.0721	73.8571
12 | Baseline P2P	7.3627	18.3491	44.5818	60.2386
13 | Baseline P2P (No Compute)	4.4936	18.1663	39.4434	56.9321
14 | Design 1	9.5494	10.8306	11.1018	10.747
15 | Design 1 (no compute)	3.4976	4.1581	4.2056	5.4061
16 | PERKS	1.5574	1.5892	1.6003	1.616


--------------------------------------------------------------------------------
/Stencil/jacobi2D/PERKS/common/jacobi_cuda.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef PERKS_CUDA_HEADER
 2 | #define PERKS_CUDA_HEADER
 3 | // template<class REAL>
 4 | 
 5 | // this is where the aimed implementation located
 6 | template <class REAL>
 7 | int jacobi_iterative(REAL*, int, int, REAL*, int, int, int, bool, bool, bool, int, bool);
 8 | 
 9 | #define PERKS_DECLARE_INITIONIZATION_ITERATIVE(_type)                                           \
10 |     int jacobi_iterative<_type>(_type*, int, int, _type*, int, int, int, bool, bool, bool, int, \
11 |                                 bool);
12 | 
13 | template <int halo, bool isstar, int arch, class REAL>
14 | int getMinWidthY(int, int, int);
15 | 
16 | template <class REAL>
17 | int getMinWidthY(int, int, int, bool, int, bool);
18 | // template<class REAL>int getMinWidthY(int , int , int );
19 | template <class REAL>
20 | int getMinWidthY(int, int, bool);
21 | 
22 | #endif
23 | 


--------------------------------------------------------------------------------
/CG/batch/Truba/_load_truba_modules.sh:
--------------------------------------------------------------------------------
 1 | . /truba/home/dsagbili/spack/share/spack/setup-env.sh
 2 | 
 3 | # spack load nvshmem@2.7.0-6
 4 | spack load nvshmem@2.7.0-6/pdl77w7
 5 | 
 6 | export UCX_WARN_UNUSED_ENV_VARS=n
 7 | #export NVSHMEM_IB_ENABLE_GPUINITIATED=1
 8 | export UCX_HOME=/truba/home/dsagbili/spack/opt/spack/linux-rhel8-zen/gcc-8.5.0/ucx-1.13.1-tc7ltbeqjfzr4sdwbv5jgppl4p62q5mu
 9 | export NVSHMEM_HOME=/truba/home/dsagbili/spack/opt/spack/linux-rhel8-zen/gcc-8.5.0/nvshmem-2.7.0-6-pdl77w7adu5dm334pezvemvt5tjxsowg
10 | export MPI_HOME=/truba/home/dsagbili/spack/opt/spack/linux-rhel8-zen/gcc-8.5.0/openmpi-4.1.4-ycvxffyzzonogvqycd4gpp7aholtkss5
11 | export CUDA_HOME=/truba/home/dsagbili/spack/opt/spack/linux-rhel8-zen/gcc-8.5.0/cuda-11.8.0-37xn6z7age2zvgrmug5jad7l34sizzkp
12 | export LD_LIBRARY_PATH=$NVSHMEM_HOME/lib:$UCX_HOME/lib:$CUDA_HOME/lib64:$LD_LIBRARY_PATH
13 | 
14 | module load centos7.3/comp/python/3.7.7-openmpi-1.8.8-gcc-4.8.5-GOLD


--------------------------------------------------------------------------------
/Stencil/jacobi2D/PERKS/common/types.hpp:
--------------------------------------------------------------------------------
 1 | 
 2 | #ifndef PERKS_TYPES
 3 | #define PERKS_TYPES
 4 | 
 5 | #define PERKS_INITIALIZE_ALL_TYPE(_macro) \
 6 |     template _macro(float);\
 7 |     template _macro(double)
 8 | 
 9 | #define PERKS_INITIALIZE_ALL_TYPE_1ARG(_macro,halo) \
10 |     template _macro(float,halo);\
11 |     template _macro(double,halo)
12 | 
13 | #define PERKS_INITIALIZE_ALL_TYPE_2ARG(_macro,a,b) \
14 |     template _macro(float,a,b);\
15 |     template _macro(double,a,b)
16 | 
17 | #define PERKS_INITIALIZE_ALL_TYPE_3ARG(_macro,a,b,c) \
18 |     template _macro(float,a,b,c);\
19 |     template _macro(double,a,b,c)
20 | 
21 | 
22 | #define PERKS_INITIALIZE_ALL_TYPE_4ARG(_macro,a,b,c,d) \
23 |     template _macro(float,a,b,c,d);\
24 |     template _macro(double,a,b,c,d)
25 | 
26 | 
27 | #define PERKS_INITIALIZE_ALL_TYPE_5ARG(_macro,a,b,c,d,e) \
28 |     template _macro(float,a,b,c,d,e);\
29 |     template _macro(double,a,b,c,d,e)
30 | 
31 | 
32 | 
33 | 
34 | #endif
35 | 


--------------------------------------------------------------------------------
/CG/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_executable(cg
 2 |         src/main.cu
 3 |         src/common.cu
 4 |         src/mmio.c
 5 |         src/mmio_wrapper.cpp
 6 |         src/baseline/discrete-pipelined.cu
 7 |         src/baseline/discrete-standard.cu
 8 |         src/profiling/discrete-pipelined.cu
 9 |         src/profiling/discrete-standard.cu
10 |         src/single-gpu/discrete-standard.cu
11 |         src/single-stream/pipelined.cu
12 |         src/single-stream/pipelined-gather.cu
13 |         src/single-stream/pipelined-multi-overlap.cu
14 |         src/single-stream/standard.cu
15 |         src/single-stream/standard-saxpy-overlap.cu)
16 | 
17 | find_package(OpenMP REQUIRED)
18 | find_package(NVSHMEM REQUIRED)
19 | find_package(MPI REQUIRED)
20 | 
21 | target_link_libraries(cg
22 |         CUDA::cudart
23 |         OpenMP::OpenMP_CXX
24 |         nvshmem::nvshmem
25 |         MPI::MPI_CXX)
26 | 
27 | target_include_directories(cg SYSTEM PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/include")
28 | 
29 | target_compile_options(cg PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:
30 |         -D_FORCE_INLINES
31 |         >)
32 | 


--------------------------------------------------------------------------------
/Stencil/jacobi2D/PERKS/config.cuh:
--------------------------------------------------------------------------------
 1 | #ifndef TILE_X
 2 | #define TILE_X (256)
 3 | #endif
 4 | #ifndef RTILE_Y
 5 | // #define RTILE_Y (8)
 6 | #define RTILE_Y (8)
 7 | #endif
 8 | 
 9 | // minimal architecture is 600
10 | 
11 | // #if defined(js2d5pt)
12 | #define HALO (1)
13 | // #define REG_FOLDER_Y (5)
14 | 
15 | // #elif defined(js2d9pt)
16 | //     #define HALO (2)
17 | //     #define REG_FOLDER_Y (10)
18 | // #elif defined(js2d13pt)
19 | //     #define HALO (3)
20 | //     #define REG_FOLDER_Y (10)
21 | // #elif defined(js2d17pt)
22 | //     #define HALO (4)
23 | //     #define REG_FOLDER_Y (10)
24 | // #elif defined(js2d21pt)
25 | //     #define HALO (5)
26 | //     #define REG_FOLDER_Y (10)
27 | // #elif defined(js2d25pt)
28 | //     #define HALO (6)
29 | //     #define REG_FOLDER_Y (10)
30 | // #elif defined(jb2d9pt)
31 | //     #define HALO (1)
32 | //     #define BOX
33 | //     #define REG_FOLDER_Y (0)
34 | // #elif defined(jb2d25pt)
35 | //     #define HALO (2)
36 | //     #define BOX
37 | //     #define REG_FOLDER_Y (0)
38 | 
39 | // #endif
40 | 
41 | #ifndef Halo
42 | #define Halo HALO
43 | #endif


--------------------------------------------------------------------------------
/CG/scripts/calculate_nnz_num_rows_ratio.py:
--------------------------------------------------------------------------------
 1 | # Maps matrix name to tuple (num_rows, num_nnz)
 2 | MATRIX_NAMES_TO_INFO = {
 3 |     'ecology2': (999999, 4995991),
 4 |     'hood': (220542, 10768436),
 5 |     'bmwcra_1': (148770, 10644002),
 6 |     'consph': (83334, 6010480),
 7 |     'thermomech_dM': (204316, 1423116),
 8 |     'tmt_sym': (726713, 5080961),
 9 |     'crankseg_1': (52804, 10614210),
10 |     'crankseg_2': (63838, 14148858),
11 |     'Queen_4147': (4147110, 329499284),
12 |     'Bump_2911': (2911419, 127729899),
13 |     'G3_circuit': (1585478, 7660826),
14 |     'StocF-1465': (1465137, 21005389),
15 |     'Flan_1565': (1564794, 117406044),
16 |     'audikw_1': (943695, 77651847),
17 |     'Serena': (1391349, 64531701),
18 |     'Geo_1438': (1437960, 63156690),
19 |     'Hook_1498': (1498023, 60917445),
20 |     'ldoor': (952203, 46522475)
21 | }
22 | 
23 | if __name__ == "__main__":
24 |     for matrix_name, (num_rows, num_nnz) in MATRIX_NAMES_TO_INFO.items():
25 |         nnz_to_num_rows_ratio = num_nnz / num_rows
26 |         print(
27 |             f'Sparsity for matrix {matrix_name} is {nnz_to_num_rows_ratio:.2f}')
28 | 


--------------------------------------------------------------------------------
/Stencil/jacobi3D/src/PERKS/config.cuh:
--------------------------------------------------------------------------------
 1 | // #ifndef TILE_X
 2 | //     #define TILE_X (256)
 3 | // #endif
 4 | // #ifndef RTILE_Y
 5 | //     // #define RTILE_Y (8)
 6 | //     #define RTILE_Y (8)
 7 | // #endif
 8 | 
 9 | // minimal architecture is 600
10 | 
11 | // #if defined(js2d5pt)
12 | #define HALO (1)
13 | // #define REG_FOLDER_Y (5)
14 | 
15 | // #elif defined(js2d9pt)
16 | //     #define HALO (2)
17 | //     #define REG_FOLDER_Y (10)
18 | // #elif defined(js2d13pt)
19 | //     #define HALO (3)
20 | //     #define REG_FOLDER_Y (10)
21 | // #elif defined(js2d17pt)
22 | //     #define HALO (4)
23 | //     #define REG_FOLDER_Y (10)
24 | // #elif defined(js2d21pt)
25 | //     #define HALO (5)
26 | //     #define REG_FOLDER_Y (10)
27 | // #elif defined(js2d25pt)
28 | //     #define HALO (6)
29 | //     #define REG_FOLDER_Y (10)
30 | // #elif defined(jb2d9pt)
31 | //     #define HALO (1)
32 | //     #define BOX
33 | //     #define REG_FOLDER_Y (0)
34 | // #elif defined(jb2d25pt)
35 | //     #define HALO (2)
36 | //     #define BOX
37 | //     #define REG_FOLDER_Y (0)
38 | 
39 | // #endif
40 | 
41 | // #ifndef Halo
42 | //     #define Halo HALO
43 | // #endif


--------------------------------------------------------------------------------
/Stencil/jacobi3D/src_nvshmem/PERKS-nvshmem/config.cuh:
--------------------------------------------------------------------------------
 1 | // #ifndef TILE_X
 2 | //     #define TILE_X (256)
 3 | // #endif
 4 | // #ifndef RTILE_Y
 5 | //     // #define RTILE_Y (8)
 6 | //     #define RTILE_Y (8)
 7 | // #endif
 8 | 
 9 | // minimal architecture is 600
10 | 
11 | // #if defined(js2d5pt)
12 | #define HALO (1)
13 | // #define REG_FOLDER_Y (5)
14 | 
15 | // #elif defined(js2d9pt)
16 | //     #define HALO (2)
17 | //     #define REG_FOLDER_Y (10)
18 | // #elif defined(js2d13pt)
19 | //     #define HALO (3)
20 | //     #define REG_FOLDER_Y (10)
21 | // #elif defined(js2d17pt)
22 | //     #define HALO (4)
23 | //     #define REG_FOLDER_Y (10)
24 | // #elif defined(js2d21pt)
25 | //     #define HALO (5)
26 | //     #define REG_FOLDER_Y (10)
27 | // #elif defined(js2d25pt)
28 | //     #define HALO (6)
29 | //     #define REG_FOLDER_Y (10)
30 | // #elif defined(jb2d9pt)
31 | //     #define HALO (1)
32 | //     #define BOX
33 | //     #define REG_FOLDER_Y (0)
34 | // #elif defined(jb2d25pt)
35 | //     #define HALO (2)
36 | //     #define BOX
37 | //     #define REG_FOLDER_Y (0)
38 | 
39 | // #endif
40 | 
41 | // #ifndef Halo
42 | //     #define Halo HALO
43 | // #endif


--------------------------------------------------------------------------------
/CG/batch/Karolina/_measure_single_gpu_runtime.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | NUM_ITER=${NUM_ITER:-1000}
 4 | NUM_RUNS=${NUM_RUNS:-5}
 5 | FILENAME=${FILENAME:-USE_DEFAULT_FILENAME}
 6 | MATRICES_FOLDER=${MATRICES_FOLDER:-USE_DEFAULT_MATRICES_FOLDER}
 7 | GPU_MODEL=${GPU_MODEL:-A100}
 8 | 
 9 | while [ $# -gt 0 ]; do
10 | 
11 |    if [[ $1 == *"--"* ]]; then
12 |         param="${1/--/}"
13 |         declare $param="$2"
14 |    fi
15 | 
16 |   shift
17 | done
18 | 
19 | WORK_DIR=~/multi-perks/CG
20 | SCRATCH_DIR=/scratch/project/${PBS_ACCOUNT,,}/multi-perks-runs
21 | # SCRATCH_DIR=/scratch/project/${PBS_ACCOUNT,,}/${USER}/multi-perks-runs
22 | 
23 | cd $WORK_DIR
24 | 
25 | . ./batch/Karolina/_load_karolina_modules.sh > /dev/null
26 | 
27 | cd $SCRATCH_DIR
28 | 
29 | cp $WORK_DIR/bin/cg ./bin/cg
30 | cp $WORK_DIR/scripts/measure_single_gpu_runtime.py ./scripts/measure_single_gpu_runtime.py
31 | 
32 | echo "--- RUNNING ---"
33 | date
34 | 
35 | python3 ./scripts/measure_single_gpu_runtime.py --num_iter $NUM_ITER --num_runs $NUM_RUNS --filename $FILENAME --matrices_folder $MATRICES_FOLDER --gpu_model $GPU_MODEL
36 | 
37 | echo ""
38 | 
39 | echo "--- DONE ---"
40 | date
41 | 
42 | cd $WORK_DIR


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 ParCore Lab, Koç University (Istanbul), and all contributors listed in AUTHORS All rights reserved. 
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/CG/Makefile2:
--------------------------------------------------------------------------------
 1 | SELF_DIR := $(dir $(lastword $(MAKEFILE_LIST)))
 2 | include $(SELF_DIR)/../common.mk
 3 | 
 4 | BUILD_ROOT ?= bin
 5 | OBJ_ROOT ?= $(BUILD_ROOT)/obj
 6 | 
 7 | NV_SRCDIR := $(SELF_DIR)/src
 8 | NV_OBJDIR := $(OBJ_ROOT)
 9 | NV_DEPDIR := $(NV_OBJDIR)/.deps
10 | 
11 | .DEFAULT_GOAL := cg
12 | 
13 | ifdef PROFILE
14 | 	NVCC_NV_FLAGS += -lineinfo --generate-line-info
15 | endif
16 | 
17 | ifdef USE_NVTX
18 | 	NVCC_NV_FLAGS += -DUSE_NVTX
19 | 	NVCC_NV_LDFLAGS += -lnvToolsExt
20 | endif
21 | 
22 | NV_SRCS = $(call rwildcard,$(NV_SRCDIR),*.cu)
23 | NV_OBJS := $(patsubst $(NV_SRCDIR)/%.cu, $(NV_OBJDIR)/%.o, $(NV_SRCS))
24 | NV_DEPS := $(patsubst $(NV_SRCDIR)/%.cu, $(NV_DEPDIR)/%.d, $(NV_SRCS))
25 | 
26 | cg: $(NV_OBJS) $(NV_OBJDIR)/mmio.c.o $(NV_OBJDIR)/mmio_wrapper.o
27 | 	$(LINK_NVSHMEM)
28 | 
29 | $(NV_OBJS) : $(NV_OBJDIR)/%.o : $(NV_SRCDIR)/%.cu $(NV_DEPDIR)/%.d | $(NV_DEPDIR)
30 | 	$(call COMPILE_NVSHMEM, $(NV_DEPDIR))
31 | 
32 | $(NV_OBJDIR)/mmio.c.o: $(NV_SRCDIR)/mmio.c
33 | 	$(NVCC) -o $@ -c $<
34 | 
35 | $(NV_OBJDIR)/mmio_wrapper.o: $(NV_SRCDIR)/mmio_wrapper.cpp
36 | 	$(NVCC) -o $@ -c $<
37 | 
38 | run: cg
39 | 	./cg
40 | 
41 | $(NV_DEPDIR):
42 | 	@mkdir -p $(NV_DEPDIR)
43 | 
44 | $(NV_DEPS):
45 | 
46 | include $(wildcard $(NV_DEPS))
47 | 


--------------------------------------------------------------------------------
/CG/batch/Karolina/_measure_total_runtime.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | NUM_ITER=${NUM_ITER:-1000}
 4 | NUM_RUNS=${NUM_RUNS:-5}
 5 | FILENAME=${FILENAME:-USE_DEFAULT_FILENAME}
 6 | MATRICES_FOLDER=${MATRICES_FOLDER:-USE_DEFAULT_MATRICES_FOLDER}
 7 | GPU_MODEL=${GPU_MODEL:-A100}
 8 | NUM_NODES=${NUM_NODES:-1}
 9 | 
10 | # Runs all versions by default
11 | VERSIONS_TO_RUN=${VERSIONS_TO_RUN:-RUN_ALL_VERSIONS}
12 | 
13 | while [ $# -gt 0 ]; do
14 | 
15 |    if [[ $1 == *"--"* ]]; then
16 |         param="${1/--/}"
17 |         declare $param="$2"
18 |    fi
19 | 
20 |   shift
21 | done
22 | 
23 | NUM_GPUS=$((NUM_NODES * 8))
24 | 
25 | WORK_DIR=~/multi-perks/CG
26 | SCRATCH_DIR=/scratch/project/${PBS_ACCOUNT,,}/multi-perks-runs
27 | # SCRATCH_DIR=/scratch/project/${PBS_ACCOUNT,,}/${USER}/multi-perks-runs
28 | 
29 | cd $WORK_DIR
30 | 
31 | . ./batch/Karolina/_load_karolina_modules.sh > /dev/null
32 | 
33 | cd $SCRATCH_DIR
34 | 
35 | cp $WORK_DIR/bin/cg ./bin/cg
36 | cp $WORK_DIR/scripts/measure_runtime.py ./scripts/measure_runtime.py
37 | 
38 | echo "--- RUNNING ---"
39 | date
40 | 
41 | python3 ./scripts/measure_runtime.py --num_iter $NUM_ITER --num_runs $NUM_RUNS --filename $FILENAME --matrices_folder $MATRICES_FOLDER --num_gpus $NUM_GPUS --versions_to_run $VERSIONS_TO_RUN --gpu_model $GPU_MODEL
42 | 
43 | echo ""
44 | 
45 | echo "--- DONE ---"
46 | date
47 | 
48 | cd $WORK_DIR


--------------------------------------------------------------------------------
/Stencil/jacobi3D/src_nvshmem/PERKS-nvshmem/common/types.hpp:
--------------------------------------------------------------------------------
 1 | 
 2 | #ifndef PERKS_TYPES
 3 | #define PERKS_TYPES
 4 | 
 5 | #define PERKS_INITIALIZE_ALL_TYPE(_macro) \
 6 |     template _macro(float);\
 7 |     template _macro(double)
 8 | 
 9 | #define PERKS_INITIALIZE_ALL_TYPE_1ARG(_macro,halo) \
10 |     template _macro(float,halo);\
11 |     template _macro(double,halo)
12 | 
13 | #define PERKS_INITIALIZE_ALL_TYPE_2ARG(_macro,a,b) \
14 |     template _macro(float,a,b);\
15 |     template _macro(double,a,b)
16 | 
17 | #define PERKS_INITIALIZE_ALL_TYPE_3ARG(_macro,a,b,c) \
18 |     template _macro(float,a,b,c);\
19 |     template _macro(double,a,b,c)
20 | 
21 | 
22 | #define PERKS_INITIALIZE_ALL_TYPE_4ARG(_macro,a,b,c,d) \
23 |     template _macro(float,a,b,c,d);\
24 |     template _macro(double,a,b,c,d)
25 | 
26 | 
27 | #define PERKS_INITIALIZE_ALL_TYPE_5ARG(_macro,a,b,c,d,e) \
28 |     template _macro(float,a,b,c,d,e);\
29 |     template _macro(double,a,b,c,d,e)
30 | 
31 | #define PERKS_INITIALIZE_ALL_TYPE_6ARG(_macro,a,b,c,d,e,f) \
32 |     template _macro(float,a,b,c,d,e,f);\
33 |     template _macro(double,a,b,c,d,e,f)
34 | 
35 | #define PERKS_INITIALIZE_ALL_TYPE_7ARG(_macro,a,b,c,d,e,f,g) \
36 |     template _macro(float,a,b,c,d,e,f,g);\
37 |     template _macro(double,a,b,c,d,e,f,g)
38 | 
39 | #define PERKS_INITIALIZE_ALL_TYPE_8ARG(_macro,a,b,c,d,e,f,g,h) \
40 |     template _macro(float,a,b,c,d,e,f,g,h);\
41 |     template _macro(double,a,b,c,d,e,f,g,h)    
42 | #endif
43 | 


--------------------------------------------------------------------------------
/CG/batch/Simula/_measure_operation_breakdown.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #SBATCH --job-name=cg-bench
 4 | #SBATCH --ntasks=8
 5 | #SBATCH --gres=gpu:8
 6 | #SBATCH --partition=dgx2q
 7 | #SBATCH --time=06:00:00
 8 | #SBATCH --output=sbatch_output_%j.log
 9 | 
10 | NUM_ITER=${NUM_ITER:-1000}
11 | FILENAME=${FILENAME:-USE_DEFAULT_FILENAME}
12 | MATRICES_FOLDER=${MATRICES_FOLDER:-USE_DEFAULT_MATRICES_FOLDER}
13 | GPU_MODEL=${GPU_MODEL:-V100}
14 | 
15 | # This will be a comma delimited list of number of GPUs to run on
16 | # No spaces between numbers
17 | # Single numbers also work
18 | # (Example => 2,3,4,8)
19 | NUM_GPUS=${NUM_GPUS:-8}
20 | 
21 | # This will be a comma delimited list of version indices
22 | # No spaces between numbers
23 | # Single numbers also work
24 | # (Example => 0,1,2,4)
25 | # Runs all versions by default
26 | VERSIONS_TO_RUN=${VERSIONS_TO_RUN:-RUN_ALL_VERSIONS}
27 | 
28 | while [ $# -gt 0 ]; do
29 | 
30 |    if [[ $1 == *"--"* ]]; then
31 |         param="${1/--/}"
32 |         declare $param="$2"
33 |    fi
34 | 
35 |   shift
36 | done
37 | 
38 | cd ~/multi-perks/CG
39 | 
40 | . ./batch/Simula/_load_simula_modules.sh > /dev/null
41 | 
42 | echo "--- RUNNING ---"
43 | date
44 | 
45 | python3 ./scripts/measure_operation_breakdown.py --num_iter $NUM_ITER --filename $FILENAME --matrices_folder $MATRICES_FOLDER --num_gpus $NUM_GPUS --versions_to_run $VERSIONS_TO_RUN --gpu_model $GPU_MODEL
46 | rm ./nsys_reports/*
47 | 
48 | echo ""
49 | 
50 | echo "--- DONE ---"
51 | date


--------------------------------------------------------------------------------
/CG/batch/Simula/_measure_total_runtime.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #SBATCH --job-name=cg-bench
 4 | #SBATCH --ntasks=8
 5 | #SBATCH --gres=gpu:8
 6 | #SBATCH --partition=dgx2q
 7 | #SBATCH --time=06:00:00
 8 | #SBATCH --output=sbatch_output_%j.log
 9 | 
10 | NUM_ITER=${NUM_ITER:-1000}
11 | NUM_RUNS=${NUM_RUNS:-5}
12 | FILENAME=${FILENAME:-USE_DEFAULT_FILENAME}
13 | MATRICES_FOLDER=${MATRICES_FOLDER:-USE_DEFAULT_MATRICES_FOLDER}
14 | GPU_MODEL=${GPU_MODEL:-V100}
15 | 
16 | # This will be a comma delimited list of number of GPUs to run on
17 | # No spaces between numbers
18 | # Single numbers also work
19 | # (Example => 2,3,4,8)
20 | NUM_GPUS=${NUM_GPUS:-8}
21 | 
22 | # This will be a comma delimited list of version indices
23 | # No spaces between numbers
24 | # Single numbers also work
25 | # (Example => 0,1,2,4)
26 | # Runs all versions by default
27 | VERSIONS_TO_RUN=${VERSIONS_TO_RUN:-RUN_ALL_VERSIONS}
28 | 
29 | while [ $# -gt 0 ]; do
30 | 
31 |    if [[ $1 == *"--"* ]]; then
32 |         param="${1/--/}"
33 |         declare $param="$2"
34 |    fi
35 | 
36 |   shift
37 | done
38 | 
39 | cd ~/multi-perks/CG
40 | 
41 | . ./batch/Simula/_load_simula_modules.sh > /dev/null
42 | 
43 | echo "--- RUNNING ---"
44 | date
45 | 
46 | python3 ./scripts/measure_runtime.py --num_iter $NUM_ITER --num_runs $NUM_RUNS --filename $FILENAME --matrices_folder $MATRICES_FOLDER --num_gpus $NUM_GPUS --versions_to_run $VERSIONS_TO_RUN --gpu_model $GPU_MODEL
47 | 
48 | echo ""
49 | 
50 | echo "--- DONE ---"
51 | date


--------------------------------------------------------------------------------
/Plots/common.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import textwrap
 3 | from pathlib import Path
 4 | 
 5 | BASE_DIR = Path('Images')
 6 | BASE_DIR.mkdir(exist_ok=True)
 7 | 
 8 | 
 9 | def rotate(l, n):
10 |     return l[-n:] + l[:-n]
11 | 
12 | 
13 | def get_files():
14 |     parser = argparse.ArgumentParser()
15 |     parser.add_argument('files', type=argparse.FileType('r'), nargs='+')
16 |     return parser.parse_args().files
17 | 
18 | 
19 | def get_module_dir(dir_name):
20 |     module_dir = BASE_DIR / dir_name
21 |     module_dir.mkdir(exist_ok=True)
22 |     return module_dir
23 | 
24 | 
25 | def wrap_labels(ax, width, break_long_words=False):
26 |     labels = []
27 |     for label in ax.get_xticklabels():
28 |         text = label.get_text()
29 |         labels.append(textwrap.fill(text, width=width,
30 |                                     break_long_words=break_long_words))
31 |     ax.set_xticklabels(labels, rotation=0)
32 | 
33 | 
34 | markers = [
35 |     '.',  # point
36 |     ',',  # pixel
37 |     'o',  # circle
38 |     'v',  # triangle down
39 |     '^',  # triangle up
40 |     '<',  # triangle_left
41 |     '>',  # triangle_right
42 |     '1',  # tri_down
43 |     '2',  # tri_up
44 |     '3',  # tri_left
45 |     '4',  # tri_right
46 |     '8',  # octagon
47 |     's',  # square
48 |     'p',  # pentagon
49 |     '*',  # star
50 |     'h',  # hexagon1
51 |     'H',  # hexagon2
52 |     '+',  # plus
53 |     'x',  # x
54 |     'D',  # diamond
55 |     'd',  # thin_diamond
56 |     '|',  # vline
57 | ]
58 | 


--------------------------------------------------------------------------------
/CG/batch/Truba/_measure_operation_breakdown.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #SBATCH -J cg-operation-breakdown
 4 | #SBATCH -N 1
 5 | #SBATCH -n 8
 6 | #SBATCH -c 16
 7 | #SBATCH -A proj16
 8 | #SBATCH -p palamut-cuda
 9 | #SBATCH --gres=gpu:8
10 | #SBATCH --time=24:00:00
11 | #SBATCH -o cg-operation-breakdown_output_%j.log
12 | 
13 | NUM_ITER=${NUM_ITER:-1000}
14 | FILENAME=${FILENAME:-USE_DEFAULT_FILENAME}
15 | MATRICES_FOLDER=${MATRICES_FOLDER:-USE_DEFAULT_MATRICES_FOLDER}
16 | GPU_MODEL=${GPU_MODEL:-A100}
17 | 
18 | # This will be a comma delimited list of number of GPUs to run on
19 | # No spaces between numbers
20 | # Single numbers also work
21 | # (Example => 2,3,4,8)
22 | NUM_GPUS=${NUM_GPUS:-8}
23 | 
24 | # This will be a comma delimited list of version indices
25 | # No spaces between numbers
26 | # Single numbers also work
27 | # (Example => 0,1,2,4)
28 | # Runs all versions by default
29 | VERSIONS_TO_RUN=${VERSIONS_TO_RUN:-RUN_ALL_VERSIONS}
30 | 
31 | while [ $# -gt 0 ]; do
32 | 
33 |    if [[ $1 == *"--"* ]]; then
34 |         param="${1/--/}"
35 |         declare $param="$2"
36 |    fi
37 | 
38 |   shift
39 | done
40 | 
41 | cd ~/ismayil/multi-perks/CG
42 | 
43 | . ./batch/Truba/_load_truba_modules.sh > /dev/null
44 | 
45 | echo "--- RUNNING ---"
46 | date
47 | 
48 | python3 ./scripts/measure_operation_breakdown.py --num_iter $NUM_ITER --matrices_folder $MATRICES_FOLDER --num_gpus $NUM_GPUS --versions_to_run $VERSIONS_TO_RUN --gpu_model $GPU_MODEL
49 | rm ./nsys_reports/*
50 | 
51 | echo ""
52 | 
53 | echo "--- DONE ---"
54 | date


--------------------------------------------------------------------------------
/CG/batch/Truba/_measure_total_runtime.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #SBATCH -J cg-runtime_benchmark
 4 | #SBATCH -N 1
 5 | #SBATCH -n 8
 6 | #SBATCH -c 16
 7 | #SBATCH -A proj16
 8 | #SBATCH -p palamut-cuda
 9 | #SBATCH --gres=gpu:8
10 | #SBATCH --time=24:00:00
11 | #SBATCH -o cg-runtime_benchmark_output_%j.log
12 | 
13 | NUM_ITER=${NUM_ITER:-1000}
14 | NUM_RUNS=${NUM_RUNS:-5}
15 | FILENAME=${FILENAME:-USE_DEFAULT_FILENAME}
16 | MATRICES_FOLDER=${MATRICES_FOLDER:-USE_DEFAULT_MATRICES_FOLDER}
17 | GPU_MODEL=${GPU_MODEL:-A100}
18 | 
19 | # This will be a comma delimited list of number of GPUs to run on
20 | # No spaces between numbers
21 | # Single numbers also work
22 | # (Example => 2,3,4,8)
23 | NUM_GPUS=${NUM_GPUS:-8}
24 | 
25 | # This will be a comma delimited list of version indices
26 | # No spaces between numbers
27 | # Single numbers also work
28 | # (Example => 0,1,2,4)
29 | # Runs all versions by default
30 | VERSIONS_TO_RUN=${VERSIONS_TO_RUN:-RUN_ALL_VERSIONS}
31 | 
32 | while [ $# -gt 0 ]; do
33 | 
34 |    if [[ $1 == *"--"* ]]; then
35 |         param="${1/--/}"
36 |         declare $param="$2"
37 |    fi
38 | 
39 |   shift
40 | done
41 | 
42 | cd ~/ismayil/multi-perks/CG
43 | 
44 | . ./batch/Truba/_load_truba_modules.sh > /dev/null
45 | 
46 | echo "--- RUNNING ---"
47 | date
48 | 
49 | python3 ./scripts/measure_runtime.py --num_iter $NUM_ITER --num_runs $NUM_RUNS --filename $FILENAME --matrices_folder $MATRICES_FOLDER --num_gpus $NUM_GPUS --versions_to_run $VERSIONS_TO_RUN --gpu_model $GPU_MODEL
50 | 
51 | echo ""
52 | 
53 | echo "--- DONE ---"
54 | date


--------------------------------------------------------------------------------
/Stencil/jacobi2D/scripts/bench.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #SBATCH --job-name=stencil-bench
 4 | #SBATCH --ntasks=8
 5 | #SBATCH --gres=gpu:8
 6 | #SBATCH --partition hgx2q
 7 | #SBATCH --time=01:00:00
 8 | #SBATCH --output=sbatch_output_%j.log
 9 | 
10 | . ./scripts/modules.sh > /dev/null
11 | 
12 | BIN="./jacobi -s 1"
13 | NUM_RUNS=5
14 | V_OURS=1
15 | V_BASELINE=6
16 | 
17 | #OUT_CSV="./results.csv"
18 | OUT_CSV="/dev/stdout"
19 | echo "version,nx,ny,niter,num_gpus,execution_time" >> "$OUT_CSV"
20 | 
21 | #MAX_NUM_GPUS=$(nvidia-smi --list-gpus | wc -l)
22 | MAX_NUM_GPUS=4
23 | 
24 | # First element reserved for pretty output in the loop
25 | CUDA_VISIBLE_DEVICES_SETTING=("x" "0" "0,1" "0,1,2,3" "0,1,2,3,4,5,6,7")
26 | DOMAIN_SIZES=(
27 |     "x"
28 |     "8192 4096"
29 |     "8192 8192"
30 |     "8192 16348"
31 |     "16348 16348"
32 | )
33 | NUM_ITERS=100000
34 | 
35 | function runp() {
36 |     cmd="$BIN -v $1 -nx $2 -ny $3 -niter $4"
37 | 
38 |     min_execution_time=9223372036854775807
39 | 
40 |     for ((i = 0; i < NUM_RUNS; i += 1)); do
41 |         execution_time=$($cmd | grep -o -E "[0-9]+.?[0-9]+")
42 |         min_execution_time=$(python -c "print(min($execution_time, $min_execution_time))")
43 |     done
44 | 
45 |     echo "$1,$2,$3,$4,$5,$min_execution_time" >> "$OUT_CSV"
46 | }
47 | 
48 | for ((NUM_GPUS = 1; NUM_GPUS <= MAX_NUM_GPUS; NUM_GPUS += 1)); do
49 |     export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES_SETTING[${NUM_GPUS}]}
50 | 
51 |     read -r nx ny <<< "${DOMAIN_SIZES[$NUM_GPUS]}"
52 | 
53 |     # Our version
54 |     runp "$V_OURS" "$nx" "$ny" "$NUM_ITERS" "$NUM_GPUS"
55 | 
56 |     # Baseline
57 |     runp "$V_BASELINE" "$nx" "$ny" "$NUM_ITERS" "$NUM_GPUS"
58 | done
59 | 


--------------------------------------------------------------------------------
/Stencil/jacobi3D/scripts/bench.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #SBATCH --job-name=stencil-bench
 4 | #SBATCH --ntasks=8
 5 | #SBATCH --gres=gpu:8
 6 | #SBATCH --partition hgx2q
 7 | #SBATCH --time=01:00:00
 8 | #SBATCH --output=sbatch_output_%j.log
 9 | 
10 | . ./scripts/modules.sh > /dev/null
11 | 
12 | BIN="./jacobi -s 1"
13 | NUM_RUNS=5
14 | V_OURS=1
15 | V_BASELINE=6
16 | 
17 | #OUT_CSV="./results.csv"
18 | OUT_CSV="/dev/stdout"
19 | echo "version,nx,ny,niter,num_gpus,execution_time" >> "$OUT_CSV"
20 | 
21 | #MAX_NUM_GPUS=$(nvidia-smi --list-gpus | wc -l)
22 | MAX_NUM_GPUS=4
23 | 
24 | # First element reserved for pretty output in the loop
25 | CUDA_VISIBLE_DEVICES_SETTING=("x" "0" "0,1" "0,1,2,3" "0,1,2,3,4,5,6,7")
26 | DOMAIN_SIZES=(
27 |     "x"
28 |     "8192 4096"
29 |     "8192 8192"
30 |     "8192 16348"
31 |     "16348 16348"
32 | )
33 | NUM_ITERS=100000
34 | 
35 | function runp() {
36 |     cmd="$BIN -v $1 -nx $2 -ny $3 -niter $4"
37 | 
38 |     min_execution_time=9223372036854775807
39 | 
40 |     for ((i = 0; i < NUM_RUNS; i += 1)); do
41 |         execution_time=$($cmd | grep -o -E "[0-9]+.?[0-9]+")
42 |         min_execution_time=$(python -c "print(min($execution_time, $min_execution_time))")
43 |     done
44 | 
45 |     echo "$1,$2,$3,$4,$5,$min_execution_time" >> "$OUT_CSV"
46 | }
47 | 
48 | for ((NUM_GPUS = 1; NUM_GPUS <= MAX_NUM_GPUS; NUM_GPUS += 1)); do
49 |     export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES_SETTING[${NUM_GPUS}]}
50 | 
51 |     read -r nx ny <<< "${DOMAIN_SIZES[$NUM_GPUS]}"
52 | 
53 |     # Our version
54 |     runp "$V_OURS" "$nx" "$ny" "$NUM_ITERS" "$NUM_GPUS"
55 | 
56 |     # Baseline
57 |     runp "$V_BASELINE" "$nx" "$ny" "$NUM_ITERS" "$NUM_GPUS"
58 | done
59 | 


--------------------------------------------------------------------------------
/Plots/scaling-bar.py:
--------------------------------------------------------------------------------
 1 | from itertools import cycle
 2 | 
 3 | import matplotlib.pyplot as plt
 4 | import pandas as pd
 5 | 
 6 | import matplotlib as mpl
 7 | mpl.rcParams['hatch.linewidth'] = 0.3
 8 | 
 9 | import common
10 | from common import get_files, markers, get_module_dir, wrap_labels
11 | 
12 | plt.style.use('https://github.com/dhaitz/matplotlib-stylesheets/raw/master/pitayasmoothie-light.mplstyle')
13 | 
14 | MODULE_DIR = get_module_dir('Bar Scaling')
15 | 
16 | files = get_files()
17 | 
18 | for file in files:
19 |     data = pd.read_csv(file, index_col='Version')
20 |     data = data.sort_index()
21 | 
22 |     data = data.T
23 | 
24 |     colors = [
25 |         '#c6c9cb', '#64b8e5', '#ee7fb2'
26 |     ]
27 | 
28 |     axes = data.plot.bar(colormap='Paired', color=colors, edgecolor='black')
29 | 
30 |     bars = axes.patches
31 |     patterns = ('///', '\\\\\\', 'xxx')
32 |     hatches = [p for p in patterns for i in range(len(data))]
33 |     for bar, hatch in zip(bars, hatches):
34 |         bar.set_hatch(hatch)
35 | 
36 |     # for line in axes.get_lines():
37 |     #     line.set_hatch(next(markers_cycle))
38 |     #     line.set_linewidth(2)
39 |     #     line.set(alpha=0.5)
40 |     #     line.set(color=next(colors))
41 |     #
42 |     #     # If our versions
43 |     #     if line.get_label().lower().startswith('baseline'):
44 |     #         # line.set(alpha=0.5)
45 |     #         line.set_linestyle('dashed')
46 |     #
47 |     # # axes.legend(axes.get_lines(), data.columns, loc='best')
48 |     # wrap_labels(axes, 10)
49 |     #
50 |     # # plt.xticks(fontsize=15)
51 |     # plt.title(title)
52 |     # plt.savefig(MODULE_DIR / title)
53 | 
54 |     # plt.grid(axis='x')
55 | 
56 |     plt.show()
57 | 


--------------------------------------------------------------------------------
/Stencil/jacobi3D/src/PERKS/common/types.hpp:
--------------------------------------------------------------------------------
 1 | 
 2 | #ifndef PERKS_TYPES
 3 | #define PERKS_TYPES
 4 | 
 5 | #define PERKS_INITIALIZE_ALL_TYPE(_macro) \
 6 |     template _macro(float);               \
 7 |     template _macro(double)
 8 | 
 9 | #define PERKS_INITIALIZE_ALL_TYPE_1ARG(_macro, halo) \
10 |     template _macro(float, halo);                    \
11 |     template _macro(double, halo)
12 | 
13 | #define PERKS_INITIALIZE_ALL_TYPE_2ARG(_macro, a, b) \
14 |     template _macro(float, a, b);                    \
15 |     template _macro(double, a, b)
16 | 
17 | #define PERKS_INITIALIZE_ALL_TYPE_3ARG(_macro, a, b, c) \
18 |     template _macro(float, a, b, c);                    \
19 |     template _macro(double, a, b, c)
20 | 
21 | #define PERKS_INITIALIZE_ALL_TYPE_4ARG(_macro, a, b, c, d) \
22 |     template _macro(float, a, b, c, d);                    \
23 |     template _macro(double, a, b, c, d)
24 | 
25 | #define PERKS_INITIALIZE_ALL_TYPE_5ARG(_macro, a, b, c, d, e) \
26 |     template _macro(float, a, b, c, d, e);                    \
27 |     template _macro(double, a, b, c, d, e)
28 | 
29 | #define PERKS_INITIALIZE_ALL_TYPE_6ARG(_macro, a, b, c, d, e, f) \
30 |     template _macro(float, a, b, c, d, e, f);                    \
31 |     template _macro(double, a, b, c, d, e, f)
32 | 
33 | #define PERKS_INITIALIZE_ALL_TYPE_7ARG(_macro, a, b, c, d, e, f, g) \
34 |     template _macro(float, a, b, c, d, e, f, g);                    \
35 |     template _macro(double, a, b, c, d, e, f, g)
36 | 
37 | #define PERKS_INITIALIZE_ALL_TYPE_8ARG(_macro, a, b, c, d, e, f, g, h) \
38 |     template _macro(float, a, b, c, d, e, f, g, h);                    \
39 |     template _macro(double, a, b, c, d, e, f, g, h)
40 | #endif
41 | 


--------------------------------------------------------------------------------
/CG/results/cg_operation_breakdown_8A100_discrete_pipelined.csv:
--------------------------------------------------------------------------------
 1 | Discrete Pipelined Operation Breakdown
 2 | Matrix,Global Reductions (+Barrier),Memcpy Dots To Host,Merged Dots (+Reset),NVSHMEM Barrier 1 (After SpMV),NVSHMEM Barrier 2 (End of Iteration),Saxpy 1,Saxpy 2,Saxpy 3,Saxpy 4,Saxpy 5,Saxpy 6,SpMV
 3 | tridiagonal,0.1818,0.0681,0.2480,0.1142,0.1210,0.1980,0.2435,0.2461,0.2417,0.2439,0.2443,0.5201
 4 | ecology2,0.1751,0.0658,0.0788,0.1014,0.1104,0.0590,0.0588,0.0589,0.0586,0.0584,0.0584,0.0987
 5 | hood,0.1740,0.0659,0.0724,0.1124,0.1051,0.0558,0.0558,0.0558,0.0553,0.0550,0.0552,0.4796
 6 | bmwcra_1,0.1703,0.0669,0.0718,0.2503,0.1036,0.0552,0.0550,0.0549,0.0547,0.0544,0.0544,0.9414
 7 | consph,0.1735,0.0667,0.0722,0.1561,0.1060,0.0556,0.0553,0.0554,0.0551,0.0549,0.0548,0.4441
 8 | thermomech_dM,0.1741,0.0665,0.0726,0.0938,0.1028,0.0562,0.0561,0.0560,0.0557,0.0555,0.0556,0.2408
 9 | tmt_sym,0.1746,0.0667,0.0764,0.0988,0.1040,0.0584,0.0582,0.0583,0.0577,0.0576,0.0578,0.1144
10 | crankseg_1,0.1733,0.0690,0.0765,7.5716,0.1040,0.0555,0.0550,0.0552,0.0548,0.0544,0.0546,1.7455
11 | crankseg_2,0.1758,0.0682,0.0777,10.8521,0.1061,0.0553,0.0550,0.0549,0.0546,0.0544,0.0545,1.8073
12 | Queen_4147,0.1791,0.0697,0.1071,0.2459,0.1102,0.0801,0.0792,0.0817,0.0730,0.0685,0.0717,15.1174
13 | Bump_2911,0.1799,0.0692,0.0949,0.1052,0.1058,0.0735,0.0728,0.0742,0.0676,0.0646,0.0646,4.9992
14 | G3_circuit,0.1746,0.0663,0.0823,0.0891,0.1030,0.0607,0.0605,0.0610,0.0604,0.0601,0.0601,0.2353
15 | StocF-1465,0.1827,0.0681,0.0828,0.0944,0.1040,0.0626,0.0625,0.0665,0.0621,0.0600,0.0596,0.6804
16 | Flan_1565,0.1787,0.0685,0.0846,0.3012,0.1051,0.0644,0.0653,0.0662,0.0623,0.0597,0.0598,4.4129
17 | audikw_1,0.1788,0.0699,0.0811,2.9359,0.1025,0.0606,0.0601,0.0620,0.0598,0.0578,0.0579,1.9612
18 | Serena,0.1805,0.0688,0.0828,0.3760,0.1068,0.0633,0.0625,0.0649,0.0609,0.0586,0.0588,2.5207
19 | Geo_1438,0.1797,0.0693,0.0825,0.1363,0.1035,0.0641,0.0633,0.0657,0.0613,0.0592,0.0590,2.3228
20 | Hook_1498,0.1791,0.0675,0.0827,0.1411,0.1048,0.0639,0.0634,0.0663,0.0615,0.0592,0.0593,1.8353
21 | ldoor,0.1811,0.0679,0.0798,0.1083,0.1043,0.0602,0.0608,0.0621,0.0597,0.0578,0.0578,0.9961


--------------------------------------------------------------------------------
/CG/scripts/plots/plot_operation_breakdown.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from itertools import cycle
 3 | from os.path import dirname, realpath
 4 | 
 5 | from common import get_files, markers, get_module_dir, wrap_labels
 6 | 
 7 | import matplotlib.pyplot as plt
 8 | import pandas as pd
 9 | import numpy as np
10 | 
11 | MATRIX_NAMES = [
12 |     'tridiagonal',
13 |     'ecology2',
14 |     #   'shallow_water2', Too little non-zeros
15 |     #   'Trefethen_2000', Too little non-zeros
16 |     'hood',
17 |     'bmwcra_1',
18 |     'consph',
19 |     'thermomech_dM',
20 |     'tmt_sym',
21 |     'crankseg_1',
22 |     'crankseg_2',
23 |     'Queen_4147',
24 |     'Bump_2911',
25 |     'G3_circuit',
26 |     'StocF-1465',
27 |     'Flan_1565',
28 |     'audikw_1',
29 |     'Serena',
30 |     'Geo_1438',
31 |     'Hook_1498',
32 |     #   'bone010', Multi-part matrix, don't handle those for now
33 |     'ldoor'
34 | ]
35 | 
36 | MODULE_DIR = get_module_dir('Operation Breakdown')
37 | 
38 | dir_path = dirname(realpath(__file__))
39 | 
40 | # plt.style.use(dir_path + '/default.mplstyle')
41 | plt.style.use('fivethirtyeight')
42 | 
43 | files = get_files()
44 | 
45 | for file in files:
46 |     title = file.readline().strip()
47 | 
48 |     operation_breakdowns = pd.read_csv(file, index_col='Matrix')
49 |     operation_breakdowns = operation_breakdowns.T
50 |     operation_breakdowns = operation_breakdowns[MATRIX_NAMES]
51 |     operation_breakdowns = operation_breakdowns.T
52 | 
53 |     # Is this necessary?
54 |     # operation_breakdowns = operation_breakdowns.sort_index()
55 | 
56 |     # Get percentages of operations
57 |     per_operation_percentages = operation_breakdowns.div(
58 |         operation_breakdowns.sum(axis=1), axis=0) * 100
59 | 
60 |     ax = per_operation_percentages.plot.barh(stacked=True)
61 |     ax.invert_yaxis()
62 |     ax.xaxis.set_visible(False)
63 |     ax.set_xlim(0, np.sum(per_operation_percentages, axis=1).max())
64 | 
65 |     for container in ax.containers:
66 |         ax.bar_label(container)
67 | 
68 |     wrap_labels(ax, 10, break_long_words=True)
69 | 
70 |     plt.title(title)
71 |     plt.savefig(MODULE_DIR / title)
72 | 
73 |     plt.show()
74 | 


--------------------------------------------------------------------------------
/CG/results/cg_operation_breakdown_8A100_discrete_standard.csv:
--------------------------------------------------------------------------------
 1 | Discrete Standard Operation Breakdown
 2 | Matrix,Dot 1 (+Reset),Dot 2 (+Reset),Global Reduction 1 (+Barrier),Global Reduction 2 (+Barrier),Memcpy Dot to Host 1,Memcpy Dot to Host 2,NVSHMEM Barrier 1 (After SpMV),NVSHMEM Barrier 2 (End of Iteration),Saxpy 1,Saxpy 2,Saxpy 3,SpMV
 3 | tridiagonal,0.2094,0.1695,0.2585,0.2599,0.0680,0.0673,0.1108,0.1102,0.1827,0.2416,0.1814,0.5409
 4 | ecology2,0.0746,0.0736,0.2211,0.2304,0.0657,0.0653,0.0952,0.0971,0.0593,0.0588,0.0605,0.1004
 5 | hood,0.0705,0.0700,0.2160,0.2223,0.0652,0.0642,0.1086,0.0973,0.0553,0.0549,0.0559,0.4870
 6 | bmwcra_1,0.0700,0.0696,0.2146,0.2280,0.0650,0.0640,0.2559,0.0973,0.0549,0.0546,0.0557,0.9622
 7 | consph,0.0698,0.0694,0.2164,0.2261,0.0653,0.0648,0.2090,0.0971,0.0545,0.0544,0.0554,0.3745
 8 | thermomech_dM,0.0701,0.0699,0.2182,0.2286,0.0649,0.0645,0.1132,0.0987,0.0553,0.0550,0.0561,0.2324
 9 | tmt_sym,0.0735,0.0731,0.2180,0.2266,0.0662,0.0657,0.0945,0.0967,0.0580,0.0577,0.0590,0.1137
10 | crankseg_1,0.0702,0.0687,0.2249,0.2288,0.0669,0.0650,7.6064,0.0960,0.0543,0.0540,0.0554,1.6466
11 | crankseg_2,0.0717,0.0699,0.2342,0.2338,0.0684,0.0658,10.8982,0.0978,0.0547,0.0544,0.0556,1.8452
12 | Queen_4147,0.1025,0.0889,0.2284,0.2247,0.0685,0.0650,0.2232,0.0959,0.0722,0.0720,0.0681,15.1167
13 | Bump_2911,0.0957,0.0850,0.2273,0.2234,0.0686,0.0654,0.3731,0.0966,0.0672,0.0676,0.0653,4.7050
14 | G3_circuit,0.0763,0.0751,0.2168,0.2256,0.0654,0.0648,0.1236,0.0975,0.0600,0.0596,0.0608,0.2145
15 | StocF-1465,0.0835,0.0780,0.2241,0.2272,0.0659,0.0638,0.1718,0.0955,0.0616,0.0609,0.0605,0.5684
16 | Flan_1565,0.0857,0.0781,0.2274,0.2240,0.0676,0.0644,0.1596,0.0962,0.0621,0.0622,0.0606,4.5419
17 | audikw_1,0.0834,0.0769,0.2250,0.2208,0.0675,0.0643,0.0936,0.0959,0.0599,0.0595,0.0586,4.8515
18 | Serena,0.0836,0.0779,0.2228,0.2217,0.0669,0.0640,0.2073,0.0955,0.0613,0.0615,0.0600,2.6818
19 | Geo_1438,0.0842,0.0781,0.2225,0.2195,0.0674,0.0648,0.1707,0.0960,0.0620,0.0623,0.0605,2.2589
20 | Hook_1498,0.0831,0.0775,0.2219,0.2201,0.0665,0.0640,0.1343,0.0964,0.0617,0.0623,0.0606,1.8148
21 | ldoor,0.0808,0.0755,0.2255,0.2252,0.0667,0.0644,0.2791,0.0954,0.0593,0.0595,0.0585,0.8216


--------------------------------------------------------------------------------
/common.mk:
--------------------------------------------------------------------------------
 1 | ifeq ($(_COMMON_),)
 2 | _COMMON_ := defined
 3 | 
 4 | NVCC ?= nvcc
 5 | MPIRUN ?= mpirun
 6 | MPICCX ?= mpic++
 7 | CXX ?= g++
 8 | 
 9 | BUILD_ROOT := bin
10 | OBJ_ROOT := $(BUILD_ROOT)/obj
11 | 
12 | ifndef NVSHMEM_HOME
13 | $(warning NVSHMEM_HOME is not set)
14 | endif
15 | ifndef MPI_HOME
16 | $(warning MPI_HOME is not set)
17 | endif
18 | ifndef UCX_HOME
19 | $(warning UCX_HOME is not set)
20 | endif
21 | 
22 | MAKEFLAGS += -j
23 | 
24 | # Can't compile CUDA with -Wpedantic
25 | WARN_FLAGS = "-Wall -Wno-comment -Werror -Wextra"
26 | 
27 | rwildcard=$(foreach d,$(wildcard $(1:=/*)),$(call rwildcard,$d,$2) $(filter $(subst *,%,$2),$d))
28 | 
29 | GENCODE_SM70    := -gencode 'arch=compute_70,code=sm_70'
30 | GENCODE_SM80    := -gencode 'arch=compute_80,code=sm_80' -gencode 'arch=compute_80,code=compute_80'
31 | GENCODE_FLAGS	:= $(GENCODE_SM70) $(GENCODE_SM80)
32 | 
33 | DEP_FLAGS = -MT $@ -MMD -MP -MF
34 | 
35 | NVCC_FLAGS_GENERIC = -O2 -dc -Xcompiler $(WARN_FLAGS) -Xcompiler -fopenmp $(GENCODE_FLAGS) -std=c++17
36 | 
37 | # Regular
38 | NVCC_FLAGS = $(NVCC_FLAGS_GENERIC) -ccbin=$(CXX) -I./include
39 | NVCC_LDFLAGS = -ccbin=$(CXX) -lgomp -L$(CUDA_HOME)/lib64 -lcuda -lcudart
40 | 
41 | # NVSHMEM
42 | NVCC_NV_FLAGS = $(NVCC_FLAGS_GENERIC) -ccbin=$(MPICCX) -isystem $(NVSHMEM_HOME)/include -isystem $(MPI_HOME)/include -I./include_nvshmem
43 | NVCC_NV_LDFLAGS = -ccbin=$(MPICCX) -lgomp -L$(CUDA_HOME)/lib64 -lcuda -lcudart -lnvidia-ml -L$(NVSHMEM_HOME)/lib -lnvshmem -L$(MPI_HOME)/lib -lmpi -L$(UCX_HOME)/lib -lucp -lucs -luct -lucm -lmlx5
44 | 
45 | # Example
46 | #$(OBJS_2D) : $(OBJ_DIR_2D)/%.o : $(SRC_DIR_2D)/%.cu $(DEP_DIR_2D)/%.d | $(DEP_DIR_2D)
47 | #	$(call COMPILE, $(DEP_DIR_2D))
48 | 
49 | define LINK =
50 | 	@mkdir -p $(BUILD_ROOT)
51 | 	$(NVCC) $(GENCODE_FLAGS) -o $(BUILD_ROOT)/$@ $^ $(NVCC_LDFLAGS)
52 | endef
53 | 
54 | define COMPILE =
55 | 	@mkdir -p "$(dir $(1)/$*)"
56 | 	@mkdir -p $(@D)
57 | 	$(NVCC) $(NVCC_FLAGS) $(DEP_FLAGS) $(1)/$*.d -o $@ $<
58 | endef
59 | 
60 | define LINK_NVSHMEM =
61 | 	@mkdir -p $(BUILD_ROOT)
62 | 	$(NVCC) $(GENCODE_FLAGS) -o $(BUILD_ROOT)/$@ $^ $(NVCC_NV_LDFLAGS)
63 | endef
64 | 
65 | define COMPILE_NVSHMEM =
66 | 	@mkdir -p "$(dir $(1)/$*)"
67 | 	@mkdir -p $(@D)
68 | 	$(NVCC) $(NVCC_NV_FLAGS) $(DEP_FLAGS) $(1)/$*.d -o $@ $<
69 | endef
70 | 
71 | clean:
72 | 	$(RM) -rd $(BUILD_ROOT)
73 | 
74 | endif
75 | 


--------------------------------------------------------------------------------
/Stencil/jacobi2D/scripts/constant_num_gpus_bench.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #SBATCH --job-name=stencil-bench
 4 | #SBATCH --ntasks=8
 5 | #SBATCH --gres=gpu:8
 6 | #SBATCH --time=03:00:00
 7 | 
 8 | . ./scripts/modules.sh > /dev/null
 9 | 
10 | CUDA_VISIBLE_DEVICES_SETTING=("0" "0" "0,1" "0,1,2" "0,1,2,3" "0,1,2,3,4" "0,1,2,3,4,5" "0,1,2,3,4,5,6" "0,1,2,3,4,5,6,7" )
11 | 
12 | declare -A version_name_to_idx_map
13 | 
14 | version_name_to_idx_map["Single Stream 1TB"]=0
15 | version_name_to_idx_map["Single Stream 2TB"]=1
16 | 
17 | version_name_to_idx_map["Baseline Copy"]=3
18 | version_name_to_idx_map["Baseline Copy Overlap"]=4
19 | version_name_to_idx_map["Baseline P2P"]=5
20 | 
21 | version_name_to_idx_map["Single Stream 1TB (No Compute)"]=9
22 | version_name_to_idx_map["Single Stream 2TB (No Compute)"]=10
23 | version_name_to_idx_map["Baseline Copy (No compute)"]=12
24 | version_name_to_idx_map["Baseline Copy Overlap (No Compute)"]=13
25 | version_name_to_idx_map["Baseline P2P (No Compute)"]=14
26 | 
27 | BIN="./jacobi -s 1"
28 | 
29 | 
30 | STARTING_NX=${STARTING_NX:-1024}
31 | STARTING_NY=${STARTING_NY:-1024}
32 | NUM_ITER=${NUM_ITER:-1000000}
33 | NUM_RUNS=${NUM_RUNS:-5}
34 | 
35 | NUM_GPUS=${NUM_GPUS:-4}
36 | MAX_DOMAIN_SIZE=${MAX_DOMAIN_SIZE:-16384}
37 | 
38 | while [ $# -gt 0 ]; do
39 | 
40 |    if [[ $1 == *"--"* ]]; then
41 |         param="${1/--/}"
42 |         declare $param="$2"
43 |    fi
44 | 
45 |   shift
46 | done
47 | 
48 | for version_name in "${!version_name_to_idx_map[@]}"; do
49 |     echo "Running ${version_name}"; echo ""
50 | 
51 |     version_idx=${version_name_to_idx_map[$version_name]}
52 | 
53 |     NX=${STARTING_NX}
54 |     NY=${STARTING_NY}
55 | 
56 |     export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES_SETTING[${NUM_GPUS}]}
57 | 
58 |     while : ; do
59 | 
60 |         echo "Num GPUS: ${NUM_GPUS}"
61 |         echo "${NUM_ITER} iterations on grid ${NY}x${NX}"
62 | 
63 |         for (( i=1; i <= ${NUM_RUNS}; i++ )); do
64 |             execution_time=$(${BIN} -v ${version_idx} -nx ${NX} -ny ${NY} -niter ${NUM_ITER})
65 |             echo "${execution_time} on run ${i}"
66 |         done
67 | 
68 |         printf "\n"
69 | 
70 |         if [[ $NX -ne ${MAX_DOMAIN_SIZE} ]]; then
71 |             NX=$((2*NX))
72 |             NY=$((2*NY))
73 |         else
74 |         break
75 |         fi
76 |     done
77 | 
78 |     echo "-------------------------------------"
79 | done
80 | 


--------------------------------------------------------------------------------
/Stencil/jacobi3D/scripts/constant_num_gpus_bench.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | #SBATCH --job-name=stencil-bench
 4 | #SBATCH --ntasks=8
 5 | #SBATCH --gres=gpu:8
 6 | #SBATCH --time=03:00:00
 7 | 
 8 | . ./scripts/modules.sh > /dev/null
 9 | 
10 | CUDA_VISIBLE_DEVICES_SETTING=("0" "0" "0,1" "0,1,2" "0,1,2,3" "0,1,2,3,4" "0,1,2,3,4,5" "0,1,2,3,4,5,6" "0,1,2,3,4,5,6,7" )
11 | 
12 | declare -A version_name_to_idx_map
13 | 
14 | version_name_to_idx_map["Single Stream 1TB"]=0
15 | version_name_to_idx_map["Single Stream 2TB"]=1
16 | 
17 | version_name_to_idx_map["Baseline Copy"]=3
18 | version_name_to_idx_map["Baseline Copy Overlap"]=4
19 | version_name_to_idx_map["Baseline P2P"]=5
20 | 
21 | version_name_to_idx_map["Single Stream 1TB (No Compute)"]=9
22 | version_name_to_idx_map["Single Stream 2TB (No Compute)"]=10
23 | version_name_to_idx_map["Baseline Copy (No compute)"]=12
24 | version_name_to_idx_map["Baseline Copy Overlap (No Compute)"]=13
25 | version_name_to_idx_map["Baseline P2P (No Compute)"]=14
26 | 
27 | BIN="./jacobi -s 1"
28 | 
29 | 
30 | STARTING_NX=${STARTING_NX:-1024}
31 | STARTING_NY=${STARTING_NY:-1024}
32 | NUM_ITER=${NUM_ITER:-1000000}
33 | NUM_RUNS=${NUM_RUNS:-5}
34 | 
35 | NUM_GPUS=${NUM_GPUS:-4}
36 | MAX_DOMAIN_SIZE=${MAX_DOMAIN_SIZE:-16384}
37 | 
38 | while [ $# -gt 0 ]; do
39 | 
40 |    if [[ $1 == *"--"* ]]; then
41 |         param="${1/--/}"
42 |         declare $param="$2"
43 |    fi
44 | 
45 |   shift
46 | done
47 | 
48 | for version_name in "${!version_name_to_idx_map[@]}"; do
49 |     echo "Running ${version_name}"; echo ""
50 | 
51 |     version_idx=${version_name_to_idx_map[$version_name]}
52 | 
53 |     NX=${STARTING_NX}
54 |     NY=${STARTING_NY}
55 | 
56 |     export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES_SETTING[${NUM_GPUS}]}
57 | 
58 |     while : ; do
59 | 
60 |         echo "Num GPUS: ${NUM_GPUS}"
61 |         echo "${NUM_ITER} iterations on grid ${NY}x${NX}"
62 | 
63 |         for (( i=1; i <= ${NUM_RUNS}; i++ )); do
64 |             execution_time=$(${BIN} -v ${version_idx} -nx ${NX} -ny ${NY} -niter ${NUM_ITER})
65 |             echo "${execution_time} on run ${i}"
66 |         done
67 | 
68 |         printf "\n"
69 | 
70 |         if [[ $NX -ne ${MAX_DOMAIN_SIZE} ]]; then
71 |             NX=$((2*NX))
72 |             NY=$((2*NY))
73 |         else
74 |         break
75 |         fi
76 |     done
77 | 
78 |     echo "-------------------------------------"
79 | done
80 | 


--------------------------------------------------------------------------------
/CG/results/cg_operation_breakdown_4A100.txt:
--------------------------------------------------------------------------------
 1 | Results per version; rows are matrices
 2 | 
 3 | Results for version Baseline Discrete Standard NVSHMEM =>
 4 | ,Dot 1 (+Reset),Dot 2 (+Reset),Global Reduction 1 (+Barrier),Global Reduction 2 (+Barrier),Memcpy Dot to Host 1,Memcpy Dot to Host 2,NVSHMEM Barrier 1 (After SpMV),NVSHMEM Barrier 2 (End of Iteration),Saxpy 1,Saxpy 2,Saxpy 3,SpMV
 5 | (generated)_tridiagonal,0.3252,0.2800,0.2109,0.2248,0.0662,0.0657,0.1073,0.0979,0.3663,0.4253,0.3655,1.0031
 6 | ecology2,0.0785,0.0810,0.1890,0.1936,0.0651,0.0646,0.0922,0.0920,0.0611,0.0615,0.0626,0.1194
 7 | hood,0.0721,0.0713,0.1947,0.1960,0.0658,0.0638,0.2090,0.0907,0.0576,0.0576,0.0571,0.4670
 8 | bmwcra_1,0.0721,0.0706,0.1842,0.1850,0.0664,0.0647,0.1759,0.0876,0.0571,0.0572,0.0561,1.2385
 9 | consph,0.0702,0.0698,0.1836,0.1916,0.0653,0.0644,0.1791,0.0909,0.0552,0.0550,0.0557,0.3645
10 | thermomech_dM,0.0722,0.0715,0.1842,0.1895,0.0652,0.0643,0.1041,0.0902,0.0563,0.0561,0.0572,0.2593
11 | tmt_sym,0.0765,0.0755,0.1849,0.1934,0.0653,0.0649,0.0957,0.0907,0.0600,0.0598,0.0609,0.1585
12 | crankseg_1,0.0717,0.0699,0.1904,0.1883,0.0675,0.0642,8.8024,0.0881,0.0564,0.0563,0.0555,2.2143
13 | crankseg_2,0.0795,0.0736,0.1930,0.1872,0.0688,0.0650,0.0941,0.0890,0.0573,0.0572,0.0558,14.0743
14 | 
15 | 
16 | Results for version Baseline Discrete Pipelined NVSHMEM (No Overlap) =>
17 | ,Global Reductions (+Barrier),Memcpy Dots To Host,Merged Dots (+Reset),NVSHMEM Barrier 1 (After SpMV),NVSHMEM Barrier 2 (End of Iteration),Saxpy 1,Saxpy 2,Saxpy 3,Saxpy 4,Saxpy 5,Saxpy 6,SpMV
18 | (generated)_tridiagonal,0.1507,0.0681,0.3863,0.1048,0.1055,0.3840,0.4209,0.4236,0.4229,0.4240,0.4234,0.9681
19 | ecology2,0.1508,0.0659,0.0866,0.0975,0.1008,0.0649,0.0637,0.0656,0.0627,0.0613,0.0614,0.1272
20 | hood,0.1506,0.0671,0.0759,0.2009,0.0954,0.0628,0.0579,0.0585,0.0581,0.0563,0.0564,0.4757
21 | bmwcra_1,0.1445,0.0665,0.0736,0.4255,0.0932,0.0576,0.0575,0.0575,0.0572,0.0557,0.0556,1.0926
22 | consph,0.1470,0.0656,0.0721,0.1186,0.0936,0.0554,0.0551,0.0550,0.0549,0.0546,0.0547,0.4386
23 | thermomech_dM,0.1458,0.0667,0.0742,0.1101,0.0933,0.0567,0.0567,0.0566,0.0563,0.0561,0.0561,0.2419
24 | tmt_sym,0.1468,0.0663,0.0817,0.0929,0.0925,0.0606,0.0605,0.0610,0.0603,0.0597,0.0602,0.1614
25 | crankseg_1,0.1479,0.0671,0.0768,9.5558,0.0994,0.0556,0.0555,0.0565,0.0560,0.0548,0.0548,1.5266
26 | crankseg_2,0.1510,0.0693,0.0794,11.0112,0.0934,0.0570,0.0565,0.0573,0.0568,0.0548,0.0548,2.9162
27 | 
28 | 
29 | 


--------------------------------------------------------------------------------
/CG/scripts/download_matrices.py:
--------------------------------------------------------------------------------
 1 | from io import BytesIO
 2 | from urllib.request import urlopen
 3 | import tarfile
 4 | 
 5 | from os.path import dirname, realpath, basename
 6 | import os
 7 | import sys
 8 | 
 9 | SAVE_MATRICES_TO_FOLDER = None
10 | 
11 | SUITE_SPARSE_BASE_URL = 'https://suitesparse-collection-website.herokuapp.com/MM'
12 | 
13 | MATRIX_INDICES = [
14 |     'McRae/ecology2',
15 |     'GHS_psdef/hood',
16 |     'GHS_psdef/bmwcra_1',
17 |     'Williams/consph',
18 |     'Botonakis/thermomech_dM',
19 |     'CEMW/tmt_sym',
20 |     'GHS_psdef/crankseg_1',
21 |     'GHS_psdef/crankseg_2',
22 |     'TKK/cbuckle',
23 |     'BenElechi/BenElechi1',
24 |     'MaxPlanck/shallow_water2',
25 |     'JGD_Trefethen/Trefethen_2000',
26 |     'Janna/Queen_4147',
27 |     'Janna/Bump_2911',
28 |     'AMD/G3_circuit',
29 |     'Janna/StocF-1465',
30 |     'Janna/Flan_1565',
31 |     'GHS_psdef/audikw_1',
32 |     'Janna/Serena',
33 |     'Janna/Geo_1438',
34 |     'Janna/Hook_1498',
35 |     #   'Oberwolfach/bone010', Multi-part matrix, don't handle those for now
36 |     'GHS_psdef/ldoor',
37 | ]
38 | 
39 | 
40 | def download_matrices():
41 |     for matrix_index in MATRIX_INDICES:
42 |         matrix_name = matrix_index.split('/')[-1]
43 | 
44 |         mtx_filename = f'{matrix_name}.mtx'
45 |         mtx_filepath = f'{SAVE_MATRICES_TO_FOLDER}/{mtx_filename}'
46 | 
47 |         if os.path.exists(mtx_filepath):
48 |             print(f'Matrix {matrix_name} is already downloaded')
49 |             continue
50 | 
51 |         matrix_url = f'{SUITE_SPARSE_BASE_URL}/{matrix_index}.tar.gz'
52 | 
53 |         with urlopen(matrix_url) as zip_response:
54 |             zip_file = tarfile.open(fileobj=zip_response, mode='r|gz')
55 | 
56 |             zip_file.extractall(SAVE_MATRICES_TO_FOLDER)
57 | 
58 |             tmp_folder_path = f'{SAVE_MATRICES_TO_FOLDER}/{matrix_name}'
59 |             old_matrix_path = f'{tmp_folder_path}/{matrix_name}.mtx'
60 | 
61 |             os.rename(old_matrix_path, mtx_filepath)
62 | 
63 |             os.rmdir(tmp_folder_path)
64 | 
65 |             print(f'Downloaded matrix {matrix_name}')
66 | 
67 | 
68 | if __name__ == "__main__":
69 |     dir_path = dirname(realpath(__file__))
70 | 
71 |     arg_idx = 1
72 | 
73 |     while arg_idx < len(sys.argv):
74 |         if sys.argv[arg_idx] == '--save_matrices_to_folder':
75 |             arg_idx += 1
76 |             arg_val = sys.argv[arg_idx]
77 | 
78 |             SAVE_MATRICES_TO_FOLDER = arg_val
79 | 
80 |         arg_idx += 1
81 | 
82 |     download_matrices()
83 | 


--------------------------------------------------------------------------------
/Stencil/jacobi2D/PERKS/jacobi-general-wrapper.cu:
--------------------------------------------------------------------------------
 1 | #include "./config.cuh"
 2 | // #include "./genconfig.cuh"
 3 | #include "./jacobi-general-kernel.cuh"
 4 | #include "./perksconfig.cuh"
 5 | 
 6 | // #ifdef SMASYNC
 7 | //   #include <cooperative_groups/memcpy_async.h>
 8 | //   #include <cuda_pipeline.h>
 9 | // #endif
10 | #include "./common/cuda_common.cuh"
11 | 
12 | #define MAXTHREAD (256)
13 | // #define MINBLOCK (1)
14 | template <class REAL, int LOCAL_TILE_Y, int halo, int registeramount, bool UseSMCache, bool isstar,
15 |           int minblocks>
16 | __launch_bounds__(MAXTHREAD, minblocks) __global__
17 |     void kernel_general_wrapper(REAL *input, int width_y, int width_x, int iy_start, int iy_end,
18 |                                 REAL *__var_4__, REAL *l2_cache_o, REAL *l2_cache_i, int iteration,
19 |                                 int max_sm_flder, volatile int *iteration_done) {
20 |     inner_general<
21 |         REAL, LOCAL_TILE_Y, halo,
22 |         regfolder<halo, isstar, registeramount, PERKS_ARCH, UseSMCache, REAL, LOCAL_TILE_Y>::val,
23 |         // 1,
24 |         UseSMCache>(input, width_y, width_x, iy_start, iy_end, __var_4__, l2_cache_o, l2_cache_i,
25 |                     iteration, max_sm_flder, iteration_done);
26 | }
27 | 
28 | PERKS_INITIALIZE_ALL_TYPE_4ARG(PERKS_DECLARE_INITIONIZATION_GENERAL_WRAPPER, 8, HALO, 128, true);
29 | PERKS_INITIALIZE_ALL_TYPE_4ARG(PERKS_DECLARE_INITIONIZATION_GENERAL_WRAPPER, 8, HALO, 128, false);
30 | PERKS_INITIALIZE_ALL_TYPE_4ARG(PERKS_DECLARE_INITIONIZATION_GENERAL_WRAPPER, 8, HALO, 256, true);
31 | PERKS_INITIALIZE_ALL_TYPE_4ARG(PERKS_DECLARE_INITIONIZATION_GENERAL_WRAPPER, 8, HALO, 256, false);
32 | 
33 | PERKS_INITIALIZE_ALL_TYPE_4ARG(PERKS_DECLARE_INITIONIZATION_GENERAL_WRAPPER, 16, HALO, 128, true);
34 | PERKS_INITIALIZE_ALL_TYPE_4ARG(PERKS_DECLARE_INITIONIZATION_GENERAL_WRAPPER, 16, HALO, 128, false);
35 | PERKS_INITIALIZE_ALL_TYPE_4ARG(PERKS_DECLARE_INITIONIZATION_GENERAL_WRAPPER, 16, HALO, 256, true);
36 | PERKS_INITIALIZE_ALL_TYPE_4ARG(PERKS_DECLARE_INITIONIZATION_GENERAL_WRAPPER, 16, HALO, 256, false);
37 | // #if PERKS_ARCH==800
38 | // #elif PERKS_ARCH==700
39 | // #elif PERKS_ARCH==600
40 | // #error "should not be 600"
41 | // #elif PERKS_ARCH==000
42 | // #error "undefined"
43 | // #else
44 | // #error "wrong architecture"
45 | // #endif
46 | // template<>
47 | // __global__ void kernel_general_wrapper<float,RTILE_Y,HALO,256,true>
48 | // ( float * __restrict__ input, int width_y, int width_x,
49 | //   float * __restrict__ __var_4__,
50 | //   float * __restrict__ l2_cache_o,float * __restrict__ l2_cache_i,
51 | //   int iteration,
52 | //   int max_sm_flder);


--------------------------------------------------------------------------------
/CG/scripts/plots/common.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import textwrap
 3 | from pathlib import Path
 4 | from os.path import dirname, realpath
 5 | 
 6 | dir_path = dirname(realpath(__file__))
 7 | 
 8 | BASE_DIR = Path(dir_path + '/../../img')
 9 | BASE_DIR.mkdir(exist_ok=True)
10 | 
11 | ACM_DOCUMENT_WIDTH = 506.295
12 | 
13 | 
14 | def rotate(l, n):
15 |     return l[-n:] + l[:-n]
16 | 
17 | 
18 | def get_files():
19 |     parser = argparse.ArgumentParser()
20 |     parser.add_argument('files', type=argparse.FileType('r'), nargs='+')
21 |     return parser.parse_args().files
22 | 
23 | 
24 | def get_module_dir(dir_name):
25 |     module_dir = BASE_DIR / dir_name
26 |     module_dir.mkdir(exist_ok=True)
27 |     return module_dir
28 | 
29 | 
30 | def wrap_labels(ax, width, break_long_words=False):
31 |     labels = []
32 |     for label in ax.get_xticklabels():
33 |         text = label.get_text()
34 |         labels.append(textwrap.fill(text, width=width,
35 |                                     break_long_words=break_long_words))
36 |     ax.set_xticklabels(labels, rotation=0)
37 | 
38 | 
39 | def set_size(width, fraction=1):
40 |     """Set figure dimensions to avoid scaling in LaTeX.
41 | 
42 |     Parameters
43 |     ----------
44 |     width: float
45 |             Document textwidth or columnwidth in pts
46 |     fraction: float, optional
47 |             Fraction of the width which you wish the figure to occupy
48 | 
49 |     Returns
50 |     -------
51 |     fig_dim: tuple
52 |             Dimensions of figure in inches
53 |     """
54 |     # Width of figure (in pts)
55 |     fig_width_pt = width * fraction
56 | 
57 |     # Convert from pt to inches
58 |     inches_per_pt = 1 / 72.27
59 | 
60 |     # Golden ratio to set aesthetic figure height
61 |     # https://disq.us/p/2940ij3
62 |     golden_ratio = (5**.5 - 1) / 2
63 | 
64 |     # Figure width in inches
65 |     fig_width_in = fig_width_pt * inches_per_pt
66 |     # Figure height in inches
67 |     fig_height_in = fig_width_in * golden_ratio
68 | 
69 |     fig_dim = (fig_width_in, fig_height_in)
70 | 
71 |     return fig_dim
72 | 
73 | 
74 | markers = [
75 |     '.',  # point
76 |     ',',  # pixel
77 |     # 'o',  # circle
78 |     # 'v',  # triangle down
79 |     '^',  # triangle up
80 |     '<',  # triangle_left
81 |     '>',  # triangle_right
82 |     '1',  # tri_down
83 |     '2',  # tri_up
84 |     '3',  # tri_left
85 |     '4',  # tri_right
86 |     '8',  # octagon
87 |     's',  # square
88 |     'p',  # pentagon
89 |     '*',  # star
90 |     'h',  # hexagon1
91 |     'H',  # hexagon2
92 |     '+',  # plus
93 |     'x',  # x
94 |     'D',  # diamond
95 |     'd',  # thin_diamond
96 |     '|',  # vline
97 | ]
98 | 


--------------------------------------------------------------------------------
/Stencil/jacobi2D/scripts/plot.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import sys
 3 | from collections import defaultdict
 4 | 
 5 | import pandas as pd
 6 | 
 7 | DELIMITER = '-------------------------------------'
 8 | 
 9 | results_path = sys.argv[1]
10 | 
11 | 
12 | def print_data_tabular(version_to_result_map, column_labels):
13 |     row_labels = version_to_result_map.keys()
14 |     full_perf_data = version_to_result_map.values()
15 |     transposed_perf_data = list(zip(*full_perf_data))
16 | 
17 |     df = pd.DataFrame(full_perf_data, columns=column_labels,
18 |                       index=row_labels)
19 | 
20 |     df.to_csv(sys.stdout)
21 | 
22 | 
23 | with open(results_path) as file:
24 |     results = file.read()
25 | 
26 | results_per_version = results.split(DELIMITER)[:-1]
27 | results_per_version = [result.strip() for result in results_per_version]
28 | 
29 | version_to_result_map = defaultdict(list)
30 | num_gpus_grid_size_label = []
31 | 
32 | for version_result in results_per_version:
33 |     chunks = version_result.split('\n\n')
34 |     version_name = ' '.join(chunks[0].split()[1:])
35 |     x_axis_label = []
36 | 
37 |     for data_chunk in chunks[1:]:
38 |         chunk_lines = data_chunk.splitlines()
39 |         num_gpus = int(re.match("Num GPUS: (?P<num_gpus>\d+)",
40 |                                 chunk_lines[0]).group('num_gpus'))
41 | 
42 |         run_parameters_match = re.match(
43 |             "(?P<num_iter>\d+) iterations on grid (?P<nx>\d+)x(?P<ny>\d+)", chunk_lines[1])
44 | 
45 |         num_iterations = int(run_parameters_match.group('num_iter'))
46 |         grid_nx = int(run_parameters_match.group('nx'))
47 |         grid_ny = int(run_parameters_match.group('ny'))
48 | 
49 |         if not num_gpus_grid_size_label:
50 |             label = f"{num_gpus} GPU" + \
51 |                 ("s" if num_gpus > 1 else "") + f" ({grid_nx}x{grid_ny})"
52 |             x_axis_label.append(label)
53 | 
54 |         perf_data_pattern = re.compile(
55 |             "Execution time:\s+(?P<exec_time>[-+]?(\d+(\.\d*)?|\.\d+)([eE][-+]?\d+)?) s on run (?P<run_num>\d+)")
56 | 
57 |         execution_times = []
58 |         run_idx = 2
59 | 
60 |         while run_idx < len(chunk_lines) and (perf_data_match := perf_data_pattern.match(chunk_lines[run_idx])):
61 |             exec_time = float(perf_data_match.group('exec_time'))
62 |             execution_times.append(exec_time)
63 | 
64 |             run_idx += 1
65 | 
66 |         min_execution_time = min(execution_times)
67 |         version_to_result_map[version_name].append(min_execution_time)
68 | 
69 |     if not num_gpus_grid_size_label:
70 |         num_gpus_grid_size_label[:] = x_axis_label
71 | 
72 | print_data_tabular(version_to_result_map, num_gpus_grid_size_label)
73 | 


--------------------------------------------------------------------------------
/Plots/comp-vs-comm.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | import pandas as pd
 3 | import numpy as np
 4 | from itertools import cycle
 5 | 
 6 | from common import get_module_dir, wrap_labels
 7 | 
 8 | MODULE_DIR = get_module_dir('Comp vs Comm')
 9 | 
10 | plt.style.use('./paper.mplstyle')
11 | 
12 | data_comp = pd.read_csv('data/comp.csv', index_col='Version')
13 | data_no_comp = pd.read_csv('data/no-comp.csv', index_col='Version')
14 | 
15 | # Make sure both have the same version names
16 | data_no_comp.index = data_comp.index.copy()
17 | 
18 | 
19 | def plot_one_gpu(comp, no_comp, title):
20 |     # Normalize to 100%
21 |     # Percentage
22 |     no_comp_left = (no_comp / comp) * 100
23 |     # comp_left = 100 - no_comp_left
24 |     comp_left = 100 - no_comp_left
25 | 
26 |     # no_comp_left += comp_left
27 | 
28 |     # Actual execution time
29 |     comp_right = comp #- no_comp
30 | 
31 |     fig, axes = plt.subplots(1, 2)
32 |     fig.set_size_inches(10, 6)
33 | 
34 |     data_left = pd.DataFrame({'Comp %': comp_left, 'Comm %': no_comp_left, 'idx_col': comp_left.index})
35 |     data_right = pd.DataFrame(
36 |         {'Comp sec.': comp_right, 'Comm sec.': no_comp, 'idx_col': comp_right.index}
37 |     )
38 | 
39 |     indices = np.arange(3)
40 | 
41 |     colors = cycle(plt.rcParams['axes.prop_cycle'].by_key()['color'][:2])
42 |     widths = cycle([0.8, 0.6])
43 | 
44 |     # data_left.iloc[:, 1].plot.bar(indices, color='r', width=0.8, stacked=True)
45 |     # comp_left.plot.bar(indices, color='b', width=0.6, stacked=True)
46 |     # plt.show()
47 | 
48 |     for ax, data in zip(axes, [data_left, data_right]):
49 | 
50 |         for i in range(2):
51 |             data.iloc[:, i].plot.bar(ax=ax, color=next(colors), width=next(widths))
52 | 
53 |         # data.plot.bar(ax=ax, width=0.6, stacked=True)
54 | 
55 |         # Hatch stuff
56 |         bars = ax.patches
57 | 
58 |         bars[0].set_width(0.8)
59 |         bars[1].set_width(0.8)
60 | 
61 |         hatches = ''.join(h * len(data_left) for h in [' ', '/'])
62 | 
63 |         for bar, hatch in zip(bars, hatches):
64 |             bar.set_hatch(hatch)
65 | 
66 |         # Beautify stuff
67 |         ax.set(xlabel=None)
68 |         wrap_labels(ax, 10)
69 | 
70 |         ax.legend(loc='upper center',
71 |                   bbox_to_anchor=(0.5,  # horizontal
72 |                                   1.09),  # vertical
73 |                   ncol=3, fancybox=True)
74 | 
75 |     plt.xticks(rotation=0, ha='center')
76 |     fig.suptitle(title)
77 |     plt.savefig(MODULE_DIR / title)
78 |     plt.show()
79 | 
80 | 
81 | for (title, comp), (_, no_comp) in zip(data_comp.iteritems(), data_no_comp.iteritems()):
82 |     plot_one_gpu(comp=comp, no_comp=no_comp, title=title)
83 | 


--------------------------------------------------------------------------------
/Stencil/jacobi3D/scripts/plot.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import sys
 3 | from collections import defaultdict
 4 | 
 5 | import pandas as pd
 6 | 
 7 | DELIMITER = '-------------------------------------'
 8 | 
 9 | results_path = sys.argv[1]
10 | 
11 | 
12 | def print_data_tabular(version_to_result_map, column_labels):
13 |     row_labels = version_to_result_map.keys()
14 |     full_perf_data = version_to_result_map.values()
15 | 
16 |     df = pd.DataFrame(full_perf_data, columns=column_labels,
17 |                       index=row_labels)
18 | 
19 |     df.to_csv(sys.stdout)
20 | 
21 | 
22 | with open(results_path) as file:
23 |     results = file.read()
24 | 
25 | results_per_version = results.split(DELIMITER)[:-1]
26 | results_per_version = [result.strip() for result in results_per_version]
27 | 
28 | version_to_result_map = defaultdict(list)
29 | num_gpus_grid_size_label = []
30 | 
31 | for version_result in results_per_version:
32 |     chunks = version_result.split('\n\n')
33 |     version_name = ' '.join(chunks[0].split()[1:])
34 |     x_axis_label = []
35 | 
36 |     for data_chunk in chunks[1:]:
37 |         chunk_lines = data_chunk.splitlines()
38 |         num_gpus = int(re.match("Num GPUS: (?P<num_gpus>\d+)",
39 |                                 chunk_lines[0]).group('num_gpus'))
40 | 
41 |         run_parameters_match = re.match(
42 |             "(?P<num_iter>\d+) iterations on grid (?P<nx>\d+)x(?P<ny>\d+)x(?P<nz>\d+)", chunk_lines[1])
43 | 
44 |         num_iterations = int(run_parameters_match.group('num_iter'))
45 |         grid_nx = int(run_parameters_match.group('nx'))
46 |         grid_ny = int(run_parameters_match.group('ny'))
47 |         grid_nz = int(run_parameters_match.group('nz'))
48 | 
49 |         if not num_gpus_grid_size_label:
50 |             label = f"{num_gpus} GPU" + \
51 |                 ("s" if num_gpus > 1 else "") + f" ({grid_nx}x{grid_ny}x{grid_nz})"
52 |             x_axis_label.append(label)
53 | 
54 |         perf_data_pattern = re.compile(
55 |             "Execution time:\s+(?P<exec_time>[-+]?(\d+(\.\d*)?|\.\d+)([eE][-+]?\d+)?) s on run (?P<run_num>\d+)")
56 | 
57 |         execution_times = []
58 |         run_idx = 2
59 | 
60 |         while run_idx < len(chunk_lines) and (perf_data_match := perf_data_pattern.match(chunk_lines[run_idx])):
61 |             exec_time = float(perf_data_match.group('exec_time'))
62 |             execution_times.append(exec_time)
63 | 
64 |             run_idx += 1
65 | 
66 |         min_execution_time = min(execution_times)
67 |         version_to_result_map[version_name].append(min_execution_time)
68 | 
69 |     if not num_gpus_grid_size_label:
70 |         num_gpus_grid_size_label[:] = x_axis_label
71 | 
72 | print_data_tabular(version_to_result_map, num_gpus_grid_size_label)
73 | 


--------------------------------------------------------------------------------
/Plots/weak-scaling.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | from itertools import cycle
 3 | from pathlib import Path
 4 | 
 5 | import matplotlib.pyplot as plt
 6 | import pandas as pd
 7 | 
 8 | from common import get_files, markers, get_module_dir, wrap_labels, rotate
 9 | 
10 | MODULE_DIR = get_module_dir('Weak Scaling')
11 | 
12 | plt.style.use('https://github.com/dhaitz/matplotlib-stylesheets/raw/master/pitayasmoothie-light.mplstyle')
13 | 
14 | plt.rcParams.update({"axes.facecolor": (0.5, 0.5, 0.5, 0.1)})
15 | 
16 | plt.rcParams['text.usetex'] = True
17 | 
18 | 
19 | MICROSECOND = 1000000
20 | 
21 | #NUM_ITERS = [1_000_000, 1_000_000, 10_000]
22 | NUM_ITERS = [1_000_000, 1_000_000, 1_000_000, 10_000]
23 | 
24 | files = get_files()
25 | 
26 | plots = len(files)
27 | fig, axes = plt.subplots(math.ceil(plots / 3), plots if plots < 3 else 3, layout='constrained')
28 | # fig.set_size_inches(15, 3 * math.ceil(plots / 3))
29 | fig.set_size_inches(13, 3 * math.ceil(plots / 3))
30 | # fig.tight_layout()
31 | 
32 | colors = plt.rcParams['axes.prop_cycle'].by_key()['color']
33 | colors = rotate(list(reversed(colors)), 1)
34 | 
35 | colors[1] = colors[-1]
36 | 
37 | for ax, file, num_iter in zip(axes.flatten(), files, NUM_ITERS):
38 |     data = pd.read_csv(file, index_col='Version')
39 | #    data = data.sort_index()
40 |     data = data.T / num_iter * MICROSECOND
41 | 
42 |     ax = data.plot(ax=ax, color=colors)
43 | 
44 |     markers_cycle = cycle(markers)
45 | 
46 |     for line in ax.get_lines():
47 |         line.set_marker(next(markers_cycle))
48 |         line.set_linewidth(1.5)
49 |         # If our versions
50 |         if line.get_label().lower().startswith('baseline'):
51 | #            line.set_linewidth(1.0)
52 |             # line.set(alpha=0.5)
53 |             line.set_linestyle('dashed')
54 | 
55 |     # axes.legend(axes.get_lines(), data.columns, loc='best')
56 |     ax.get_legend().remove()
57 |     wrap_labels(ax, 10)
58 | 
59 |     # plt.xticks(fontsize=15)
60 |     # plt.title(title, fontsize=15)
61 | 
62 | # handles, labels = axes.get_legend_handles_labels()
63 | 
64 | axes.flatten()[0].legend(loc='best', fancybox=True, prop={'weight': 'bold', 'size': 'large'})
65 | axes.flatten()[0].legend(loc='best', fancybox=True)
66 | 
67 | # legend = fig.legend(handles, labels, loc='upper center',
68 | #                     bbox_to_anchor=(0.5,  # horizontal
69 | #                                     1.1),  # vertical
70 | #                     ncol=6, fancybox=True)
71 | 
72 | #legend = fig.legend(handles, labels, loc='best')
73 | 
74 | fig.supylabel(r'$\mu$ seconds per iteration', weight='normal')
75 | 
76 | title = Path(files[0].name).stem
77 | 
78 | format = 'pdf'
79 | #plt.constrained_layoadia
80 | plt.savefig(MODULE_DIR / f'{title}.{format}', bbox_inches='tight', format=format, transparent=False)
81 | 
82 | plt.show()
83 | 


--------------------------------------------------------------------------------
/Stencil/jacobi2D/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_executable(jacobi2D
 2 |         src/main.cu
 3 |         src/common.cu
 4 |         PERKS/jacobi-general-wrapper.cu
 5 |         src/baseline/multi-threaded-copy.cu
 6 |         src/baseline/multi-threaded-copy-overlap.cu
 7 |         src/baseline/multi-threaded-p2p.cu
 8 |         src/baseline/single-threaded-copy.cu
 9 |         src/multi-stream/multi-gpu-peer-tiling.cu
10 |         src/PERKS/multi-stream-perks.cu
11 |         src/single-stream/multi-threaded-one-block-comm.cu
12 |         src/single-stream/multi-threaded-two-block-comm.cu
13 |         src/no-compute/multi-gpu-peer-tiling-no-compute.cu
14 |         src/no-compute/multi-threaded-copy-no-compute.cu
15 |         src/no-compute/multi-threaded-copy-overlap-no-compute.cu
16 |         src/no-compute/multi-threaded-one-block-comm-no-compute.cu
17 |         src/no-compute/multi-threaded-p2p-no-compute.cu
18 |         src/no-compute/multi-threaded-two-block-comm-no-compute.cu)
19 | 
20 | add_executable(jacobi2D_nvshmem
21 |         src_nvshmem/main.cu
22 |         src_nvshmem/common.cu
23 |         PERKS/jacobi-general-wrapper.cu
24 |         src_nvshmem/baseline/multi-threaded-nvshmem.cu
25 |         src_nvshmem/baseline/multi-threaded-nvshmem-opt.cu
26 |         src_nvshmem/PERKS/multi-stream-perks.cu
27 |         src_nvshmem/multi-stream/multi-gpu-multi-block-tiling.cu
28 |         src_nvshmem/multi-stream/multi-gpu-peer-tiling.cu
29 |         src_nvshmem/single-stream/multi-threaded-multi-block-comm.cu
30 |         src_nvshmem/single-stream/multi-threaded-one-block-comm.cu
31 |         src_nvshmem/single-stream/multi-threaded-two-block-comm.cu
32 |         src_nvshmem/no-compute/design-1-multi-block-no-compute.cu
33 |         src_nvshmem/no-compute/multi-gpu-peer-tiling-no-compute.cu
34 |         src_nvshmem/no-compute/multi-threaded-nvshmem-no-compute.cu
35 |         src_nvshmem/no-compute/multi-threaded-nvshmem-opt-no-compute.cu
36 |         src_nvshmem/no-compute/multi-threaded-one-block-comm-no-compute.cu
37 |         src_nvshmem/no-compute/multi-threaded-two-block-comm-no-compute.cu)
38 | 
39 | target_include_directories(jacobi2D PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/include")
40 | target_include_directories(jacobi2D PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/PERKS")
41 | 
42 | target_include_directories(jacobi2D_nvshmem PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/include_nvshmem")
43 | target_include_directories(jacobi2D_nvshmem PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/PERKS")
44 | 
45 | find_package(OpenMP REQUIRED)
46 | find_package(NVSHMEM REQUIRED)
47 | find_package(MPI REQUIRED)
48 | 
49 | target_link_libraries(jacobi2D
50 |         CUDA::cudart
51 |         OpenMP::OpenMP_CXX)
52 | 
53 | target_link_libraries(jacobi2D_nvshmem
54 |         CUDA::cudart
55 |         OpenMP::OpenMP_CXX
56 |         nvshmem::nvshmem
57 |         MPI::MPI_CXX)
58 | 


--------------------------------------------------------------------------------
/Stencil/jacobi2D/src/main.cu:
--------------------------------------------------------------------------------
 1 | #include <array>
 2 | 
 3 | #include "../include/baseline/multi-threaded-copy-overlap.cuh"
 4 | #include "../include/baseline/multi-threaded-copy.cuh"
 5 | #include "../include/baseline/multi-threaded-p2p.cuh"
 6 | #include "../include/baseline/single-threaded-copy.cuh"
 7 | 
 8 | #include "../include/single-stream/multi-threaded-one-block-comm.cuh"
 9 | #include "../include/single-stream/multi-threaded-two-block-comm.cuh"
10 | 
11 | #include "../include/PERKS/multi-stream-perks.cuh"
12 | #include "../include/multi-stream/multi-gpu-peer-tiling.cuh"
13 | 
14 | #include "../include/no-compute/multi-gpu-peer-tiling-no-compute.cuh"
15 | #include "../include/no-compute/multi-threaded-copy-no-compute.cuh"
16 | #include "../include/no-compute/multi-threaded-copy-overlap-no-compute.cuh"
17 | 
18 | #include "../include/no-compute/multi-threaded-one-block-comm-no-compute.cuh"
19 | #include "../include/no-compute/multi-threaded-p2p-no-compute.cuh"
20 | #include "../include/no-compute/multi-threaded-two-block-comm-no-compute.cuh"
21 | 
22 | using std::make_pair;
23 | 
24 | int main(int argc, char *argv[]) {
25 |     const std::array versions{
26 |         make_pair("Baseline Copy", BaselineMultiThreadedCopy::init),
27 |         make_pair("Baseline Overlap", BaselineMultiThreadedCopyOverlap::init),
28 |         make_pair("Baseline P2P", BaselineMultiThreadedP2P::init),
29 | 
30 |         make_pair("Design 1", MultiGPUPeerTiling::init),
31 |         make_pair("Design 2", SSMultiThreadedTwoBlockComm::init),
32 |         make_pair("PERKS", MultiStreamPERKS::init),
33 | 
34 |         make_pair("Baseline Copy (No computation)", BaselineMultiThreadedCopyNoCompute::init),
35 |         make_pair("Baseline Overlap (No Computation)",
36 |                   BaselineMultiThreadedCopyOverlapNoCompute::init),
37 |         make_pair("Baseline P2P (No Computation)", BaselineMultiThreadedP2PNoCompute::init),
38 | 
39 |         make_pair("Design 1 (No Computation)", MultiGPUPeerTilingNoCompute::init),
40 |         make_pair("Design 2 (No Computation)", SSMultiThreadedTwoBlockCommNoCompute::init),
41 | 
42 |         //        make_pair("Baseline Single Threaded Copy", BaselineSingleThreadedCopy::init),
43 |         //        make_pair("Single stream multi threaded  (one thread block communicates)",
44 |         //        SSMultiThreadedOneBlockComm::init),
45 | 
46 |         //        make_pair("Single stream multi threaded (one thread block communicates; no
47 |         //        computation)",
48 |         //                 SSMultiThreadedOneBlockCommNoCompute::init),
49 |     };
50 | 
51 |     const int selection = get_argval<int>(argv, argv + argc, "-v", 0);
52 |     const bool silent = get_arg(argv, argv + argc, "-s");
53 | 
54 |     auto &selected = versions[selection];
55 | 
56 |     if (!silent) {
57 |         std::cout << "Versions (select with -v):"
58 |                   << "\n";
59 |         for (size_t i = 0; i < versions.size(); ++i) {
60 |             auto &v = versions[i];
61 |             std::cout << i << ":\t" << v.first << "\n";
62 |         }
63 |         std::cout << std::endl;
64 | 
65 |         std::cout << "Running " << selected.first << "\n" << std::endl;
66 |     }
67 | 
68 |     return selected.second(argc, argv);
69 | }
70 | 


--------------------------------------------------------------------------------
/Plots/weak-scaling-2.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | from itertools import cycle
 3 | from pathlib import Path
 4 | 
 5 | import matplotlib.pyplot as plt
 6 | import pandas as pd
 7 | 
 8 | from common import get_files, markers, get_module_dir, wrap_labels, rotate
 9 | 
10 | from matplotlib.ticker import FormatStrFormatter
11 | 
12 | MODULE_DIR = get_module_dir('Weak Scaling')
13 | 
14 | plt.style.use('https://github.com/dhaitz/matplotlib-stylesheets/raw/master/pitayasmoothie-light.mplstyle')
15 | 
16 | plt.rcParams.update({"axes.facecolor": (0.5, 0.5, 0.5, 0.1)})
17 | 
18 | plt.rcParams['text.usetex'] = True
19 | 
20 | MICROSECOND = 1000000
21 | 
22 | #NUM_ITERS = [1_000_000, 1_000_000, 10_000]
23 | NUM_ITERS = [100_000, 100_000, 10_000]
24 | 
25 | files = get_files()
26 | 
27 | plots = len(files)
28 | fig, axes = plt.subplots(math.ceil(plots / 3), plots if plots < 3 else 3, layout='constrained')
29 | # fig.set_size_inches(15, 3 * math.ceil(plots / 3))
30 | fig.set_size_inches(15, 3 * math.ceil(plots / 3))
31 | # fig.tight_layout()
32 | 
33 | titles = ['Weak Scaling', 'Strong Scaling (No Compute) ($512^3$)', 'Strong Scaling ($256^3$)',]
34 | 
35 | logy = [False, True, True]
36 | 
37 | colors = plt.rcParams['axes.prop_cycle'].by_key()['color']
38 | colors = rotate(list(reversed(colors)), 1)
39 | 
40 | colors[1] = colors[-1]
41 | 
42 | ok = False
43 | 
44 | for ax, file, num_iter, title, logy in zip(axes.flatten(), files, NUM_ITERS, titles, logy):
45 | #    ax.margins(x=0)
46 | 
47 |     data = pd.read_csv(file, index_col='Version')
48 | #    data = data.sort_index()
49 |     data = data.T / num_iter * MICROSECOND
50 | 
51 |     ax = data.plot(ax=ax, color=colors, title=title, logy=logy)
52 | 
53 |     if logy:
54 |         ax.set_yscale('log', base=2)
55 | 
56 |     markers_cycle = cycle(markers)
57 | 
58 |     for line in ax.get_lines():
59 |         line.set_marker(next(markers_cycle))
60 |         line.set_linewidth(1.5)
61 |         # If our versions
62 |         if line.get_label().lower().startswith('baseline'):
63 |             # line.set_linewidth(1.0)
64 |             # line.set(alpha=0.5)
65 |             line.set_linestyle('dashed')
66 | 
67 | #    axes.legend(axes.get_lines(), data.columns, loc='best')
68 |     if ok:
69 |         ax.set_xlabel('Number of GPUs', weight='bold', fontdict={'fontsize': 11.0})
70 | 
71 |     ok = True
72 | 
73 |     ax.get_legend().remove()
74 |     wrap_labels(ax, 10)
75 |     # plt.title(title, fontsize=15)
76 | 
77 | # handles, labels = axes.get_legend_handles_labels()
78 | 
79 | axes.flatten()[0].legend(loc='best', fancybox=True, prop={'weight': 'bold', 'size': 'large'})
80 | axes.flatten()[0].legend(loc='best', fancybox=True)
81 | 
82 | # legend = fig.legend(handles, labels, loc='upper center',
83 | #                     bbox_to_anchor=(0.5,  # horizontal
84 | #                                     1.1),  # vertical
85 | #                     ncol=6, fancybox=True)
86 | 
87 | #legend = fig.legend(handles, labels, loc='best')
88 | 
89 | fig.supylabel(r'$\mu$ seconds per iteration', weight='normal')
90 | 
91 | title = Path(files[0].name).stem
92 | 
93 | format = 'pdf'
94 | #plt.constrained_layoadia
95 | plt.savefig(MODULE_DIR / f'{title}.{format}', bbox_inches='tight', format=format, transparent=False)
96 | 
97 | plt.show()
98 | 


--------------------------------------------------------------------------------
/Stencil/jacobi3D/src/main.cu:
--------------------------------------------------------------------------------
 1 | #include <array>
 2 | #include <iostream>
 3 | 
 4 | #include "../include/baseline/multi-threaded-copy-overlap.cuh"
 5 | #include "../include/baseline/multi-threaded-copy.cuh"
 6 | #include "../include/baseline/multi-threaded-p2p.cuh"
 7 | #include "../include/baseline/single-threaded-copy.cuh"
 8 | 
 9 | #include "../include/PERKS/multi-stream-perks.cuh"
10 | 
11 | #include "../include/single-stream/multi-threaded-one-block-comm.cuh"
12 | 
13 | #include "../include/single-stream/multi-threaded-two-block-comm.cuh"
14 | 
15 | #include "../include/multi-stream/multi-gpu-peer-tiling.cuh"
16 | 
17 | #include "../include/no-compute/multi-threaded-copy-no-compute.cuh"
18 | #include "../include/no-compute/multi-threaded-copy-overlap-no-compute.cuh"
19 | #include "../include/no-compute/multi-threaded-p2p-no-compute.cuh"
20 | 
21 | #include "../include/no-compute/multi-threaded-one-block-comm-no-compute.cuh"
22 | 
23 | #include "../include/no-compute/multi-gpu-peer-tiling-no-compute.cuh"
24 | #include "../include/no-compute/multi-threaded-two-block-comm-no-compute.cuh"
25 | 
26 | #include "../include/no-compute/multi-gpu-peer-tiling-no-compute.cuh"
27 | 
28 | using std::make_pair;
29 | 
30 | int main(int argc, char *argv[]) {
31 |     const std::array versions{
32 |         make_pair("Baseline Copy", BaselineMultiThreadedCopy::init),
33 |         make_pair("Baseline Overlap", BaselineMultiThreadedCopyOverlap::init),
34 |         make_pair("Baseline P2P", BaselineMultiThreadedP2P::init),
35 | 
36 |         make_pair("Design 1", MultiGPUPeerTiling::init),
37 |         make_pair("Design 2", SSMultiThreadedTwoBlockComm::init),
38 |         make_pair("PERKS", MultiStreamPERKS::init),
39 | 
40 |         make_pair("Baseline Copy (No computation)", BaselineMultiThreadedCopyNoCompute::init),
41 |         make_pair("Baseline Overlap (No Computation)",
42 |                   BaselineMultiThreadedCopyOverlapNoCompute::init),
43 |         make_pair("Baseline P2P (No Computation)", BaselineMultiThreadedP2PNoCompute::init),
44 | 
45 |         make_pair("Design 1 (No Computation)", MultiGPUPeerTilingNoCompute::init),
46 |         make_pair("Design 2 (No Computation)", SSMultiThreadedTwoBlockCommNoCompute::init),
47 | 
48 |         //        make_pair("Baseline Single Threaded Copy", BaselineSingleThreadedCopy::init),
49 |         //        make_pair("Naive Single stream multi threaded (one thread block
50 |         //        communicates)",SSMultiThreadedOneBlockComm::init), make_pair("Single stream multi
51 |         //        threaded (one thread block communicates; no computation)",
52 |         //                  SSMultiThreadedOneBlockCommNoCompute::init),
53 |     };
54 | 
55 |     const int selection = get_argval<int>(argv, argv + argc, "-v", 0);
56 |     const bool silent = get_arg(argv, argv + argc, "-s");
57 | 
58 |     auto &selected = versions[selection];
59 | 
60 |     if (!silent) {
61 |         std::cout << "Versions (select with -v):"
62 |                   << "\n";
63 |         for (size_t i = 0; i < versions.size(); ++i) {
64 |             auto &v = versions[i];
65 |             std::cout << i << ":\t" << v.first << "\n";
66 |         }
67 |         std::cout << std::endl;
68 | 
69 |         std::cout << "Running " << selected.first << "\n" << std::endl;
70 |     }
71 | 
72 |     return selected.second(argc, argv);
73 | }
74 | 


--------------------------------------------------------------------------------
/Stencil/jacobi2D/src_nvshmem/main.cu:
--------------------------------------------------------------------------------
 1 | #include <array>
 2 | #include <iostream>
 3 | 
 4 | #include "../include_nvshmem/baseline/multi-threaded-nvshmem-opt.cuh"
 5 | #include "../include_nvshmem/baseline/multi-threaded-nvshmem.cuh"
 6 | 
 7 | #include "../include_nvshmem/multi-stream/multi-gpu-multi-block-tiling.cuh"
 8 | #include "../include_nvshmem/multi-stream/multi-gpu-peer-tiling.cuh"
 9 | 
10 | #include "../include_nvshmem/single-stream/multi-threaded-multi-block-comm.cuh"
11 | #include "../include_nvshmem/single-stream/multi-threaded-one-block-comm.cuh"
12 | #include "../include_nvshmem/single-stream/multi-threaded-two-block-comm.cuh"
13 | 
14 | #include "../include_nvshmem/no-compute/multi-threaded-nvshmem-no-compute.cuh"
15 | #include "../include_nvshmem/no-compute/multi-threaded-nvshmem-opt-no-compute.cuh"
16 | 
17 | #include "../include_nvshmem/no-compute/design-1-multi-block-no-compute.cuh"
18 | #include "../include_nvshmem/no-compute/multi-threaded-one-block-comm-no-compute.cuh"
19 | #include "../include_nvshmem/no-compute/multi-threaded-two-block-comm-no-compute.cuh"
20 | 
21 | #include "../include_nvshmem/no-compute/multi-gpu-peer-tiling-no-compute.cuh"
22 | 
23 | #include "../include_nvshmem/PERKS/multi-stream-perks.cuh"
24 | 
25 | using std::make_pair;
26 | 
27 | int main(int argc, char *argv[]) {
28 |     const std::array versions{
29 |         make_pair("Baseline NVSHMEM", BaselineMultiThreadedNvshmemOpt::init),
30 | 
31 |         make_pair("Design 1 (NVSHMEM)", MultiGPUPeerTilingNvshmem::init),
32 |         make_pair("Design 2 (NVSHMEM)", SSMultiThreadedTwoBlockCommNvshmem::init),
33 |         make_pair("Design 1 Partitioned (NVSHMEM)", MultiGPUMultiBlockPeerTilingNvshmem::init),
34 |         make_pair("PERKS NVSHMEM", MultiStreamPERKSNVSHMEM::init),
35 | 
36 |         make_pair("Baseline NVSHMEM (No Computation)",
37 |                   BaselineMultiThreadedNvshmemOptNoCompute::init),
38 | 
39 |         make_pair("Design 1 NVSHMEM (No Computation)", MultiGPUPeerTilingNvshmemNoCompute::init),
40 |         make_pair("Design 2 NVSHMEM (No Computation",
41 |                   SSMultiThreadedTwoBlockCommNvshmemNoCompute::init),
42 |         make_pair("Design 1 Partitioned (No Computation)", Design1MultiBlockNoComputation::init),
43 | 
44 |         //        make_pair("Design 2 Partitioned (NVSHMEM)",
45 |         //        SSMultiThreadedMultiBlockCommNvshmem::init), make_pair("NVSHMEM Baseline Multi
46 |         //        Threaded", BaselineMultiThreadedNvshmem::init), make_pair("NVSHMEM Single stream
47 |         //        multi threaded (one thread block communicates)",
48 |         //                  SSMultiThreadedOneBlockCommNvshmem::init),
49 |         //        make_pair("NVSHMEM Baseline Multi Threaded (No Computation)",
50 |         //        BaselineMultiThreadedNvshmemNoCompute::init), make_pair(
51 |         //            "NVSHMEM Single stream multi threaded (one thread block communicates; no
52 |         //            computation)", SSMultiThreadedOneBlockCommNvshmemNoCompute::init),
53 | 
54 |     };
55 | 
56 |     const int selection = get_argval<int>(argv, argv + argc, "-v", 0);
57 |     const bool silent = get_arg(argv, argv + argc, "-s");
58 | 
59 |     auto &selected = versions[selection];
60 | 
61 |     if (!silent) {
62 |         std::cout << "Versions (select with -v):" << std::endl;
63 |         for (size_t i = 0; i < versions.size(); ++i) {
64 |             auto &v = versions[i];
65 |             std::cout << i << ":\t" << v.first << "\n";
66 |         }
67 |         std::cout << std::endl;
68 | 
69 |         std::cout << "Running " << selected.first << "\n" << std::endl;
70 |     }
71 | 
72 |     return selected.second(argc, argv);
73 | }
74 | 


--------------------------------------------------------------------------------
/CG/src/single-gpu/discrete-standard.cu:
--------------------------------------------------------------------------------
 1 | /* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 2 |  *
 3 |  * Redistribution and use in source and binary forms, with or without
 4 |  * modification, are permitted provided that the following conditions
 5 |  * are met:
 6 |  *  * Redistributions of source code must retain the above copyright
 7 |  *    notice, this list of conditions and the following disclaimer.
 8 |  *  * Redistributions in binary form must reproduce the above copyright
 9 |  *    notice, this list of conditions and the following disclaimer in the
10 |  *    documentation and/or other materials provided with the distribution.
11 |  *  * Neither the name of NVIDIA CORPORATION nor the names of its
12 |  *    contributors may be used to endorse or promote products derived
13 |  *    from this software without specific prior written permission.
14 |  *
15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 |  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 |  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 |  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 |  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 |  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 |  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 |  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 |  */
27 | 
28 | /*
29 |  * This sample implements a conjugate gradient solver on multiple GPU using
30 |  * Unified Memory optimized prefetching and usage hints.
31 |  *
32 |  */
33 | 
34 | // includes, system
35 | #include <stdio.h>
36 | #include <stdlib.h>
37 | #include <string.h>
38 | #include <filesystem>
39 | #include <iostream>
40 | #include <map>
41 | #include <set>
42 | #include <utility>
43 | 
44 | #include <omp.h>
45 | 
46 | #include <mpi.h>
47 | #include <nvshmem.h>
48 | #include <nvshmemx.h>
49 | 
50 | #include "../../include/common.h"
51 | 
52 | #include <cooperative_groups.h>
53 | #include <cooperative_groups/reduce.h>
54 | 
55 | namespace cg = cooperative_groups;
56 | 
57 | // This should only be run with a single GPU
58 | int SingleGPUDiscreteStandard::init(int *device_csrRowIndices, int *device_csrColIndices,
59 |                                     real *device_csrVal, const int num_rows, const int nnz,
60 |                                     bool matrix_is_zero_indexed, const int iter_max,
61 |                                     [[maybe_unused]] real *x_final_result,
62 |                                     [[maybe_unused]] const double single_gpu_runtime,
63 |                                     [[maybe_unused]] bool compare_to_single_gpu,
64 |                                     [[maybe_unused]] bool compare_to_cpu, real *x_ref_single_gpu,
65 |                                     [[maybe_unused]] real *x_ref_cpu) {
66 |     // This version should be run with 1 GPU only but adding this check here just in case.
67 |     int mype = nvshmem_my_pe();
68 | 
69 |     if (mype == 0) {
70 |         bool run_as_separate_version = true;
71 | 
72 |         double single_gpu_runtime = SingleGPUDiscreteStandard::run_single_gpu(
73 |             iter_max, device_csrRowIndices, device_csrColIndices, device_csrVal, x_ref_single_gpu,
74 |             num_rows, nnz, matrix_is_zero_indexed, run_as_separate_version);
75 | 
76 |         printf("Execution time: %8.4f s\n", single_gpu_runtime);
77 |     }
78 | 
79 |     return 0;
80 | }


--------------------------------------------------------------------------------
/Stencil/jacobi3D/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_executable(jacobi3D
 2 |         src/main.cu
 3 |         src/common.cu
 4 |         src/baseline/multi-threaded-copy.cu
 5 |         src/baseline/multi-threaded-copy-overlap.cu
 6 |         src/baseline/multi-threaded-p2p.cu
 7 |         src/baseline/single-threaded-copy.cu
 8 |         src/multi-stream/multi-gpu-peer-tiling.cu
 9 |         src/single-stream/multi-threaded-one-block-comm.cu
10 |         src/single-stream/multi-threaded-two-block-comm.cu
11 |         src/PERKS/common/common.hpp
12 |         src/PERKS/common/cub_utils.cuh
13 |         src/PERKS/common/cuda_common.cuh
14 |         src/PERKS/common/cuda_computation.cuh
15 |         src/PERKS/common/jacobi_cuda.cuh
16 |         src/PERKS/common/jacobi_reference.hpp
17 |         src/PERKS/common/types.hpp
18 |         src/PERKS/config.cuh
19 |         src/PERKS/genconfig.cuh
20 |         src/PERKS/j3d-general-kernels.cuh
21 |         src/PERKS/j3d-general-wrapper.cu
22 |         src/PERKS/multi-stream-perks.cu
23 |         src/PERKS/perksconfig.cuh
24 |         src/no-compute/multi-gpu-peer-tiling-no-compute.cu
25 |         src/no-compute/multi-threaded-copy-no-compute.cu
26 |         src/no-compute/multi-threaded-copy-overlap-no-compute.cu
27 |         src/no-compute/multi-threaded-one-block-comm-no-compute.cu
28 |         src/no-compute/multi-threaded-p2p-no-compute.cu
29 |         src/no-compute/multi-threaded-two-block-comm-no-compute.cu)
30 | 
31 | add_executable(jacobi3D_nvshmem
32 |         src_nvshmem/baseline/multi-threaded-nvshmem.cu
33 |         src_nvshmem/baseline/multi-threaded-nvshmem-opt.cu
34 |         src_nvshmem/common.cu
35 |         src_nvshmem/main.cu
36 |         src_nvshmem/multi-stream/multi-gpu-multi-block-tiling.cu
37 |         src_nvshmem/multi-stream/multi-gpu-peer-tiling.cu
38 |         src_nvshmem/no-compute/multi-gpu-peer-tiling-no-compute.cu
39 |         src_nvshmem/no-compute/multi-threaded-multi-block-comm-no-compute.cu
40 |         src_nvshmem/no-compute/multi-threaded-nvshmem-no-compute.cu
41 |         src_nvshmem/no-compute/multi-threaded-nvshmem-opt-no-compute.cu
42 |         src_nvshmem/no-compute/multi-threaded-one-block-comm-no-compute.cu
43 |         src_nvshmem/no-compute/multi-threaded-two-block-comm-no-compute.cu
44 |         src_nvshmem/PERKS-nvshmem/common/common.hpp
45 |         src_nvshmem/PERKS-nvshmem/common/cub_utils.cuh
46 |         src_nvshmem/PERKS-nvshmem/common/cuda_common.cuh
47 |         src_nvshmem/PERKS-nvshmem/common/cuda_computation.cuh
48 |         src_nvshmem/PERKS-nvshmem/common/jacobi_cuda.cuh
49 |         src_nvshmem/PERKS-nvshmem/common/jacobi_reference.hpp
50 |         src_nvshmem/PERKS-nvshmem/common/types.hpp
51 |         src_nvshmem/PERKS-nvshmem/config.cuh
52 |         src_nvshmem/PERKS-nvshmem/genconfig.cuh
53 |         src_nvshmem/PERKS-nvshmem/j3d-general-kernels.cuh
54 |         src_nvshmem/PERKS-nvshmem/j3d-general-wrapper.cu
55 |         src_nvshmem/PERKS-nvshmem/multi-stream-perks-block.cu
56 |         src_nvshmem/PERKS-nvshmem/multi-stream-perks.cu
57 |         src_nvshmem/PERKS-nvshmem/perksconfig.cuh
58 |         src_nvshmem/single-stream/multi-threaded-multi-block-comm.cu
59 |         src_nvshmem/single-stream/multi-threaded-one-block-comm.cu
60 |         src_nvshmem/single-stream/multi-threaded-two-block-comm.cu)
61 | 
62 | target_include_directories(jacobi3D PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/include")
63 | 
64 | target_include_directories(jacobi3D_nvshmem PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/include_nvshmem")
65 | 
66 | find_package(OpenMP REQUIRED)
67 | find_package(NVSHMEM REQUIRED)
68 | find_package(MPI REQUIRED)
69 | 
70 | target_link_libraries(jacobi3D
71 |         CUDA::cudart
72 |         OpenMP::OpenMP_CXX)
73 | 
74 | target_link_libraries(jacobi3D_nvshmem
75 |         CUDA::cudart
76 |         OpenMP::OpenMP_CXX
77 |         nvshmem::nvshmem
78 |         MPI::MPI_CXX)
79 | 


--------------------------------------------------------------------------------
/Stencil/jacobi3D/src_nvshmem/main.cu:
--------------------------------------------------------------------------------
 1 | #include <array>
 2 | #include <iostream>
 3 | 
 4 | #include "../include_nvshmem/PERKS-nvshmem/multi-stream-perks-nvshmem-block.h"
 5 | #include "../include_nvshmem/PERKS-nvshmem/multi-stream-perks-nvshmem.h"
 6 | #include "../include_nvshmem/baseline/multi-threaded-nvshmem-opt.cuh"
 7 | #include "../include_nvshmem/baseline/multi-threaded-nvshmem.cuh"
 8 | #include "../include_nvshmem/multi-stream/multi-gpu-multi-block-tiling.cuh"
 9 | #include "../include_nvshmem/multi-stream/multi-gpu-peer-tiling.cuh"
10 | #include "../include_nvshmem/no-compute/multi-gpu-peer-tiling-no-compute.cuh"
11 | #include "../include_nvshmem/no-compute/multi-threaded-multi-block-comm-no-compute.cuh"
12 | #include "../include_nvshmem/no-compute/multi-threaded-nvshmem-no-compute.cuh"
13 | #include "../include_nvshmem/no-compute/multi-threaded-nvshmem-opt-no-compute.cuh"
14 | #include "../include_nvshmem/no-compute/multi-threaded-one-block-comm-no-compute.cuh"
15 | #include "../include_nvshmem/no-compute/multi-threaded-two-block-comm-no-compute.cuh"
16 | #include "../include_nvshmem/single-stream/multi-threaded-multi-block-comm.cuh"
17 | #include "../include_nvshmem/single-stream/multi-threaded-one-block-comm.cuh"
18 | #include "../include_nvshmem/single-stream/multi-threaded-two-block-comm.cuh"
19 | 
20 | using std::make_pair;
21 | 
22 | int main(int argc, char *argv[]) {
23 |     const std::array versions{
24 |         make_pair("Baseline NVSHMEM", BaselineMultiThreadedNvshmemOpt::init),
25 | 
26 |         make_pair("Design 1 (NVSHMEM)", MultiGPUPeerTilingNvshmem::init),
27 |         make_pair("Design 2 (NVSHMEM)", SSMultiThreadedTwoBlockCommNvshmem::init),
28 | 
29 |         make_pair("Design 1 Partitioned (NVSHMEM)", MultiGPUMultiBlockPeerTilingNvshmem::init),
30 |         make_pair("Design 2 Partitioned (NVSHMEM)", SSMultiThreadedMultiBlockCommNvshmem::init),
31 | 
32 |         make_pair("Baseline NVSHMEM (No Computation)",
33 |                   BaselineMultiThreadedNvshmemOptNoCompute::init),
34 |         make_pair("Design 1 NVSHMEM (No Compute)", MultiGPUPeerTilingNvshmemNoCompute::init),
35 | 
36 |         make_pair("PERKS NVSHMEM", MultiStreamPERKSNvshmem::init),
37 |         make_pair("PERKS NVSHMEM Partitioned", MultiStreamPERKSNvshmemBlock::init),
38 | 
39 |         //        make_pair("NVSHMEM Baseline Multi Threaded", BaselineMultiThreadedNvshmem::init),
40 |         //        make_pair("NVSHMEM Single stream multi threaded (one thread block communicates)",
41 |         //                  SSMultiThreadedOneBlockCommNvshmem::init),
42 |         //        make_pair("NVSHMEM Baseline Multi Threaded (No Computation)",
43 |         //        BaselineMultiThreadedNvshmemNoCompute::init), make_pair(
44 |         //            "NVSHMEM Single stream multi threaded (one thread block communicates; no
45 |         //            computation)", SSMultiThreadedOneBlockCommNvshmemNoCompute::init),
46 |         //        make_pair(
47 |         //                    "NVSHMEM Single stream multi threaded (two thread blocks communicate;
48 |         //                    no computation)", SSMultiThreadedTwoBlockCommNvshmemNoCompute::init),
49 |         //            make_pair("Design 2 NVSHMEM (No Compute)",
50 |         //            SSMultiThreadedMultiBlockCommNvshmemNoCompute::init),
51 |     };
52 | 
53 |     const int selection = get_argval<int>(argv, argv + argc, "-v", 0);
54 |     const bool silent = get_arg(argv, argv + argc, "-s");
55 | 
56 |     auto &selected = versions[selection];
57 | 
58 |     if (!silent) {
59 |         std::cout << "Versions (select with -v):"
60 |                   << "\n";
61 |         for (size_t i = 0; i < versions.size(); ++i) {
62 |             auto &v = versions[i];
63 |             std::cout << i << ":\t" << v.first << "\n";
64 |         }
65 |         std::cout << std::endl;
66 | 
67 |         std::cout << "Running " << selected.first << "\n" << std::endl;
68 |     }
69 | 
70 |     return selected.second(argc, argv);
71 | }
72 | 


--------------------------------------------------------------------------------
/Scripts/full_bench.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | #SBATCH -J stencil-bench-weak
  4 | #SBATCH -N 1
  5 | #SBATCH -n 8
  6 | #SBATCH -c 16
  7 | #SBATCH -A proj16
  8 | #SBATCH -p palamut-cuda
  9 | #SBATCH --gres=gpu:8
 10 | #SBATCH --time=24:00:00
 11 | #SBATCH -o stencil_bench_%j.log
 12 | 
 13 | import os
 14 | import sys
 15 | 
 16 | sys.path.append(os.getcwd())
 17 | 
 18 | from pathlib import Path
 19 | from datetime import datetime
 20 | from itertools import cycle
 21 | 
 22 | import bench
 23 | 
 24 | BIN = './jacobi'
 25 | BIN_3D = './jacobi3d'
 26 | 
 27 | BIN_NVSHMEM = './jacobi_nvshmem'
 28 | BIN_3D_NVSHMEM = './jacobi3d_nvshmem'
 29 | 
 30 | VERSIONS = [
 31 |     0,  # Baseline Copy
 32 |     1,  # Baseline Overlap
 33 |     2,  # Baseline P2P
 34 |     3,  # Design 1
 35 |     # 4,  # Design 2
 36 |     5  # PERKS
 37 | ]
 38 | VERSIONS_NO_COMPUTE = [
 39 |     6,  # Baseline Copy
 40 |     7,  # Baseline Overlap
 41 |     8,  # Baseline P2P
 42 |     9,  # Design 1
 43 |     10  # Design 2
 44 | ]
 45 | 
 46 | VERSIONS_NVSHMEM = [
 47 |     0,  # Baseline
 48 |     1,  # Design 1
 49 |     # 2,  # Design 2
 50 |     3,  # Design 1 Partitioned
 51 |     # 3  # PERKS
 52 |     7,  # PERKS, possibly
 53 | ]
 54 | 
 55 | VERSIONS_NVSHMEM_NO_COMPUTE = [
 56 |     4,  # Baseline
 57 |     5,  # Design 1
 58 |     6,  # Design 2
 59 |     7  # PERKS
 60 | ]
 61 | 
 62 | NUM_REPEAT = 1
 63 | 
 64 | BASE_DIR = Path(str(datetime.now()))
 65 | BASE_DIR.mkdir()
 66 | 
 67 | 
 68 | def get_dim_str(dim):
 69 |     return 'x'.join([str(x) for x in dim])
 70 | 
 71 | 
 72 | # Multiplies the last index by 2
 73 | def dim_func_last(dims):
 74 |     last_index = len(dims) - 1
 75 | 
 76 |     while True:
 77 |         yield dims.copy()
 78 |         dims[last_index] *= 2
 79 | 
 80 | 
 81 | default_args = {'bin': BIN, 'num_repeat': NUM_REPEAT}
 82 | default_args_strong = {
 83 |     **default_args,
 84 |     'gpu_step': lambda x: x + 1,  # Add 1 more GPU
 85 |     'dim_func': lambda x: cycle([x])
 86 | }
 87 | 
 88 | weak_scaling = [
 89 |     {'starting_dim': (256, 256), 'num_iter': 1_000_000},
 90 |     {'starting_dim': (1024, 1024), 'num_iter': 1_000_000},
 91 |     {'starting_dim': (2048, 1024), 'num_iter': 1_000_000},
 92 |     {'starting_dim': (8192, 4096), 'num_iter': 10_000},
 93 | ]
 94 | 
 95 | strong_scaling = [
 96 |     {'starting_dim': (4096, 4096), 'num_iter': 10000},
 97 | ]
 98 | 
 99 | weak_scaling_3D = [
100 |     {'bin': BIN_3D, 'starting_dim': (256, 256, 256), 'num_iter': 10000},
101 |     {'bin': BIN_3D, 'starting_dim': (256, 256, 256), 'num_iter': 10000, 'dim_func': dim_func_last},
102 | ]
103 | 
104 | strong_scaling_3D = [
105 |     {'bin': BIN_3D, 'starting_dim': (512, 512, 512), 'num_iter': 10000},
106 | ]
107 | 
108 | 
109 | def run_experiment(name: str, args):
110 |     dim_str = get_dim_str(args['starting_dim'])
111 |     args['out_file'] = BASE_DIR / f'{name}_{dim_str}.csv'
112 |     bench.run(**args)
113 | 
114 | 
115 | def run(args, version=''):
116 |     run_experiment(version, {**args, 'versions': VERSIONS, 'bin': BIN})
117 |     # run_experiment(version, {**args, 'versions': VERSIONS_NVSHMEM, 'bin': BIN_NVSHMEM, 'mpi': True})
118 | 
119 |     # run_experiment(f'{version}_No_Compute', {**args, 'versions': VERSIONS_NO_COMPUTE, 'bin': BIN})
120 |     # run_experiment(f'{version}_No_Compute',
121 |     #                {**args, 'versions': VERSIONS_NVSHMEM_NO_COMPUTE, 'bin': BIN_NVSHMEM, 'mpi': True})
122 | 
123 | 
124 | if __name__ == '__main__':
125 |     # Running with the same name merges them
126 |     for args in weak_scaling:
127 |         run({**default_args, **args}, version='2D_Weak_Scaling')
128 | 
129 |     # for args in weak_scaling_3D:
130 |     #     run({**default_args, **args}, version='3D_Weak_Scaling')
131 | 
132 |     # for args in strong_scaling:
133 |     #     run({**default_args_strong, **args}, version='2D_Strong_Scaling')
134 | 
135 |     # for args in strong_scaling_3D:
136 |     #     run({**default_args_strong, **args}, version='3D_Strong_Scaling')
137 | 


--------------------------------------------------------------------------------
/Stencil/jacobi2D/scripts/strong_scale_bench_truba.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | #SBATCH -J stencil-bench-strong
  4 | #SBATCH -N 1
  5 | #SBATCH -n 8
  6 | #SBATCH -c 16
  7 | #SBATCH -A proj16
  8 | #SBATCH -p palamut-cuda
  9 | #SBATCH --gres=gpu:8
 10 | #SBATCH --time=24:00:00
 11 | #SBATCH -o stencil_bench_strong_output_%j.log
 12 | 
 13 | . ./scripts/modules_truba.sh > /dev/null
 14 | 
 15 | MAX_NUM_GPUS=8
 16 | CUDA_VISIBLE_DEVICES_SETTING=("0" "0" "0,1" "0,1,2" "0,1,2,3" "0,1,2,3,4" "0,1,2,3,4,5" "0,1,2,3,4,5,6" "0,1,2,3,4,5,6,7" )
 17 | 
 18 | declare -A version_name_to_idx_map
 19 | 
 20 | version_name_to_idx_map["Baseline Copy"]=0
 21 | version_name_to_idx_map["Baseline Copy Overlap"]=1
 22 | version_name_to_idx_map["Baseline P2P"]=2
 23 | #version_name_to_idx_map["Baseline Single Copy"]=3
 24 | 
 25 | version_name_to_idx_map["Single Stream 1TB"]=4
 26 | version_name_to_idx_map["Single Stream 2TB"]=5
 27 | version_name_to_idx_map["Double Stream"]=6
 28 | 
 29 | version_name_to_idx_map["Baseline Copy (No compute)"]=7
 30 | version_name_to_idx_map["Baseline Copy Overlap (No Compute)"]=8
 31 | version_name_to_idx_map["Baseline P2P (No Compute)"]=9
 32 | 
 33 | version_name_to_idx_map["Single Stream 1TB (No Compute)"]=10
 34 | version_name_to_idx_map["Single Stream 2TB (No Compute)"]=11
 35 | version_name_to_idx_map["Double Stream (No Compute)"]=12
 36 | 
 37 | declare -A version_name_to_idx_map_nvshmem
 38 | 
 39 | version_name_to_idx_map_nvshmem["NVSHMEM Baseline"]=0
 40 | version_name_to_idx_map_nvshmem["NVSHMEM Baseline Optimized"]=1
 41 | 
 42 | version_name_to_idx_map_nvshmem["NVSHMEM Single Stream 1TB"]=2
 43 | version_name_to_idx_map_nvshmem["NVSHMEM Single Stream 2TB"]=3
 44 | version_name_to_idx_map_nvshmem["NVSHMEM Double Stream"]=4
 45 | 
 46 | version_name_to_idx_map_nvshmem["NVSHMEM Baseline (No Compute)"]=5
 47 | version_name_to_idx_map_nvshmem["NVSHMEM Baseline Optimized (No Compute)"]=6
 48 | 
 49 | version_name_to_idx_map_nvshmem["NVSHMEM Single Stream 1TB (No Compute)"]=7
 50 | version_name_to_idx_map_nvshmem["NVSHMEM Single Stream 2TB (No Compute)"]=8
 51 | version_name_to_idx_map_nvshmem["NVSHMEM Double Stream (No Compute)"]=9
 52 | 
 53 | 
 54 | BIN="./jacobi -s 1"
 55 | NV_BIN="./jacobi_nvshmem -s 1"
 56 | 
 57 | 
 58 | NUM_ITER=${NUM_ITER:-10000}
 59 | NUM_RUNS=${NUM_RUNS:-5}
 60 | 
 61 | while [ $# -gt 0 ]; do
 62 | 
 63 |    if [[ $1 == *"--"* ]]; then
 64 |         param="${1/--/}"
 65 |         declare $param="$2"
 66 |    fi
 67 | 
 68 |   shift
 69 | done
 70 | 
 71 | 
 72 | for (( STARTING_NX=16384; STARTING_NX<=16384; STARTING_NX*=2 )); do
 73 | 
 74 |     NX=${STARTING_NX}
 75 |     NY=${NX}
 76 | 
 77 |     for version_name in "${!version_name_to_idx_map[@]}"; do
 78 |         echo "Running ${version_name}"; echo ""
 79 | 
 80 |         version_idx=${version_name_to_idx_map[$version_name]}
 81 | 
 82 |         for (( NUM_GPUS=1; NUM_GPUS <= ${MAX_NUM_GPUS}; NUM_GPUS+=1 )); do
 83 |             export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES_SETTING[${NUM_GPUS}]}
 84 | 
 85 |             echo "Num GPUS: ${NUM_GPUS}"
 86 |             echo "${NUM_ITER} iterations on grid ${NX}x${NY}"
 87 | 
 88 |             for (( i=1; i <= ${NUM_RUNS}; i++ )); do
 89 |                 execution_time=$(${BIN} -v ${version_idx} -nx ${NX} -ny ${NY} -niter ${NUM_ITER})
 90 |                 echo "${execution_time} on run ${i}"
 91 |             done
 92 | 
 93 |             printf "\n"
 94 |             
 95 |         done
 96 | 
 97 |         echo "-------------------------------------"
 98 |     done
 99 | 
100 | 
101 |     for version_name in "${!version_name_to_idx_map_nvshmem[@]}"; do
102 |         echo "Running ${version_name}"; echo ""
103 | 
104 |         version_idx=${version_name_to_idx_map_nvshmem[$version_name]}
105 | 
106 |         for (( NP=1; NP <= ${MAX_NUM_GPUS}; NP+=1 )); do
107 | 
108 |             echo "Num GPUS: ${NP}"
109 |             echo "${NUM_ITER} iterations on grid ${NX}x${NY}"
110 | 
111 |             for (( i=1; i <= ${NUM_RUNS}; i++ )); do
112 |                 execution_time=$(mpirun -np ${NP} ${NV_BIN} -v ${version_idx} -nx ${NX} -ny ${NY} -niter ${NUM_ITER})
113 |                 echo "${execution_time} on run ${i}"
114 |             done
115 | 
116 |             printf "\n"
117 | 
118 |         done
119 | 
120 |         echo "-------------------------------------"
121 |     done
122 | 
123 |     echo "#####################################" 
124 | done


--------------------------------------------------------------------------------
/CG/results/cg_operation_breakdown_8A100.txt:
--------------------------------------------------------------------------------
 1 | Results per version; rows are matrices
 2 | 
 3 | Results for version Profiling Discrete Standard NVSHMEM =>
 4 | ,Dot 1 (+Reset),Dot 2 (+Reset),Global Reduction 1 (+Barrier),Global Reduction 2 (+Barrier),Memcpy Dot to Host 1,Memcpy Dot to Host 2,NVSHMEM Barrier 1 (After SpMV),NVSHMEM Barrier 2 (End of Iteration),Saxpy 1,Saxpy 2,Saxpy 3,SpMV
 5 | (generated)_tridiagonal,0.2094,0.1695,0.2585,0.2599,0.0680,0.0673,0.1108,0.1102,0.1827,0.2416,0.1814,0.5409
 6 | ecology2,0.0746,0.0736,0.2211,0.2304,0.0657,0.0653,0.0952,0.0971,0.0593,0.0588,0.0605,0.1004
 7 | hood,0.0705,0.0700,0.2160,0.2223,0.0652,0.0642,0.1086,0.0973,0.0553,0.0549,0.0559,0.4870
 8 | bmwcra_1,0.0700,0.0696,0.2146,0.2280,0.0650,0.0640,0.2559,0.0973,0.0549,0.0546,0.0557,0.9622
 9 | consph,0.0698,0.0694,0.2164,0.2261,0.0653,0.0648,0.2090,0.0971,0.0545,0.0544,0.0554,0.3745
10 | thermomech_dM,0.0701,0.0699,0.2182,0.2286,0.0649,0.0645,0.1132,0.0987,0.0553,0.0550,0.0561,0.2324
11 | tmt_sym,0.0735,0.0731,0.2180,0.2266,0.0662,0.0657,0.0945,0.0967,0.0580,0.0577,0.0590,0.1137
12 | crankseg_1,0.0702,0.0687,0.2249,0.2288,0.0669,0.0650,7.6064,0.0960,0.0543,0.0540,0.0554,1.6466
13 | crankseg_2,0.0717,0.0699,0.2342,0.2338,0.0684,0.0658,10.8982,0.0978,0.0547,0.0544,0.0556,1.8452
14 | Queen_4147,0.1025,0.0889,0.2284,0.2247,0.0685,0.0650,0.2232,0.0959,0.0722,0.0720,0.0681,15.1167
15 | Bump_2911,0.0957,0.0850,0.2273,0.2234,0.0686,0.0654,0.3731,0.0966,0.0672,0.0676,0.0653,4.7050
16 | G3_circuit,0.0763,0.0751,0.2168,0.2256,0.0654,0.0648,0.1236,0.0975,0.0600,0.0596,0.0608,0.2145
17 | StocF-1465,0.0835,0.0780,0.2241,0.2272,0.0659,0.0638,0.1718,0.0955,0.0616,0.0609,0.0605,0.5684
18 | Flan_1565,0.0857,0.0781,0.2274,0.2240,0.0676,0.0644,0.1596,0.0962,0.0621,0.0622,0.0606,4.5419
19 | audikw_1,0.0834,0.0769,0.2250,0.2208,0.0675,0.0643,0.0936,0.0959,0.0599,0.0595,0.0586,4.8515
20 | Serena,0.0836,0.0779,0.2228,0.2217,0.0669,0.0640,0.2073,0.0955,0.0613,0.0615,0.0600,2.6818
21 | Geo_1438,0.0842,0.0781,0.2225,0.2195,0.0674,0.0648,0.1707,0.0960,0.0620,0.0623,0.0605,2.2589
22 | Hook_1498,0.0831,0.0775,0.2219,0.2201,0.0665,0.0640,0.1343,0.0964,0.0617,0.0623,0.0606,1.8148
23 | ldoor,0.0808,0.0755,0.2255,0.2252,0.0667,0.0644,0.2791,0.0954,0.0593,0.0595,0.0585,0.8216
24 | 
25 | 
26 | Results for version Profiling Discrete Pipelined NVSHMEM (No Overlap) =>
27 | ,Global Reductions (+Barrier),Memcpy Dots To Host,Merged Dots (+Reset),NVSHMEM Barrier 1 (After SpMV),NVSHMEM Barrier 2 (End of Iteration),Saxpy 1,Saxpy 2,Saxpy 3,Saxpy 4,Saxpy 5,Saxpy 6,SpMV
28 | (generated)_tridiagonal,0.1818,0.0681,0.2480,0.1142,0.1210,0.1980,0.2435,0.2461,0.2417,0.2439,0.2443,0.5201
29 | ecology2,0.1751,0.0658,0.0788,0.1014,0.1104,0.0590,0.0588,0.0589,0.0586,0.0584,0.0584,0.0987
30 | hood,0.1740,0.0659,0.0724,0.1124,0.1051,0.0558,0.0558,0.0558,0.0553,0.0550,0.0552,0.4796
31 | bmwcra_1,0.1703,0.0669,0.0718,0.2503,0.1036,0.0552,0.0550,0.0549,0.0547,0.0544,0.0544,0.9414
32 | consph,0.1735,0.0667,0.0722,0.1561,0.1060,0.0556,0.0553,0.0554,0.0551,0.0549,0.0548,0.4441
33 | thermomech_dM,0.1741,0.0665,0.0726,0.0938,0.1028,0.0562,0.0561,0.0560,0.0557,0.0555,0.0556,0.2408
34 | tmt_sym,0.1746,0.0667,0.0764,0.0988,0.1040,0.0584,0.0582,0.0583,0.0577,0.0576,0.0578,0.1144
35 | crankseg_1,0.1733,0.0690,0.0765,7.5716,0.1040,0.0555,0.0550,0.0552,0.0548,0.0544,0.0546,1.7455
36 | crankseg_2,0.1758,0.0682,0.0777,10.8521,0.1061,0.0553,0.0550,0.0549,0.0546,0.0544,0.0545,1.8073
37 | Queen_4147,0.1791,0.0697,0.1071,0.2459,0.1102,0.0801,0.0792,0.0817,0.0730,0.0685,0.0717,15.1174
38 | Bump_2911,0.1799,0.0692,0.0949,0.1052,0.1058,0.0735,0.0728,0.0742,0.0676,0.0646,0.0646,4.9992
39 | G3_circuit,0.1746,0.0663,0.0823,0.0891,0.1030,0.0607,0.0605,0.0610,0.0604,0.0601,0.0601,0.2353
40 | StocF-1465,0.1827,0.0681,0.0828,0.0944,0.1040,0.0626,0.0625,0.0665,0.0621,0.0600,0.0596,0.6804
41 | Flan_1565,0.1787,0.0685,0.0846,0.3012,0.1051,0.0644,0.0653,0.0662,0.0623,0.0597,0.0598,4.4129
42 | audikw_1,0.1788,0.0699,0.0811,2.9359,0.1025,0.0606,0.0601,0.0620,0.0598,0.0578,0.0579,1.9612
43 | Serena,0.1805,0.0688,0.0828,0.3760,0.1068,0.0633,0.0625,0.0649,0.0609,0.0586,0.0588,2.5207
44 | Geo_1438,0.1797,0.0693,0.0825,0.1363,0.1035,0.0641,0.0633,0.0657,0.0613,0.0592,0.0590,2.3228
45 | Hook_1498,0.1791,0.0675,0.0827,0.1411,0.1048,0.0639,0.0634,0.0663,0.0615,0.0592,0.0593,1.8353
46 | ldoor,0.1811,0.0679,0.0798,0.1083,0.1043,0.0602,0.0608,0.0621,0.0597,0.0578,0.0578,0.9961
47 | 
48 | 
49 | 


--------------------------------------------------------------------------------
/Stencil/jacobi2D/scripts/strong_scale_bench.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | #SBATCH --job-name=stencil-bench
  4 | #SBATCH --ntasks=8
  5 | #SBATCH --gres=gpu:8
  6 | #SBATCH --partition hgx2q
  7 | #SBATCH --time=03:00:00
  8 | #SBATCH --output=sbatch_output_%j.log
  9 | 
 10 | . ./scripts/modules.sh > /dev/null
 11 | 
 12 | MAX_NUM_GPUS=8
 13 | CUDA_VISIBLE_DEVICES_SETTING=("0" "0" "0,1" "0,1,2" "0,1,2,3" "0,1,2,3,4" "0,1,2,3,4,5" "0,1,2,3,4,5,6" "0,1,2,3,4,5,6,7" )
 14 | 
 15 | declare -A version_name_to_idx_map
 16 | 
 17 | version_name_to_idx_map["Baseline Copy"]=0
 18 | version_name_to_idx_map["Baseline Copy Overlap"]=1
 19 | version_name_to_idx_map["Baseline P2P"]=2
 20 | #version_name_to_idx_map["Baseline Single Copy"]=3
 21 | 
 22 | version_name_to_idx_map["Single Stream 1TB"]=4
 23 | version_name_to_idx_map["Single Stream 2TB"]=5
 24 | version_name_to_idx_map["Double Stream"]=6
 25 | 
 26 | version_name_to_idx_map["Baseline Copy (No compute)"]=7
 27 | version_name_to_idx_map["Baseline Copy Overlap (No Compute)"]=8
 28 | version_name_to_idx_map["Baseline P2P (No Compute)"]=9
 29 | 
 30 | version_name_to_idx_map["Single Stream 1TB (No Compute)"]=10
 31 | version_name_to_idx_map["Single Stream 2TB (No Compute)"]=11
 32 | version_name_to_idx_map["Double Stream (No Compute)"]=12
 33 | 
 34 | declare -A version_name_to_idx_map_nvshmem
 35 | 
 36 | version_name_to_idx_map_nvshmem["NVSHMEM Baseline"]=0
 37 | version_name_to_idx_map_nvshmem["NVSHMEM Baseline Optimized"]=1
 38 | 
 39 | version_name_to_idx_map_nvshmem["NVSHMEM Single Stream 1TB"]=2
 40 | version_name_to_idx_map_nvshmem["NVSHMEM Single Stream 2TB"]=3
 41 | version_name_to_idx_map_nvshmem["NVSHMEM Double Stream"]=4
 42 | 
 43 | version_name_to_idx_map_nvshmem["NVSHMEM Baseline (No Compute)"]=5
 44 | version_name_to_idx_map_nvshmem["NVSHMEM Baseline Optimized (No Compute)"]=6
 45 | 
 46 | version_name_to_idx_map_nvshmem["NVSHMEM Single Stream 1TB (No Compute)"]=7
 47 | version_name_to_idx_map_nvshmem["NVSHMEM Single Stream 2TB (No Compute)"]=8
 48 | version_name_to_idx_map_nvshmem["NVSHMEM Double Stream (No Compute)"]=9
 49 | 
 50 | BIN="./jacobi -s 1"
 51 | NV_BIN="./jacobi_nvshmem -s 1"
 52 | 
 53 | MAX_NX=${MAX_NX:-16384}
 54 | MAX_NY=${MAX_NY:-16384}
 55 | 
 56 | STARTING_NX=${STARTING_NX:-4096}
 57 | STARTING_NY=${STARTING_NY:-4096}
 58 | 
 59 | NUM_ITER=${NUM_ITER:-100000}
 60 | NUM_RUNS=${NUM_RUNS:-5}
 61 | 
 62 | while [ $# -gt 0 ]; do
 63 | 
 64 |    if [[ $1 == *"--"* ]]; then
 65 |         param="${1/--/}"
 66 |         declare $param="$2"
 67 |    fi
 68 | 
 69 |   shift
 70 | done
 71 | 
 72 | for (( STARTING_NX=16384; STARTING_NX<=16384; STARTING_NX*=2 )); do
 73 |     
 74 |     NX=${STARTING_NX}
 75 |     NY=${NX}
 76 | 
 77 |     for version_name in "${!version_name_to_idx_map[@]}"; do
 78 |         echo "Running ${version_name}"; echo ""
 79 | 
 80 |         version_idx=${version_name_to_idx_map[$version_name]}
 81 | 
 82 |         for (( NUM_GPUS=1; NUM_GPUS <= ${MAX_NUM_GPUS}; NUM_GPUS+=1 )); do
 83 |             export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES_SETTING[${NUM_GPUS}]}
 84 | 
 85 |             echo "Num GPUS: ${NUM_GPUS}"
 86 |             echo "${NUM_ITER} iterations on grid ${NX}x${NY}"
 87 | 
 88 |             for (( i=1; i <= ${NUM_RUNS}; i++ )); do
 89 |                 execution_time=$(${BIN} -v ${version_idx} -nx ${NX} -ny ${NY} -niter ${NUM_ITER})
 90 |                 echo "${execution_time} on run ${i}"
 91 |             done
 92 | 
 93 |             printf "\n"
 94 |             
 95 |         done
 96 | 
 97 |         echo "-------------------------------------"
 98 |     done
 99 | 
100 |     for version_name in "${!version_name_to_idx_map_nvshmem[@]}"; do
101 |         echo "Running ${version_name}"; echo ""
102 | 
103 |         version_idx=${version_name_to_idx_map_nvshmem[$version_name]}
104 | 
105 |         for (( NP=1; NP <= ${MAX_NUM_GPUS}; NP+=1 )); do
106 | 
107 |             echo "Num GPUS: ${NP}"
108 |             echo "${NUM_ITER} iterations on grid ${NX}x${NY}"
109 | 
110 |             for (( i=1; i <= ${NUM_RUNS}; i++ )); do
111 |                 execution_time=$(mpirun -np ${NP} ${NV_BIN} -v ${version_idx} -nx ${NX} -ny ${NY} -niter ${NUM_ITER})
112 |                 echo "${execution_time} on run ${i}"
113 |             done
114 | 
115 |             printf "\n"
116 | 
117 |         done
118 | 
119 |         echo "-------------------------------------"
120 |     done
121 | 
122 |     echo "#####################################" 
123 | done
124 | 


--------------------------------------------------------------------------------
/Stencil/Makefile2:
--------------------------------------------------------------------------------
  1 | SELF_DIR := $(dir $(lastword $(MAKEFILE_LIST)))
  2 | include $(SELF_DIR)/../common.mk
  3 | 
  4 | BUILD_ROOT ?= bin
  5 | OBJ_ROOT ?= $(BUILD_ROOT)/obj
  6 | 
  7 | VERSION_2D = $(SELF_DIR)/jacobi2D
  8 | VERSION_3D = $(SELF_DIR)/jacobi3D
  9 | 
 10 | 2D_PERKS_WRAPPER = $(VERSION_2D)/PERKS/*.cu
 11 | 
 12 | # =========================================================================
 13 | # 2D
 14 | SRC_DIR_2D = $(VERSION_2D)/src/
 15 | OBJ_DIR_2D := $(OBJ_ROOT)/$(VERSION_2D)/
 16 | DEP_DIR_2D := $(OBJ_DIR_2D)/.deps/
 17 | 
 18 | SRCS_2D = $(call rwildcard,$(SRC_DIR_2D),*.cu)
 19 | OBJS_2D := $(patsubst $(SRC_DIR_2D)/%.cu, $(OBJ_DIR_2D)/%.o, $(SRCS_2D))
 20 | DEPS_2D := $(patsubst $(SRC_DIR_2D)/%.cu, $(DEP_DIR_2D)/%.d, $(SRCS_2D))
 21 | # =========================================================================
 22 | 
 23 | # =========================================================================
 24 | # 3D
 25 | SRC_DIR_3D = $(VERSION_3D)/src/
 26 | OBJ_DIR_3D := $(OBJ_ROOT)/$(VERSION_3D)/
 27 | DEP_DIR_3D := $(OBJ_DIR_3D)/.deps/
 28 | 
 29 | SRCS_3D = $(call rwildcard,$(SRC_DIR_3D),*.cu)
 30 | OBJS_3D := $(patsubst $(SRC_DIR_3D)/%.cu, $(OBJ_DIR_3D)/%.o, $(SRCS_3D))
 31 | DEPS_3D := $(patsubst $(SRC_DIR_3D)/%.cu, $(DEP_DIR_3D)/%.d, $(SRCS_3D))
 32 | # =========================================================================
 33 | 
 34 | # =========================================================================
 35 | # 2D NVSHMEM
 36 | SRC_DIR_2D_NVSHMEM = $(VERSION_2D)/src_nvshmem/
 37 | OBJ_DIR_2D_NVSHMEM := $(OBJ_ROOT)/$(VERSION_2D)_nvshmem/
 38 | DEP_DIR_2D_NVSHMEM := $(OBJ_DIR_2D_NVSHMEM)/.deps/
 39 | 
 40 | SRCS_2D_NVSHMEM = $(call rwildcard,$(SRC_DIR_2D_NVSHMEM),*.cu)
 41 | OBJS_2D_NVSHMEM := $(patsubst $(SRC_DIR_2D_NVSHMEM)/%.cu, $(OBJ_DIR_2D_NVSHMEM)/%.o, $(SRCS_2D_NVSHMEM))
 42 | DEPS_2D_NVSHMEM := $(patsubst $(SRC_DIR_2D_NVSHMEM)/%.cu, $(DEP_DIR_2D_NVSHMEM)/%.d, $(SRCS_2D_NVSHMEM))
 43 | # =========================================================================
 44 | 
 45 | # =========================================================================
 46 | # 3D NVSHMEM
 47 | SRC_DIR_3D_NVSHMEM = $(VERSION_3D)/src_nvshmem/
 48 | OBJ_DIR_3D_NVSHMEM := $(OBJ_ROOT)/$(VERSION_3D)_nvshmem/
 49 | DEP_DIR_3D_NVSHMEM := $(OBJ_DIR_3D_NVSHMEM)/.deps/
 50 | 
 51 | SRCS_3D_NVSHMEM = $(call rwildcard,$(SRC_DIR_3D_NVSHMEM),*.cu)
 52 | OBJS_3D_NVSHMEM := $(patsubst $(SRC_DIR_3D_NVSHMEM)/%.cu, $(OBJ_DIR_3D_NVSHMEM)/%.o, $(SRCS_3D_NVSHMEM))
 53 | DEPS_3D_NVSHMEM := $(patsubst $(SRC_DIR_3D_NVSHMEM)/%.cu, $(DEP_DIR_3D_NVSHMEM)/%.d, $(SRCS_3D_NVSHMEM))
 54 | # =========================================================================
 55 | stencil: jacobi jacobi_nvshmem
 56 | jacobi: jacobi2D jacobi3D
 57 | jacobi_nvshmem: jacobi2D_nvshmem jacobi3D_nvshmem
 58 | 
 59 | # =========================================================================
 60 | jacobi2D: $(OBJS_2D) $(2D_PERKS_WRAPPER)
 61 | 	$(LINK)
 62 | 
 63 | jacobi3D: $(OBJS_3D)
 64 | 	$(LINK)
 65 | 
 66 | $(OBJS_2D) : $(OBJ_DIR_2D)/%.o : $(SRC_DIR_2D)/%.cu $(DEP_DIR_2D)/%.d | $(DEP_DIR_2D)
 67 | 	$(call COMPILE, $(DEP_DIR_2D))
 68 | 
 69 | $(OBJS_3D) : $(OBJ_DIR_3D)/%.o : $(SRC_DIR_3D)/%.cu $(DEP_DIR_3D)/%.d | $(DEP_DIR_3D)
 70 | 	$(call COMPILE, $(DEP_DIR_3D))
 71 | # =========================================================================
 72 | 
 73 | # =========================================================================
 74 | jacobi2D_nvshmem: $(OBJS_2D_NVSHMEM) $(2D_PERKS_WRAPPER)
 75 | 	$(LINK_NVSHMEM)
 76 | 
 77 | jacobi3D_nvshmem: $(OBJS_3D_NVSHMEM)
 78 | 	$(LINK_NVSHMEM)
 79 | 
 80 | $(OBJS_2D_NVSHMEM) : $(OBJ_DIR_2D_NVSHMEM)/%.o : $(SRC_DIR_2D_NVSHMEM)/%.cu $(DEP_DIR_2D_NVSHMEM)/%.d | $(DEP_DIR_2D_NVSHMEM)
 81 | 	$(call COMPILE_NVSHMEM, $(DEP_DIR_2D_NVSHMEM))
 82 | 
 83 | $(OBJS_3D_NVSHMEM) : $(OBJ_DIR_3D_NVSHMEM)/%.o : $(SRC_DIR_3D_NVSHMEM)/%.cu $(DEP_DIR_3D_NVSHMEM)/%.d | $(DEP_DIR_3D_NVSHMEM)
 84 | 	$(call COMPILE_NVSHMEM, $(DEP_DIR_3D_NVSHMEM))
 85 | # =========================================================================
 86 | 
 87 | $(DEP_DIR_2D):
 88 | 	@mkdir -p $(DEP_DIR_2D)
 89 | 
 90 | $(DEP_DIR_3D):
 91 | 	@mkdir -p $(DEP_DIR_3D)
 92 | 
 93 | $(DEP_DIR_2D_NVSHMEM):
 94 | 	@mkdir -p $(DEP_DIR_2D_NVSHMEM)
 95 | 
 96 | $(DEP_DIR_3D_NVSHMEM):
 97 | 	@mkdir -p $(DEP_DIR_3D_NVSHMEM)
 98 | 
 99 | $(DEPS_2D):
100 | 
101 | $(DEPS_3D):
102 | 
103 | $(DEPS_2D_NVSHMEM):
104 | 
105 | $(DEPS_3D_NVSHMEM):
106 | 
107 | include $(wildcard $(DEPS_2D))
108 | include $(wildcard $(DEPS_3D))
109 | include $(wildcard $(DEPS_2D_NVSHMEM))
110 | include $(wildcard $(DEPS_3D_NVSHMEM))
111 | 


--------------------------------------------------------------------------------
/Stencil/jacobi2D/scripts/weak_scale_comm_bench_truba.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | #SBATCH -J stencil-bench-weak
  4 | #SBATCH -N 1
  5 | #SBATCH -n 8
  6 | #SBATCH -c 16
  7 | #SBATCH -A proj16
  8 | #SBATCH -p palamut-cuda
  9 | #SBATCH --gres=gpu:8
 10 | #SBATCH --time=24:00:00
 11 | #SBATCH -o stencil_bench_weak_output_%j.log
 12 | 
 13 | . ./scripts/modules_truba.sh > /dev/null
 14 | 
 15 | MAX_NUM_GPUS=8
 16 | CUDA_VISIBLE_DEVICES_SETTING=("0" "0" "0,1" "0,1,2" "0,1,2,3" "0,1,2,3,4" "0,1,2,3,4,5" "0,1,2,3,4,5,6" "0,1,2,3,4,5,6,7" )
 17 | 
 18 | declare -A version_name_to_idx_map
 19 | 
 20 | declare -A version_name_to_idx_map
 21 | 
 22 | version_name_to_idx_map["Baseline Copy"]=0
 23 | version_name_to_idx_map["Baseline Copy Overlap"]=1
 24 | version_name_to_idx_map["Baseline P2P"]=2
 25 | #version_name_to_idx_map["Baseline Single Copy"]=3
 26 | 
 27 | version_name_to_idx_map["Single Stream 1TB"]=4
 28 | version_name_to_idx_map["Single Stream 2TB"]=5
 29 | version_name_to_idx_map["Double Stream"]=6
 30 | 
 31 | version_name_to_idx_map["Baseline Copy (No compute)"]=7
 32 | version_name_to_idx_map["Baseline Copy Overlap (No Compute)"]=8
 33 | version_name_to_idx_map["Baseline P2P (No Compute)"]=9
 34 | 
 35 | version_name_to_idx_map["Single Stream 1TB (No Compute)"]=10
 36 | version_name_to_idx_map["Single Stream 2TB (No Compute)"]=11
 37 | version_name_to_idx_map["Double Stream (No Compute)"]=12
 38 | 
 39 | declare -A version_name_to_idx_map_nvshmem
 40 | 
 41 | version_name_to_idx_map_nvshmem["NVSHMEM Baseline"]=0
 42 | version_name_to_idx_map_nvshmem["NVSHMEM Baseline Optimized"]=1
 43 | 
 44 | version_name_to_idx_map_nvshmem["NVSHMEM Single Stream 1TB"]=2
 45 | version_name_to_idx_map_nvshmem["NVSHMEM Single Stream 2TB"]=3
 46 | version_name_to_idx_map_nvshmem["NVSHMEM Double Stream"]=4
 47 | 
 48 | version_name_to_idx_map_nvshmem["NVSHMEM Baseline (No Compute)"]=5
 49 | version_name_to_idx_map_nvshmem["NVSHMEM Baseline Optimized (No Compute)"]=6
 50 | 
 51 | version_name_to_idx_map_nvshmem["NVSHMEM Single Stream 1TB (No Compute)"]=7
 52 | version_name_to_idx_map_nvshmem["NVSHMEM Single Stream 2TB (No Compute)"]=8
 53 | version_name_to_idx_map_nvshmem["NVSHMEM Double Stream (No Compute)"]=9
 54 | 
 55 | BIN="./jacobi -s 1"
 56 | NV_BIN="./jacobi_nvshmem -s 1"
 57 | 
 58 | NUM_ITER=${NUM_ITER:-10000}
 59 | NUM_RUNS=${NUM_RUNS:-5}
 60 | 
 61 | while [ $# -gt 0 ]; do
 62 | 
 63 |    if [[ $1 == *"--"* ]]; then
 64 |         param="${1/--/}"
 65 |         declare $param="$2"
 66 |    fi
 67 | 
 68 |   shift
 69 | done
 70 | 
 71 | 
 72 | for (( STARTING_NX=16384; STARTING_NX<=16384; STARTING_NX*=2 )); do
 73 | 
 74 |     for version_name in "${!version_name_to_idx_map[@]}"; do
 75 |         echo "Running ${version_name}"; echo ""
 76 |         NX=${STARTING_NX}
 77 |         NY=${NX}
 78 | 
 79 |         version_idx=${version_name_to_idx_map[$version_name]}
 80 | 
 81 |         for (( NUM_GPUS=1; NUM_GPUS <= ${MAX_NUM_GPUS}; NUM_GPUS*=2 )); do
 82 |             export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES_SETTING[${NUM_GPUS}]}
 83 | 
 84 |             echo "Num GPUS: ${NUM_GPUS}"
 85 |             echo "${NUM_ITER} iterations on grid ${NX}x${NY}"
 86 | 
 87 |             for (( i=1; i <= ${NUM_RUNS}; i++ )); do
 88 |                 execution_time=$(${BIN} -v ${version_idx} -nx ${NX} -ny ${NY} -niter ${NUM_ITER})
 89 |                 echo "${execution_time} on run ${i}"
 90 |             done
 91 | 
 92 |             printf "\n"
 93 | 
 94 |             NX=$((2*NX))
 95 | 
 96 |         done
 97 | 
 98 |         echo "-------------------------------------"
 99 |     done
100 | 
101 |     for version_name in "${!version_name_to_idx_map_nvshmem[@]}"; do
102 |         echo "Running ${version_name}"; echo ""
103 |         NX=${STARTING_NX}
104 |         NY=${NX}
105 | 
106 |         version_idx=${version_name_to_idx_map_nvshmem[$version_name]}
107 | 
108 |         for (( NP=1; NP <= ${MAX_NUM_GPUS}; NP*=2 )); do
109 | 
110 |             echo "Num GPUS: ${NP}"
111 |             echo "${NUM_ITER} iterations on grid ${NX}x${NY}"
112 | 
113 |             for (( i=1; i <= ${NUM_RUNS}; i++ )); do
114 |                 execution_time=$(mpirun -np ${NP} ${NV_BIN} -v ${version_idx} -nx ${NX} -ny ${NY} -niter ${NUM_ITER})
115 |                 echo "${execution_time} on run ${i}"
116 |             done
117 | 
118 |             printf "\n"
119 | 
120 |             NX=$((2*NX))
121 |             
122 |         done
123 | 
124 |         echo "-------------------------------------"
125 |     done
126 |     
127 |     echo "#####################################" 
128 | done
129 | 


--------------------------------------------------------------------------------
/Stencil/jacobi2D/scripts/weak_scale_comp_bench_truba.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | #SBATCH -J stencil-bench-weak
  4 | #SBATCH -N 1
  5 | #SBATCH -n 8
  6 | #SBATCH -c 16
  7 | #SBATCH -A proj16
  8 | #SBATCH -p palamut-cuda
  9 | #SBATCH --gres=gpu:8
 10 | #SBATCH --time=24:00:00
 11 | #SBATCH -o stencil_bench_weak_output_%j.log
 12 | 
 13 | . ./scripts/modules_truba.sh > /dev/null
 14 | 
 15 | MAX_NUM_GPUS=8
 16 | CUDA_VISIBLE_DEVICES_SETTING=("0" "0" "0,1" "0,1,2" "0,1,2,3" "0,1,2,3,4" "0,1,2,3,4,5" "0,1,2,3,4,5,6" "0,1,2,3,4,5,6,7" )
 17 | 
 18 | declare -A version_name_to_idx_map
 19 | 
 20 | declare -A version_name_to_idx_map
 21 | 
 22 | version_name_to_idx_map["Baseline Copy"]=0
 23 | version_name_to_idx_map["Baseline Copy Overlap"]=1
 24 | version_name_to_idx_map["Baseline P2P"]=2
 25 | #version_name_to_idx_map["Baseline Single Copy"]=3
 26 | 
 27 | version_name_to_idx_map["Single Stream 1TB"]=4
 28 | version_name_to_idx_map["Single Stream 2TB"]=5
 29 | version_name_to_idx_map["Double Stream"]=6
 30 | 
 31 | version_name_to_idx_map["Baseline Copy (No compute)"]=7
 32 | version_name_to_idx_map["Baseline Copy Overlap (No Compute)"]=8
 33 | version_name_to_idx_map["Baseline P2P (No Compute)"]=9
 34 | 
 35 | version_name_to_idx_map["Single Stream 1TB (No Compute)"]=10
 36 | version_name_to_idx_map["Single Stream 2TB (No Compute)"]=11
 37 | version_name_to_idx_map["Double Stream (No Compute)"]=12
 38 | 
 39 | declare -A version_name_to_idx_map_nvshmem
 40 | 
 41 | version_name_to_idx_map_nvshmem["NVSHMEM Baseline"]=0
 42 | version_name_to_idx_map_nvshmem["NVSHMEM Baseline Optimized"]=1
 43 | 
 44 | version_name_to_idx_map_nvshmem["NVSHMEM Single Stream 1TB"]=2
 45 | version_name_to_idx_map_nvshmem["NVSHMEM Single Stream 2TB"]=3
 46 | version_name_to_idx_map_nvshmem["NVSHMEM Double Stream"]=4
 47 | 
 48 | version_name_to_idx_map_nvshmem["NVSHMEM Baseline (No Compute)"]=5
 49 | version_name_to_idx_map_nvshmem["NVSHMEM Baseline Optimized (No Compute)"]=6
 50 | 
 51 | version_name_to_idx_map_nvshmem["NVSHMEM Single Stream 1TB (No Compute)"]=7
 52 | version_name_to_idx_map_nvshmem["NVSHMEM Single Stream 2TB (No Compute)"]=8
 53 | version_name_to_idx_map_nvshmem["NVSHMEM Double Stream (No Compute)"]=9
 54 | 
 55 | BIN="./jacobi -s 1"
 56 | NV_BIN="./jacobi_nvshmem -s 1"
 57 | 
 58 | NUM_ITER=${NUM_ITER:-10000}
 59 | NUM_RUNS=${NUM_RUNS:-5}
 60 | 
 61 | while [ $# -gt 0 ]; do
 62 | 
 63 |    if [[ $1 == *"--"* ]]; then
 64 |         param="${1/--/}"
 65 |         declare $param="$2"
 66 |    fi
 67 | 
 68 |   shift
 69 | done
 70 | 
 71 | 
 72 | for (( STARTING_NX=16384; STARTING_NX<=16384; STARTING_NX*=2 )); do
 73 | 
 74 |     for version_name in "${!version_name_to_idx_map[@]}"; do
 75 |         echo "Running ${version_name}"; echo ""
 76 |         NX=${STARTING_NX}
 77 |         NY=${NX}
 78 | 
 79 |         version_idx=${version_name_to_idx_map[$version_name]}
 80 | 
 81 |         for (( NUM_GPUS=1; NUM_GPUS <= ${MAX_NUM_GPUS}; NUM_GPUS*=2 )); do
 82 |             export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES_SETTING[${NUM_GPUS}]}
 83 | 
 84 |             echo "Num GPUS: ${NUM_GPUS}"
 85 |             echo "${NUM_ITER} iterations on grid ${NX}x${NY}"
 86 | 
 87 |             for (( i=1; i <= ${NUM_RUNS}; i++ )); do
 88 |                 execution_time=$(${BIN} -v ${version_idx} -nx ${NX} -ny ${NY} -niter ${NUM_ITER})
 89 |                 echo "${execution_time} on run ${i}"
 90 |             done
 91 | 
 92 |             printf "\n"
 93 | 
 94 |             NY=$((2*NY))
 95 | 
 96 |         done
 97 | 
 98 |         echo "-------------------------------------"
 99 |     done
100 | 
101 |     for version_name in "${!version_name_to_idx_map_nvshmem[@]}"; do
102 |         echo "Running ${version_name}"; echo ""
103 |         NX=${STARTING_NX}
104 |         NY=${NX}
105 | 
106 |         version_idx=${version_name_to_idx_map_nvshmem[$version_name]}
107 | 
108 |         for (( NP=1; NP <= ${MAX_NUM_GPUS}; NP*=2 )); do
109 | 
110 |             echo "Num GPUS: ${NP}"
111 |             echo "${NUM_ITER} iterations on grid ${NX}x${NY}"
112 | 
113 |             for (( i=1; i <= ${NUM_RUNS}; i++ )); do
114 |                 execution_time=$(mpirun -np ${NP} ${NV_BIN} -v ${version_idx} -nx ${NX} -ny ${NY} -niter ${NUM_ITER})
115 |                 echo "${execution_time} on run ${i}"
116 |             done
117 | 
118 |             printf "\n"
119 | 
120 |             NY=$((2*NY))
121 |             
122 |         done
123 | 
124 |         echo "-------------------------------------"
125 |     done
126 |     
127 |     echo "#####################################" 
128 | done
129 | 


--------------------------------------------------------------------------------
/Stencil/jacobi2D/scripts/weak_scale_bench.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | #SBATCH --job-name=stencil-bench
  4 | #SBATCH --ntasks=8
  5 | #SBATCH --gres=gpu:8
  6 | #SBATCH --partition hgx2q
  7 | #SBATCH --time=03:00:00
  8 | #SBATCH --output=sbatch_output_%j.log
  9 | 
 10 | . ./scripts/modules.sh > /dev/null
 11 | 
 12 | MAX_NUM_GPUS=8
 13 | CUDA_VISIBLE_DEVICES_SETTING=("0" "0" "0,1" "0,1,2" "0,1,2,3" "0,1,2,3,4" "0,1,2,3,4,5" "0,1,2,3,4,5,6" "0,1,2,3,4,5,6,7" )
 14 | 
 15 | declare -A version_name_to_idx_map
 16 | 
 17 | version_name_to_idx_map["Baseline Copy"]=0
 18 | version_name_to_idx_map["Baseline Copy Overlap"]=1
 19 | version_name_to_idx_map["Baseline P2P"]=2
 20 | #version_name_to_idx_map["Baseline Single Copy"]=3
 21 | 
 22 | version_name_to_idx_map["Single Stream 1TB"]=4
 23 | version_name_to_idx_map["Single Stream 2TB"]=5
 24 | version_name_to_idx_map["Double Stream"]=6
 25 | 
 26 | version_name_to_idx_map["Baseline Copy (No compute)"]=7
 27 | version_name_to_idx_map["Baseline Copy Overlap (No Compute)"]=8
 28 | version_name_to_idx_map["Baseline P2P (No Compute)"]=9
 29 | 
 30 | version_name_to_idx_map["Single Stream 1TB (No Compute)"]=10
 31 | version_name_to_idx_map["Single Stream 2TB (No Compute)"]=11
 32 | version_name_to_idx_map["Double Stream (No Compute)"]=12
 33 | 
 34 | declare -A version_name_to_idx_map_nvshmem
 35 | 
 36 | version_name_to_idx_map_nvshmem["NVSHMEM Baseline"]=0
 37 | version_name_to_idx_map_nvshmem["NVSHMEM Baseline Optimized"]=1
 38 | 
 39 | version_name_to_idx_map_nvshmem["NVSHMEM Single Stream 1TB"]=2
 40 | version_name_to_idx_map_nvshmem["NVSHMEM Single Stream 2TB"]=3
 41 | version_name_to_idx_map_nvshmem["NVSHMEM Double Stream"]=4
 42 | 
 43 | version_name_to_idx_map_nvshmem["NVSHMEM Baseline (No Compute)"]=5
 44 | version_name_to_idx_map_nvshmem["NVSHMEM Baseline Optimized (No Compute)"]=6
 45 | 
 46 | version_name_to_idx_map_nvshmem["NVSHMEM Single Stream 1TB (No Compute)"]=7
 47 | version_name_to_idx_map_nvshmem["NVSHMEM Single Stream 2TB (No Compute)"]=8
 48 | version_name_to_idx_map_nvshmem["NVSHMEM Double Stream (No Compute)"]=9
 49 | 
 50 | BIN="./jacobi -s 1"
 51 | NV_BIN="./jacobi_nvshmem -s 1"
 52 | 
 53 | MAX_NX=${MAX_NX:-16384}
 54 | MAX_NY=${MAX_NY:-16384}
 55 | 
 56 | STARTING_NX=${STARTING_NX:-4096}
 57 | STARTING_NY=${STARTING_NY:-4096}
 58 | 
 59 | NUM_ITER=${NUM_ITER:-1000000}
 60 | NUM_RUNS=${NUM_RUNS:-5}
 61 | 
 62 | while [ $# -gt 0 ]; do
 63 | 
 64 |    if [[ $1 == *"--"* ]]; then
 65 |         param="${1/--/}"
 66 |         declare $param="$2"
 67 |    fi
 68 | 
 69 |   shift
 70 | done
 71 | 
 72 | for (( STARTING_NX=16384; STARTING_NX<=16384; STARTING_NX*=2 )); do
 73 |     
 74 |     for version_name in "${!version_name_to_idx_map[@]}"; do
 75 |         echo "Running ${version_name}"; echo ""
 76 |         NX=${STARTING_NX}
 77 |         NY=${NX}
 78 | 
 79 |         version_idx=${version_name_to_idx_map[$version_name]}
 80 | 
 81 |         for (( NUM_GPUS=1; NUM_GPUS <= ${MAX_NUM_GPUS}; NUM_GPUS*=2 )); do
 82 |             export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES_SETTING[${NUM_GPUS}]}
 83 | 
 84 |             echo "Num GPUS: ${NUM_GPUS}"
 85 |             echo "${NUM_ITER} iterations on grid ${NX}x${NY}"
 86 | 
 87 |             for (( i=1; i <= ${NUM_RUNS}; i++ )); do
 88 |                 execution_time=$(${BIN} -v ${version_idx} -nx ${NX} -ny ${NY} -niter ${NUM_ITER})
 89 |                 echo "${execution_time} on run ${i}"
 90 |             done
 91 | 
 92 |             printf "\n"
 93 | 
 94 |             NY=$((2*NY))
 95 |         
 96 |         done
 97 | 
 98 |         echo "-------------------------------------"
 99 |     done
100 | 
101 |     for version_name in "${!version_name_to_idx_map_nvshmem[@]}"; do
102 |         echo "Running ${version_name}"; echo ""
103 |         NX=${STARTING_NX}
104 |         NY=${NX}
105 | 
106 |         version_idx=${version_name_to_idx_map_nvshmem[$version_name]}
107 | 
108 |         for (( NP=1; NP <= ${MAX_NUM_GPUS}; NP*=2 )); do
109 | 
110 |             echo "Num GPUS: ${NP}"
111 |             echo "${NUM_ITER} iterations on grid ${NX}x${NY}"
112 | 
113 |             for (( i=1; i <= ${NUM_RUNS}; i++ )); do
114 |                 execution_time=$(mpirun -np ${NP} ${NV_BIN} -v ${version_idx} -nx ${NX} -ny ${NY} -niter ${NUM_ITER})
115 |                 echo "${execution_time} on run ${i}"
116 |             done
117 | 
118 |             printf "\n"
119 | 
120 |             NY=$((2*NY))
121 |         done
122 | 
123 |         echo "-------------------------------------"
124 |     done
125 |     
126 |     echo "#####################################" 
127 | done
128 | 


--------------------------------------------------------------------------------
/Stencil/jacobi3D/scripts/multi-node.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | #SBATCH -J multi-node-test
  4 | #SBATCH --nodes 2
  5 | #SBATCH --gres=gpu:2
  6 | #SBATCH --ntasks=4
  7 | #SBATCH --ntasks-per-node=2 
  8 | 
  9 | #SBATCH --cpus-per-task 16
 10 | #SBATCH -A proj16
 11 | #SBATCH -p palamut-cuda
 12 | 
 13 | #SBATCH --time=1:00:00
 14 | #SBATCH -o %x_%j.log
 15 | 
 16 | . ./scripts/modules_truba.sh > /dev/null
 17 | 
 18 | MAX_NUM_GPUS=8
 19 | CUDA_VISIBLE_DEVICES_SETTING=("0" "0" "0,1" "0,1,2" "0,1,2,3" "0,1,2,3,4" "0,1,2,3,4,5" "0,1,2,3,4,5,6" "0,1,2,3,4,5,6,7" )
 20 | 
 21 | declare -A version_name_to_idx_map
 22 | 
 23 | #version_name_to_idx_map["Baseline Copy"]=0
 24 | #version_name_to_idx_map["Baseline Copy Overlap"]=1
 25 | #version_name_to_idx_map["Baseline P2P"]=2
 26 | #version_name_to_idx_map["Baseline Single Copy"]=3
 27 | 
 28 | #version_name_to_idx_map["Single Stream 1TB"]=4
 29 | #version_name_to_idx_map["Single Stream 2TB"]=5
 30 | #version_name_to_idx_map["Double Stream"]=6
 31 | 
 32 | #version_name_to_idx_map["Baseline Copy (No compute)"]=7
 33 | #version_name_to_idx_map["Baseline Copy Overlap (No Compute)"]=8
 34 | #version_name_to_idx_map["Baseline P2P (No Compute)"]=9
 35 | 
 36 | #version_name_to_idx_map["Single Stream 1TB (No Compute)"]=10
 37 | #version_name_to_idx_map["Single Stream 2TB (No Compute)"]=11
 38 | #version_name_to_idx_map["Double Stream (No Compute)"]=12
 39 | 
 40 | declare -A version_name_to_idx_map_nvshmem
 41 | 
 42 | version_name_to_idx_map_nvshmem["NVSHMEM Baseline"]=0
 43 | version_name_to_idx_map_nvshmem["NVSHMEM Baseline Optimized"]=1
 44 | 
 45 | version_name_to_idx_map_nvshmem["NVSHMEM Single Stream 1TB"]=2
 46 | version_name_to_idx_map_nvshmem["NVSHMEM Single Stream 2TB"]=3
 47 | version_name_to_idx_map_nvshmem["NVSHMEM Double Stream"]=4
 48 | version_name_to_idx_map_nvshmem["NVSHMEM Single Stream All TB Partitoned"]=5
 49 | 
 50 | version_name_to_idx_map_nvshmem["NVSHMEM Baseline (No Compute)"]=6
 51 | version_name_to_idx_map_nvshmem["NVSHMEM Baseline Optimized (No Compute)"]=7
 52 | 
 53 | version_name_to_idx_map_nvshmem["NVSHMEM Single Stream 1TB (No Compute)"]=8
 54 | version_name_to_idx_map_nvshmem["NVSHMEM Single Stream 2TB (No Compute)"]=9
 55 | version_name_to_idx_map_nvshmem["NVSHMEM Double Stream (No Compute)"]=10
 56 | version_name_to_idx_map_nvshmem["NVSHMEM Single Stream All TB Partitoned (No Compute)"]=11
 57 | 
 58 | 
 59 | BIN="./jacobi -s 1"
 60 | NV_BIN="./jacobi_nvshmem -s 1"
 61 | 
 62 | 
 63 | NUM_ITER=${NUM_ITER:-10000}
 64 | NUM_RUNS=${NUM_RUNS:-5}
 65 | 
 66 | while [ $# -gt 0 ]; do
 67 | 
 68 |    if [[ $1 == *"--"* ]]; then
 69 |         param="${1/--/}"
 70 |         declare $param="$2"
 71 |    fi
 72 | 
 73 |   shift
 74 | done
 75 | 
 76 | 
 77 | for (( STARTING_NX=512; STARTING_NX<=512; STARTING_NX*=2 )); do
 78 |     
 79 |     NX=${STARTING_NX}
 80 |     NY=${NX}
 81 |     NZ=${NX}
 82 | 
 83 |     for version_name in "${!version_name_to_idx_map[@]}"; do
 84 |         echo "Running ${version_name}"; echo ""
 85 | 
 86 |         version_idx=${version_name_to_idx_map[$version_name]}
 87 | 
 88 |         for (( NUM_GPUS=1; NUM_GPUS <= ${MAX_NUM_GPUS}; NUM_GPUS+=1 )); do
 89 |             export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES_SETTING[${NUM_GPUS}]}
 90 | 
 91 |             echo "Num GPUS: ${NUM_GPUS}"
 92 |             echo "${NUM_ITER} iterations on grid ${NX}x${NY}x${NZ}"
 93 | 
 94 |             for (( i=1; i <= ${NUM_RUNS}; i++ )); do
 95 |                 execution_time=$(${BIN} -v ${version_idx} -nx ${NX} -ny ${NY} -nz  ${NZ} -niter ${NUM_ITER})
 96 |                 echo "${execution_time} on run ${i}"
 97 |             done
 98 | 
 99 |             printf "\n"
100 |             
101 |         done
102 | 
103 |         echo "-------------------------------------"
104 |     done
105 | 
106 | 
107 |     for version_name in "${!version_name_to_idx_map_nvshmem[@]}"; do
108 |         echo "Running ${version_name}"; echo ""
109 | 
110 |         version_idx=${version_name_to_idx_map_nvshmem[$version_name]}
111 | 
112 |         for (( NP=1; NP <= ${MAX_NUM_GPUS}; NP+=1 )); do
113 | 
114 |             echo "Num GPUS: ${NP}"
115 |             echo "${NUM_ITER} iterations on grid ${NX}x${NY}x${NZ}"
116 | 
117 |             for (( i=1; i <= ${NUM_RUNS}; i++ )); do
118 |                 execution_time=$(mpirun -np ${NP} ${NV_BIN} -v ${version_idx} -nx ${NX} -ny ${NY} -nz  ${NZ} -niter ${NUM_ITER})
119 |                 echo "${execution_time} on run ${i}"
120 |             done
121 | 
122 |             printf "\n"
123 | 
124 |         done
125 | 
126 |         echo "-------------------------------------"
127 |     done
128 | 
129 |     echo "#####################################"   
130 | done


--------------------------------------------------------------------------------
/Stencil/jacobi3D/scripts/strong_scale_bench.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | #SBATCH --job-name=stencil-bench
  4 | #SBATCH --ntasks=8
  5 | #SBATCH --gres=gpu:8
  6 | #SBATCH --partition hgx2q
  7 | #SBATCH --time=06:00:00
  8 | #SBATCH --output=sbatch_output_%j.log
  9 | 
 10 | . ./scripts/modules.sh > /dev/null
 11 | 
 12 | MAX_NUM_GPUS=8
 13 | CUDA_VISIBLE_DEVICES_SETTING=("0" "0" "0,1" "0,1,2" "0,1,2,3" "0,1,2,3,4" "0,1,2,3,4,5" "0,1,2,3,4,5,6" "0,1,2,3,4,5,6,7" )
 14 | 
 15 | declare -A version_name_to_idx_map
 16 | 
 17 | #version_name_to_idx_map["Baseline Copy"]=0
 18 | version_name_to_idx_map["Baseline Copy Overlap"]=1
 19 | version_name_to_idx_map["Baseline P2P"]=2
 20 | #version_name_to_idx_map["Baseline Single Copy"]=3
 21 | 
 22 | #version_name_to_idx_map["Single Stream 1TB"]=4
 23 | #version_name_to_idx_map["Single Stream 2TB"]=5
 24 | #version_name_to_idx_map["Double Stream"]=6
 25 | #version_name_to_idx_map["PERKS"]=7
 26 | 
 27 | #version_name_to_idx_map["Baseline Copy (No compute)"]=8
 28 | #version_name_to_idx_map["Baseline Copy Overlap (No Compute)"]=9
 29 | #version_name_to_idx_map["Baseline P2P (No Compute)"]=10
 30 | 
 31 | #version_name_to_idx_map["Single Stream 1TB (No Compute)"]=10
 32 | #version_name_to_idx_map["Single Stream 2TB (No Compute)"]=11
 33 | #version_name_to_idx_map["Double Stream (No Compute)"]=12
 34 | 
 35 | declare -A version_name_to_idx_map_nvshmem
 36 | 
 37 | version_name_to_idx_map_nvshmem["NVSHMEM Baseline"]=0
 38 | version_name_to_idx_map_nvshmem["NVSHMEM Baseline Optimized"]=1
 39 | 
 40 | version_name_to_idx_map_nvshmem["NVSHMEM Single Stream 1TB"]=2
 41 | version_name_to_idx_map_nvshmem["NVSHMEM Single Stream 2TB"]=3
 42 | version_name_to_idx_map_nvshmem["NVSHMEM Double Stream"]=4
 43 | version_name_to_idx_map_nvshmem["NVSHMEM Single Stream Partitoned"]=5
 44 | version_name_to_idx_map_nvshmem["NVSHMEM Double Stream Partitoned"]=6
 45 | 
 46 | version_name_to_idx_map_nvshmem["NVSHMEM PERKS"]=13
 47 | 
 48 | #version_name_to_idx_map_nvshmem["NVSHMEM Baseline (No Compute)"]=7
 49 | #version_name_to_idx_map_nvshmem["NVSHMEM Baseline Optimized (No Compute)"]=8
 50 | 
 51 | #version_name_to_idx_map_nvshmem["NVSHMEM Single Stream 1TB (No Compute)"]=9
 52 | #version_name_to_idx_map_nvshmem["NVSHMEM Single Stream 2TB (No Compute)"]=10
 53 | #version_name_to_idx_map_nvshmem["NVSHMEM Double Stream (No Compute)"]=11
 54 | #version_name_to_idx_map_nvshmem["NVSHMEM Single Stream Partitoned (No Compute)"]=12
 55 | 
 56 | BIN="./jacobi -s 1"
 57 | NV_BIN="./jacobi_nvshmem -s 1"
 58 | 
 59 | NUM_ITER=${NUM_ITER:-100000}
 60 | NUM_RUNS=${NUM_RUNS:-5}
 61 | 
 62 | while [ $# -gt 0 ]; do
 63 | 
 64 |    if [[ $1 == *"--"* ]]; then
 65 |         param="${1/--/}"
 66 |         declare $param="$2"
 67 |    fi
 68 | 
 69 |   shift
 70 | done
 71 | 
 72 | 
 73 | for (( STARTING_NX=512; STARTING_NX<=512; STARTING_NX*=2 )); do
 74 | 
 75 |     NX=${STARTING_NX}
 76 |     NY=${NX}
 77 |     NZ=${NX}
 78 |     
 79 |     for version_name in "${!version_name_to_idx_map[@]}"; do
 80 |         echo "Running ${version_name}"; echo ""
 81 |         version_idx=${version_name_to_idx_map[$version_name]}
 82 | 
 83 |         for (( NUM_GPUS=1; NUM_GPUS <= ${MAX_NUM_GPUS}; NUM_GPUS+=1 )); do
 84 |             export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES_SETTING[${NUM_GPUS}]}
 85 | 
 86 |             echo "Num GPUS: ${NUM_GPUS}"
 87 |             echo "${NUM_ITER} iterations on grid ${NX}x${NY}x${NZ}"
 88 | 
 89 |             for (( i=1; i <= ${NUM_RUNS}; i++ )); do
 90 |                 execution_time=$(${BIN} -v ${version_idx} -nx ${NX} -ny ${NY} -nz  ${NZ} -niter ${NUM_ITER})
 91 |                 echo "${execution_time} on run ${i}"
 92 |             done
 93 | 
 94 |             printf "\n"
 95 |             
 96 |         done
 97 | 
 98 |         echo "-------------------------------------"
 99 |     done
100 | 
101 |     
102 |     for version_name in "${!version_name_to_idx_map_nvshmem[@]}"; do
103 |         echo "Running ${version_name}"; echo ""
104 |         version_idx=${version_name_to_idx_map_nvshmem[$version_name]}
105 | 
106 |         for (( NP=1; NP <= ${MAX_NUM_GPUS}; NP+=1 )); do
107 | 
108 |             echo "Num GPUS: ${NP}"
109 |             echo "${NUM_ITER} iterations on grid ${NX}x${NY}x${NZ}"
110 | 
111 |             for (( i=1; i <= ${NUM_RUNS}; i++ )); do
112 |                 execution_time=$(mpirun -np ${NP} ${NV_BIN} -v ${version_idx} -nx ${NX} -ny ${NY} -nz  ${NZ} -niter ${NUM_ITER})
113 |                 echo "${execution_time} on run ${i}"
114 |             done
115 | 
116 |             printf "\n"
117 | 
118 |         done
119 | 
120 |         echo "-------------------------------------"
121 |     done
122 | 
123 |     echo "#####################################" 
124 |     
125 | done


--------------------------------------------------------------------------------
/CG/scripts/calculate_speedup.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import numpy as np
  3 | import pandas as pd
  4 | from os.path import dirname, realpath
  5 | import argparse
  6 | 
  7 | MATRIX_NAMES = [
  8 |     # 'tridiagonal',
  9 |     'ecology2',
 10 |     #   'shallow_water2', Too little non-zeros
 11 |     #   'Trefethen_2000', Too little non-zeros
 12 |     'hood',
 13 |     'bmwcra_1',
 14 |     'consph',
 15 |     'thermomech_dM',
 16 |     'tmt_sym',
 17 |     'crankseg_1',
 18 |     'crankseg_2',
 19 |     'Queen_4147',
 20 |     'Bump_2911',
 21 |     'G3_circuit',
 22 |     'StocF-1465',
 23 |     'Flan_1565',
 24 |     'audikw_1',
 25 |     'Serena',
 26 |     'Geo_1438',
 27 |     'Hook_1498',
 28 |     #   'bone010', Multi-part matrix, don't handle those for now
 29 |     'ldoor'
 30 | ]
 31 | 
 32 | VERSIONS_TO_KEEP = [
 33 |     'CPU-Controlled Standard CG (Baseline)',
 34 |     'CPU-Controlled Pipelined CG (Baseline)',
 35 |     'CPU-Free Standard CG (Ours)',
 36 |     'CPU-Free Pipelined CG (Ours)'
 37 | ]
 38 | 
 39 | dir_path = dirname(realpath(__file__))
 40 | 
 41 | # First file should be the full CSV file
 42 | # Second should be the SingleGPU runtimes
 43 | parser = argparse.ArgumentParser()
 44 | parser.add_argument('files', type=argparse.FileType('r'), nargs='+')
 45 | files = parser.parse_args().files
 46 | 
 47 | full_runtimes_csv = files[0]
 48 | single_gpu_runtimes_csv = files[1]
 49 | 
 50 | if __name__ == "__main__":
 51 |     dir_path = dirname(realpath(__file__))
 52 | 
 53 |     SAVE_RESULT_TO_DIR_PATH = dir_path + '/../results'
 54 | 
 55 |     # Skip first line
 56 |     full_runtimes_csv.readline()
 57 | 
 58 |     data = pd.read_csv(full_runtimes_csv, index_col='Matrix')
 59 |     data = data.sort_index()
 60 | 
 61 |     single_gpu_baseline_standard_runtimes = pd.read_csv(
 62 |         single_gpu_runtimes_csv, index_col='Matrix')['Runtime']
 63 |     single_gpu_baseline_standard_runtimes = single_gpu_baseline_standard_runtimes.sort_index()
 64 | 
 65 |     matrix_names = np.unique(
 66 |         [matrix_name for matrix_name, _ in data.iterrows()])
 67 | 
 68 |     gpu_num_column_labels = [column_label
 69 |                              for column_label in data.columns if 'GPU' in column_label]
 70 | 
 71 |     for matrix_name in matrix_names:
 72 |         if matrix_name not in MATRIX_NAMES:
 73 |             data.drop(matrix_name, inplace=True)
 74 |             single_gpu_baseline_standard_runtimes.drop(
 75 |                 matrix_name, inplace=True)
 76 | 
 77 |     for gpu_num_column_label in gpu_num_column_labels:
 78 |         per_gpu_num_data = data[['Version', gpu_num_column_label]]
 79 |         per_gpu_num_data = per_gpu_num_data.pivot_table(
 80 |             gpu_num_column_label, 'Matrix', 'Version')
 81 | 
 82 |         per_gpu_num_data = pd.DataFrame(
 83 |             per_gpu_num_data, columns=VERSIONS_TO_KEEP)
 84 | 
 85 |         per_gpu_num_speedup = 1 / per_gpu_num_data.div(
 86 |             single_gpu_baseline_standard_runtimes, axis=0)
 87 |             
 88 |         pipelined_cg_speedup = per_gpu_num_speedup['CPU-Free Pipelined CG (Ours)'] / \
 89 |             per_gpu_num_speedup['CPU-Controlled Pipelined CG (Baseline)']
 90 |         standard_cg_speedup = per_gpu_num_speedup['CPU-Free Standard CG (Ours)'] / \
 91 |             per_gpu_num_speedup['CPU-Controlled Standard CG (Baseline)']
 92 | 
 93 |         pipelined_cg_geo_mean_spedup = np.exp(
 94 |             np.log(pipelined_cg_speedup).mean())
 95 |         standard_cg_geo_mean_spedup = np.exp(
 96 |             np.log(standard_cg_speedup).mean())
 97 | 
 98 |         pipelined_speedup_file_path = SAVE_RESULT_TO_DIR_PATH + \
 99 |             '/pipelined_speedup/pipelined_cg_speedup_' + gpu_num_column_label + '.txt'
100 |         standard_speedup_file_path = SAVE_RESULT_TO_DIR_PATH + \
101 |             '/standard_speedup/pipelined_cg_speedup_' + gpu_num_column_label + '.txt'
102 | 
103 |         with open(pipelined_speedup_file_path, 'w') as pipelined_speedup_file:
104 |             pipelined_cg_speedup.to_string(
105 |                 pipelined_speedup_file, header=False)
106 |             pipelined_speedup_file.write('\n')
107 |             pipelined_speedup_file.write(
108 |                 f'Persistent vs Discrete Pipelined CG geo mean speedup on {gpu_num_column_label}: {pipelined_cg_geo_mean_spedup}')
109 | 
110 |         with open(standard_speedup_file_path, 'w') as standard_speedup_file:
111 |             standard_cg_speedup.to_string(
112 |                 standard_speedup_file, header=False)
113 |             standard_speedup_file.write('\n')
114 |             standard_speedup_file.write(
115 |                 f'Persistent vs Discrete Standard CG geo mean speedup on {gpu_num_column_label}: {standard_cg_geo_mean_spedup}')
116 | 


--------------------------------------------------------------------------------
/.clang-format:
--------------------------------------------------------------------------------
  1 | ---
  2 | Language:        Cpp
  3 | BasedOnStyle:  Google
  4 | AccessModifierOffset: -1
  5 | AlignAfterOpenBracket: Align
  6 | AlignConsecutiveAssignments: false
  7 | AlignConsecutiveDeclarations: false
  8 | AlignEscapedNewlines: Left
  9 | AlignOperands:   true
 10 | AlignTrailingComments: true
 11 | AllowAllParametersOfDeclarationOnNextLine: true
 12 | AllowShortBlocksOnASingleLine: false
 13 | AllowShortCaseLabelsOnASingleLine: false
 14 | AllowShortFunctionsOnASingleLine: All
 15 | AllowShortIfStatementsOnASingleLine: true
 16 | AllowShortLoopsOnASingleLine: true
 17 | AlwaysBreakAfterDefinitionReturnType: None
 18 | AlwaysBreakAfterReturnType: None
 19 | AlwaysBreakBeforeMultilineStrings: true
 20 | AlwaysBreakTemplateDeclarations: Yes
 21 | BinPackArguments: true
 22 | BinPackParameters: true
 23 | BraceWrapping:   
 24 |   AfterClass:      false
 25 |   AfterControlStatement: false
 26 |   AfterEnum:       false
 27 |   AfterFunction:   false
 28 |   AfterNamespace:  false
 29 |   AfterObjCDeclaration: false
 30 |   AfterStruct:     false
 31 |   AfterUnion:      false
 32 |   AfterExternBlock: false
 33 |   BeforeCatch:     false
 34 |   BeforeElse:      false
 35 |   IndentBraces:    false
 36 |   SplitEmptyFunction: true
 37 |   SplitEmptyRecord: true
 38 |   SplitEmptyNamespace: true
 39 | BreakBeforeBinaryOperators: None
 40 | BreakBeforeBraces: Attach
 41 | BreakBeforeInheritanceComma: false
 42 | BreakInheritanceList: BeforeColon
 43 | BreakBeforeTernaryOperators: true
 44 | BreakConstructorInitializersBeforeComma: false
 45 | BreakConstructorInitializers: BeforeColon
 46 | BreakAfterJavaFieldAnnotations: false
 47 | BreakStringLiterals: true
 48 | ColumnLimit:     100
 49 | CommentPragmas:  '^ IWYU pragma:'
 50 | CompactNamespaces: false
 51 | ConstructorInitializerAllOnOneLineOrOnePerLine: true
 52 | ConstructorInitializerIndentWidth: 4
 53 | ContinuationIndentWidth: 4
 54 | Cpp11BracedListStyle: true
 55 | DerivePointerAlignment: true
 56 | DisableFormat:   false
 57 | ExperimentalAutoDetectBinPacking: false
 58 | FixNamespaceComments: true
 59 | ForEachMacros:   
 60 |   - foreach
 61 |   - Q_FOREACH
 62 |   - BOOST_FOREACH
 63 | IncludeBlocks:   Preserve
 64 | IncludeCategories: 
 65 |   - Regex:           '^<ext/.*\.h>'
 66 |     Priority:        2
 67 |   - Regex:           '^<.*\.h>'
 68 |     Priority:        1
 69 |   - Regex:           '^<.*'
 70 |     Priority:        2
 71 |   - Regex:           '.*'
 72 |     Priority:        3
 73 | IncludeIsMainRegex: '([-_](test|unittest))?$'
 74 | IndentCaseLabels: true
 75 | IndentPPDirectives: None
 76 | IndentWidth:     4
 77 | IndentWrappedFunctionNames: false
 78 | JavaScriptQuotes: Leave
 79 | JavaScriptWrapImports: true
 80 | KeepEmptyLinesAtTheStartOfBlocks: false
 81 | MacroBlockBegin: ''
 82 | MacroBlockEnd:   ''
 83 | MaxEmptyLinesToKeep: 1
 84 | NamespaceIndentation: None
 85 | ObjCBinPackProtocolList: Never
 86 | ObjCBlockIndentWidth: 4
 87 | ObjCSpaceAfterProperty: false
 88 | ObjCSpaceBeforeProtocolList: true
 89 | PenaltyBreakAssignment: 2
 90 | PenaltyBreakBeforeFirstCallParameter: 1
 91 | PenaltyBreakComment: 300
 92 | PenaltyBreakFirstLessLess: 120
 93 | PenaltyBreakString: 1000
 94 | PenaltyBreakTemplateDeclaration: 10
 95 | PenaltyExcessCharacter: 1000000
 96 | PenaltyReturnTypeOnItsOwnLine: 200
 97 | PointerAlignment: Left
 98 | RawStringFormats: 
 99 |   - Language:        Cpp
100 |     Delimiters:      
101 |       - cc
102 |       - CC
103 |       - cpp
104 |       - Cpp
105 |       - CPP
106 |       - 'c++'
107 |       - 'C++'
108 |     CanonicalDelimiter: ''
109 |     BasedOnStyle:    Google
110 |   - Language:        TextProto
111 |     Delimiters:      
112 |       - pb
113 |       - PB
114 |       - proto
115 |       - PROTO
116 |     EnclosingFunctions: 
117 |       - EqualsProto
118 |       - EquivToProto
119 |       - PARSE_PARTIAL_TEXT_PROTO
120 |       - PARSE_TEST_PROTO
121 |       - PARSE_TEXT_PROTO
122 |       - ParseTextOrDie
123 |       - ParseTextProtoOrDie
124 |     CanonicalDelimiter: ''
125 |     BasedOnStyle:    Google
126 | ReflowComments:  true
127 | SortIncludes:    true
128 | SortUsingDeclarations: true
129 | SpaceAfterCStyleCast: false
130 | SpaceAfterTemplateKeyword: true
131 | SpaceBeforeAssignmentOperators: true
132 | SpaceBeforeCpp11BracedList: false
133 | SpaceBeforeCtorInitializerColon: true
134 | SpaceBeforeInheritanceColon: true
135 | SpaceBeforeParens: ControlStatements
136 | SpaceBeforeRangeBasedForLoopColon: true
137 | SpaceInEmptyParentheses: false
138 | SpacesBeforeTrailingComments: 2
139 | SpacesInAngles:  false
140 | SpacesInContainerLiterals: true
141 | SpacesInCStyleCastParentheses: false
142 | SpacesInParentheses: false
143 | SpacesInSquareBrackets: false
144 | Standard:        Auto
145 | TabWidth:        8
146 | UseTab:          Never


--------------------------------------------------------------------------------