├── .clang-format ├── .gitignore ├── Benchmarks ├── tea_bm_1.in ├── tea_bm_1.out ├── tea_bm_2.in ├── tea_bm_2.out ├── tea_bm_3.in ├── tea_bm_3.out ├── tea_bm_4.in ├── tea_bm_4.out ├── tea_bm_5.in ├── tea_bm_5.out ├── tea_bm_5e_1.in ├── tea_bm_5e_1_2.in ├── tea_bm_5e_1_4.in ├── tea_bm_5e_2.in ├── tea_bm_5e_2_2.in ├── tea_bm_5e_2_4.in ├── tea_bm_5e_4.in ├── tea_bm_5e_4_2.in ├── tea_bm_5e_4_4.in ├── tea_bm_5e_8.in ├── tea_bm_5e_8_2.in ├── tea_bm_5e_8_4.in ├── tea_bm_6.in └── tea_bm_6.out ├── CMakeLists.txt ├── README.md ├── build.sh ├── cmake └── register_models.cmake ├── driver ├── application.h ├── cg_driver.cpp ├── cheby_driver.cpp ├── chunk.cpp ├── chunk.h ├── comms.cpp ├── comms.h ├── diffuse.cpp ├── drivers.h ├── eigenvalue_driver.cpp ├── field_summary_driver.cpp ├── halo_update_driver.cpp ├── initialise.cpp ├── jacobi_driver.cpp ├── kernel_initialise_driver.cpp ├── kernel_interface.h ├── main.cpp ├── mpi_shim.cpp ├── mpi_shim.h ├── parse_config.cpp ├── ppcg_driver.cpp ├── profiler.cpp ├── profiler.h ├── remote_halo_driver.cpp ├── set_chunk_data_driver.cpp ├── set_chunk_state_driver.cpp ├── settings.cpp ├── settings.h ├── shared.cpp ├── shared.h ├── solve_finished_driver.cpp └── store_energy_driver.cpp ├── src ├── cuda │ ├── cg.cpp │ ├── cheby.cpp │ ├── chunk_extension.h │ ├── cuknl_shared.h │ ├── jacobi.cpp │ ├── kernel_initialise.cpp │ ├── local_halos.cpp │ ├── model.cmake │ ├── pack_halos.cpp │ ├── ppcg.cpp │ └── solver_methods.cpp ├── hip │ ├── cg.cpp │ ├── cheby.cpp │ ├── chunk_extension.h │ ├── cuknl_shared.h │ ├── jacobi.cpp │ ├── kernel_initialise.cpp │ ├── local_halos.cpp │ ├── model.cmake │ ├── pack_halos.cpp │ ├── ppcg.cpp │ └── solver_methods.cpp ├── kokkos │ ├── cg.cpp │ ├── cheby.cpp │ ├── chunk_extension.h │ ├── jacobi.cpp │ ├── kernel_initialise.cpp │ ├── kokkos_shared.hpp │ ├── local_halos.cpp │ ├── model.cmake │ ├── pack_halos.cpp │ ├── ppcg.cpp │ └── solver_methods.cpp ├── omp │ ├── cg.cpp │ ├── cheby.cpp │ ├── chunk_extension.h │ ├── diffuse_overload.cpp │ ├── jacobi.cpp │ ├── kernel_initialise.cpp │ ├── local_halos.cpp │ ├── model.cmake │ ├── pack_halos.cpp │ ├── ppcg.cpp │ └── solver_methods.cpp ├── serial │ ├── cg.cpp │ ├── cheby.cpp │ ├── chunk_extension.h │ ├── jacobi.cpp │ ├── kernel_initialise.cpp │ ├── local_halos.cpp │ ├── model.cmake │ ├── pack_halos.cpp │ ├── ppcg.cpp │ └── solver_methods.cpp ├── std-indices │ ├── cg.cpp │ ├── cheby.cpp │ ├── chunk_extension.h │ ├── dpl_shim.h │ ├── jacobi.cpp │ ├── kernel_initialise.cpp │ ├── local_halos.cpp │ ├── model.cmake │ ├── pack_halos.cpp │ ├── ppcg.cpp │ ├── ranged.h │ ├── solver_methods.cpp │ └── std_shared.h ├── sycl-acc │ ├── cg.cpp │ ├── cheby.cpp │ ├── chunk_extension.h │ ├── jacobi.cpp │ ├── kernel_initialise.cpp │ ├── local_halos.cpp │ ├── model.cmake │ ├── pack_halos.cpp │ ├── ppcg.cpp │ ├── solver_methods.cpp │ └── sycl_shared.hpp └── sycl-usm │ ├── cg.cpp │ ├── cheby.cpp │ ├── chunk_extension.h │ ├── jacobi.cpp │ ├── kernel_initialise.cpp │ ├── local_halos.cpp │ ├── model.cmake │ ├── pack_halos.cpp │ ├── ppcg.cpp │ ├── solver_methods.cpp │ └── sycl_shared.hpp ├── tea.in ├── tea.problems └── test.sh /.clang-format: -------------------------------------------------------------------------------- 1 | --- 2 | AllowShortIfStatementsOnASingleLine: Always 3 | AllowShortCaseLabelsOnASingleLine: true 4 | AllowShortFunctionsOnASingleLine: All 5 | AlignEscapedNewlines: Left 6 | IndentCaseLabels: true 7 | ColumnLimit: 140 8 | CompactNamespaces: true 9 | FixNamespaceComments: true 10 | IndentPPDirectives: BeforeHash 11 | ... 12 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.dat 2 | *.o 3 | *.a 4 | *~ 5 | run 6 | submit 7 | submit.* 8 | *.mod 9 | .*.swp 10 | .DS_STORE 11 | tea.out 12 | *.ptx 13 | *.cub 14 | *.lst 15 | tealeaf 16 | tealeaf.* 17 | .ycm_extra_conf.py* 18 | *.optrpt 19 | src.* 20 | tags 21 | 22 | # CMake 23 | cmake-build-*/ 24 | Build/ 25 | build/ 26 | build_*/ 27 | .idea/ 28 | 29 | # IntelliJ 30 | out/ 31 | 32 | 33 | # KDE directory preferences 34 | .directory 35 | -------------------------------------------------------------------------------- /Benchmarks/tea_bm_1.in: -------------------------------------------------------------------------------- 1 | *tea 2 | state 1 density=100.0 energy=0.0001 3 | state 2 density=0.1 energy=25.0 geometry=rectangle xmin=0.0 xmax=1.0 ymin=1.0 ymax=2.0 4 | state 3 density=0.1 energy=0.1 geometry=rectangle xmin=1.0 xmax=6.0 ymin=1.0 ymax=2.0 5 | state 4 density=0.1 energy=0.1 geometry=rectangle xmin=5.0 xmax=6.0 ymin=1.0 ymax=8.0 6 | state 5 density=0.1 energy=0.1 geometry=rectangle xmin=5.0 xmax=10.0 ymin=7.0 ymax=8.0 7 | x_cells=10 8 | y_cells=10 9 | xmin=0.0 10 | ymin=0.0 11 | xmax=10.0 12 | ymax=10.0 13 | initial_timestep=0.004 14 | end_step=10 15 | max_iters=10000 16 | use_cg 17 | eps 1.0e-15 18 | test_problem 1 19 | use_c_kernels 20 | *endtea 21 | -------------------------------------------------------------------------------- /Benchmarks/tea_bm_2.in: -------------------------------------------------------------------------------- 1 | *tea 2 | state 1 density=100.0 energy=0.0001 3 | state 2 density=0.1 energy=25.0 geometry=rectangle xmin=0.0 xmax=1.0 ymin=1.0 ymax=2.0 4 | state 3 density=0.1 energy=0.1 geometry=rectangle xmin=1.0 xmax=6.0 ymin=1.0 ymax=2.0 5 | state 4 density=0.1 energy=0.1 geometry=rectangle xmin=5.0 xmax=6.0 ymin=1.0 ymax=8.0 6 | state 5 density=0.1 energy=0.1 geometry=rectangle xmin=5.0 xmax=10.0 ymin=7.0 ymax=8.0 7 | x_cells=250 8 | y_cells=250 9 | xmin=0.0 10 | ymin=0.0 11 | xmax=10.0 12 | ymax=10.0 13 | initial_timestep=0.004 14 | end_step=10 15 | max_iters=10000 16 | use_cg 17 | eps 1.0e-15 18 | test_problem 2 19 | use_c_kernels 20 | *endtea 21 | -------------------------------------------------------------------------------- /Benchmarks/tea_bm_3.in: -------------------------------------------------------------------------------- 1 | *tea 2 | state 1 density=100.0 energy=0.0001 3 | state 2 density=0.1 energy=25.0 geometry=rectangle xmin=0.0 xmax=1.0 ymin=1.0 ymax=2.0 4 | state 3 density=0.1 energy=0.1 geometry=rectangle xmin=1.0 xmax=6.0 ymin=1.0 ymax=2.0 5 | state 4 density=0.1 energy=0.1 geometry=rectangle xmin=5.0 xmax=6.0 ymin=1.0 ymax=8.0 6 | state 5 density=0.1 energy=0.1 geometry=rectangle xmin=5.0 xmax=10.0 ymin=7.0 ymax=8.0 7 | x_cells=500 8 | y_cells=500 9 | xmin=0.0 10 | ymin=0.0 11 | xmax=10.0 12 | ymax=10.0 13 | initial_timestep=0.004 14 | end_step=10 15 | max_iters=10000 16 | use_cg 17 | eps 1.0e-15 18 | test_problem 3 19 | use_c_kernels 20 | *endtea 21 | -------------------------------------------------------------------------------- /Benchmarks/tea_bm_4.in: -------------------------------------------------------------------------------- 1 | *tea 2 | state 1 density=100.0 energy=0.0001 3 | state 2 density=0.1 energy=25.0 geometry=rectangle xmin=0.0 xmax=1.0 ymin=1.0 ymax=2.0 4 | state 3 density=0.1 energy=0.1 geometry=rectangle xmin=1.0 xmax=6.0 ymin=1.0 ymax=2.0 5 | state 4 density=0.1 energy=0.1 geometry=rectangle xmin=5.0 xmax=6.0 ymin=1.0 ymax=8.0 6 | state 5 density=0.1 energy=0.1 geometry=rectangle xmin=5.0 xmax=10.0 ymin=7.0 ymax=8.0 7 | x_cells=1000 8 | y_cells=1000 9 | xmin=0.0 10 | ymin=0.0 11 | xmax=10.0 12 | ymax=10.0 13 | initial_timestep=0.004 14 | end_step=10 15 | max_iters=10000 16 | use_cg 17 | eps 1.0e-15 18 | test_problem 4 19 | use_c_kernels 20 | *endtea 21 | -------------------------------------------------------------------------------- /Benchmarks/tea_bm_5.in: -------------------------------------------------------------------------------- 1 | *tea 2 | state 1 density=100.0 energy=0.0001 3 | state 2 density=0.1 energy=25.0 geometry=rectangle xmin=0.0 xmax=1.0 ymin=1.0 ymax=2.0 4 | state 3 density=0.1 energy=0.1 geometry=rectangle xmin=1.0 xmax=6.0 ymin=1.0 ymax=2.0 5 | state 4 density=0.1 energy=0.1 geometry=rectangle xmin=5.0 xmax=6.0 ymin=1.0 ymax=8.0 6 | state 5 density=0.1 energy=0.1 geometry=rectangle xmin=5.0 xmax=10.0 ymin=7.0 ymax=8.0 7 | x_cells=4000 8 | y_cells=4000 9 | xmin=0.0 10 | ymin=0.0 11 | xmax=10.0 12 | ymax=10.0 13 | initial_timestep=0.004 14 | end_step=10 15 | max_iters=10000 16 | use_cg 17 | eps 1.0e-15 18 | test_problem 5 19 | profiler_on 20 | use_c_kernels 21 | *endtea 22 | -------------------------------------------------------------------------------- /Benchmarks/tea_bm_5e_1.in: -------------------------------------------------------------------------------- 1 | *tea 2 | state 1 density=100.0 energy=0.0001 3 | state 2 density=0.1 energy=25.0 geometry=rectangle xmin=0.0 xmax=1.0 ymin=1.0 ymax=2.0 4 | state 3 density=0.1 energy=0.1 geometry=rectangle xmin=1.0 xmax=6.0 ymin=1.0 ymax=2.0 5 | state 4 density=0.1 energy=0.1 geometry=rectangle xmin=5.0 xmax=6.0 ymin=1.0 ymax=8.0 6 | state 5 density=0.1 energy=0.1 geometry=rectangle xmin=5.0 xmax=10.0 ymin=7.0 ymax=8.0 7 | x_cells=1000 8 | y_cells=1000 9 | xmin=0.0 10 | ymin=0.0 11 | xmax=10.0 12 | ymax=10.0 13 | initial_timestep=0.004 14 | end_step=10 15 | max_iters=10000 16 | use_cg 17 | eps 1.0e-15 18 | test_problem 5 19 | profiler_on 20 | use_c_kernels 21 | *endtea 22 | -------------------------------------------------------------------------------- /Benchmarks/tea_bm_5e_1_2.in: -------------------------------------------------------------------------------- 1 | *tea 2 | state 1 density=100.0 energy=0.0001 3 | state 2 density=0.1 energy=25.0 geometry=rectangle xmin=0.0 xmax=1.0 ymin=1.0 ymax=2.0 4 | state 3 density=0.1 energy=0.1 geometry=rectangle xmin=1.0 xmax=6.0 ymin=1.0 ymax=2.0 5 | state 4 density=0.1 energy=0.1 geometry=rectangle xmin=5.0 xmax=6.0 ymin=1.0 ymax=8.0 6 | state 5 density=0.1 energy=0.1 geometry=rectangle xmin=5.0 xmax=10.0 ymin=7.0 ymax=8.0 7 | x_cells=1000 8 | y_cells=1000 9 | xmin=0.0 10 | ymin=0.0 11 | xmax=10.0 12 | ymax=10.0 13 | initial_timestep=0.004 14 | end_step=2 15 | max_iters=10000 16 | use_cg 17 | eps 1.0e-15 18 | test_problem 5 19 | profiler_on 20 | use_c_kernels 21 | *endtea 22 | -------------------------------------------------------------------------------- /Benchmarks/tea_bm_5e_1_4.in: -------------------------------------------------------------------------------- 1 | *tea 2 | state 1 density=100.0 energy=0.0001 3 | state 2 density=0.1 energy=25.0 geometry=rectangle xmin=0.0 xmax=1.0 ymin=1.0 ymax=2.0 4 | state 3 density=0.1 energy=0.1 geometry=rectangle xmin=1.0 xmax=6.0 ymin=1.0 ymax=2.0 5 | state 4 density=0.1 energy=0.1 geometry=rectangle xmin=5.0 xmax=6.0 ymin=1.0 ymax=8.0 6 | state 5 density=0.1 energy=0.1 geometry=rectangle xmin=5.0 xmax=10.0 ymin=7.0 ymax=8.0 7 | x_cells=1000 8 | y_cells=1000 9 | xmin=0.0 10 | ymin=0.0 11 | xmax=10.0 12 | ymax=10.0 13 | initial_timestep=0.004 14 | end_step=4 15 | max_iters=10000 16 | use_cg 17 | eps 1.0e-15 18 | test_problem 5 19 | profiler_on 20 | use_c_kernels 21 | *endtea 22 | -------------------------------------------------------------------------------- /Benchmarks/tea_bm_5e_2.in: -------------------------------------------------------------------------------- 1 | *tea 2 | state 1 density=100.0 energy=0.0001 3 | state 2 density=0.1 energy=25.0 geometry=rectangle xmin=0.0 xmax=1.0 ymin=1.0 ymax=2.0 4 | state 3 density=0.1 energy=0.1 geometry=rectangle xmin=1.0 xmax=6.0 ymin=1.0 ymax=2.0 5 | state 4 density=0.1 energy=0.1 geometry=rectangle xmin=5.0 xmax=6.0 ymin=1.0 ymax=8.0 6 | state 5 density=0.1 energy=0.1 geometry=rectangle xmin=5.0 xmax=10.0 ymin=7.0 ymax=8.0 7 | x_cells=2000 8 | y_cells=2000 9 | xmin=0.0 10 | ymin=0.0 11 | xmax=10.0 12 | ymax=10.0 13 | initial_timestep=0.004 14 | end_step=10 15 | max_iters=10000 16 | use_cg 17 | eps 1.0e-15 18 | test_problem 5 19 | profiler_on 20 | use_c_kernels 21 | *endtea 22 | -------------------------------------------------------------------------------- /Benchmarks/tea_bm_5e_2_2.in: -------------------------------------------------------------------------------- 1 | *tea 2 | state 1 density=100.0 energy=0.0001 3 | state 2 density=0.1 energy=25.0 geometry=rectangle xmin=0.0 xmax=1.0 ymin=1.0 ymax=2.0 4 | state 3 density=0.1 energy=0.1 geometry=rectangle xmin=1.0 xmax=6.0 ymin=1.0 ymax=2.0 5 | state 4 density=0.1 energy=0.1 geometry=rectangle xmin=5.0 xmax=6.0 ymin=1.0 ymax=8.0 6 | state 5 density=0.1 energy=0.1 geometry=rectangle xmin=5.0 xmax=10.0 ymin=7.0 ymax=8.0 7 | x_cells=2000 8 | y_cells=2000 9 | xmin=0.0 10 | ymin=0.0 11 | xmax=10.0 12 | ymax=10.0 13 | initial_timestep=0.004 14 | end_step=2 15 | max_iters=10000 16 | use_cg 17 | eps 1.0e-15 18 | test_problem 5 19 | profiler_on 20 | use_c_kernels 21 | *endtea 22 | -------------------------------------------------------------------------------- /Benchmarks/tea_bm_5e_2_4.in: -------------------------------------------------------------------------------- 1 | *tea 2 | state 1 density=100.0 energy=0.0001 3 | state 2 density=0.1 energy=25.0 geometry=rectangle xmin=0.0 xmax=1.0 ymin=1.0 ymax=2.0 4 | state 3 density=0.1 energy=0.1 geometry=rectangle xmin=1.0 xmax=6.0 ymin=1.0 ymax=2.0 5 | state 4 density=0.1 energy=0.1 geometry=rectangle xmin=5.0 xmax=6.0 ymin=1.0 ymax=8.0 6 | state 5 density=0.1 energy=0.1 geometry=rectangle xmin=5.0 xmax=10.0 ymin=7.0 ymax=8.0 7 | x_cells=2000 8 | y_cells=2000 9 | xmin=0.0 10 | ymin=0.0 11 | xmax=10.0 12 | ymax=10.0 13 | initial_timestep=0.004 14 | end_step=4 15 | max_iters=10000 16 | use_cg 17 | eps 1.0e-15 18 | test_problem 5 19 | profiler_on 20 | use_c_kernels 21 | *endtea 22 | -------------------------------------------------------------------------------- /Benchmarks/tea_bm_5e_4.in: -------------------------------------------------------------------------------- 1 | *tea 2 | state 1 density=100.0 energy=0.0001 3 | state 2 density=0.1 energy=25.0 geometry=rectangle xmin=0.0 xmax=1.0 ymin=1.0 ymax=2.0 4 | state 3 density=0.1 energy=0.1 geometry=rectangle xmin=1.0 xmax=6.0 ymin=1.0 ymax=2.0 5 | state 4 density=0.1 energy=0.1 geometry=rectangle xmin=5.0 xmax=6.0 ymin=1.0 ymax=8.0 6 | state 5 density=0.1 energy=0.1 geometry=rectangle xmin=5.0 xmax=10.0 ymin=7.0 ymax=8.0 7 | x_cells=4000 8 | y_cells=4000 9 | xmin=0.0 10 | ymin=0.0 11 | xmax=10.0 12 | ymax=10.0 13 | initial_timestep=0.004 14 | end_step=10 15 | max_iters=10000 16 | use_cg 17 | eps 1.0e-15 18 | test_problem 5 19 | profiler_on 20 | use_c_kernels 21 | *endtea 22 | -------------------------------------------------------------------------------- /Benchmarks/tea_bm_5e_4_2.in: -------------------------------------------------------------------------------- 1 | *tea 2 | state 1 density=100.0 energy=0.0001 3 | state 2 density=0.1 energy=25.0 geometry=rectangle xmin=0.0 xmax=1.0 ymin=1.0 ymax=2.0 4 | state 3 density=0.1 energy=0.1 geometry=rectangle xmin=1.0 xmax=6.0 ymin=1.0 ymax=2.0 5 | state 4 density=0.1 energy=0.1 geometry=rectangle xmin=5.0 xmax=6.0 ymin=1.0 ymax=8.0 6 | state 5 density=0.1 energy=0.1 geometry=rectangle xmin=5.0 xmax=10.0 ymin=7.0 ymax=8.0 7 | x_cells=4000 8 | y_cells=4000 9 | xmin=0.0 10 | ymin=0.0 11 | xmax=10.0 12 | ymax=10.0 13 | initial_timestep=0.004 14 | end_step=2 15 | max_iters=10000 16 | use_cg 17 | eps 1.0e-15 18 | test_problem 5 19 | profiler_on 20 | use_c_kernels 21 | *endtea 22 | -------------------------------------------------------------------------------- /Benchmarks/tea_bm_5e_4_4.in: -------------------------------------------------------------------------------- 1 | *tea 2 | state 1 density=100.0 energy=0.0001 3 | state 2 density=0.1 energy=25.0 geometry=rectangle xmin=0.0 xmax=1.0 ymin=1.0 ymax=2.0 4 | state 3 density=0.1 energy=0.1 geometry=rectangle xmin=1.0 xmax=6.0 ymin=1.0 ymax=2.0 5 | state 4 density=0.1 energy=0.1 geometry=rectangle xmin=5.0 xmax=6.0 ymin=1.0 ymax=8.0 6 | state 5 density=0.1 energy=0.1 geometry=rectangle xmin=5.0 xmax=10.0 ymin=7.0 ymax=8.0 7 | x_cells=4000 8 | y_cells=4000 9 | xmin=0.0 10 | ymin=0.0 11 | xmax=10.0 12 | ymax=10.0 13 | initial_timestep=0.004 14 | end_step=4 15 | max_iters=10000 16 | use_cg 17 | eps 1.0e-15 18 | test_problem 5 19 | profiler_on 20 | use_c_kernels 21 | *endtea 22 | -------------------------------------------------------------------------------- /Benchmarks/tea_bm_5e_8.in: -------------------------------------------------------------------------------- 1 | *tea 2 | state 1 density=100.0 energy=0.0001 3 | state 2 density=0.1 energy=25.0 geometry=rectangle xmin=0.0 xmax=1.0 ymin=1.0 ymax=2.0 4 | state 3 density=0.1 energy=0.1 geometry=rectangle xmin=1.0 xmax=6.0 ymin=1.0 ymax=2.0 5 | state 4 density=0.1 energy=0.1 geometry=rectangle xmin=5.0 xmax=6.0 ymin=1.0 ymax=8.0 6 | state 5 density=0.1 energy=0.1 geometry=rectangle xmin=5.0 xmax=10.0 ymin=7.0 ymax=8.0 7 | x_cells=8000 8 | y_cells=8000 9 | xmin=0.0 10 | ymin=0.0 11 | xmax=10.0 12 | ymax=10.0 13 | initial_timestep=0.004 14 | end_step=10 15 | max_iters=10000 16 | use_cg 17 | eps 1.0e-15 18 | test_problem 5 19 | profiler_on 20 | use_c_kernels 21 | *endtea 22 | -------------------------------------------------------------------------------- /Benchmarks/tea_bm_5e_8_2.in: -------------------------------------------------------------------------------- 1 | *tea 2 | state 1 density=100.0 energy=0.0001 3 | state 2 density=0.1 energy=25.0 geometry=rectangle xmin=0.0 xmax=1.0 ymin=1.0 ymax=2.0 4 | state 3 density=0.1 energy=0.1 geometry=rectangle xmin=1.0 xmax=6.0 ymin=1.0 ymax=2.0 5 | state 4 density=0.1 energy=0.1 geometry=rectangle xmin=5.0 xmax=6.0 ymin=1.0 ymax=8.0 6 | state 5 density=0.1 energy=0.1 geometry=rectangle xmin=5.0 xmax=10.0 ymin=7.0 ymax=8.0 7 | x_cells=8000 8 | y_cells=8000 9 | xmin=0.0 10 | ymin=0.0 11 | xmax=10.0 12 | ymax=10.0 13 | initial_timestep=0.004 14 | end_step=2 15 | max_iters=10000 16 | use_cg 17 | eps 1.0e-15 18 | test_problem 5 19 | profiler_on 20 | use_c_kernels 21 | *endtea 22 | -------------------------------------------------------------------------------- /Benchmarks/tea_bm_5e_8_4.in: -------------------------------------------------------------------------------- 1 | *tea 2 | state 1 density=100.0 energy=0.0001 3 | state 2 density=0.1 energy=25.0 geometry=rectangle xmin=0.0 xmax=1.0 ymin=1.0 ymax=2.0 4 | state 3 density=0.1 energy=0.1 geometry=rectangle xmin=1.0 xmax=6.0 ymin=1.0 ymax=2.0 5 | state 4 density=0.1 energy=0.1 geometry=rectangle xmin=5.0 xmax=6.0 ymin=1.0 ymax=8.0 6 | state 5 density=0.1 energy=0.1 geometry=rectangle xmin=5.0 xmax=10.0 ymin=7.0 ymax=8.0 7 | x_cells=8000 8 | y_cells=8000 9 | xmin=0.0 10 | ymin=0.0 11 | xmax=10.0 12 | ymax=10.0 13 | initial_timestep=0.004 14 | end_step=4 15 | max_iters=10000 16 | use_cg 17 | eps 1.0e-15 18 | test_problem 5 19 | profiler_on 20 | use_c_kernels 21 | *endtea 22 | -------------------------------------------------------------------------------- /Benchmarks/tea_bm_6.in: -------------------------------------------------------------------------------- 1 | *tea 2 | state 1 density=100.0 energy=0.0001 3 | state 2 density=0.1 energy=25.0 geometry=rectangle xmin=0.0 xmax=1.0 ymin=1.0 ymax=2.0 4 | state 3 density=0.1 energy=0.1 geometry=rectangle xmin=1.0 xmax=6.0 ymin=1.0 ymax=2.0 5 | state 4 density=0.1 energy=0.1 geometry=rectangle xmin=5.0 xmax=6.0 ymin=1.0 ymax=8.0 6 | state 5 density=0.1 energy=0.1 geometry=rectangle xmin=5.0 xmax=10.0 ymin=7.0 ymax=8.0 7 | x_cells=8000 8 | y_cells=8000 9 | xmin=0.0 10 | ymin=0.0 11 | xmax=10.0 12 | ymax=10.0 13 | initial_timestep=0.004 14 | end_step=10 15 | max_iters=10000 16 | use_cg 17 | eps 1.0e-15 18 | test_problem 6 19 | profiler_on 20 | use_c_kernels 21 | *endtea 22 | -------------------------------------------------------------------------------- /build.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | 4 | module load cmake -------------------------------------------------------------------------------- /driver/application.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "chunk.h" 4 | 5 | #define TEALEAF_VERSION "2.000" 6 | 7 | void initialise_model_info(Settings &settings); 8 | void initialise_application(Chunk **chunks, Settings &settings, State * states); 9 | bool diffuse(Chunk *chunk, Settings &settings); 10 | void read_config(Settings &settings, State **states); 11 | 12 | #ifdef DIFFUSE_OVERLOAD 13 | bool diffuse_overload(Chunk *chunk, Settings &settings); 14 | #endif 15 | -------------------------------------------------------------------------------- /driver/cg_driver.cpp: -------------------------------------------------------------------------------- 1 | #include "chunk.h" 2 | #include "comms.h" 3 | #include "drivers.h" 4 | #include "kernel_interface.h" 5 | 6 | // Performs a full solve with the CG solver kernels 7 | void cg_driver(Chunk *chunks, Settings &settings, double rx, double ry, double *error) { 8 | int tt; 9 | double rro = 0.0; 10 | 11 | // Perform CG initialisation 12 | cg_init_driver(chunks, settings, rx, ry, &rro); 13 | 14 | // Iterate till convergence 15 | for (tt = 0; tt < settings.max_iters; ++tt) { 16 | cg_main_step_driver(chunks, settings, tt, &rro, error); 17 | 18 | halo_update_driver(chunks, settings, 1); 19 | 20 | if (sqrt(fabs(*error)) < settings.eps) break; 21 | } 22 | 23 | print_and_log(settings, " CG: \t\t\t%d iterations\n", tt); 24 | } 25 | 26 | // Invokes the CG initialisation kernels 27 | void cg_init_driver(Chunk *chunks, Settings &settings, double rx, double ry, double *rro) { 28 | *rro = 0.0; 29 | 30 | for (int cc = 0; cc < settings.num_chunks_per_rank; ++cc) { 31 | if (settings.kernel_language == Kernel_Language::C) { 32 | run_cg_init(&(chunks[cc]), settings, rx, ry, rro); 33 | } else if (settings.kernel_language == Kernel_Language::FORTRAN) { 34 | } 35 | } 36 | 37 | // Need to update for the matvec 38 | reset_fields_to_exchange(settings); 39 | settings.fields_to_exchange[FIELD_U] = true; 40 | settings.fields_to_exchange[FIELD_P] = true; 41 | halo_update_driver(chunks, settings, 1); 42 | 43 | sum_over_ranks(settings, rro); 44 | 45 | for (int cc = 0; cc < settings.num_chunks_per_rank; ++cc) { 46 | if (settings.kernel_language == Kernel_Language::C) { 47 | run_copy_u(&(chunks[cc]), settings); 48 | } else if (settings.kernel_language == Kernel_Language::FORTRAN) { 49 | } 50 | } 51 | } 52 | 53 | // Invokes the main CG solve kernels 54 | void cg_main_step_driver(Chunk *chunks, Settings &settings, int tt, double *rro, double *error) { 55 | double pw = 0.0; 56 | 57 | for (int cc = 0; cc < settings.num_chunks_per_rank; ++cc) { 58 | if (settings.kernel_language == Kernel_Language::C) { 59 | run_cg_calc_w(&(chunks[cc]), settings, &pw); 60 | } else if (settings.kernel_language == Kernel_Language::FORTRAN) { 61 | } 62 | } 63 | 64 | sum_over_ranks(settings, &pw); 65 | 66 | double alpha = *rro / pw; 67 | double rrn = 0.0; 68 | 69 | for (int cc = 0; cc < settings.num_chunks_per_rank; ++cc) { 70 | // TODO: Some redundancy across chunks?? 71 | chunks[cc].cg_alphas[tt] = alpha; 72 | 73 | if (settings.kernel_language == Kernel_Language::C) { 74 | run_cg_calc_ur(&(chunks[cc]), settings, alpha, &rrn); 75 | } else if (settings.kernel_language == Kernel_Language::FORTRAN) { 76 | } 77 | } 78 | 79 | sum_over_ranks(settings, &rrn); 80 | 81 | double beta = rrn / *rro; 82 | 83 | for (int cc = 0; cc < settings.num_chunks_per_rank; ++cc) { 84 | // TODO: Some redundancy across chunks?? 85 | chunks[cc].cg_betas[tt] = beta; 86 | 87 | if (settings.kernel_language == Kernel_Language::C) { 88 | run_cg_calc_p(&(chunks[cc]), settings, beta); 89 | } else if (settings.kernel_language == Kernel_Language::FORTRAN) { 90 | } 91 | } 92 | 93 | *error = rrn; 94 | *rro = rrn; 95 | } 96 | -------------------------------------------------------------------------------- /driver/chunk.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "chunk.h" 6 | 7 | // static void dump_data(FILE *out, const char *name, double *data, int size) { 8 | // auto *host = static_cast(std::malloc(size * sizeof(double))); 9 | // std::memcpy(host, data, size * sizeof(double)); 10 | // 11 | // bool all_zero = true; 12 | // for (int i = 0; i < size; i++) { 13 | // if (host[i] != 0.0) { 14 | // all_zero = false; 15 | // break; 16 | // } 17 | // } 18 | // 19 | // std::fprintf(out, "[%s,+0]", name); 20 | // if (all_zero) { 21 | // std::fprintf(out, "(0.0 * %d)", size); 22 | // } else { 23 | // for (int i = 0; i < size; i++) { 24 | // std::fprintf(out, "%.5f,", host[i]); 25 | // if (i % 20 == 0) { 26 | // std::fprintf(out, "\n[%s,+%d]", name, i); 27 | // } 28 | // } 29 | // } 30 | // std::fprintf(out, "\n"); 31 | // free(host); 32 | // } 33 | // 34 | // void dump_chunk(const char *prefix, const char *suffix, Chunk *chunk, Settings &settings) { 35 | // char name[256] = {}; 36 | // sprintf(name, "%s_rank=%d+%s.txt", prefix, settings.rank, suffix); 37 | // FILE *out = fopen(name, "w"); 38 | // 39 | // std::fprintf(out, "x=%d\n", chunk->x); 40 | // std::fprintf(out, "y=%d\n", chunk->y); 41 | // std::fprintf(out, "dt_init=%f\n", chunk->dt_init); 42 | // 43 | // std::fprintf(out, "left=%d\n", chunk->left); 44 | // std::fprintf(out, "right=%d\n", chunk->right); 45 | // std::fprintf(out, "bottom=%d\n", chunk->bottom); 46 | // std::fprintf(out, "top=%d\n", chunk->top); 47 | // 48 | //// dump_data(out, "density", chunk->density, chunk->x * chunk->y); 49 | //// dump_data(out, "energy", chunk->energy, chunk->x * chunk->y); 50 | //// dump_data(out, "u", chunk->u, chunk->x * chunk->y); 51 | //// dump_data(out, "p", chunk->p, chunk->x * chunk->y); 52 | //// dump_data(out, "r", chunk->r, chunk->x * chunk->y); 53 | //// dump_data(out, "w", chunk->w, chunk->x * chunk->y); 54 | //// dump_data(out, "kx", chunk->kx, chunk->x * chunk->y); 55 | //// dump_data(out, "ky", chunk->ky, chunk->x * chunk->y); 56 | // 57 | // std::fclose(out); 58 | //} 59 | 60 | // Initialise the chunk 61 | void initialise_chunk(Chunk *chunk, Settings &settings, int x, int y) { 62 | // Initialise the key variables 63 | chunk->x = x + settings.halo_depth * 2; 64 | chunk->y = y + settings.halo_depth * 2; 65 | chunk->dt_init = settings.dt_init; 66 | 67 | // Allocate the neighbour list 68 | chunk->neighbours = static_cast(std::malloc(sizeof(int) * NUM_FACES)); 69 | 70 | // Allocate the MPI comm buffers 71 | // int lr_len = chunk->y * settings.halo_depth * NUM_FIELDS; 72 | // chunk->left_send = static_cast(std::malloc(sizeof(double) * lr_len)); 73 | // chunk->left_recv = static_cast(std::malloc(sizeof(double) * lr_len)); 74 | // chunk->right_send = static_cast(std::malloc(sizeof(double) * lr_len)); 75 | // chunk->right_recv = static_cast(std::malloc(sizeof(double) * lr_len)); 76 | 77 | // int tb_len = chunk->x * settings.halo_depth * NUM_FIELDS; 78 | // chunk->top_send = static_cast(std::malloc(sizeof(double) * tb_len)); 79 | // chunk->top_recv = static_cast(std::malloc(sizeof(double) * tb_len)); 80 | // chunk->bottom_send = static_cast(std::malloc(sizeof(double) * tb_len)); 81 | // chunk->bottom_recv = static_cast(std::malloc(sizeof(double) * tb_len)); 82 | 83 | // int lr_len = chunk->y * settings.halo_depth * NUM_FIELDS; 84 | // chunk->staging_left_send = static_cast(std::malloc(sizeof(double) * lr_len)); 85 | // chunk->staging_left_recv = static_cast(std::malloc(sizeof(double) * lr_len)); 86 | // chunk->staging_right_send = static_cast(std::malloc(sizeof(double) * lr_len)); 87 | // chunk->staging_right_recv = static_cast(std::malloc(sizeof(double) * lr_len)); 88 | 89 | // int tb_len = chunk->x * settings.halo_depth * NUM_FIELDS; 90 | // chunk->staging_top_send = static_cast(std::malloc(sizeof(double) * tb_len)); 91 | // chunk->staging_top_recv = static_cast(std::malloc(sizeof(double) * tb_len)); 92 | // chunk->staging_bottom_send = static_cast(std::malloc(sizeof(double) * tb_len)); 93 | // chunk->staging_bottom_recv = static_cast(std::malloc(sizeof(double) * tb_len)); 94 | 95 | // Initialise the ChunkExtension, which allows composition of extended 96 | // fields specific to individual implementations 97 | chunk->ext = static_cast(std::malloc(sizeof(ChunkExtension))); 98 | } 99 | 100 | // Finalise the chunk 101 | void finalise_chunk(Chunk *chunk) { 102 | free(chunk->neighbours); 103 | free(chunk->ext); 104 | // free(chunk->left_send); 105 | // free(chunk->left_recv); 106 | // free(chunk->right_send); 107 | // free(chunk->right_recv); 108 | // free(chunk->top_send); 109 | // free(chunk->top_recv); 110 | // free(chunk->bottom_send); 111 | // free(chunk->bottom_recv); 112 | } 113 | -------------------------------------------------------------------------------- /driver/chunk.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "chunk_extension.h" 4 | #include "settings.h" 5 | #include 6 | 7 | // The core Tealeaf interface class. 8 | struct Chunk { 9 | // Solve-wide variables 10 | double dt_init; 11 | 12 | // Neighbouring ranks 13 | int *neighbours; 14 | 15 | // MPI comm buffers 16 | FieldBufferType left_send; 17 | FieldBufferType left_recv; 18 | FieldBufferType right_send; 19 | FieldBufferType right_recv; 20 | FieldBufferType top_send; 21 | FieldBufferType top_recv; 22 | FieldBufferType bottom_send; 23 | FieldBufferType bottom_recv; 24 | 25 | StagingBufferType staging_left_send; 26 | StagingBufferType staging_left_recv; 27 | StagingBufferType staging_right_send; 28 | StagingBufferType staging_right_recv; 29 | StagingBufferType staging_top_send; 30 | StagingBufferType staging_top_recv; 31 | StagingBufferType staging_bottom_send; 32 | StagingBufferType staging_bottom_recv; 33 | 34 | // Mesh chunks 35 | int left; 36 | int right; 37 | int bottom; 38 | int top; 39 | 40 | // Field dimensions 41 | int x; 42 | int y; 43 | 44 | // Field buffers 45 | FieldBufferType density0; 46 | FieldBufferType density; 47 | FieldBufferType energy0; 48 | FieldBufferType energy; 49 | 50 | FieldBufferType u; 51 | FieldBufferType u0; 52 | FieldBufferType p; 53 | FieldBufferType r; 54 | FieldBufferType mi; 55 | FieldBufferType w; 56 | FieldBufferType kx; 57 | FieldBufferType ky; 58 | FieldBufferType sd; 59 | 60 | FieldBufferType cell_x; 61 | FieldBufferType cell_y; 62 | FieldBufferType cell_dx; 63 | FieldBufferType cell_dy; 64 | 65 | FieldBufferType vertex_dx; 66 | FieldBufferType vertex_dy; 67 | FieldBufferType vertex_x; 68 | FieldBufferType vertex_y; 69 | 70 | FieldBufferType volume; 71 | FieldBufferType x_area; 72 | FieldBufferType y_area; 73 | 74 | // Cheby and PPCG 75 | double theta; 76 | double eigmin; 77 | double eigmax; 78 | 79 | double *cg_alphas; 80 | double *cg_betas; 81 | double *cheby_alphas; 82 | double *cheby_betas; 83 | 84 | ChunkExtension *ext; 85 | }; 86 | 87 | struct Settings; 88 | 89 | void dump_chunk(const char *prefix, const char *suffix, Chunk *chunk, Settings &settings); 90 | void initialise_chunk(Chunk *chunk, Settings &settings, int x, int y); 91 | void finalise_chunk(Chunk *chunk); 92 | -------------------------------------------------------------------------------- /driver/comms.cpp: -------------------------------------------------------------------------------- 1 | #include "comms.h" 2 | #include "settings.h" 3 | 4 | // Initialise MPI 5 | void initialise_comms(int argc, char **argv) { MPI_Init(&argc, &argv); } 6 | 7 | // Initialise the rank information 8 | void initialise_ranks(Settings &settings) { 9 | MPI_Comm_rank(MPI_COMM_WORLD, &settings.rank); 10 | MPI_Comm_size(MPI_COMM_WORLD, &settings.num_ranks); 11 | } 12 | 13 | // Teardown MPI 14 | void finalise_comms() { MPI_Finalize(); } 15 | 16 | // Sends a message out and receives a message in 17 | void send_recv_message(Settings &settings, double *send_buffer, double *recv_buffer, int buffer_len, int neighbour, int send_tag, 18 | int recv_tag, MPI_Request *send_request, MPI_Request *recv_request) { 19 | START_PROFILING(settings.kernel_profile); 20 | 21 | MPI_Isend(send_buffer, buffer_len, MPI_DOUBLE, neighbour, send_tag, MPI_COMM_WORLD, send_request); 22 | MPI_Irecv(recv_buffer, buffer_len, MPI_DOUBLE, neighbour, recv_tag, MPI_COMM_WORLD, recv_request); 23 | 24 | STOP_PROFILING(settings.kernel_profile, __func__); 25 | } 26 | 27 | // Waits for all requests to complete 28 | void wait_for_requests(Settings &settings, int num_requests, MPI_Request *requests) { 29 | START_PROFILING(settings.kernel_profile); 30 | MPI_Waitall(num_requests, requests, MPI_STATUSES_IGNORE); 31 | STOP_PROFILING(settings.kernel_profile, __func__); 32 | } 33 | 34 | // Reduce over all ranks to get sum 35 | void sum_over_ranks(Settings &settings, double *a) { 36 | START_PROFILING(settings.kernel_profile); 37 | double temp = *a; 38 | MPI_Allreduce(&temp, a, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); 39 | STOP_PROFILING(settings.kernel_profile, __func__); 40 | } 41 | 42 | // Reduce across all ranks to get minimum value 43 | void min_over_ranks(Settings &settings, double *a) { 44 | START_PROFILING(settings.kernel_profile); 45 | double temp = *a; 46 | MPI_Allreduce(&temp, a, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD); 47 | STOP_PROFILING(settings.kernel_profile, __func__); 48 | } 49 | 50 | // Synchronise all ranks 51 | void barrier() { MPI_Barrier(MPI_COMM_WORLD); } 52 | 53 | // End the application 54 | void abort_comms() { MPI_Abort(MPI_COMM_WORLD, 1); } 55 | -------------------------------------------------------------------------------- /driver/comms.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #ifndef NO_MPI 4 | // XXX OpenMPI pulls in CXX headers which we don't link against, prevent that: 5 | #define OMPI_SKIP_MPICXX 6 | #include 7 | #if __has_include("mpi-ext.h") // C23, but everyone supports this already 8 | #include "mpi-ext.h" // for CUDA-aware MPI checks 9 | #endif 10 | #else 11 | #include "mpi_shim.h" 12 | #endif 13 | 14 | #include "chunk.h" 15 | #include "settings.h" 16 | 17 | void barrier(); 18 | void abort_comms(); 19 | void finalise_comms(); 20 | void initialise_comms(int argc, char **argv); 21 | void initialise_ranks(Settings &settings); 22 | void sum_over_ranks(Settings &settings, double *a); 23 | void min_over_ranks(Settings &settings, double *a); 24 | void wait_for_requests(Settings &settings, int num_requests, MPI_Request *requests); 25 | void send_recv_message(Settings &settings, double *send_buffer, double *recv_buffer, int buffer_len, int neighbour, int send_tag, 26 | int recv_tag, MPI_Request *send_request, MPI_Request *recv_request); -------------------------------------------------------------------------------- /driver/diffuse.cpp: -------------------------------------------------------------------------------- 1 | #include "application.h" 2 | #include "comms.h" 3 | #include "drivers.h" 4 | 5 | double calc_dt(Chunk *chunks); 6 | void calc_min_timestep(Chunk *chunks, double *dt, int chunks_per_task); 7 | void solve(Chunk *chunks, Settings &settings, int tt, double *wallclock_prev); 8 | 9 | // The main timestep loop 10 | bool diffuse(Chunk *chunks, Settings &settings) { 11 | double wallclock_prev = 0.0; 12 | for (int tt = 0; tt < settings.end_step; ++tt) { 13 | solve(chunks, settings, tt, &wallclock_prev); 14 | } 15 | 16 | return field_summary_driver(chunks, settings, true); 17 | } 18 | 19 | // Performs a solve for a single timestep 20 | void solve(Chunk *chunks, Settings &settings, int tt, double *wallclock_prev) { 21 | print_and_log(settings, "\n Timestep %d\n", tt + 1); 22 | profiler_start_timer(settings.wallclock_profile); 23 | 24 | // Calculate minimum timestep information 25 | double dt = settings.dt_init; 26 | calc_min_timestep(chunks, &dt, settings.num_chunks_per_rank); 27 | 28 | // Pick the smallest timestep across all ranks 29 | min_over_ranks(settings, &dt); 30 | 31 | double rx = dt / (settings.dx * settings.dx); 32 | double ry = dt / (settings.dy * settings.dy); 33 | 34 | // Prepare halo regions for solve 35 | reset_fields_to_exchange(settings); 36 | settings.fields_to_exchange[FIELD_ENERGY1] = true; 37 | settings.fields_to_exchange[FIELD_DENSITY] = true; 38 | halo_update_driver(chunks, settings, 2); 39 | 40 | double error = 1e+10; 41 | 42 | // Perform the solve with one of the integrated solvers 43 | switch (settings.solver) { 44 | case Solver::JACOBI_SOLVER: jacobi_driver(chunks, settings, rx, ry, &error); break; 45 | case Solver::CG_SOLVER: cg_driver(chunks, settings, rx, ry, &error); break; 46 | case Solver::CHEBY_SOLVER: cheby_driver(chunks, settings, rx, ry, &error); break; 47 | case Solver::PPCG_SOLVER: ppcg_driver(chunks, settings, rx, ry, &error); break; 48 | } 49 | 50 | // Perform solve finalisation tasks 51 | solve_finished_driver(chunks, settings); 52 | 53 | if (tt % settings.summary_frequency == 0) { 54 | field_summary_driver(chunks, settings, false); 55 | } 56 | 57 | profiler_end_timer(settings.wallclock_profile, "Wallclock"); 58 | 59 | double wallclock = settings.wallclock_profile->profiler_entries[0].time; 60 | print_and_log(settings, " Wallclock: \t\t%.3lfs\n", wallclock); 61 | print_and_log(settings, " Avg. time per cell: \t%.6e\n", (wallclock - *wallclock_prev) / (settings.grid_x_cells * settings.grid_y_cells)); 62 | print_and_log(settings, " Error: \t\t%.6e\n", error); 63 | } 64 | 65 | // Calculate minimum timestep 66 | void calc_min_timestep(Chunk *chunks, double *dt, int chunks_per_task) { 67 | for (int cc = 0; cc < chunks_per_task; ++cc) { 68 | double dtlp = calc_dt(&(chunks[cc])); 69 | 70 | if (dtlp < *dt) { 71 | *dt = dtlp; 72 | } 73 | } 74 | } 75 | 76 | // Calculates a value for dt 77 | double calc_dt(Chunk *chunk) { 78 | // Currently defaults to config provided value 79 | return chunk->dt_init; 80 | } 81 | -------------------------------------------------------------------------------- /driver/drivers.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "chunk.h" 4 | 5 | // Initialisation drivers 6 | void set_chunk_data_driver(Chunk *chunk, Settings &settings); 7 | void set_chunk_state_driver(Chunk *chunk, Settings &settings, State *states); 8 | void kernel_initialise_driver(Chunk *chunks, Settings &settings); 9 | void kernel_finalise_driver(Chunk *chunks, Settings &settings); 10 | 11 | // Halo drivers 12 | void halo_update_driver(Chunk *chunks, Settings &settings, int depth); 13 | void remote_halo_driver(Chunk *chunks, Settings &settings, int depth); 14 | 15 | // Conjugate Gradient solver drivers 16 | void cg_driver(Chunk *chunks, Settings &settings, double rx, double ry, double *error); 17 | void cg_init_driver(Chunk *chunks, Settings &settings, double rx, double ry, double *rro); 18 | void cg_main_step_driver(Chunk *chunks, Settings &settings, int tt, double *rro, double *error); 19 | 20 | // Chebyshev solver drivers 21 | void cheby_driver(Chunk *chunks, Settings &settings, double rx, double ry, double *error); 22 | void cheby_init_driver(Chunk *chunks, Settings &settings, int num_cg_iters, double *bb); 23 | void cheby_coef_driver(Chunk *chunks, Settings &settings, int max_iters); 24 | void cheby_main_step_driver(Chunk *chunks, Settings &settings, int cheby_iters, bool is_calc_2norm, double *error); 25 | 26 | // PPCG solver drivers 27 | void ppcg_driver(Chunk *chunks, Settings &settings, double rx, double ry, double *error); 28 | void ppcg_init_driver(Chunk *chunks, Settings &settings, double *rro); 29 | void ppcg_main_step_driver(Chunk *chunks, Settings &settings, double *rro, double *error); 30 | 31 | // Jacobi solver drivers 32 | void jacobi_driver(Chunk *chunks, Settings &settings, double rx, double ry, double *error); 33 | void jacobi_init_driver(Chunk *chunks, Settings &settings, double rx, double ry); 34 | void jacobi_main_step_driver(Chunk *chunks, Settings &settings, int tt, double *error); 35 | 36 | // Misc drivers 37 | bool field_summary_driver(Chunk *chunks, Settings &settings, bool solve_finished); 38 | void store_energy_driver(Chunk *chunk, Settings &settings); 39 | void solve_finished_driver(Chunk *chunks, Settings &settings); 40 | void eigenvalue_driver_initialise(Chunk *chunks, Settings &settings, int num_cg_iters); 41 | -------------------------------------------------------------------------------- /driver/eigenvalue_driver.cpp: -------------------------------------------------------------------------------- 1 | #include "chunk.h" 2 | #include "comms.h" 3 | #include "drivers.h" 4 | #include "kernel_interface.h" 5 | #include 6 | #include 7 | #include 8 | 9 | void tqli(double *d, double *e, int n); 10 | 11 | // Calculates the eigenvalues from cg_alphas and cg_betas 12 | void eigenvalue_driver_initialise(Chunk *chunks, Settings &settings, int num_cg_iters) { 13 | START_PROFILING(settings.kernel_profile); 14 | 15 | for (int cc = 0; cc < settings.num_chunks_per_rank; ++cc) { 16 | double diag[num_cg_iters]; 17 | double offdiag[num_cg_iters]; 18 | std::memset(diag, 0, sizeof(diag)); 19 | std::memset(offdiag, 0, sizeof(offdiag)); 20 | 21 | // Prepare matrix 22 | for (int ii = 0; ii < num_cg_iters; ++ii) { 23 | diag[ii] = 1.0 / chunks[cc].cg_alphas[ii]; 24 | 25 | if (ii > 0) { 26 | diag[ii] += chunks[cc].cg_betas[ii - 1] / chunks[cc].cg_alphas[ii - 1]; 27 | } 28 | if (ii < num_cg_iters - 1) { 29 | offdiag[ii + 1] = std::sqrt(chunks[cc].cg_betas[ii]) / chunks[cc].cg_alphas[ii]; 30 | } 31 | } 32 | 33 | // Calculate the eigenvalues (ignore eigenvectors) 34 | tqli(diag, offdiag, num_cg_iters); 35 | 36 | chunks[cc].eigmin = DBL_MAX; 37 | chunks[cc].eigmax = DBL_MIN; 38 | 39 | // Get minimum and maximum eigenvalues 40 | for (int ii = 0; ii < num_cg_iters; ++ii) { 41 | chunks[cc].eigmin = tealeaf_MIN(chunks[cc].eigmin, diag[ii]); 42 | chunks[cc].eigmax = tealeaf_MAX(chunks[cc].eigmax, diag[ii]); 43 | } 44 | 45 | if (chunks[cc].eigmin < 0.0 || chunks[cc].eigmax < 0.0) { 46 | die(__LINE__, __FILE__, "Calculated negative eigenvalues.\n"); 47 | } 48 | 49 | // TODO: Find out the reasoning behind this!? 50 | // Adds some buffer for precision maybe? 51 | chunks[cc].eigmin *= 0.95; 52 | chunks[cc].eigmax *= 1.05; 53 | 54 | print_and_log(settings, "Min. eigenvalue: \t%.12e\nMax. eigenvalue: \t%.12e\n", chunks[cc].eigmin, chunks[cc].eigmax); 55 | } 56 | 57 | STOP_PROFILING(settings.kernel_profile, __func__); 58 | } 59 | 60 | // Adapted from 61 | // http://ftp.cs.stanford.edu/cs/robotics/scohen/nr/tqli.c 62 | void tqli(double *d, double *e, int n) { 63 | int m, l, iter, i; 64 | double s, r, p, g, f, dd, c, b; 65 | 66 | for (i = 0; i < n - 1; i++) 67 | e[i] = e[i + 1]; 68 | e[n - 1] = 0.0; 69 | for (l = 0; l < n; l++) { 70 | iter = 0; 71 | do { 72 | for (m = l; m < n - 1; m++) { 73 | dd = std::fabs(d[m]) + std::fabs(d[m + 1]); 74 | if (std::fabs(e[m]) + dd == dd) break; 75 | } 76 | 77 | if (m == l) break; 78 | 79 | if (iter++ == 30) { 80 | die(__LINE__, __FILE__, "Too many iterations in TQLI routine\n"); 81 | } 82 | g = (d[l + 1] - d[l]) / (2.0 * e[l]); 83 | r = std::sqrt((g * g) + 1.0); 84 | g = d[m] - d[l] + e[l] / (g + tealeaf_sign(r, g)); 85 | s = c = 1.0; 86 | p = 0.0; 87 | for (i = m - 1; i >= l; i--) { 88 | f = s * e[i]; 89 | b = c * e[i]; 90 | r = std::sqrt(f * f + g * g); 91 | e[i + 1] = r; 92 | if (r == 0.0) { 93 | d[i + 1] -= p; 94 | e[m] = 0.0; 95 | continue; 96 | } 97 | s = f / r; 98 | c = g / r; 99 | g = d[i + 1] - p; 100 | r = (d[i] - g) * s + 2.0 * c * b; 101 | p = s * r; 102 | d[i + 1] = g + p; 103 | g = c * r - b; 104 | } 105 | d[l] = d[l] - p; 106 | e[l] = g; 107 | e[m] = 0.0; 108 | } while (m != l); 109 | } 110 | } 111 | -------------------------------------------------------------------------------- /driver/field_summary_driver.cpp: -------------------------------------------------------------------------------- 1 | #include "chunk.h" 2 | #include "comms.h" 3 | #include "kernel_interface.h" 4 | 5 | void get_checking_value(Settings &settings, double *checking_value); 6 | 7 | // Invokes the set chunk data kernel 8 | bool field_summary_driver(Chunk *chunks, Settings &settings, bool is_solve_finished) { 9 | double vol = 0.0; 10 | double ie = 0.0; 11 | double temp = 0.0; 12 | double mass = 0.0; 13 | 14 | for (int cc = 0; cc < settings.num_chunks_per_rank; ++cc) { 15 | if (settings.kernel_language == Kernel_Language::C) { 16 | run_field_summary(&(chunks[cc]), settings, &vol, &mass, &ie, &temp); 17 | } else if (settings.kernel_language == Kernel_Language::FORTRAN) { 18 | } 19 | } 20 | 21 | // Bring all of the results to the master 22 | sum_over_ranks(settings, &vol); 23 | sum_over_ranks(settings, &mass); 24 | sum_over_ranks(settings, &ie); 25 | sum_over_ranks(settings, &temp); 26 | 27 | if (settings.rank == MASTER && settings.check_result && is_solve_finished) { 28 | print_and_log(settings, "\n Checking results...\n"); 29 | 30 | double checking_value = 1.0; 31 | get_checking_value(settings, &checking_value); 32 | 33 | print_and_log(settings, " Expected %.15e\n", checking_value); 34 | print_and_log(settings, " Actual %.15e\n", temp); 35 | 36 | double qa_diff = fabs(100.0 * (temp / checking_value) - 100.0); 37 | if (qa_diff < 0.001 && !std::isnan(temp)) { 38 | print_and_log(settings, " This run PASSED (Difference is within %.8lf%%)\n", qa_diff); 39 | return true; 40 | } else { 41 | print_and_log(settings, " This run FAILED (Difference is within %.8lf%%)\n", qa_diff); 42 | return false; 43 | } 44 | } 45 | // only master needs to return validation failure if we see one 46 | return true; 47 | } 48 | 49 | // Fetches the checking value from the test problems file 50 | void get_checking_value(Settings &settings, double *checking_value) { 51 | FILE *test_problem_file = std::fopen(settings.test_problem_filename, "r"); 52 | 53 | if (!test_problem_file) { 54 | print_and_log(settings, "\n WARNING: Could not open the test problem file: %s, expected value will be invalid.\n", 55 | settings.test_problem_filename); 56 | return; 57 | } 58 | 59 | size_t len = 0; 60 | char *line = nullptr; 61 | 62 | // Get the number of states present in the config file 63 | while (getline(&line, &len, test_problem_file) != EOF) { 64 | int x; 65 | int y; 66 | int num_steps; 67 | 68 | std::sscanf(line, "%d %d %d %lf", &x, &y, &num_steps, checking_value); 69 | 70 | // Found the problem in the file 71 | if (x == settings.grid_x_cells && y == settings.grid_y_cells && num_steps == settings.end_step) { 72 | std::fclose(test_problem_file); 73 | return; 74 | } 75 | } 76 | 77 | *checking_value = 1.0; 78 | print_and_log(settings, "\n WARNING: Problem was not found in the test problems file, expected value will be invalid.\n"); 79 | std::fclose(test_problem_file); 80 | } 81 | -------------------------------------------------------------------------------- /driver/halo_update_driver.cpp: -------------------------------------------------------------------------------- 1 | #include "drivers.h" 2 | #include "kernel_interface.h" 3 | #include "settings.h" 4 | 5 | // Invoke the halo update kernels 6 | void halo_update_driver(Chunk *chunks, Settings &settings, int depth) { 7 | // Check that we actually have exchanges to perform 8 | if (!is_fields_to_exchange(settings)) return; 9 | 10 | remote_halo_driver(chunks, settings, depth); 11 | 12 | for (int cc = 0; cc < settings.num_chunks_per_rank; ++cc) { 13 | if (settings.kernel_language == Kernel_Language::C) { 14 | run_local_halos(&(chunks[cc]), settings, depth); 15 | } else if (settings.kernel_language == Kernel_Language::FORTRAN) { 16 | // Fortran store energy kernel 17 | } 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /driver/initialise.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "application.h" 5 | #include "chunk.h" 6 | #include "drivers.h" 7 | #include "kernel_interface.h" 8 | #include "settings.h" 9 | 10 | // Decomposes the field into multiple chunks 11 | void decompose_field(Settings &settings, Chunk *chunks) { 12 | // Calculates the num chunks field is to be decomposed into 13 | settings.num_chunks = settings.num_ranks * settings.num_chunks_per_rank; 14 | 15 | int num_chunks = settings.num_chunks; 16 | 17 | double best_metric = DBL_MAX; 18 | auto x_cells = static_cast(settings.grid_x_cells); 19 | auto y_cells = static_cast(settings.grid_y_cells); 20 | int x_chunks = 0; 21 | int y_chunks = 0; 22 | 23 | // Decompose by minimal area to perimeter 24 | for (int xx = 1; xx <= num_chunks; ++xx) { 25 | if (num_chunks % xx) continue; 26 | 27 | // Calculate number of chunks grouped by x split 28 | int yy = num_chunks / xx; 29 | 30 | if (num_chunks % yy) continue; 31 | 32 | double perimeter = ((x_cells / xx) * (x_cells / xx) + (y_cells / yy) * (y_cells / yy)) * 2; 33 | double area = (x_cells / xx) * (y_cells / yy); 34 | 35 | double current_metric = perimeter / area; 36 | 37 | // Save improved decompositions 38 | if (current_metric < best_metric) { 39 | x_chunks = xx; 40 | y_chunks = yy; 41 | best_metric = current_metric; 42 | } 43 | } 44 | 45 | // Check that the decomposition didn't fail 46 | if (!x_chunks || !y_chunks) { 47 | die(__LINE__, __FILE__, "Failed to decompose the field with given parameters.\n"); 48 | } 49 | 50 | int dx = settings.grid_x_cells / x_chunks; 51 | int dy = settings.grid_y_cells / y_chunks; 52 | 53 | int mod_x = settings.grid_x_cells % x_chunks; 54 | int mod_y = settings.grid_y_cells % y_chunks; 55 | int add_x_prev = 0; 56 | int add_y_prev = 0; 57 | 58 | // Compute the full decomposition on all ranks 59 | for (int yy = 0; yy < y_chunks; ++yy) { 60 | int add_y = (yy < mod_y); 61 | 62 | for (int xx = 0; xx < x_chunks; ++xx) { 63 | int add_x = (xx < mod_x); 64 | 65 | for (int cc = 0; cc < settings.num_chunks_per_rank; ++cc) { 66 | int chunk = xx + yy * x_chunks; 67 | int rank = cc + settings.rank * settings.num_chunks_per_rank; 68 | 69 | // Store the values for all chunks local to rank 70 | if (rank == chunk) { 71 | initialise_chunk(&(chunks[cc]), settings, dx + add_x, dy + add_y); 72 | 73 | // Set up the mesh ranges 74 | chunks[cc].left = xx * dx + add_x_prev; 75 | chunks[cc].right = chunks[cc].left + dx + add_x; 76 | chunks[cc].bottom = yy * dy + add_y_prev; 77 | chunks[cc].top = chunks[cc].bottom + dy + add_y; 78 | 79 | // Set up the chunk connectivity 80 | chunks[cc].neighbours[CHUNK_LEFT] = (xx == 0) ? EXTERNAL_FACE : chunk - 1; 81 | chunks[cc].neighbours[CHUNK_RIGHT] = (xx == x_chunks - 1) ? EXTERNAL_FACE : chunk + 1; 82 | chunks[cc].neighbours[CHUNK_BOTTOM] = (yy == 0) ? EXTERNAL_FACE : chunk - x_chunks; 83 | chunks[cc].neighbours[CHUNK_TOP] = (yy == y_chunks - 1) ? EXTERNAL_FACE : chunk + x_chunks; 84 | } 85 | } 86 | 87 | // If chunks rounded up, maintain relative location 88 | add_x_prev += add_x; 89 | } 90 | add_x_prev = 0; 91 | add_y_prev += add_y; 92 | } 93 | } 94 | 95 | void initialise_model_info(Settings &settings) { run_model_info(settings); } 96 | 97 | // Initialise settings from input file 98 | void initialise_application(Chunk **chunks, Settings &settings, State* states) { 99 | 100 | *chunks = (Chunk *)malloc(sizeof(Chunk) * settings.num_chunks_per_rank); 101 | 102 | decompose_field(settings, *chunks); 103 | kernel_initialise_driver(*chunks, settings); 104 | set_chunk_data_driver(*chunks, settings); 105 | set_chunk_state_driver(*chunks, settings, states); 106 | 107 | // Prime the initial halo data 108 | reset_fields_to_exchange(settings); 109 | settings.fields_to_exchange[FIELD_DENSITY] = true; // start.f90:111 110 | settings.fields_to_exchange[FIELD_ENERGY0] = true; // start.f90:112 111 | settings.fields_to_exchange[FIELD_ENERGY1] = true; // start.f90:113 112 | halo_update_driver(*chunks, settings, 2); 113 | 114 | store_energy_driver(*chunks, settings); 115 | } 116 | -------------------------------------------------------------------------------- /driver/jacobi_driver.cpp: -------------------------------------------------------------------------------- 1 | #include "chunk.h" 2 | #include "comms.h" 3 | #include "drivers.h" 4 | #include "kernel_interface.h" 5 | 6 | // Performs a full solve with the Jacobi solver kernels 7 | void jacobi_driver(Chunk *chunks, Settings &settings, double rx, double ry, double *error) { 8 | jacobi_init_driver(chunks, settings, rx, ry); 9 | 10 | // Iterate till convergence 11 | int tt; 12 | for (tt = 0; tt < settings.max_iters; ++tt) { 13 | jacobi_main_step_driver(chunks, settings, tt, error); 14 | 15 | halo_update_driver(chunks, settings, 1); 16 | 17 | if (fabs(*error) < settings.eps) break; 18 | } 19 | 20 | print_and_log(settings, "Jacobi: \t\t%d iterations\n", tt); 21 | } 22 | 23 | // Invokes the CG initialisation kernels 24 | void jacobi_init_driver(Chunk *chunks, Settings &settings, double rx, double ry) { 25 | for (int cc = 0; cc < settings.num_chunks_per_rank; ++cc) { 26 | if (settings.kernel_language == Kernel_Language::C) { 27 | run_jacobi_init(&(chunks[cc]), settings, rx, ry); 28 | 29 | run_copy_u(&(chunks[cc]), settings); 30 | } else if (settings.kernel_language == Kernel_Language::FORTRAN) { 31 | } 32 | } 33 | 34 | // Need to update for the matvec 35 | reset_fields_to_exchange(settings); 36 | settings.fields_to_exchange[FIELD_U] = true; 37 | } 38 | 39 | // Invokes the main Jacobi solve kernels 40 | void jacobi_main_step_driver(Chunk *chunks, Settings &settings, int tt, double *error) { 41 | for (int cc = 0; cc < settings.num_chunks_per_rank; ++cc) { 42 | if (settings.kernel_language == Kernel_Language::C) { 43 | run_jacobi_iterate(&(chunks[cc]), settings, error); 44 | } else if (settings.kernel_language == Kernel_Language::FORTRAN) { 45 | } 46 | } 47 | 48 | if (tt % 50 == 0) { 49 | halo_update_driver(chunks, settings, 1); 50 | 51 | for (int cc = 0; cc < settings.num_chunks_per_rank; ++cc) { 52 | if (settings.kernel_language == Kernel_Language::C) { 53 | run_calculate_residual(&(chunks[cc]), settings); 54 | 55 | run_calculate_2norm(&(chunks[cc]), settings, chunks[cc].r, error); 56 | } else if (settings.kernel_language == Kernel_Language::FORTRAN) { 57 | } 58 | } 59 | } 60 | 61 | sum_over_ranks(settings, error); 62 | } 63 | -------------------------------------------------------------------------------- /driver/kernel_initialise_driver.cpp: -------------------------------------------------------------------------------- 1 | #include "chunk.h" 2 | #include "kernel_interface.h" 3 | 4 | // Invokes the kernel initialisation kernels 5 | void kernel_initialise_driver(Chunk *chunks, Settings &settings) { 6 | for (int cc = 0; cc < settings.num_chunks_per_rank; ++cc) { 7 | if (settings.kernel_language == Kernel_Language::C) { 8 | int lr_len = chunks[cc].y * settings.halo_depth * NUM_FIELDS; 9 | int tb_len = chunks[cc].x * settings.halo_depth * NUM_FIELDS; 10 | run_kernel_initialise(&(chunks[cc]), settings, lr_len, tb_len); 11 | } else if (settings.kernel_language == Kernel_Language::FORTRAN) { 12 | } 13 | } 14 | } 15 | 16 | // Invokes the kernel finalisation drivers 17 | void kernel_finalise_driver(Chunk *chunks, Settings &settings) { 18 | for (int cc = 0; cc < settings.num_chunks_per_rank; ++cc) { 19 | if (settings.kernel_language == Kernel_Language::C) { 20 | run_kernel_finalise(&(chunks[cc]), settings); 21 | } else if (settings.kernel_language == Kernel_Language::FORTRAN) { 22 | } 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /driver/kernel_interface.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "chunk.h" 4 | #include "comms.h" 5 | #include "settings.h" 6 | 7 | /* 8 | * This is the main interface file for C based implementations. 9 | */ 10 | 11 | // Initialisation kernels 12 | void run_model_info(Settings &settings); 13 | void run_set_chunk_data(Chunk *chunk, Settings &settings); 14 | void run_set_chunk_state(Chunk *chunk, Settings &settings, State *states); 15 | void run_kernel_initialise(Chunk *chunk, Settings &settings, int comms_lr_len, int comms_tb_len); 16 | void run_kernel_finalise(Chunk *chunk, Settings &settings); 17 | 18 | // Solver-wide kernels 19 | void run_local_halos(Chunk *chunk, Settings &settings, int depth); 20 | 21 | // void run_pack_or_unpack(Chunk *chunk, Settings &settings, int depth, int face, bool pack, FieldBufferType field, 22 | // FieldBufferType destination); 23 | 24 | void run_pack_or_unpack(Chunk *chunk, Settings &settings, int depth, int face, bool pack, FieldBufferType field, 25 | FieldBufferType destination, int offset); 26 | // 27 | void run_send_recv_halo(Chunk *chunk, Settings &settings, // 28 | FieldBufferType src_send_buffer, FieldBufferType src_recv_buffer, // 29 | StagingBufferType dest_staging_send_buffer, StagingBufferType dest_staging_recv_buffer, // 30 | int buffer_len, int neighbour, // 31 | int send_tag, int recv_tag, // 32 | MPI_Request *send_request, MPI_Request *recv_request); 33 | void run_before_waitall_halo(Chunk *chunk, Settings &settings); 34 | void run_restore_recv_halo(Chunk *chunk, Settings &settings, // 35 | FieldBufferType dest_recv_buffer, StagingBufferType src_staging_recv_buffer, int buffer_len); 36 | 37 | void run_store_energy(Chunk *chunk, Settings &settings); 38 | void run_field_summary(Chunk *chunk, Settings &settings, double *vol, double *mass, double *ie, double *temp); 39 | 40 | // CG solver kernels 41 | void run_cg_init(Chunk *chunk, Settings &settings, double rx, double ry, double *rro); 42 | void run_cg_calc_w(Chunk *chunk, Settings &settings, double *pw); 43 | void run_cg_calc_ur(Chunk *chunk, Settings &settings, double alpha, double *rrn); 44 | void run_cg_calc_p(Chunk *chunk, Settings &settings, double beta); 45 | 46 | // Chebyshev solver kernels 47 | void run_cheby_init(Chunk *chunk, Settings &settings); 48 | void run_cheby_iterate(Chunk *chunk, Settings &settings, double alpha, double beta); 49 | 50 | // Jacobi solver kernels 51 | void run_jacobi_init(Chunk *chunk, Settings &settings, double rx, double ry); 52 | void run_jacobi_iterate(Chunk *chunk, Settings &settings, double *error); 53 | 54 | // PPCG solver kernels 55 | void run_ppcg_init(Chunk *chunk, Settings &settings); 56 | void run_ppcg_inner_iteration(Chunk *chunk, Settings &settings, double alpha, double beta); 57 | 58 | // Shared solver kernels 59 | void run_copy_u(Chunk *chunk, Settings &settings); 60 | void run_calculate_residual(Chunk *chunk, Settings &settings); 61 | void run_calculate_2norm(Chunk *chunk, Settings &settings, FieldBufferType buffer, double *norm); 62 | void run_finalise(Chunk *chunk, Settings &settings); 63 | -------------------------------------------------------------------------------- /driver/mpi_shim.cpp: -------------------------------------------------------------------------------- 1 | #include "mpi_shim.h" 2 | #include 3 | 4 | #ifdef NO_MPI 5 | 6 | int MPI_Init(int *, char ***) { return MPI_SUCCESS; } 7 | int MPI_Comm_rank(MPI_Comm, int *rank) { 8 | *rank = 0; 9 | return MPI_SUCCESS; 10 | } 11 | int MPI_Comm_size(MPI_Comm, int *size) { 12 | *size = 1; 13 | return MPI_SUCCESS; 14 | } 15 | int MPI_Abort(MPI_Comm, int errorcode) { 16 | std::exit(errorcode); 17 | return MPI_SUCCESS; 18 | } 19 | int MPI_Finalize() { return MPI_SUCCESS; } 20 | 21 | int MPI_Barrier(MPI_Comm) { 22 | // XXX no-op, correct for 1 rank only 23 | return MPI_SUCCESS; 24 | } 25 | int MPI_Allgather(const void *, int, MPI_Datatype, void *, int, MPI_Datatype, MPI_Comm) { 26 | // XXX no-op, correct for 1 rank only 27 | return MPI_SUCCESS; 28 | } 29 | int MPI_Reduce(const void *, void *, int, MPI_Datatype, MPI_Op, int, MPI_Comm) { 30 | // XXX no-op, correct for 1 rank only 31 | return MPI_SUCCESS; 32 | } 33 | int MPI_Allreduce(const void *, void *, int, MPI_Datatype, MPI_Op, MPI_Comm) { 34 | // XXX no-op, correct for 1 rank only 35 | return MPI_SUCCESS; 36 | } 37 | int MPI_Waitall(int, MPI_Request[], MPI_Status[]) { 38 | // XXX no-op, correct for 1 rank only 39 | return MPI_SUCCESS; 40 | } 41 | 42 | int MPI_Isend(const void *, int, MPI_Datatype, int, int, MPI_Comm, MPI_Request *) { 43 | fprintf(stderr, "MPI disabled, stub: %s\n", __func__); 44 | std::abort(); 45 | return MPI_ERR_COMM; 46 | } 47 | int MPI_Irecv(void *, int, MPI_Datatype, int, int, MPI_Comm, MPI_Request *) { 48 | fprintf(stderr, "MPI disabled, stub: %s\n", __func__); 49 | std::abort(); 50 | return MPI_ERR_COMM; 51 | } 52 | 53 | #endif 54 | -------------------------------------------------------------------------------- /driver/mpi_shim.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #ifdef NO_MPI 5 | 6 | #define MPI_SUCCESS (0) 7 | #define MPI_ERR_COMM (1) 8 | #define MPI_ERR_COUNT (2) 9 | #define MPI_ERR_TYPE (3) 10 | #define MPI_ERR_BUFFER (4) 11 | 12 | #define MPI_INT (0) 13 | #define MPI_LONG (0) 14 | #define MPI_DOUBLE (0) 15 | #define MPI_SUM (0) 16 | #define MPI_MIN (0) 17 | #define MPI_MAX (0) 18 | #define MPI_STATUS_IGNORE (0) 19 | #define MPI_STATUSES_IGNORE (0) 20 | 21 | #define MPI_COMM_WORLD (0) 22 | 23 | using MPI_Comm = int; 24 | using MPI_Request = int; 25 | using MPI_Datatype = int; 26 | using MPI_Op = int; 27 | using MPI_Status = int; 28 | 29 | int MPI_Init(int *argc, char ***argv); 30 | int MPI_Comm_rank(MPI_Comm comm, int *rank); 31 | int MPI_Comm_size(MPI_Comm comm, int *size); 32 | int MPI_Abort(MPI_Comm comm, int errorcode); 33 | int MPI_Barrier(MPI_Comm comm); 34 | int MPI_Finalize(); 35 | 36 | int MPI_Reduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, int root, MPI_Comm comm); 37 | int MPI_Allreduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, MPI_Comm comm); 38 | int MPI_Isend(const void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm, MPI_Request *request); 39 | int MPI_Irecv(void *buf, int count, MPI_Datatype datatype, int source, int tag, MPI_Comm comm, MPI_Request *request); 40 | int MPI_Allgather(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, 41 | MPI_Comm comm); 42 | int MPI_Waitall(int count, MPI_Request array_of_requests[], MPI_Status array_of_statuses[]); 43 | 44 | #endif -------------------------------------------------------------------------------- /driver/profiler.cpp: -------------------------------------------------------------------------------- 1 | #include "profiler.h" 2 | #include 3 | #include 4 | #include 5 | 6 | #define tealeaf_strmatch(a, b) (strcmp(a, b) == 0) 7 | 8 | struct Profile *profiler_initialise() { 9 | auto *profile = static_cast(std::malloc(sizeof(Profile))); 10 | std::memset(profile, 0, sizeof(Profile)); 11 | return profile; 12 | } 13 | 14 | void profiler_finalise(Profile **profile) { 15 | std::free(*profile); 16 | *profile = nullptr; 17 | } 18 | 19 | // Internally start the profiling timer 20 | void profiler_start_timer(Profile *profile) { 21 | #ifdef __APPLE__ 22 | profile->profiler_start = mach_absolute_time(); 23 | #else 24 | clock_gettime(CLOCK_MONOTONIC, &profile->profiler_start); 25 | #endif 26 | } 27 | 28 | // Internally end the profiling timer and store results 29 | void profiler_end_timer(Profile *profile, const char *entry_name) { 30 | #ifdef __APPLE__ 31 | profile->profiler_end = mach_absolute_time(); 32 | #else 33 | clock_gettime(CLOCK_MONOTONIC, &profile->profiler_end); 34 | #endif 35 | 36 | // Check if an entry exists 37 | int ii; 38 | for (ii = 0; ii < profile->profiler_entry_count; ++ii) { 39 | if (tealeaf_strmatch(profile->profiler_entries[ii].name, entry_name)) { 40 | break; 41 | } 42 | } 43 | 44 | // Don't overrun 45 | if (ii >= PROFILER_MAX_ENTRIES) { 46 | printf("Attempted to profile too many entries, maximum is %d\n", PROFILER_MAX_ENTRIES); 47 | exit(1); 48 | } 49 | 50 | // Create new entry 51 | if (ii == profile->profiler_entry_count) { 52 | profile->profiler_entry_count++; 53 | strcpy(profile->profiler_entries[ii].name, entry_name); 54 | profile->profiler_entries[ii].time = 0; 55 | profile->profiler_entries[ii].calls = 0; 56 | } 57 | 58 | // Update number of calls and time 59 | #ifdef __APPLE__ 60 | double elapsed = (profile->profiler_end - profile->profiler_start) * 1.0E-9; 61 | #else 62 | double elapsed = (profile->profiler_end.tv_sec - profile->profiler_start.tv_sec) + 63 | (profile->profiler_end.tv_nsec - profile->profiler_start.tv_nsec) * 1.0E-9; 64 | #endif 65 | 66 | profile->profiler_entries[ii].time += elapsed; 67 | profile->profiler_entries[ii].calls++; 68 | } 69 | 70 | // Print the profiling results to output 71 | void profiler_print_full_profile(Profile *profile) { 72 | printf("\n -------------------------------------------------------------\n"); 73 | printf("\n Profiling Results:\n\n"); 74 | printf(" %-30s%8s%20s\n", "Kernel Name", "Calls", "Runtime (s)"); 75 | 76 | double total_elapsed_time = 0.0; 77 | for (int ii = 0; ii < profile->profiler_entry_count; ++ii) { 78 | total_elapsed_time += profile->profiler_entries[ii].time; 79 | printf(" %-30s%8d%20.03F\n", profile->profiler_entries[ii].name, profile->profiler_entries[ii].calls, 80 | profile->profiler_entries[ii].time); 81 | } 82 | 83 | printf("\n Total elapsed time: %.03Fs, entries * are excluded.\n", total_elapsed_time); 84 | printf("\n -------------------------------------------------------------\n\n"); 85 | } 86 | 87 | // Prints profile without extra details 88 | void profiler_print_simple_profile(Profile *profile) { 89 | for (int ii = 0; ii < profile->profiler_entry_count; ++ii) { 90 | printf("\033[1m\033[30m%s\033[0m: %.3lfs (%d calls)\n", profile->profiler_entries[ii].name, profile->profiler_entries[ii].time, 91 | profile->profiler_entries[ii].calls); 92 | } 93 | } 94 | 95 | // Gets an individual profile entry 96 | int profiler_get_profile_entry(Profile *profile, const char *entry_name) { 97 | for (int ii = 0; ii < profile->profiler_entry_count; ++ii) { 98 | if (tealeaf_strmatch(profile->profiler_entries[ii].name, entry_name)) { 99 | return ii; 100 | } 101 | } 102 | 103 | printf("Attempted to retrieve missing profile entry %s\n", entry_name); 104 | exit(1); 105 | } 106 | -------------------------------------------------------------------------------- /driver/profiler.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #ifdef __APPLE__ 4 | #include 5 | #include 6 | #else 7 | #include 8 | #endif 9 | 10 | /* 11 | * PROFILING TOOL 12 | * Not thread safe. 13 | */ 14 | 15 | #define PROFILER_MAX_NAME 128 16 | #define PROFILER_MAX_ENTRIES 2048 17 | 18 | #ifdef __cplusplus 19 | extern "C" { 20 | #endif 21 | 22 | struct ProfileEntry { 23 | int calls; 24 | double time; 25 | char name[PROFILER_MAX_NAME]; 26 | }; 27 | 28 | struct Profile { 29 | #ifdef __APPLE__ 30 | uint64_t profiler_start; 31 | uint64_t profiler_end; 32 | #else 33 | struct timespec profiler_start; 34 | struct timespec profiler_end; 35 | #endif 36 | 37 | int profiler_entry_count; 38 | ProfileEntry profiler_entries[PROFILER_MAX_ENTRIES]; 39 | }; 40 | 41 | Profile *profiler_initialise(); 42 | void profiler_finalise(Profile **profile); 43 | 44 | void profiler_start_timer(Profile *profile); 45 | void profiler_end_timer(Profile *profile, const char *entry_name); 46 | void profiler_print_simple_profile(Profile *profile); 47 | void profiler_print_full_profile(Profile *profile); 48 | int profiler_get_profile_entry(Profile *profile, const char *entry_name); 49 | 50 | #ifdef __cplusplus 51 | } 52 | #endif 53 | 54 | // Allows compile-time optimised conditional profiling 55 | #ifdef ENABLE_PROFILING 56 | 57 | #define START_PROFILING(profile) profiler_start_timer(profile) 58 | 59 | #define STOP_PROFILING(profile, name) profiler_end_timer(profile, name) 60 | 61 | #define PRINT_PROFILING_RESULTS(profile) profiler_print_full_profile(profile) 62 | 63 | #else 64 | 65 | #define START_PROFILING(profile) \ 66 | do { \ 67 | } while (false) 68 | #define STOP_PROFILING(profile, name) \ 69 | do { \ 70 | } while (false) 71 | #define PRINT_PROFILING_RESULTS(profile) \ 72 | do { \ 73 | } while (false) 74 | 75 | #endif 76 | -------------------------------------------------------------------------------- /driver/set_chunk_data_driver.cpp: -------------------------------------------------------------------------------- 1 | #include "chunk.h" 2 | #include "kernel_interface.h" 3 | #include "settings.h" 4 | 5 | // Invokes the set chunk data kernel 6 | void set_chunk_data_driver(Chunk *chunks, Settings &settings) { 7 | for (int cc = 0; cc < settings.num_chunks_per_rank; ++cc) { 8 | if (settings.kernel_language == Kernel_Language::C) { 9 | run_set_chunk_data(&(chunks[cc]), settings); 10 | } else if (settings.kernel_language == Kernel_Language::FORTRAN) { 11 | // Fortran store energy kernel 12 | } 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /driver/set_chunk_state_driver.cpp: -------------------------------------------------------------------------------- 1 | #include "chunk.h" 2 | #include "kernel_interface.h" 3 | 4 | // Invokes the set chunk state kernel 5 | void set_chunk_state_driver(Chunk *chunks, Settings &settings, State *states) { 6 | // Issue kernel to all local chunks 7 | for (int cc = 0; cc < settings.num_chunks_per_rank; ++cc) { 8 | if (settings.kernel_language == Kernel_Language::C) { 9 | run_set_chunk_state(&(chunks[cc]), settings, states); 10 | } else if (settings.kernel_language == Kernel_Language::FORTRAN) { 11 | // Fortran store energy kernel 12 | } 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /driver/settings.cpp: -------------------------------------------------------------------------------- 1 | #include "settings.h" 2 | #include 3 | 4 | #define MAX_CHAR_LEN 256 5 | 6 | void set_default_settings(Settings &settings) { 7 | settings.test_problem_filename = (char *)malloc(sizeof(char) * MAX_CHAR_LEN); 8 | strncpy(settings.test_problem_filename, DEF_TEST_PROBLEM_FILENAME, MAX_CHAR_LEN); 9 | 10 | settings.tea_in_filename = (char *)malloc(sizeof(char) * MAX_CHAR_LEN); 11 | strncpy(settings.tea_in_filename, DEF_TEA_IN_FILENAME, MAX_CHAR_LEN); 12 | 13 | settings.tea_out_filename = (char *)malloc(sizeof(char) * MAX_CHAR_LEN); 14 | strncpy(settings.tea_out_filename, DEF_TEA_OUT_FILENAME, MAX_CHAR_LEN); 15 | 16 | settings.tea_out_fp = nullptr; 17 | settings.grid_x_min = DEF_GRID_X_MIN; 18 | settings.grid_y_min = DEF_GRID_Y_MIN; 19 | settings.grid_x_max = DEF_GRID_X_MAX; 20 | settings.grid_y_max = DEF_GRID_Y_MAX; 21 | settings.grid_x_cells = DEF_GRID_X_CELLS; 22 | settings.grid_y_cells = DEF_GRID_Y_CELLS; 23 | settings.dt_init = DEF_DT_INIT; 24 | settings.max_iters = DEF_MAX_ITERS; 25 | settings.eps = DEF_EPS; 26 | settings.end_time = DEF_END_TIME; 27 | settings.end_step = DEF_END_STEP; 28 | settings.summary_frequency = DEF_SUMMARY_FREQUENCY; 29 | settings.solver = DEF_SOLVER; 30 | settings.staging_buffer_preference = DEF_STAGING_BUFFER; 31 | settings.model_name = ""; 32 | settings.model_kind = ModelKind::Host; 33 | settings.coefficient = DEF_COEFFICIENT; 34 | settings.error_switch = DEF_ERROR_SWITCH; 35 | settings.presteps = DEF_PRESTEPS; 36 | settings.eps_lim = DEF_EPS_LIM; 37 | settings.check_result = DEF_CHECK_RESULT; 38 | settings.ppcg_inner_steps = DEF_PPCG_INNER_STEPS; 39 | settings.preconditioner = DEF_PRECONDITIONER; 40 | settings.num_states = DEF_NUM_STATES; 41 | settings.num_chunks = DEF_NUM_CHUNKS; 42 | settings.num_chunks_per_rank = DEF_NUM_CHUNKS_PER_RANK; 43 | settings.num_ranks = DEF_NUM_RANKS; 44 | settings.halo_depth = DEF_HALO_DEPTH; 45 | settings.is_offload = DEF_IS_OFFLOAD; 46 | settings.kernel_profile = profiler_initialise(); 47 | settings.application_profile = profiler_initialise(); 48 | settings.wallclock_profile = profiler_initialise(); 49 | settings.fields_to_exchange = (bool *)malloc(sizeof(bool) * NUM_FIELDS); 50 | settings.solver_name = (char *)malloc(sizeof(char) * MAX_CHAR_LEN); 51 | settings.device_selector = nullptr; 52 | } 53 | 54 | // Resets all of the fields to be exchanged 55 | void reset_fields_to_exchange(Settings &settings) { 56 | for (int ii = 0; ii < NUM_FIELDS; ++ii) { 57 | settings.fields_to_exchange[ii] = false; 58 | } 59 | } 60 | 61 | // Checks if any of the fields are to be exchanged 62 | bool is_fields_to_exchange(Settings &settings) { 63 | for (int ii = 0; ii < NUM_FIELDS; ++ii) { 64 | if (settings.fields_to_exchange[ii]) { 65 | return true; 66 | } 67 | } 68 | 69 | return false; 70 | } 71 | -------------------------------------------------------------------------------- /driver/settings.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "shared.h" 4 | #include 5 | #include 6 | 7 | #define NUM_FIELDS 6 8 | 9 | // Default settings 10 | #define DEF_TEA_IN_FILENAME "tea.in" 11 | #define DEF_TEA_OUT_FILENAME "tea.out" 12 | #define DEF_TEST_PROBLEM_FILENAME "tea.problems" 13 | #define DEF_GRID_X_MIN 0.0 14 | #define DEF_GRID_Y_MIN 0.0 15 | #define DEF_GRID_Z_MIN 0.0 16 | #define DEF_GRID_X_MAX 100.0 17 | #define DEF_GRID_Y_MAX 100.0 18 | #define DEF_GRID_Z_MAX 100.0 19 | #define DEF_GRID_X_CELLS 10 20 | #define DEF_GRID_Y_CELLS 10 21 | #define DEF_GRID_Z_CELLS 10 22 | #define DEF_DT_INIT 0.1 23 | #define DEF_MAX_ITERS 10000 24 | #define DEF_EPS 1.0E-15 25 | #define DEF_END_TIME 10.0 26 | #define DEF_END_STEP INT32_MAX 27 | #define DEF_SUMMARY_FREQUENCY 10 28 | #define DEF_KERNEL_LANGUAGE C 29 | #define DEF_COEFFICIENT CONDUCTIVITY 30 | #define DEF_ERROR_SWITCH 0 31 | #define DEF_PRESTEPS 30 32 | #define DEF_EPS_LIM 1E-5 33 | #define DEF_CHECK_RESULT 1 34 | #define DEF_PPCG_INNER_STEPS 10 35 | #define DEF_PRECONDITIONER 0 36 | #define DEF_SOLVER Solver::CG_SOLVER 37 | #define DEF_STAGING_BUFFER StagingBuffer::AUTO 38 | #define DEF_NUM_STATES 0 39 | #define DEF_NUM_CHUNKS 1 40 | #define DEF_NUM_CHUNKS_PER_RANK 1 41 | #define DEF_NUM_RANKS 1 42 | #define DEF_HALO_DEPTH 2 43 | #define DEF_RANK 0 44 | #define DEF_IS_OFFLOAD false 45 | 46 | // The type of solver to be run 47 | enum class Solver { JACOBI_SOLVER, CG_SOLVER, CHEBY_SOLVER, PPCG_SOLVER }; 48 | 49 | // The language of the kernels to be run 50 | enum class Kernel_Language { C, FORTRAN }; 51 | 52 | enum class StagingBuffer { ENABLE, DISABLE, AUTO }; 53 | 54 | enum class ModelKind { Host, Offload, Unified }; 55 | 56 | // The main settings structure 57 | struct Settings { 58 | // Set of system-wide profiles 59 | Profile *kernel_profile; 60 | Profile *application_profile; 61 | Profile *wallclock_profile; 62 | 63 | // Log files 64 | FILE *tea_out_fp; 65 | 66 | // Solve-wide constants 67 | int rank; 68 | int end_step; 69 | int presteps; 70 | int max_iters; 71 | int coefficient; 72 | int ppcg_inner_steps; 73 | int summary_frequency; 74 | int halo_depth; 75 | int num_states; 76 | int num_chunks; 77 | int num_chunks_per_rank; 78 | int num_ranks; 79 | bool *fields_to_exchange; 80 | 81 | bool is_offload; 82 | 83 | bool error_switch; 84 | bool check_result; 85 | bool preconditioner; 86 | 87 | double eps; 88 | double dt_init; 89 | double end_time; 90 | double eps_lim; 91 | 92 | // Input-Output files 93 | char *tea_in_filename; 94 | char *tea_out_filename; 95 | char *test_problem_filename; 96 | 97 | Solver solver; 98 | char *solver_name; 99 | 100 | Kernel_Language kernel_language; 101 | char *device_selector; 102 | std::string model_name; 103 | ModelKind model_kind; 104 | StagingBuffer staging_buffer_preference; 105 | bool staging_buffer; 106 | 107 | // Field dimensions 108 | int grid_x_cells; 109 | int grid_y_cells; 110 | 111 | double grid_x_min; 112 | double grid_y_min; 113 | double grid_x_max; 114 | double grid_y_max; 115 | 116 | double dx; 117 | double dy; 118 | }; 119 | 120 | // The accepted types of state geometry 121 | enum class Geometry { RECTANGULAR, CIRCULAR, POINT }; 122 | 123 | // State list 124 | struct State { 125 | bool defined; 126 | double density; 127 | double energy; 128 | double x_min; 129 | double y_min; 130 | double x_max; 131 | double y_max; 132 | double radius; 133 | Geometry geometry; 134 | }; 135 | 136 | void set_default_settings(Settings &settings); 137 | void reset_fields_to_exchange(Settings &settings); 138 | bool is_fields_to_exchange(Settings &settings); 139 | -------------------------------------------------------------------------------- /driver/shared.cpp: -------------------------------------------------------------------------------- 1 | #include "shared.h" 2 | #include "comms.h" 3 | 4 | // Initialises the log file pointer 5 | void initialise_log(Settings &settings) { 6 | // Only write to log in master rank 7 | if (settings.rank != MASTER) { 8 | return; 9 | } 10 | 11 | std::printf("# Opening %s as log file.\n", settings.tea_out_filename); 12 | std::fflush(stdout); 13 | settings.tea_out_fp = std::fopen(settings.tea_out_filename, "w"); 14 | 15 | if (!settings.tea_out_fp) { 16 | die(__LINE__, __FILE__, "Could not open log %s\n", settings.tea_out_filename); 17 | } 18 | } 19 | 20 | // Prints to stdout and then logs message in log file 21 | void print_and_log(Settings &settings, const char *format, ...) { 22 | // Only master rank should print 23 | if (settings.rank != MASTER) { 24 | return; 25 | } 26 | 27 | va_list arglist; 28 | va_start(arglist, format); 29 | std::vprintf(format, arglist); 30 | va_end(arglist); 31 | std::fflush(stdout); 32 | 33 | if (!settings.tea_out_fp) { 34 | die(__LINE__, __FILE__, "Attempted to write to log before it was initialised\n"); 35 | } 36 | 37 | // Obtuse, but necessary 38 | va_list arglist2; 39 | va_start(arglist2, format); 40 | std::vfprintf(settings.tea_out_fp, format, arglist2); 41 | va_end(arglist2); 42 | std::fflush(settings.tea_out_fp); 43 | } 44 | 45 | // Logs message in log file 46 | void print_to_log(Settings &settings, const char *format, ...) { 47 | // Only master rank should log 48 | if (settings.rank != MASTER) { 49 | return; 50 | } 51 | 52 | if (!settings.tea_out_fp) { 53 | die(__LINE__, __FILE__, "Attempted to write to log before it was initialised\n"); 54 | } 55 | 56 | va_list arglist; 57 | va_start(arglist, format); 58 | std::vfprintf(settings.tea_out_fp, format, arglist); 59 | va_end(arglist); 60 | std::fflush(settings.tea_out_fp); 61 | } 62 | 63 | // Plots a two-dimensional dat file. 64 | void plot_2d(int x, int y, const double *buffer, const char *name) { 65 | // Open the plot file 66 | FILE *fp = std::fopen("plot2d.dat", "wb"); 67 | if (!fp) { 68 | std::printf("Could not open plot file.\n"); 69 | } 70 | 71 | double b_sum = 0.0; 72 | 73 | for (int jj = 0; jj < y; ++jj) { 74 | for (int kk = 0; kk < x; ++kk) { 75 | double val = buffer[kk + jj * x]; 76 | std::fprintf(fp, "%d %d %.12E\n", kk, jj, val); 77 | b_sum += val; 78 | } 79 | } 80 | 81 | std::printf("%s: %.12E\n", name, b_sum); 82 | std::fclose(fp); 83 | } 84 | 85 | // Aborts the application. 86 | void die(int lineNum, const char *file, const char *format, ...) { 87 | // Print location of error 88 | std::printf("\x1b[31m"); 89 | std::printf("\nError at line %d in %s:", lineNum, file); 90 | std::printf("\x1b[0m \n"); 91 | 92 | va_list arglist; 93 | va_start(arglist, format); 94 | std::vprintf(format, arglist); 95 | va_end(arglist); 96 | std::fflush(stdout); 97 | 98 | abort_comms(); 99 | } 100 | 101 | // Write out data for visualisation in visit 102 | void write_to_visit(const int nx, const int ny, const int x_off, const int y_off, const double *data, const char *name, const int step, 103 | const double time) { 104 | char bovname[256]{}; 105 | char datname[256]{}; 106 | std::sprintf(bovname, "%s%d.bov", name, step); 107 | std::sprintf(datname, "%s%d.dat", name, step); 108 | 109 | FILE *bovfp = std::fopen(bovname, "w"); 110 | 111 | if (!bovfp) { 112 | std::printf("Could not open file %s\n", bovname); 113 | std::exit(1); 114 | } 115 | 116 | std::fprintf(bovfp, "TIME: %.4f\n", time); 117 | std::fprintf(bovfp, "DATA_FILE: %s\n", datname); 118 | std::fprintf(bovfp, "DATA_SIZE: %d %d 1\n", nx, ny); 119 | std::fprintf(bovfp, "DATA_FORMAT: DOUBLE\n"); 120 | std::fprintf(bovfp, "VARIABLE: density\n"); 121 | std::fprintf(bovfp, "DATA_ENDIAN: LITTLE\n"); 122 | std::fprintf(bovfp, "CENTERING: zone\n"); 123 | std::fprintf(bovfp, "BRICK_ORIGIN: 0. 0. 0.\n"); 124 | 125 | std::fprintf(bovfp, "BRICK_SIZE: %d %d 1\n", nx, ny); 126 | std::fclose(bovfp); 127 | 128 | FILE *datfp = std::fopen(datname, "wb"); 129 | if (!datfp) { 130 | std::printf("Could not open file %s\n", datname); 131 | std::exit(1); 132 | } 133 | 134 | std::fwrite(data, sizeof(double), nx * ny, datfp); 135 | std::fclose(datfp); 136 | } 137 | -------------------------------------------------------------------------------- /driver/shared.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "profiler.h" 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #ifdef __cplusplus 10 | extern "C" { 11 | #endif 12 | 13 | struct Settings; 14 | 15 | // Shared function declarations 16 | void initialise_log(Settings &settings); 17 | void print_to_log(Settings &settings, const char *format, ...); 18 | void print_and_log(Settings &settings, const char *format, ...); 19 | void plot_2d(int x, int y, const double *buffer, const char *name); 20 | void die(int lineNum, const char *file, const char *format, ...); 21 | 22 | // Write out data for visualisation in visit 23 | void write_to_visit(int nx, int ny, int x_off, int y_off, const double *data, const char *name, int step, double time); 24 | 25 | #ifdef __cplusplus 26 | } 27 | #endif 28 | 29 | // Global constants 30 | #define MASTER 0 31 | 32 | #define NUM_FACES 4 33 | #define CHUNK_LEFT 0 34 | #define CHUNK_RIGHT 1 35 | #define CHUNK_BOTTOM 2 36 | #define CHUNK_TOP 3 37 | #define EXTERNAL_FACE -1 38 | 39 | #define FIELD_DENSITY 0 40 | #define FIELD_ENERGY0 1 41 | #define FIELD_ENERGY1 2 42 | #define FIELD_U 3 43 | #define FIELD_P 4 44 | #define FIELD_SD 5 45 | 46 | #define CONDUCTIVITY 1 47 | #define RECIP_CONDUCTIVITY 2 48 | 49 | #define CG_ITERS_FOR_EIGENVALUES 20 50 | #define ERROR_SWITCH_MAX 1.0 51 | 52 | #define tealeaf_MIN(a, b) ((a < b) ? a : b) 53 | #define tealeaf_MAX(a, b) ((a > b) ? a : b) 54 | #define tealeaf_strmatch(a, b) (strcmp(a, b) == 0) 55 | #define tealeaf_sign(a, b) ((b) < 0 ? -fabs(a) : fabs(a)) 56 | 57 | // Sparse Matrix Vector Product 58 | #define tealeaf_SMVP(a) \ 59 | (1.0 + (kx[index + 1] + kx[index]) + (ky[index + x] + ky[index])) * a[index] - \ 60 | (kx[index + 1] * a[index + 1] + kx[index] * a[index - 1]) - (ky[index + x] * a[index + x] + ky[index] * a[index - x]) 61 | 62 | #define GET_ARRAY_VALUE(len, buffer) \ 63 | temp = 0.0; \ 64 | for (int ii = 0; ii < len; ++ii) { \ 65 | temp += buffer[ii]; \ 66 | } \ 67 | printf("%s = %.12E\n", #buffer, temp); 68 | -------------------------------------------------------------------------------- /driver/solve_finished_driver.cpp: -------------------------------------------------------------------------------- 1 | #include "chunk.h" 2 | #include "comms.h" 3 | #include "drivers.h" 4 | #include "kernel_interface.h" 5 | 6 | // Calls all kernels that wrap up a solve regardless of solver 7 | void solve_finished_driver(Chunk *chunks, Settings &settings) { 8 | double exact_error = 0.0; 9 | 10 | if (settings.check_result) { 11 | for (int cc = 0; cc < settings.num_chunks_per_rank; ++cc) { 12 | if (settings.kernel_language == Kernel_Language::C) { 13 | run_calculate_residual(&(chunks[cc]), settings); 14 | 15 | run_calculate_2norm(&(chunks[cc]), settings, chunks[cc].r, &exact_error); 16 | } else if (settings.kernel_language == Kernel_Language::FORTRAN) { 17 | } 18 | } 19 | 20 | sum_over_ranks(settings, &exact_error); 21 | } 22 | 23 | for (int cc = 0; cc < settings.num_chunks_per_rank; ++cc) { 24 | if (settings.kernel_language == Kernel_Language::C) { 25 | run_finalise(&(chunks[cc]), settings); 26 | } else if (settings.kernel_language == Kernel_Language::FORTRAN) { 27 | } 28 | } 29 | 30 | settings.fields_to_exchange[FIELD_ENERGY1] = true; 31 | halo_update_driver(chunks, settings, 1); 32 | } 33 | -------------------------------------------------------------------------------- /driver/store_energy_driver.cpp: -------------------------------------------------------------------------------- 1 | #include "chunk.h" 2 | #include "kernel_interface.h" 3 | 4 | // Invokes the store energy kernel 5 | void store_energy_driver(Chunk *chunks, Settings &settings) { 6 | for (int cc = 0; cc < settings.num_chunks_per_rank; ++cc) { 7 | if (settings.kernel_language == Kernel_Language::C) { 8 | run_store_energy(&(chunks[cc]), settings); 9 | } else if (settings.kernel_language == Kernel_Language::FORTRAN) { 10 | // Fortran store energy kernel 11 | } 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /src/cuda/cheby.cpp: -------------------------------------------------------------------------------- 1 | #include "chunk.h" 2 | #include "cuknl_shared.h" 3 | #include "shared.h" 4 | 5 | __global__ void cheby_init(const int x_inner, const int y_inner, const int halo_depth, const double *u, const double *u0, const double *kx, 6 | const double *ky, const double theta, double *p, double *r, double *w) { 7 | const int gid = threadIdx.x + blockIdx.x * blockDim.x; 8 | if (gid >= x_inner * y_inner) return; 9 | 10 | const int x = x_inner + 2 * halo_depth; 11 | const int col = gid % x_inner; 12 | const int row = gid / x_inner; 13 | const int off0 = halo_depth * (x + 1); 14 | const int index = off0 + col + row * x; 15 | 16 | const double smvp = tealeaf_SMVP(u); 17 | w[index] = smvp; 18 | r[index] = u0[index] - w[index]; 19 | p[index] = r[index] / theta; 20 | } 21 | 22 | __global__ void cheby_calc_u(const int x_inner, const int y_inner, const int halo_depth, const double *p, double *u) { 23 | const int gid = threadIdx.x + blockIdx.x * blockDim.x; 24 | if (gid >= x_inner * y_inner) return; 25 | 26 | const int x = x_inner + 2 * halo_depth; 27 | const int col = gid % x_inner; 28 | const int row = gid / x_inner; 29 | const int off0 = halo_depth * (x + 1); 30 | const int index = off0 + col + row * x; 31 | 32 | u[index] += p[index]; 33 | } 34 | 35 | __global__ void cheby_calc_p(const int x_inner, const int y_inner, const int halo_depth, const double *u, const double *u0, 36 | const double *kx, const double *ky, const double alpha, const double beta, double *p, double *r, double *w) { 37 | const int gid = threadIdx.x + blockIdx.x * blockDim.x; 38 | if (gid >= x_inner * y_inner) return; 39 | 40 | const int x = x_inner + 2 * halo_depth; 41 | const int col = gid % x_inner; 42 | const int row = gid / x_inner; 43 | const int off0 = halo_depth * (x + 1); 44 | const int index = off0 + col + row * x; 45 | 46 | const double smvp = tealeaf_SMVP(u); 47 | w[index] = smvp; 48 | r[index] = u0[index] - w[index]; 49 | p[index] = alpha * p[index] + beta * r[index]; 50 | } 51 | 52 | // Chebyshev solver kernels 53 | void run_cheby_init(Chunk *chunk, Settings &settings) { 54 | KERNELS_START(2 * settings.halo_depth); 55 | cheby_init<<>>(x_inner, y_inner, settings.halo_depth, chunk->u, chunk->u0, chunk->kx, chunk->ky, chunk->theta, 56 | chunk->p, chunk->r, chunk->w); 57 | KERNELS_END(); 58 | } 59 | 60 | void run_cheby_iterate(Chunk *chunk, Settings &settings, double alpha, double beta) { 61 | KERNELS_START(2 * settings.halo_depth); 62 | cheby_calc_p<<>>(x_inner, y_inner, settings.halo_depth, chunk->u, chunk->u0, chunk->kx, chunk->ky, alpha, beta, 63 | chunk->p, chunk->r, chunk->w); 64 | cheby_calc_u<<>>(x_inner, y_inner, settings.halo_depth, chunk->p, chunk->u); 65 | KERNELS_END(); 66 | } -------------------------------------------------------------------------------- /src/cuda/chunk_extension.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | using FieldBufferType = double *; 4 | using StagingBufferType = double *; 5 | 6 | struct ChunkExtension { 7 | double *d_reduce_buffer; 8 | double *d_reduce_buffer2; 9 | double *d_reduce_buffer3; 10 | double *d_reduce_buffer4; 11 | }; 12 | -------------------------------------------------------------------------------- /src/cuda/cuknl_shared.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "shared.h" 4 | #ifndef BLOCK_SIZE 5 | #define BLOCK_SIZE 256 6 | #endif 7 | 8 | #ifdef CLOVER_MANAGED_ALLOC 9 | #define CLOVER_MEMCPY_KIND_D2H (cudaMemcpyDefault) 10 | #define CLOVER_MEMCPY_KIND_H2D (cudaMemcpyDefault) 11 | #else 12 | #define CLOVER_MEMCPY_KIND_D2H (cudaMemcpyDeviceToHost) 13 | #define CLOVER_MEMCPY_KIND_H2D (cudaMemcpyHostToDevice) 14 | #endif 15 | 16 | __device__ inline double SUM(double a, double b) { return a + b; } 17 | 18 | template class reduce { 19 | public: 20 | __device__ inline static void run(T *array, T *out, T (*func)(T, T)) { 21 | // only need to sync if not working within a warp 22 | if (offset > 16) { 23 | __syncthreads(); 24 | } 25 | 26 | // only continue if it's in the lower half 27 | if (threadIdx.x < offset) { 28 | array[threadIdx.x] = func(array[threadIdx.x], array[threadIdx.x + offset]); 29 | reduce::run(array, out, func); 30 | } 31 | } 32 | }; 33 | 34 | template class reduce { 35 | public: 36 | __device__ inline static void run(T *array, T *out, T (*)(T, T)) { out[blockIdx.x] = array[0]; } 37 | }; 38 | 39 | inline void check_errors(int line_num, const char *file) { 40 | cudaDeviceSynchronize(); 41 | if (auto result = cudaGetLastError(); result != cudaSuccess) { 42 | die(line_num, file, "Error in %s - return code %d (%s)\n", file, result, cudaGetErrorName(result)); 43 | } 44 | } 45 | 46 | void sum_reduce_buffer(double *buffer, double *result, int len); 47 | 48 | #define KERNELS_START(pad) \ 49 | START_PROFILING(settings.kernel_profile); \ 50 | int x_inner = chunk->x - (pad); \ 51 | int y_inner = chunk->y - (pad); \ 52 | int num_blocks = ceil((double)(x_inner * y_inner) / double(BLOCK_SIZE)); \ 53 | do { \ 54 | } while (false) 55 | #ifdef CLOVER_SYNC_ALL_KERNELS 56 | #define KERNELS_END() \ 57 | check_errors(__LINE__, __FILE__); \ 58 | STOP_PROFILING(settings.kernel_profile, __func__); 59 | #else 60 | #define KERNELS_END() STOP_PROFILING(settings.kernel_profile, __func__) 61 | #endif 62 | -------------------------------------------------------------------------------- /src/cuda/jacobi.cpp: -------------------------------------------------------------------------------- 1 | #include "chunk.h" 2 | #include "cuknl_shared.h" 3 | #include "shared.h" 4 | 5 | // Core computation for Jacobi solver. 6 | __global__ void jacobi_iterate(const int x_inner, const int y_inner, const int halo_depth, const double *kx, const double *ky, 7 | const double *u0, const double *r, double *u, double *error) { 8 | const int gid = threadIdx.x + blockIdx.x * blockDim.x; 9 | __shared__ double error_local[BLOCK_SIZE]; 10 | 11 | const int x = x_inner + 2 * halo_depth; 12 | const int col = gid % x_inner; 13 | const int row = gid / x_inner; 14 | const int off0 = halo_depth * (x + 1); 15 | const int index = off0 + col + row * x; 16 | 17 | if (gid < x_inner * y_inner) { 18 | u[index] = 19 | (u0[index] + kx[index + 1] * r[index + 1] + kx[index] * r[index - 1] + ky[index + x] * r[index + x] + ky[index] * r[index - x]) / 20 | (1.0 + (kx[index] + kx[index + 1]) + (ky[index] + ky[index + x])); 21 | 22 | error_local[threadIdx.x] = fabs(u[index] - r[index]); 23 | } else { 24 | error_local[threadIdx.x] = 0.0; 25 | } 26 | 27 | reduce::run(error_local, error, SUM); 28 | } 29 | 30 | __global__ void jacobi_init(const int x_inner, const int y_inner, const int halo_depth, const double *density, const double *energy, 31 | const double rx, const double ry, double *kx, double *ky, double *u0, double *u, const int coefficient) { 32 | const int gid = threadIdx.x + blockIdx.x * blockDim.x; 33 | if (gid >= x_inner * y_inner) return; 34 | 35 | const int x = x_inner + 2 * halo_depth; 36 | const int col = gid % x_inner; 37 | const int row = gid / x_inner; 38 | const int off0 = halo_depth * (x + 1); 39 | const int index = off0 + col + row * x; 40 | 41 | const double u_temp = energy[index] * density[index]; 42 | u0[index] = u_temp; 43 | u[index] = u_temp; 44 | 45 | if (row == 0 || col == 0) return; 46 | 47 | double density_center; 48 | double density_left; 49 | double density_down; 50 | 51 | if (coefficient == CONDUCTIVITY) { 52 | density_center = density[index]; 53 | density_left = density[index - 1]; 54 | density_down = density[index - x]; 55 | } else if (coefficient == RECIP_CONDUCTIVITY) { 56 | density_center = 1.0 / density[index]; 57 | density_left = 1.0 / density[index - 1]; 58 | density_down = 1.0 / density[index - x]; 59 | } 60 | 61 | kx[index] = rx * (density_left + density_center) / (2.0 * density_left * density_center); 62 | ky[index] = ry * (density_down + density_center) / (2.0 * density_down * density_center); 63 | } 64 | 65 | __global__ void jacobi_copy_u(const int x_inner, const int y_inner, const double *src, double *dest) { 66 | const int gid = threadIdx.x + blockIdx.x * blockDim.x; 67 | 68 | if (gid < x_inner * y_inner) { 69 | dest[gid] = src[gid]; 70 | } 71 | } 72 | 73 | // Jacobi solver kernels 74 | void run_jacobi_init(Chunk *chunk, Settings &settings, double rx, double ry) { 75 | KERNELS_START(2 * settings.halo_depth); 76 | jacobi_init<<>>(x_inner, y_inner, settings.halo_depth, chunk->density, chunk->energy, rx, ry, chunk->kx, 77 | chunk->ky, chunk->u0, chunk->u, settings.coefficient); 78 | KERNELS_END(); 79 | } 80 | 81 | void run_jacobi_iterate(Chunk *chunk, Settings &settings, double *error) { 82 | KERNELS_START(2 * settings.halo_depth); 83 | jacobi_iterate<<>>(x_inner, y_inner, settings.halo_depth, chunk->kx, chunk->ky, chunk->u0, chunk->r, chunk->u, 84 | chunk->ext->d_reduce_buffer); 85 | sum_reduce_buffer(chunk->ext->d_reduce_buffer, error, num_blocks); 86 | KERNELS_END(); 87 | } 88 | -------------------------------------------------------------------------------- /src/cuda/local_halos.cpp: -------------------------------------------------------------------------------- 1 | #include "chunk.h" 2 | #include "cuknl_shared.h" 3 | #include "shared.h" 4 | 5 | __global__ void update_bottom(const int x, const int y, const int halo_depth, const int depth, double *buffer) { 6 | const int gid = threadIdx.x + blockIdx.x * blockDim.x; 7 | if (gid >= x * depth) return; 8 | 9 | const int lines = gid / x; 10 | const int offset = x * halo_depth; 11 | const int from_index = offset + gid; 12 | const int to_index = from_index - (1 + lines * 2) * x; 13 | buffer[to_index] = buffer[from_index]; 14 | } 15 | 16 | __global__ void update_top(const int x, const int y, const int halo_depth, const int depth, double *buffer) { 17 | const int gid = threadIdx.x + blockIdx.x * blockDim.x; 18 | if (gid >= x * depth) return; 19 | 20 | const int lines = gid / x; 21 | const int offset = x * (y - halo_depth); 22 | const int to_index = offset + gid; 23 | const int from_index = to_index - (1 + lines * 2) * x; 24 | buffer[to_index] = buffer[from_index]; 25 | } 26 | 27 | __global__ void update_left(const int x, const int y, const int halo_depth, const int depth, double *buffer) { 28 | const int gid = threadIdx.x + blockDim.x * blockIdx.x; 29 | if (gid >= y * depth) return; 30 | 31 | const int flip = gid % depth; 32 | const int lines = gid / depth; 33 | const int offset = halo_depth + lines * (x - depth); 34 | const int from_index = offset + gid; 35 | const int to_index = from_index - (1 + flip * 2); 36 | 37 | buffer[to_index] = buffer[from_index]; 38 | } 39 | 40 | __global__ void update_right(const int x, const int y, const int halo_depth, const int depth, double *buffer) { 41 | const int gid = threadIdx.x + blockDim.x * blockIdx.x; 42 | if (gid >= y * depth) return; 43 | 44 | const int flip = gid % depth; 45 | const int lines = gid / depth; 46 | const int offset = x - halo_depth + lines * (x - depth); 47 | const int to_index = offset + gid; 48 | const int from_index = to_index - (1 + flip * 2); 49 | 50 | buffer[to_index] = buffer[from_index]; 51 | } 52 | 53 | // Updates faces in turn. 54 | void update_face(const int x, const int y, const int halo_depth, const int *chunk_neighbours, const int depth, double *buffer) { 55 | int num_blocks = std::ceil((x * depth) / (double)BLOCK_SIZE); 56 | if (chunk_neighbours[CHUNK_TOP] == EXTERNAL_FACE) { 57 | update_top<<>>(x, y, halo_depth, depth, buffer); 58 | check_errors(__LINE__, __FILE__); 59 | } 60 | if (chunk_neighbours[CHUNK_BOTTOM] == EXTERNAL_FACE) { 61 | update_bottom<<>>(x, y, halo_depth, depth, buffer); 62 | check_errors(__LINE__, __FILE__); 63 | } 64 | 65 | num_blocks = std::ceil((y * depth) / (float)BLOCK_SIZE); 66 | if (chunk_neighbours[CHUNK_RIGHT] == EXTERNAL_FACE) { 67 | update_right<<>>(x, y, halo_depth, depth, buffer); 68 | check_errors(__LINE__, __FILE__); 69 | } 70 | if (chunk_neighbours[CHUNK_LEFT] == EXTERNAL_FACE) { 71 | update_left<<>>(x, y, halo_depth, depth, buffer); 72 | check_errors(__LINE__, __FILE__); 73 | } 74 | } 75 | 76 | // The kernel for updating halos locally 77 | void local_halos(const int x, const int y, const int halo_depth, const int depth, const int *chunk_neighbours, 78 | const bool *fields_to_exchange, double *density, double *energy0, double *energy, double *u, double *p, double *sd) { 79 | if (fields_to_exchange[FIELD_DENSITY]) { 80 | update_face(x, y, halo_depth, chunk_neighbours, depth, density); 81 | } 82 | if (fields_to_exchange[FIELD_P]) { 83 | update_face(x, y, halo_depth, chunk_neighbours, depth, p); 84 | } 85 | if (fields_to_exchange[FIELD_ENERGY0]) { 86 | update_face(x, y, halo_depth, chunk_neighbours, depth, energy0); 87 | } 88 | if (fields_to_exchange[FIELD_ENERGY1]) { 89 | update_face(x, y, halo_depth, chunk_neighbours, depth, energy); 90 | } 91 | if (fields_to_exchange[FIELD_U]) { 92 | update_face(x, y, halo_depth, chunk_neighbours, depth, u); 93 | } 94 | if (fields_to_exchange[FIELD_SD]) { 95 | update_face(x, y, halo_depth, chunk_neighbours, depth, sd); 96 | } 97 | } 98 | 99 | // Solver-wide kernels 100 | void run_local_halos(Chunk *chunk, Settings &settings, int depth) { 101 | START_PROFILING(settings.kernel_profile); 102 | 103 | local_halos(chunk->x, chunk->y, settings.halo_depth, depth, chunk->neighbours, settings.fields_to_exchange, chunk->density, 104 | chunk->energy0, chunk->energy, chunk->u, chunk->p, chunk->sd); 105 | 106 | STOP_PROFILING(settings.kernel_profile, __func__); 107 | } 108 | -------------------------------------------------------------------------------- /src/cuda/model.cmake: -------------------------------------------------------------------------------- 1 | 2 | register_flag_optional(CMAKE_CXX_COMPILER 3 | "Any CXX compiler that is supported by CMake detection, this is used for host compilation" 4 | "c++") 5 | 6 | register_flag_required(CMAKE_CUDA_COMPILER 7 | "Path to the CUDA nvcc compiler") 8 | 9 | # XXX we may want to drop this eventually and use CMAKE_CUDA_ARCHITECTURES directly 10 | register_flag_required(CUDA_ARCH 11 | "Nvidia architecture, will be passed in via `-arch=` (e.g `sm_70`) for nvcc") 12 | 13 | register_flag_optional(CUDA_EXTRA_FLAGS 14 | "Additional CUDA flags passed to nvcc, this is appended after `CUDA_ARCH`" 15 | "") 16 | 17 | 18 | register_flag_optional(MANAGED_ALLOC "Use UVM (cudaMallocManaged) instead of the device-only allocation (cudaMalloc)" 19 | "OFF") 20 | 21 | register_flag_optional(SYNC_ALL_KERNELS 22 | "Fully synchronise all kernels after launch, this also enables synchronous error checking with line and file name" 23 | "OFF") 24 | 25 | 26 | macro(setup) 27 | 28 | # XXX CMake 3.18 supports CMAKE_CUDA_ARCHITECTURES/CUDA_ARCHITECTURES but we support older CMakes 29 | if (POLICY CMP0104) 30 | cmake_policy(SET CMP0104 OLD) 31 | endif () 32 | 33 | set(CMAKE_CXX_STANDARD 17) 34 | enable_language(CUDA) 35 | 36 | # add -forward-unknown-to-host-compiler for compatibility reasons 37 | # add -std=c++17 manually as older CMake seems to omit this (source gets treated as C otherwise) 38 | set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -std=c++17 -forward-unknown-to-host-compiler -arch=${CUDA_ARCH} -use_fast_math -restrict -keep ${CUDA_EXTRA_FLAGS}") 39 | 40 | # CMake defaults to -O2 for CUDA at Release, let's wipe that and use the global RELEASE_FLAG 41 | # appended later 42 | wipe_gcc_style_optimisation_flags(CMAKE_CUDA_FLAGS_${BUILD_TYPE}) 43 | 44 | if (MANAGED_ALLOC) 45 | register_definitions(CLOVER_MANAGED_ALLOC) 46 | endif () 47 | 48 | if (SYNC_ALL_KERNELS) 49 | register_definitions(CLOVER_SYNC_ALL_KERNELS) 50 | endif () 51 | 52 | message(STATUS "NVCC flags: ${CMAKE_CUDA_FLAGS} ${CMAKE_CUDA_FLAGS_${BUILD_TYPE}}") 53 | endmacro() 54 | 55 | 56 | macro(setup_target NAME) 57 | # Treat everything as CUDA source 58 | get_target_property(PROJECT_SRC "${NAME}" SOURCES) 59 | foreach (SRC ${PROJECT_SRC}) 60 | set_source_files_properties("${SRC}" PROPERTIES LANGUAGE CUDA) 61 | endforeach () 62 | endmacro() 63 | -------------------------------------------------------------------------------- /src/cuda/ppcg.cpp: -------------------------------------------------------------------------------- 1 | #include "chunk.h" 2 | #include "cuknl_shared.h" 3 | 4 | __global__ void ppcg_init(const int x_inner, const int y_inner, const int halo_depth, const double theta, const double *r, double *sd) { 5 | const int gid = threadIdx.x + blockIdx.x * blockDim.x; 6 | if (gid >= x_inner * y_inner) return; 7 | 8 | const int x = x_inner + 2 * halo_depth; 9 | const int col = gid % x_inner; 10 | const int row = gid / x_inner; 11 | const int off0 = halo_depth * (x + 1); 12 | const int index = off0 + col + row * x; 13 | 14 | sd[index] = r[index] / theta; 15 | } 16 | 17 | __global__ void ppcg_calc_ur(const int x_inner, const int y_inner, const int halo_depth, const double *kx, const double *ky, 18 | const double *sd, double *u, double *r) { 19 | const int gid = threadIdx.x + blockIdx.x * blockDim.x; 20 | if (gid >= x_inner * y_inner) return; 21 | 22 | const int x = x_inner + 2 * halo_depth; 23 | const int col = gid % x_inner; 24 | const int row = gid / x_inner; 25 | const int off0 = halo_depth * (x + 1); 26 | const int index = off0 + col + row * x; 27 | 28 | const double smvp = (1.0 + (kx[index + 1] + kx[index]) + (ky[index + x] + ky[index])) * sd[index] - 29 | (kx[index + 1] * sd[index + 1] + kx[index] * sd[index - 1]) - 30 | (ky[index + x] * sd[index + x] + ky[index] * sd[index - x]); 31 | 32 | r[index] -= smvp; 33 | u[index] += sd[index]; 34 | } 35 | 36 | __global__ void ppcg_calc_sd(const int x_inner, const int y_inner, const int halo_depth, const double alpha, const double beta, 37 | const double *r, double *sd) { 38 | const int gid = threadIdx.x + blockIdx.x * blockDim.x; 39 | if (gid >= x_inner * y_inner) return; 40 | 41 | const int x = x_inner + 2 * halo_depth; 42 | const int col = gid % x_inner; 43 | const int row = gid / x_inner; 44 | const int off0 = halo_depth * (x + 1); 45 | const int index = off0 + col + row * x; 46 | 47 | sd[index] = alpha * sd[index] + beta * r[index]; 48 | } 49 | 50 | // PPCG solver kernels 51 | void run_ppcg_init(Chunk *chunk, Settings &settings) { 52 | KERNELS_START(2 * settings.halo_depth); 53 | ppcg_init<<>>(x_inner, y_inner, settings.halo_depth, chunk->theta, chunk->r, chunk->sd); 54 | KERNELS_END(); 55 | } 56 | 57 | void run_ppcg_inner_iteration(Chunk *chunk, Settings &settings, double alpha, double beta) { 58 | KERNELS_START(2 * settings.halo_depth); 59 | ppcg_calc_ur<<>>(x_inner, y_inner, settings.halo_depth, chunk->kx, chunk->ky, chunk->sd, chunk->u, chunk->r); 60 | ppcg_calc_sd<<>>(x_inner, y_inner, settings.halo_depth, alpha, beta, chunk->r, chunk->sd); 61 | KERNELS_END(); 62 | } -------------------------------------------------------------------------------- /src/hip/cheby.cpp: -------------------------------------------------------------------------------- 1 | #include "hip/hip_runtime.h" 2 | 3 | #include "chunk.h" 4 | #include "cuknl_shared.h" 5 | #include "shared.h" 6 | 7 | __global__ void cheby_init(const int x_inner, const int y_inner, const int halo_depth, const double *u, const double *u0, const double *kx, 8 | const double *ky, const double theta, double *p, double *r, double *w) { 9 | const int gid = threadIdx.x + blockIdx.x * blockDim.x; 10 | if (gid >= x_inner * y_inner) return; 11 | 12 | const int x = x_inner + 2 * halo_depth; 13 | const int col = gid % x_inner; 14 | const int row = gid / x_inner; 15 | const int off0 = halo_depth * (x + 1); 16 | const int index = off0 + col + row * x; 17 | 18 | const double smvp = tealeaf_SMVP(u); 19 | w[index] = smvp; 20 | r[index] = u0[index] - w[index]; 21 | p[index] = r[index] / theta; 22 | } 23 | 24 | __global__ void cheby_calc_u(const int x_inner, const int y_inner, const int halo_depth, const double *p, double *u) { 25 | const int gid = threadIdx.x + blockIdx.x * blockDim.x; 26 | if (gid >= x_inner * y_inner) return; 27 | 28 | const int x = x_inner + 2 * halo_depth; 29 | const int col = gid % x_inner; 30 | const int row = gid / x_inner; 31 | const int off0 = halo_depth * (x + 1); 32 | const int index = off0 + col + row * x; 33 | 34 | u[index] += p[index]; 35 | } 36 | 37 | __global__ void cheby_calc_p(const int x_inner, const int y_inner, const int halo_depth, const double *u, const double *u0, 38 | const double *kx, const double *ky, const double alpha, const double beta, double *p, double *r, double *w) { 39 | const int gid = threadIdx.x + blockIdx.x * blockDim.x; 40 | if (gid >= x_inner * y_inner) return; 41 | 42 | const int x = x_inner + 2 * halo_depth; 43 | const int col = gid % x_inner; 44 | const int row = gid / x_inner; 45 | const int off0 = halo_depth * (x + 1); 46 | const int index = off0 + col + row * x; 47 | 48 | const double smvp = tealeaf_SMVP(u); 49 | w[index] = smvp; 50 | r[index] = u0[index] - w[index]; 51 | p[index] = alpha * p[index] + beta * r[index]; 52 | } 53 | 54 | // Chebyshev solver kernels 55 | void run_cheby_init(Chunk *chunk, Settings &settings) { 56 | KERNELS_START(2 * settings.halo_depth); 57 | cheby_init<<>>(x_inner, y_inner, settings.halo_depth, chunk->u, chunk->u0, chunk->kx, chunk->ky, chunk->theta, 58 | chunk->p, chunk->r, chunk->w); 59 | KERNELS_END(); 60 | } 61 | 62 | void run_cheby_iterate(Chunk *chunk, Settings &settings, double alpha, double beta) { 63 | KERNELS_START(2 * settings.halo_depth); 64 | cheby_calc_p<<>>(x_inner, y_inner, settings.halo_depth, chunk->u, chunk->u0, chunk->kx, chunk->ky, alpha, beta, 65 | chunk->p, chunk->r, chunk->w); 66 | cheby_calc_u<<>>(x_inner, y_inner, settings.halo_depth, chunk->p, chunk->u); 67 | KERNELS_END(); 68 | } -------------------------------------------------------------------------------- /src/hip/chunk_extension.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | using FieldBufferType = double *; 4 | using StagingBufferType = double *; 5 | 6 | struct ChunkExtension { 7 | double *d_reduce_buffer; 8 | double *d_reduce_buffer2; 9 | double *d_reduce_buffer3; 10 | double *d_reduce_buffer4; 11 | }; 12 | -------------------------------------------------------------------------------- /src/hip/cuknl_shared.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "hip/hip_runtime.h" 4 | 5 | #include "shared.h" 6 | #ifndef BLOCK_SIZE 7 | #define BLOCK_SIZE 1024 // XXX anything less than 512 would break reduction 8 | #endif 9 | 10 | #ifdef CLOVER_MANAGED_ALLOC 11 | #define CLOVER_MEMCPY_KIND_D2H (hipMemcpyDefault) 12 | #define CLOVER_MEMCPY_KIND_H2D (hipMemcpyDefault) 13 | #else 14 | #define CLOVER_MEMCPY_KIND_D2H (hipMemcpyDeviceToHost) 15 | #define CLOVER_MEMCPY_KIND_H2D (hipMemcpyHostToDevice) 16 | #endif 17 | 18 | __device__ inline double SUM(double a, double b) { return a + b; } 19 | 20 | template class reduce { 21 | public: 22 | __device__ inline static void run(T *array, T *out, T (*func)(T, T)) { 23 | __syncthreads(); // don't optimise for sub-warp, always sync 24 | // only continue if it's in the lower half 25 | if (threadIdx.x < offset) { 26 | array[threadIdx.x] = func(array[threadIdx.x], array[threadIdx.x + offset]); 27 | reduce::run(array, out, func); 28 | } 29 | } 30 | }; 31 | 32 | template class reduce { 33 | public: 34 | __device__ inline static void run(T *array, T *out, T (*)(T, T)) { out[blockIdx.x] = array[0]; } 35 | }; 36 | 37 | inline void check_errors(int line_num, const char *file) { 38 | hipDeviceSynchronize(); 39 | if (auto result = hipGetLastError(); result != hipSuccess) { 40 | die(line_num, file, "Error in %s - return code %d (%s)\n", file, result, hipGetErrorName(result)); 41 | } 42 | } 43 | 44 | void sum_reduce_buffer(double *buffer, double *result, int len); 45 | 46 | #define KERNELS_START(pad) \ 47 | START_PROFILING(settings.kernel_profile); \ 48 | int x_inner = chunk->x - (pad); \ 49 | int y_inner = chunk->y - (pad); \ 50 | int num_blocks = ceil((double)(x_inner * y_inner) / double(BLOCK_SIZE)); \ 51 | do { \ 52 | } while (false) 53 | #ifdef CLOVER_SYNC_ALL_KERNELS 54 | #define KERNELS_END() \ 55 | check_errors(__LINE__, __FILE__); \ 56 | STOP_PROFILING(settings.kernel_profile, __func__); 57 | #else 58 | #define KERNELS_END() STOP_PROFILING(settings.kernel_profile, __func__) 59 | #endif 60 | -------------------------------------------------------------------------------- /src/hip/jacobi.cpp: -------------------------------------------------------------------------------- 1 | #include "hip/hip_runtime.h" 2 | 3 | #include "chunk.h" 4 | #include "cuknl_shared.h" 5 | #include "shared.h" 6 | 7 | // Core computation for Jacobi solver. 8 | __global__ void jacobi_iterate(const int x_inner, const int y_inner, const int halo_depth, const double *kx, const double *ky, 9 | const double *u0, const double *r, double *u, double *error) { 10 | const int gid = threadIdx.x + blockIdx.x * blockDim.x; 11 | __shared__ double error_local[BLOCK_SIZE]; 12 | 13 | const int x = x_inner + 2 * halo_depth; 14 | const int col = gid % x_inner; 15 | const int row = gid / x_inner; 16 | const int off0 = halo_depth * (x + 1); 17 | const int index = off0 + col + row * x; 18 | 19 | if (gid < x_inner * y_inner) { 20 | u[index] = 21 | (u0[index] + kx[index + 1] * r[index + 1] + kx[index] * r[index - 1] + ky[index + x] * r[index + x] + ky[index] * r[index - x]) / 22 | (1.0 + (kx[index] + kx[index + 1]) + (ky[index] + ky[index + x])); 23 | 24 | error_local[threadIdx.x] = fabs(u[index] - r[index]); 25 | } else { 26 | error_local[threadIdx.x] = 0.0; 27 | } 28 | 29 | reduce::run(error_local, error, SUM); 30 | } 31 | 32 | __global__ void jacobi_init(const int x_inner, const int y_inner, const int halo_depth, const double *density, const double *energy, 33 | const double rx, const double ry, double *kx, double *ky, double *u0, double *u, const int coefficient) { 34 | const int gid = threadIdx.x + blockIdx.x * blockDim.x; 35 | if (gid >= x_inner * y_inner) return; 36 | 37 | const int x = x_inner + 2 * halo_depth; 38 | const int col = gid % x_inner; 39 | const int row = gid / x_inner; 40 | const int off0 = halo_depth * (x + 1); 41 | const int index = off0 + col + row * x; 42 | 43 | const double u_temp = energy[index] * density[index]; 44 | u0[index] = u_temp; 45 | u[index] = u_temp; 46 | 47 | if (row == 0 || col == 0) return; 48 | 49 | double density_center; 50 | double density_left; 51 | double density_down; 52 | 53 | if (coefficient == CONDUCTIVITY) { 54 | density_center = density[index]; 55 | density_left = density[index - 1]; 56 | density_down = density[index - x]; 57 | } else if (coefficient == RECIP_CONDUCTIVITY) { 58 | density_center = 1.0 / density[index]; 59 | density_left = 1.0 / density[index - 1]; 60 | density_down = 1.0 / density[index - x]; 61 | } 62 | 63 | kx[index] = rx * (density_left + density_center) / (2.0 * density_left * density_center); 64 | ky[index] = ry * (density_down + density_center) / (2.0 * density_down * density_center); 65 | } 66 | 67 | __global__ void jacobi_copy_u(const int x_inner, const int y_inner, const double *src, double *dest) { 68 | const int gid = threadIdx.x + blockIdx.x * blockDim.x; 69 | 70 | if (gid < x_inner * y_inner) { 71 | dest[gid] = src[gid]; 72 | } 73 | } 74 | 75 | // Jacobi solver kernels 76 | void run_jacobi_init(Chunk *chunk, Settings &settings, double rx, double ry) { 77 | KERNELS_START(2 * settings.halo_depth); 78 | jacobi_init<<>>(x_inner, y_inner, settings.halo_depth, chunk->density, chunk->energy, rx, ry, chunk->kx, 79 | chunk->ky, chunk->u0, chunk->u, settings.coefficient); 80 | KERNELS_END(); 81 | } 82 | 83 | void run_jacobi_iterate(Chunk *chunk, Settings &settings, double *error) { 84 | KERNELS_START(2 * settings.halo_depth); 85 | jacobi_iterate<<>>(x_inner, y_inner, settings.halo_depth, chunk->kx, chunk->ky, chunk->u0, chunk->r, chunk->u, 86 | chunk->ext->d_reduce_buffer); 87 | sum_reduce_buffer(chunk->ext->d_reduce_buffer, error, num_blocks); 88 | KERNELS_END(); 89 | } 90 | -------------------------------------------------------------------------------- /src/hip/local_halos.cpp: -------------------------------------------------------------------------------- 1 | #include "hip/hip_runtime.h" 2 | 3 | #include "chunk.h" 4 | #include "cuknl_shared.h" 5 | #include "shared.h" 6 | 7 | __global__ void update_bottom(const int x, const int y, const int halo_depth, const int depth, double *buffer) { 8 | const int gid = threadIdx.x + blockIdx.x * blockDim.x; 9 | if (gid >= x * depth) return; 10 | 11 | const int lines = gid / x; 12 | const int offset = x * halo_depth; 13 | const int from_index = offset + gid; 14 | const int to_index = from_index - (1 + lines * 2) * x; 15 | buffer[to_index] = buffer[from_index]; 16 | } 17 | 18 | __global__ void update_top(const int x, const int y, const int halo_depth, const int depth, double *buffer) { 19 | const int gid = threadIdx.x + blockIdx.x * blockDim.x; 20 | if (gid >= x * depth) return; 21 | 22 | const int lines = gid / x; 23 | const int offset = x * (y - halo_depth); 24 | const int to_index = offset + gid; 25 | const int from_index = to_index - (1 + lines * 2) * x; 26 | buffer[to_index] = buffer[from_index]; 27 | } 28 | 29 | __global__ void update_left(const int x, const int y, const int halo_depth, const int depth, double *buffer) { 30 | const int gid = threadIdx.x + blockDim.x * blockIdx.x; 31 | if (gid >= y * depth) return; 32 | 33 | const int flip = gid % depth; 34 | const int lines = gid / depth; 35 | const int offset = halo_depth + lines * (x - depth); 36 | const int from_index = offset + gid; 37 | const int to_index = from_index - (1 + flip * 2); 38 | 39 | buffer[to_index] = buffer[from_index]; 40 | } 41 | 42 | __global__ void update_right(const int x, const int y, const int halo_depth, const int depth, double *buffer) { 43 | const int gid = threadIdx.x + blockDim.x * blockIdx.x; 44 | if (gid >= y * depth) return; 45 | 46 | const int flip = gid % depth; 47 | const int lines = gid / depth; 48 | const int offset = x - halo_depth + lines * (x - depth); 49 | const int to_index = offset + gid; 50 | const int from_index = to_index - (1 + flip * 2); 51 | 52 | buffer[to_index] = buffer[from_index]; 53 | } 54 | 55 | // Updates faces in turn. 56 | void update_face(const int x, const int y, const int halo_depth, const int *chunk_neighbours, const int depth, double *buffer) { 57 | int num_blocks = std::ceil((x * depth) / (double)BLOCK_SIZE); 58 | if (chunk_neighbours[CHUNK_TOP] == EXTERNAL_FACE) { 59 | update_top<<>>(x, y, halo_depth, depth, buffer); 60 | check_errors(__LINE__, __FILE__); 61 | } 62 | if (chunk_neighbours[CHUNK_BOTTOM] == EXTERNAL_FACE) { 63 | update_bottom<<>>(x, y, halo_depth, depth, buffer); 64 | check_errors(__LINE__, __FILE__); 65 | } 66 | 67 | num_blocks = std::ceil((y * depth) / (float)BLOCK_SIZE); 68 | if (chunk_neighbours[CHUNK_RIGHT] == EXTERNAL_FACE) { 69 | update_right<<>>(x, y, halo_depth, depth, buffer); 70 | check_errors(__LINE__, __FILE__); 71 | } 72 | if (chunk_neighbours[CHUNK_LEFT] == EXTERNAL_FACE) { 73 | update_left<<>>(x, y, halo_depth, depth, buffer); 74 | check_errors(__LINE__, __FILE__); 75 | } 76 | } 77 | 78 | // The kernel for updating halos locally 79 | void local_halos(const int x, const int y, const int halo_depth, const int depth, const int *chunk_neighbours, 80 | const bool *fields_to_exchange, double *density, double *energy0, double *energy, double *u, double *p, double *sd) { 81 | if (fields_to_exchange[FIELD_DENSITY]) { 82 | update_face(x, y, halo_depth, chunk_neighbours, depth, density); 83 | } 84 | if (fields_to_exchange[FIELD_P]) { 85 | update_face(x, y, halo_depth, chunk_neighbours, depth, p); 86 | } 87 | if (fields_to_exchange[FIELD_ENERGY0]) { 88 | update_face(x, y, halo_depth, chunk_neighbours, depth, energy0); 89 | } 90 | if (fields_to_exchange[FIELD_ENERGY1]) { 91 | update_face(x, y, halo_depth, chunk_neighbours, depth, energy); 92 | } 93 | if (fields_to_exchange[FIELD_U]) { 94 | update_face(x, y, halo_depth, chunk_neighbours, depth, u); 95 | } 96 | if (fields_to_exchange[FIELD_SD]) { 97 | update_face(x, y, halo_depth, chunk_neighbours, depth, sd); 98 | } 99 | } 100 | 101 | // Solver-wide kernels 102 | void run_local_halos(Chunk *chunk, Settings &settings, int depth) { 103 | START_PROFILING(settings.kernel_profile); 104 | 105 | local_halos(chunk->x, chunk->y, settings.halo_depth, depth, chunk->neighbours, settings.fields_to_exchange, chunk->density, 106 | chunk->energy0, chunk->energy, chunk->u, chunk->p, chunk->sd); 107 | 108 | STOP_PROFILING(settings.kernel_profile, __func__); 109 | } 110 | -------------------------------------------------------------------------------- /src/hip/model.cmake: -------------------------------------------------------------------------------- 1 | 2 | register_flag_required(CMAKE_CXX_COMPILER 3 | "Absolute path to the AMD HIP C++ compiler") 4 | 5 | register_flag_optional(MANAGED_ALLOC "Use UVM (hipMallocManaged) instead of the device-only allocation (hipMalloc)" 6 | "OFF") 7 | 8 | register_flag_optional(SYNC_ALL_KERNELS 9 | "Fully synchronise all kernels after launch, this also enables synchronous error checking with line and file name" 10 | "OFF") 11 | 12 | 13 | macro(setup) 14 | 15 | set(CMAKE_CXX_STANDARD 17) 16 | set(CMAKE_POSITION_INDEPENDENT_CODE ON) 17 | 18 | if (MANAGED_ALLOC) 19 | register_definitions(CLOVER_MANAGED_ALLOC) 20 | endif () 21 | 22 | if (SYNC_ALL_KERNELS) 23 | register_definitions(CLOVER_SYNC_ALL_KERNELS) 24 | endif () 25 | 26 | 27 | endmacro() -------------------------------------------------------------------------------- /src/hip/ppcg.cpp: -------------------------------------------------------------------------------- 1 | #include "hip/hip_runtime.h" 2 | 3 | #include "chunk.h" 4 | #include "cuknl_shared.h" 5 | 6 | __global__ void ppcg_init(const int x_inner, const int y_inner, const int halo_depth, const double theta, const double *r, double *sd) { 7 | const int gid = threadIdx.x + blockIdx.x * blockDim.x; 8 | if (gid >= x_inner * y_inner) return; 9 | 10 | const int x = x_inner + 2 * halo_depth; 11 | const int col = gid % x_inner; 12 | const int row = gid / x_inner; 13 | const int off0 = halo_depth * (x + 1); 14 | const int index = off0 + col + row * x; 15 | 16 | sd[index] = r[index] / theta; 17 | } 18 | 19 | __global__ void ppcg_calc_ur(const int x_inner, const int y_inner, const int halo_depth, const double *kx, const double *ky, 20 | const double *sd, double *u, double *r) { 21 | const int gid = threadIdx.x + blockIdx.x * blockDim.x; 22 | if (gid >= x_inner * y_inner) return; 23 | 24 | const int x = x_inner + 2 * halo_depth; 25 | const int col = gid % x_inner; 26 | const int row = gid / x_inner; 27 | const int off0 = halo_depth * (x + 1); 28 | const int index = off0 + col + row * x; 29 | 30 | const double smvp = (1.0 + (kx[index + 1] + kx[index]) + (ky[index + x] + ky[index])) * sd[index] - 31 | (kx[index + 1] * sd[index + 1] + kx[index] * sd[index - 1]) - 32 | (ky[index + x] * sd[index + x] + ky[index] * sd[index - x]); 33 | 34 | r[index] -= smvp; 35 | u[index] += sd[index]; 36 | } 37 | 38 | __global__ void ppcg_calc_sd(const int x_inner, const int y_inner, const int halo_depth, const double alpha, const double beta, 39 | const double *r, double *sd) { 40 | const int gid = threadIdx.x + blockIdx.x * blockDim.x; 41 | if (gid >= x_inner * y_inner) return; 42 | 43 | const int x = x_inner + 2 * halo_depth; 44 | const int col = gid % x_inner; 45 | const int row = gid / x_inner; 46 | const int off0 = halo_depth * (x + 1); 47 | const int index = off0 + col + row * x; 48 | 49 | sd[index] = alpha * sd[index] + beta * r[index]; 50 | } 51 | 52 | // PPCG solver kernels 53 | void run_ppcg_init(Chunk *chunk, Settings &settings) { 54 | KERNELS_START(2 * settings.halo_depth); 55 | ppcg_init<<>>(x_inner, y_inner, settings.halo_depth, chunk->theta, chunk->r, chunk->sd); 56 | KERNELS_END(); 57 | } 58 | 59 | void run_ppcg_inner_iteration(Chunk *chunk, Settings &settings, double alpha, double beta) { 60 | KERNELS_START(2 * settings.halo_depth); 61 | ppcg_calc_ur<<>>(x_inner, y_inner, settings.halo_depth, chunk->kx, chunk->ky, chunk->sd, chunk->u, chunk->r); 62 | ppcg_calc_sd<<>>(x_inner, y_inner, settings.halo_depth, alpha, beta, chunk->r, chunk->sd); 63 | KERNELS_END(); 64 | } -------------------------------------------------------------------------------- /src/kokkos/cheby.cpp: -------------------------------------------------------------------------------- 1 | #include "chunk.h" 2 | #include "kokkos_shared.hpp" 3 | #include "shared.h" 4 | 5 | // Initialises the Chebyshev solver 6 | void cheby_init(const int x, const int y, const int halo_depth, const double theta, KView &p, KView &r, KView &u, KView &u0, KView &w, 7 | KView &kx, KView &ky) { 8 | Kokkos::parallel_for( 9 | x * y, KOKKOS_LAMBDA(const int index) { 10 | const int kk = index % x; 11 | const int jj = index / x; 12 | 13 | if (kk >= halo_depth && kk < x - halo_depth && jj >= halo_depth && jj < y - halo_depth) { 14 | const double smvp = tealeaf_SMVP(u); 15 | w[index] = smvp; 16 | r[index] = u0[index] - w[index]; 17 | p[index] = r[index] / theta; 18 | } 19 | }); 20 | } 21 | 22 | // Calculates U 23 | void cheby_calc_u(const int x, const int y, const int halo_depth, KView &p, KView &u) { 24 | Kokkos::parallel_for( 25 | x * y, KOKKOS_LAMBDA(const int index) { 26 | const int kk = index % x; 27 | const int jj = index / x; 28 | 29 | if (kk >= halo_depth && kk < x - halo_depth && jj >= halo_depth && jj < y - halo_depth) { 30 | u[index] += p[index]; 31 | } 32 | }); 33 | } 34 | 35 | // The main Cheby iteration step 36 | void cheby_iterate(const int x, const int y, const int halo_depth, const double alpha, const double beta, KView &p, KView &r, KView &u, 37 | KView &u0, KView &w, KView &kx, KView &ky) { 38 | Kokkos::parallel_for( 39 | x * y, KOKKOS_LAMBDA(const int index) { 40 | const int kk = index % x; 41 | const int jj = index / x; 42 | 43 | if (kk >= halo_depth && kk < x - halo_depth && jj >= halo_depth && jj < y - halo_depth) { 44 | const double smvp = tealeaf_SMVP(u); 45 | w[index] = smvp; 46 | r[index] = u0[index] - w[index]; 47 | p[index] = alpha * p[index] + beta * r[index]; 48 | } 49 | }); 50 | } 51 | 52 | // Chebyshev solver kernels 53 | void run_cheby_init(Chunk *chunk, Settings &settings) { 54 | START_PROFILING(settings.kernel_profile); 55 | 56 | cheby_init(chunk->x, chunk->y, settings.halo_depth, chunk->theta, *chunk->p, *chunk->r, *chunk->u, *chunk->u0, *chunk->w, *chunk->kx, 57 | *chunk->ky); 58 | 59 | STOP_PROFILING(settings.kernel_profile, __func__); 60 | } 61 | 62 | void run_cheby_iterate(Chunk *chunk, Settings &settings, double alpha, double beta) { 63 | START_PROFILING(settings.kernel_profile); 64 | 65 | cheby_iterate(chunk->x, chunk->y, settings.halo_depth, alpha, beta, *chunk->p, *chunk->r, *chunk->u, *chunk->u0, *chunk->w, *chunk->kx, 66 | *chunk->ky); 67 | 68 | cheby_calc_u(chunk->x, chunk->y, settings.halo_depth, *chunk->p, *chunk->u); 69 | 70 | STOP_PROFILING(settings.kernel_profile, __func__); 71 | } -------------------------------------------------------------------------------- /src/kokkos/chunk_extension.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "kokkos_shared.hpp" 4 | #include 5 | 6 | using FieldBufferType = Kokkos::View *; 7 | using StagingBufferType = Kokkos::View::HostMirror *; 8 | struct ChunkExtension {}; 9 | -------------------------------------------------------------------------------- /src/kokkos/jacobi.cpp: -------------------------------------------------------------------------------- 1 | #include "chunk.h" 2 | #include "kokkos_shared.hpp" 3 | #include "shared.h" 4 | 5 | // Initialises the Jacobi solver 6 | void jacobi_init(const int x, const int y, const int halo_depth, const int coefficient, const double rx, const double ry, KView &u, 7 | KView &u0, KView &density, KView &energy, KView &kx, KView &ky) { 8 | Kokkos::parallel_for( 9 | x * y, KOKKOS_LAMBDA(const int index) { 10 | const int kk = index % x; 11 | const int jj = index / x; 12 | 13 | if (kk > 0 && kk < x - 1 && jj > 0 && jj < y - 1) { 14 | u0(index) = energy(index) * density(index); 15 | u(index) = u0(index); 16 | } 17 | 18 | if (jj >= halo_depth && jj < y - 1 && kk >= halo_depth && kk < x - 1) { 19 | double densityCentre = (coefficient == CONDUCTIVITY) ? density(index) : 1.0 / density(index); 20 | double densityLeft = (coefficient == CONDUCTIVITY) ? density(index - 1) : 1.0 / density(index - 1); 21 | double densityDown = (coefficient == CONDUCTIVITY) ? density(index - x) : 1.0 / density(index - x); 22 | 23 | kx(index) = rx * (densityLeft + densityCentre) / (2.0 * densityLeft * densityCentre); 24 | ky(index) = ry * (densityDown + densityCentre) / (2.0 * densityDown * densityCentre); 25 | } 26 | }); 27 | } 28 | 29 | // Main Jacobi solver method. 30 | void jacobi_iterate(const int x, const int y, const int halo_depth, KView &u, KView &u0, KView &r, KView &kx, KView &ky, double *error) { 31 | Kokkos::parallel_reduce( 32 | x * y, 33 | KOKKOS_LAMBDA(const int index, double &temp_error) { 34 | const int kk = index % x; 35 | const int jj = index / x; 36 | 37 | if (kk >= halo_depth && kk < x - halo_depth && jj >= halo_depth && jj < y - halo_depth) { 38 | u(index) = (u0(index) + (kx(index + 1) * r(index + 1) + kx(index) * r(index - 1)) + 39 | (ky(index + x) * r(index + x) + ky(index) * r(index - x))) / 40 | (1.0 + (kx(index) + kx(index + 1)) + (ky(index) + ky(index + x))); 41 | 42 | temp_error += Kokkos::fabs(u(index) - r(index)); 43 | } 44 | }, 45 | *error); 46 | } 47 | 48 | // Copies u into r 49 | void jacobi_copy_u(const int x, const int y, KView &r, KView &u) { 50 | Kokkos::parallel_for( 51 | x * y, KOKKOS_LAMBDA(const int index) { r(index) = u(index); }); 52 | } 53 | 54 | // Jacobi solver kernels 55 | void run_jacobi_init(Chunk *chunk, Settings &settings, double rx, double ry) { 56 | START_PROFILING(settings.kernel_profile); 57 | 58 | jacobi_init(chunk->x, chunk->y, settings.halo_depth, settings.coefficient, rx, ry, *chunk->u, *chunk->u0, *chunk->density, *chunk->energy, 59 | *chunk->kx, *chunk->ky); 60 | 61 | STOP_PROFILING(settings.kernel_profile, __func__); 62 | } 63 | 64 | void run_jacobi_iterate(Chunk *chunk, Settings &settings, double *error) { 65 | START_PROFILING(settings.kernel_profile); 66 | 67 | jacobi_copy_u(chunk->x, chunk->y, *chunk->r, *chunk->u); 68 | 69 | jacobi_iterate(chunk->x, chunk->y, settings.halo_depth, *chunk->u, *chunk->u0, *chunk->r, *chunk->kx, *chunk->ky, error); 70 | 71 | STOP_PROFILING(settings.kernel_profile, __func__); 72 | } -------------------------------------------------------------------------------- /src/kokkos/kokkos_shared.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | using KView = Kokkos::View; 4 | -------------------------------------------------------------------------------- /src/kokkos/local_halos.cpp: -------------------------------------------------------------------------------- 1 | #include "chunk.h" 2 | #include "kokkos_shared.hpp" 3 | #include "shared.h" 4 | 5 | // Updates the local left halo region(s) 6 | void update_left(const int x, const int y, const int depth, const int halo_depth, KView &buffer) { 7 | Kokkos::parallel_for( 8 | y * depth, KOKKOS_LAMBDA(const int index) { 9 | const int flip = index % depth; 10 | const int lines = index / depth; 11 | const int offset = lines * (x - depth); 12 | const int to_index = offset + halo_depth - depth + index; 13 | const int from_index = to_index + 2 * (depth - flip) - 1; 14 | buffer(to_index) = buffer(from_index); 15 | }); 16 | } 17 | 18 | // Updates the local right halo region(s) 19 | void update_right(const int x, const int y, const int depth, const int halo_depth, KView &buffer) { 20 | Kokkos::parallel_for( 21 | y * depth, KOKKOS_LAMBDA(const int index) { 22 | const int flip = index % depth; 23 | const int lines = index / depth; 24 | const int offset = x - halo_depth + lines * (x - depth); 25 | const int to_index = offset + index; 26 | const int from_index = to_index - (1 + flip * 2); 27 | buffer(to_index) = buffer(from_index); 28 | }); 29 | } 30 | 31 | // Updates the local top halo region(s) 32 | void update_top(const int x, const int y, const int depth, const int halo_depth, KView &buffer) { 33 | Kokkos::parallel_for( 34 | x * depth, KOKKOS_LAMBDA(const int index) { 35 | const int lines = index / x; 36 | const int offset = x * (y - halo_depth); 37 | 38 | const int to_index = offset + index; 39 | const int from_index = to_index - (1 + lines * 2) * x; 40 | buffer(to_index) = buffer(from_index); 41 | }); 42 | } 43 | 44 | // Updates the local bottom halo region(s) 45 | void update_bottom(const int x, const int y, const int depth, const int halo_depth, KView &buffer) { 46 | Kokkos::parallel_for( 47 | x * depth, KOKKOS_LAMBDA(const int index) { 48 | const int lines = index / x; 49 | const int offset = x * halo_depth; 50 | 51 | const int from_index = offset + index; 52 | const int to_index = from_index - (1 + lines * 2) * x; 53 | buffer(to_index) = buffer(from_index); 54 | }); 55 | } 56 | 57 | // Updates faces in turn. 58 | void update_face(const int x, const int y, const int halo_depth, const int *chunk_neighbours, const int depth, KView &buffer) { 59 | if (chunk_neighbours[CHUNK_LEFT] == EXTERNAL_FACE) { 60 | update_left(x, y, depth, halo_depth, buffer); 61 | } 62 | if (chunk_neighbours[CHUNK_RIGHT] == EXTERNAL_FACE) { 63 | update_right(x, y, depth, halo_depth, buffer); 64 | } 65 | if (chunk_neighbours[CHUNK_TOP] == EXTERNAL_FACE) { 66 | update_top(x, y, depth, halo_depth, buffer); 67 | } 68 | if (chunk_neighbours[CHUNK_BOTTOM] == EXTERNAL_FACE) { 69 | update_bottom(x, y, depth, halo_depth, buffer); 70 | } 71 | } 72 | 73 | // The kernel for updating halos locally 74 | void local_halos(const int x, const int y, const int depth, const int halo_depth, const int *chunk_neighbours, 75 | const bool *fields_to_exchange, KView &density, KView &energy0, KView &energy, KView &u, KView &p, KView &sd) { 76 | if (fields_to_exchange[FIELD_DENSITY]) { 77 | update_face(x, y, halo_depth, chunk_neighbours, depth, density); 78 | } 79 | if (fields_to_exchange[FIELD_P]) { 80 | update_face(x, y, halo_depth, chunk_neighbours, depth, p); 81 | } 82 | if (fields_to_exchange[FIELD_ENERGY0]) { 83 | update_face(x, y, halo_depth, chunk_neighbours, depth, energy0); 84 | } 85 | if (fields_to_exchange[FIELD_ENERGY1]) { 86 | update_face(x, y, halo_depth, chunk_neighbours, depth, energy); 87 | } 88 | if (fields_to_exchange[FIELD_U]) { 89 | update_face(x, y, halo_depth, chunk_neighbours, depth, u); 90 | } 91 | if (fields_to_exchange[FIELD_SD]) { 92 | update_face(x, y, halo_depth, chunk_neighbours, depth, sd); 93 | } 94 | } 95 | 96 | // Solver-wide kernels 97 | void run_local_halos(Chunk *chunk, Settings &settings, int depth) { 98 | START_PROFILING(settings.kernel_profile); 99 | local_halos(chunk->x, chunk->y, depth, settings.halo_depth, chunk->neighbours, settings.fields_to_exchange, *chunk->density, 100 | *chunk->energy0, *chunk->energy, *chunk->u, *chunk->p, *chunk->sd); 101 | STOP_PROFILING(settings.kernel_profile, __func__); 102 | } 103 | -------------------------------------------------------------------------------- /src/kokkos/model.cmake: -------------------------------------------------------------------------------- 1 | register_flag_optional(CMAKE_CXX_COMPILER 2 | "Any CXX compiler that is supported by CMake detection and RAJA. 3 | See https://github.com/kokkos/kokkos#primary-tested-compilers-on-x86-are" 4 | "c++") 5 | 6 | register_flag_optional(KOKKOS_IN_TREE 7 | "Absolute path to the *source* distribution directory of Kokkos. 8 | Remember to append Kokkos specific flags as well, for example: 9 | -DKOKKOS_IN_TREE=... -DKokkos_ENABLE_OPENMP=ON -DKokkos_ARCH_ZEN=ON ... 10 | See https://github.com/kokkos/kokkos/blob/master/BUILD.md for all available options" "") 11 | 12 | register_flag_optional(KOKKOS_IN_PACKAGE 13 | "Absolute path to package R-Path containing Kokkos libs. 14 | Use this instead of KOKKOS_IN_TREE if Kokkos is from a package manager like Spack." "") 15 | 16 | # compiler vendor and arch specific flags 17 | set(KOKKOS_FLAGS_CPU_INTEL -qopt-streaming-stores=always) 18 | 19 | macro(setup) 20 | 21 | set(CMAKE_CXX_STANDARD 17) # Kokkos 4+ requires CXX >= 17 22 | cmake_policy(SET CMP0074 NEW) #see https://github.com/kokkos/kokkos/blob/master/BUILD.md 23 | 24 | 25 | if (EXISTS "${KOKKOS_IN_TREE}") 26 | message(STATUS "Build using in-tree Kokkos source at `${KOKKOS_IN_TREE}`") 27 | add_subdirectory(${KOKKOS_IN_TREE} ${CMAKE_BINARY_DIR}/kokkos) 28 | register_link_library(Kokkos::kokkos) 29 | elseif (EXISTS "${KOKKOS_IN_PACKAGE}") 30 | message(STATUS "Build using packaged Kokkos at `${KOKKOS_IN_PACKAGE}`") 31 | set(Kokkos_DIR "${KOKKOS_IN_PACKAGE}/lib64/cmake/Kokkos") 32 | find_package(Kokkos REQUIRED) 33 | register_link_library(Kokkos::kokkos) 34 | else () 35 | message(FATAL_ERROR "Neither `KOKKOS_IN_TREE`, or `KOKKOS_IN_PACKAGE` was set!") 36 | endif () 37 | 38 | register_append_compiler_and_arch_specific_cxx_flags( 39 | KOKKOS_FLAGS_CPU 40 | ${CMAKE_CXX_COMPILER_ID} 41 | ${CMAKE_SYSTEM_PROCESSOR} 42 | ) 43 | 44 | endmacro() 45 | -------------------------------------------------------------------------------- /src/kokkos/ppcg.cpp: -------------------------------------------------------------------------------- 1 | #include "chunk.h" 2 | #include "kokkos_shared.hpp" 3 | #include "shared.h" 4 | 5 | // Initialises Sd 6 | void ppcg_init(const int x, const int y, const int halo_depth, const double theta, KView &sd, KView &r) { 7 | Kokkos::parallel_for( 8 | x * y, KOKKOS_LAMBDA(const int index) { 9 | const int kk = index % x; 10 | const int jj = index / x; 11 | 12 | if (kk >= halo_depth && kk < x - halo_depth && jj >= halo_depth && jj < y - halo_depth) { 13 | sd[index] = r[index] / theta; 14 | } 15 | }); 16 | } 17 | 18 | // Calculates U and R 19 | void ppcg_calc_ur(const int x, const int y, const int halo_depth, KView &sd, KView &r, KView &u, KView &kx, KView &ky) { 20 | Kokkos::parallel_for( 21 | x * y, KOKKOS_LAMBDA(const int index) { 22 | const int kk = index % x; 23 | const int jj = index / x; 24 | 25 | if (kk >= halo_depth && kk < x - halo_depth && jj >= halo_depth && jj < y - halo_depth) { 26 | const double smvp = tealeaf_SMVP(sd); 27 | r[index] -= smvp; 28 | u[index] += sd[index]; 29 | } 30 | }); 31 | } 32 | 33 | // Calculates Sd 34 | void ppcg_calc_sd(const int x, const int y, const int halo_depth, const double theta, const double alpha, const double beta, KView &sd, 35 | KView &r) { 36 | Kokkos::parallel_for( 37 | x * y, KOKKOS_LAMBDA(const int index) { 38 | const int kk = index % x; 39 | const int jj = index / x; 40 | 41 | if (kk >= halo_depth && kk < x - halo_depth && jj >= halo_depth && jj < y - halo_depth) { 42 | sd[index] = alpha * sd[index] + beta * r[index]; 43 | } 44 | }); 45 | } 46 | 47 | // PPCG solver kernels 48 | void run_ppcg_init(Chunk *chunk, Settings &settings) { 49 | START_PROFILING(settings.kernel_profile); 50 | 51 | ppcg_init(chunk->x, chunk->y, settings.halo_depth, chunk->theta, *chunk->sd, *chunk->r); 52 | 53 | STOP_PROFILING(settings.kernel_profile, __func__); 54 | } 55 | 56 | void run_ppcg_inner_iteration(Chunk *chunk, Settings &settings, double alpha, double beta) { 57 | START_PROFILING(settings.kernel_profile); 58 | 59 | ppcg_calc_ur(chunk->x, chunk->y, settings.halo_depth, *chunk->sd, *chunk->r, *chunk->u, *chunk->kx, *chunk->ky); 60 | 61 | ppcg_calc_sd(chunk->x, chunk->y, settings.halo_depth, chunk->theta, alpha, beta, *chunk->sd, *chunk->r); 62 | 63 | STOP_PROFILING(settings.kernel_profile, __func__); 64 | } -------------------------------------------------------------------------------- /src/kokkos/solver_methods.cpp: -------------------------------------------------------------------------------- 1 | #include "chunk.h" 2 | #include "kokkos_shared.hpp" 3 | #include "shared.h" 4 | 5 | // Copies energy0 into energy1. 6 | void store_energy(const int x, const int y, KView &energy, KView &energy0) { 7 | Kokkos::parallel_for( 8 | x * y, KOKKOS_LAMBDA(const int index) { energy(index) = energy0(index); }); 9 | } 10 | 11 | // Copies the inner u into u0. 12 | void copy_u(const int x, const int y, const int halo_depth, KView &u, KView &u0) { 13 | Kokkos::parallel_for( 14 | x * y, KOKKOS_LAMBDA(const int index) { 15 | const int kk = index % x; 16 | const int jj = index / x; 17 | 18 | if (kk >= halo_depth && kk < x - halo_depth && jj >= halo_depth && jj < y - halo_depth) { 19 | u0(index) = u(index); 20 | } 21 | }); 22 | } 23 | 24 | // Calculates the residual r. 25 | void calculate_residual(const int x, const int y, const int halo_depth, KView &u, KView &u0, KView &r, KView &kx, KView &ky) { 26 | Kokkos::parallel_for( 27 | x * y, KOKKOS_LAMBDA(const int index) { 28 | const int kk = index % x; 29 | const int jj = index / x; 30 | 31 | if (kk >= halo_depth && kk < x - halo_depth && jj >= halo_depth && jj < y - halo_depth) { 32 | const double smvp = tealeaf_SMVP(u); 33 | r(index) = u0(index) - smvp; 34 | } 35 | }); 36 | } 37 | 38 | // Calculates the 2 norm of the provided buffer. 39 | void calculate_2norm(const int x, const int y, const int halo_depth, KView &buffer, double *norm) { 40 | Kokkos::parallel_reduce( 41 | x * y, 42 | KOKKOS_LAMBDA(const int index, double &norm_temp) { 43 | const int kk = index % x; 44 | const int jj = index / x; 45 | 46 | if (kk >= halo_depth && kk < x - halo_depth && jj >= halo_depth && jj < y - halo_depth) { 47 | norm_temp += buffer(index) * buffer(index); 48 | } 49 | }, 50 | *norm); 51 | } 52 | 53 | // Finalises the energy field. 54 | void finalise(const int x, const int y, const int halo_depth, KView &u, KView &density, KView &energy) { 55 | Kokkos::parallel_for( 56 | x * y, KOKKOS_LAMBDA(const int index) { 57 | const int kk = index % x; 58 | const int jj = index / x; 59 | 60 | if (kk >= halo_depth && kk < x - halo_depth && jj >= halo_depth && jj < y - halo_depth) { 61 | energy(index) = u(index) / density(index); 62 | } 63 | }); 64 | } 65 | 66 | void run_store_energy(Chunk *chunk, Settings &settings) { 67 | START_PROFILING(settings.kernel_profile); 68 | store_energy(chunk->x, chunk->y, *chunk->energy, *chunk->energy0); 69 | STOP_PROFILING(settings.kernel_profile, __func__); 70 | } 71 | 72 | void run_field_summary(Chunk *chunk, Settings &settings, double *vol, double *mass, double *ie, double *temp) { 73 | START_PROFILING(settings.kernel_profile); 74 | int x = chunk->x; 75 | int y = chunk->y; 76 | int halo_depth = settings.halo_depth; 77 | auto &u = *chunk->u; 78 | auto &density = *chunk->density; 79 | auto &energy0 = *chunk->energy0; 80 | auto &volume = *chunk->volume; 81 | 82 | Kokkos::parallel_reduce( 83 | chunk->x * chunk->y, 84 | KOKKOS_LAMBDA(const int index, double &vol, double &mass, double &ie, double &temp) { 85 | const int kk = index % x; 86 | const int jj = index / x; 87 | 88 | if (kk >= halo_depth && kk < x - halo_depth && jj >= halo_depth && jj < y - halo_depth) { 89 | const double cellVol = volume[index]; 90 | const double cellMass = cellVol * density[index]; 91 | vol += cellVol; 92 | mass += cellMass; 93 | ie += cellMass * energy0[index]; 94 | temp += cellMass * u[index]; 95 | } 96 | }, 97 | *vol, *mass, *ie, *temp); 98 | STOP_PROFILING(settings.kernel_profile, __func__); 99 | } 100 | 101 | // Shared solver kernels 102 | void run_copy_u(Chunk *chunk, Settings &settings) { 103 | START_PROFILING(settings.kernel_profile); 104 | 105 | copy_u(chunk->x, chunk->y, settings.halo_depth, *chunk->u, *chunk->u0); 106 | 107 | STOP_PROFILING(settings.kernel_profile, __func__); 108 | } 109 | 110 | void run_calculate_residual(Chunk *chunk, Settings &settings) { 111 | START_PROFILING(settings.kernel_profile); 112 | 113 | calculate_residual(chunk->x, chunk->y, settings.halo_depth, *chunk->u, *chunk->u0, *chunk->r, *chunk->kx, *chunk->ky); 114 | 115 | STOP_PROFILING(settings.kernel_profile, __func__); 116 | } 117 | 118 | void run_calculate_2norm(Chunk *chunk, Settings &settings, FieldBufferType buffer, double *norm) { 119 | START_PROFILING(settings.kernel_profile); 120 | 121 | calculate_2norm(chunk->x, chunk->y, settings.halo_depth, *buffer, norm); 122 | 123 | STOP_PROFILING(settings.kernel_profile, __func__); 124 | } 125 | 126 | void run_finalise(Chunk *chunk, Settings &settings) { 127 | START_PROFILING(settings.kernel_profile); 128 | 129 | finalise(chunk->x, chunk->y, settings.halo_depth, *chunk->u, *chunk->density, *chunk->energy); 130 | 131 | STOP_PROFILING(settings.kernel_profile, __func__); 132 | } 133 | -------------------------------------------------------------------------------- /src/omp/cheby.cpp: -------------------------------------------------------------------------------- 1 | #include "chunk.h" 2 | #include "shared.h" 3 | 4 | /* 5 | * CHEBYSHEV SOLVER KERNEL 6 | */ 7 | 8 | // Calculates the new value for u. 9 | void cheby_calc_u(const int x, const int y, const int halo_depth, double *u, const double *p) { 10 | #ifdef OMP_TARGET 11 | #pragma omp target teams distribute parallel for simd collapse(2) 12 | #else 13 | #pragma omp parallel for 14 | #endif 15 | for (int jj = halo_depth; jj < y - halo_depth; ++jj) { 16 | for (int kk = halo_depth; kk < x - halo_depth; ++kk) { 17 | const int index = kk + jj * x; 18 | u[index] += p[index]; 19 | } 20 | } 21 | } 22 | 23 | // Initialises the Chebyshev solver 24 | void cheby_init(const int x, const int y, const int halo_depth, const double theta, double *u, const double *u0, double *p, double *r, 25 | double *w, const double *kx, const double *ky) { 26 | #ifdef OMP_TARGET 27 | #pragma omp target teams distribute parallel for simd collapse(2) 28 | #else 29 | #pragma omp parallel for 30 | #endif 31 | for (int jj = halo_depth; jj < y - halo_depth; ++jj) { 32 | for (int kk = halo_depth; kk < x - halo_depth; ++kk) { 33 | const int index = kk + jj * x; 34 | const double smvp = tealeaf_SMVP(u); 35 | w[index] = smvp; 36 | r[index] = u0[index] - w[index]; 37 | p[index] = r[index] / theta; 38 | } 39 | } 40 | 41 | cheby_calc_u(x, y, halo_depth, u, p); 42 | } 43 | 44 | // The main chebyshev iteration 45 | void cheby_iterate(const int x, const int y, const int halo_depth, double alpha, double beta, double *u, const double *u0, double *p, 46 | double *r, double *w, const double *kx, const double *ky) { 47 | #ifdef OMP_TARGET 48 | #pragma omp target teams distribute parallel for simd collapse(2) 49 | #else 50 | #pragma omp parallel for 51 | #endif 52 | for (int jj = halo_depth; jj < y - halo_depth; ++jj) { 53 | for (int kk = halo_depth; kk < x - halo_depth; ++kk) { 54 | const int index = kk + jj * x; 55 | const double smvp = tealeaf_SMVP(u); 56 | w[index] = smvp; 57 | r[index] = u0[index] - w[index]; 58 | p[index] = alpha * p[index] + beta * r[index]; 59 | } 60 | } 61 | 62 | cheby_calc_u(x, y, halo_depth, u, p); 63 | } 64 | 65 | // Chebyshev solver kernels 66 | void run_cheby_init(Chunk *chunk, Settings &settings) { 67 | START_PROFILING(settings.kernel_profile); 68 | cheby_init(chunk->x, chunk->y, settings.halo_depth, chunk->theta, chunk->u, chunk->u0, chunk->p, chunk->r, chunk->w, chunk->kx, 69 | chunk->ky); 70 | STOP_PROFILING(settings.kernel_profile, __func__); 71 | } 72 | 73 | void run_cheby_iterate(Chunk *chunk, Settings &settings, double alpha, double beta) { 74 | START_PROFILING(settings.kernel_profile); 75 | cheby_iterate(chunk->x, chunk->y, settings.halo_depth, alpha, beta, chunk->u, chunk->u0, chunk->p, chunk->r, chunk->w, chunk->kx, 76 | chunk->ky); 77 | STOP_PROFILING(settings.kernel_profile, __func__); 78 | } 79 | -------------------------------------------------------------------------------- /src/omp/chunk_extension.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | using FieldBufferType = double *; 4 | using StagingBufferType = double *; 5 | struct ChunkExtension {}; 6 | -------------------------------------------------------------------------------- /src/omp/diffuse_overload.cpp: -------------------------------------------------------------------------------- 1 | #ifdef OMP_TARGET 2 | 3 | #include "application.h" 4 | #include "drivers.h" 5 | 6 | void solve(Chunk *chunks, Settings &settings, int tt, double *wallclock_prev); 7 | 8 | // An implementation specific overload of the main timestep loop 9 | bool diffuse_overload(Chunk *chunks, Settings &settings) { 10 | int n = chunks->x * chunks->y; 11 | 12 | print_and_log(settings, "This implementation overloads the diffuse function.\n"); 13 | 14 | // Currently have to place all structure enclose pointers 15 | // into local variables for OMP 4.0 to accept them in mapping clauses 16 | double *r = chunks->r; 17 | double *sd = chunks->sd; 18 | double *kx = chunks->kx; 19 | double *ky = chunks->ky; 20 | double *w = chunks->w; 21 | double *p = chunks->p; 22 | double *cheby_alphas = chunks->cheby_alphas; 23 | double *cheby_betas = chunks->cheby_betas; 24 | double *cg_alphas = chunks->cg_alphas; 25 | double *cg_betas = chunks->cg_betas; 26 | double *energy = chunks->energy; 27 | double *density = chunks->density; 28 | double *energy0 = chunks->energy0; 29 | double *density0 = chunks->density0; 30 | double *u = chunks->u; 31 | double *u0 = chunks->u0; 32 | 33 | double *left_send = chunks->left_send; 34 | double *left_recv = chunks->left_recv; 35 | double *right_send = chunks->right_send; 36 | double *right_recv = chunks->right_recv; 37 | double *top_send = chunks->top_send; 38 | double *top_recv = chunks->top_recv; 39 | double *bottom_send = chunks->bottom_send; 40 | double *bottom_recv = chunks->bottom_recv; 41 | 42 | settings.is_offload = true; 43 | 44 | int lr_len = chunks->y * settings.halo_depth * NUM_FIELDS; 45 | int tb_len = chunks->x * settings.halo_depth * NUM_FIELDS; 46 | 47 | #pragma omp target enter data map(to : r[ : n], sd[ : n], kx[ : n], ky[ : n], w[ : n], p[ : n], cheby_alphas[ : settings.max_iters], \ 48 | cheby_betas[ : settings.max_iters], cg_alphas[ : settings.max_iters], \ 49 | cg_betas[ : settings.max_iters]) \ 50 | map(to : density[ : n], energy[ : n], density0[ : n], energy0[ : n], u[ : n], u0[ : n]), \ 51 | map(alloc : left_send[ : lr_len], left_recv[ : lr_len], right_send[ : lr_len], right_recv[ : lr_len], top_send[ : tb_len], \ 52 | top_recv[ : tb_len], bottom_send[ : tb_len], bottom_recv[ : tb_len]) 53 | 54 | double wallclock_prev = 0.0; 55 | for (int tt = 0; tt < settings.end_step; ++tt) { 56 | solve(chunks, settings, tt, &wallclock_prev); 57 | } 58 | 59 | #pragma omp target exit data map(from : density[ : n], energy[ : n], density0[ : n], energy0[ : n], u[ : n], u0[ : n]) 60 | 61 | settings.is_offload = false; 62 | 63 | return field_summary_driver(chunks, settings, true); 64 | } 65 | 66 | #endif -------------------------------------------------------------------------------- /src/omp/jacobi.cpp: -------------------------------------------------------------------------------- 1 | #include "chunk.h" 2 | #include "shared.h" 3 | #include 4 | 5 | /* 6 | * JACOBI SOLVER KERNEL 7 | */ 8 | 9 | // Initialises the Jacobi solver 10 | void jacobi_init(const int x, const int y, const int halo_depth, const int coefficient, double rx, double ry, const double *density, 11 | const double *energy, double *u0, double *u, double *kx, double *ky) { 12 | if (coefficient < CONDUCTIVITY && coefficient < RECIP_CONDUCTIVITY) { 13 | die(__LINE__, __FILE__, "Coefficient %d is not valid.\n", coefficient); 14 | } 15 | 16 | #ifdef OMP_TARGET 17 | #pragma omp target teams distribute parallel for simd collapse(2) 18 | #else 19 | #pragma omp parallel for 20 | #endif 21 | for (int jj = 1; jj < y - 1; ++jj) { 22 | for (int kk = 1; kk < x - 1; ++kk) { 23 | const int index = kk + jj * x; 24 | double temp = energy[index] * density[index]; 25 | u0[index] = temp; 26 | u[index] = temp; 27 | } 28 | } 29 | 30 | #ifdef OMP_TARGET 31 | #pragma omp target teams distribute parallel for simd collapse(2) 32 | #else 33 | #pragma omp parallel for 34 | #endif 35 | for (int jj = halo_depth; jj < y - 1; ++jj) { 36 | for (int kk = halo_depth; kk < x - 1; ++kk) { 37 | const int index = kk + jj * x; 38 | double densityCentre = (coefficient == CONDUCTIVITY) ? density[index] : 1.0 / density[index]; 39 | double densityLeft = (coefficient == CONDUCTIVITY) ? density[index - 1] : 1.0 / density[index - 1]; 40 | double densityDown = (coefficient == CONDUCTIVITY) ? density[index - x] : 1.0 / density[index - x]; 41 | 42 | kx[index] = rx * (densityLeft + densityCentre) / (2.0 * densityLeft * densityCentre); 43 | ky[index] = ry * (densityDown + densityCentre) / (2.0 * densityDown * densityCentre); 44 | } 45 | } 46 | } 47 | 48 | // The main Jacobi solve step 49 | void jacobi_iterate(const int x, const int y, const int halo_depth, double *error, const double *kx, const double *ky, const double *u0, 50 | double *u, double *r) { 51 | #ifdef OMP_TARGET 52 | #pragma omp target teams distribute parallel for simd collapse(2) 53 | #else 54 | #pragma omp parallel for 55 | #endif 56 | for (int jj = 0; jj < y; ++jj) { 57 | for (int kk = 0; kk < x; ++kk) { 58 | const int index = kk + jj * x; 59 | r[index] = u[index]; 60 | } 61 | } 62 | 63 | double err = 0.0; 64 | 65 | #ifdef OMP_TARGET 66 | #pragma omp target teams distribute parallel for simd reduction(+ : err) collapse(2) 67 | #else 68 | #pragma omp parallel for reduction(+ : err) 69 | #endif 70 | for (int jj = halo_depth; jj < y - halo_depth; ++jj) { 71 | for (int kk = halo_depth; kk < x - halo_depth; ++kk) { 72 | const int index = kk + jj * x; 73 | u[index] = (u0[index] + (kx[index + 1] * r[index + 1] + kx[index] * r[index - 1]) + 74 | (ky[index + x] * r[index + x] + ky[index] * r[index - x])) / 75 | (1.0 + (kx[index] + kx[index + 1]) + (ky[index] + ky[index + x])); 76 | 77 | err += fabs(u[index] - r[index]); 78 | } 79 | } 80 | 81 | *error = err; 82 | } 83 | 84 | // Jacobi solver kernels 85 | void run_jacobi_init(Chunk *chunk, Settings &settings, double rx, double ry) { 86 | START_PROFILING(settings.kernel_profile); 87 | jacobi_init(chunk->x, chunk->y, settings.halo_depth, settings.coefficient, rx, ry, chunk->density, chunk->energy, chunk->u0, chunk->u, 88 | chunk->kx, chunk->ky); 89 | STOP_PROFILING(settings.kernel_profile, __func__); 90 | } 91 | 92 | void run_jacobi_iterate(Chunk *chunk, Settings &settings, double *error) { 93 | START_PROFILING(settings.kernel_profile); 94 | jacobi_iterate(chunk->x, chunk->y, settings.halo_depth, error, chunk->kx, chunk->ky, chunk->u0, chunk->u, chunk->r); 95 | STOP_PROFILING(settings.kernel_profile, __func__); 96 | } -------------------------------------------------------------------------------- /src/omp/local_halos.cpp: -------------------------------------------------------------------------------- 1 | #include "chunk.h" 2 | #include "shared.h" 3 | 4 | // Update left halo. 5 | void update_left(const int x, const int y, const int halo_depth, const int depth, double *buffer, bool is_offload) { 6 | 7 | #ifdef OMP_TARGET 8 | #pragma omp target teams distribute parallel for simd if (is_offload) collapse(2) 9 | #else 10 | #pragma omp parallel for 11 | #endif 12 | for (int jj = halo_depth; jj < y - halo_depth; ++jj) { 13 | for (int kk = 0; kk < depth; ++kk) { 14 | int base = jj * x; 15 | buffer[base + (halo_depth - kk - 1)] = buffer[base + (halo_depth + kk)]; 16 | } 17 | } 18 | } 19 | 20 | // Update right halo. 21 | void update_right(const int x, const int y, const int halo_depth, const int depth, double *buffer, bool is_offload) { 22 | #ifdef OMP_TARGET 23 | #pragma omp target teams distribute parallel for simd if (is_offload) collapse(2) 24 | #else 25 | #pragma omp parallel for 26 | #endif 27 | for (int jj = halo_depth; jj < y - halo_depth; ++jj) { 28 | for (int kk = 0; kk < depth; ++kk) { 29 | int base = jj * x; 30 | buffer[base + (x - halo_depth + kk)] = buffer[base + (x - halo_depth - 1 - kk)]; 31 | } 32 | } 33 | } 34 | 35 | // Update top halo. 36 | void update_top(const int x, const int y, const int halo_depth, const int depth, double *buffer, bool is_offload) { 37 | #ifdef OMP_TARGET 38 | #pragma omp target teams distribute parallel for simd if (is_offload) collapse(2) 39 | #endif 40 | for (int jj = 0; jj < depth; ++jj) { 41 | #ifndef OMP_TARGET 42 | #pragma omp parallel for 43 | #endif 44 | for (int kk = halo_depth; kk < x - halo_depth; ++kk) { 45 | int base = kk; 46 | buffer[base + (y - halo_depth + jj) * x] = buffer[base + (y - halo_depth - 1 - jj) * x]; 47 | } 48 | } 49 | } 50 | 51 | // Updates bottom halo. 52 | void update_bottom(const int x, const int y, const int halo_depth, const int depth, double *buffer, bool is_offload) { 53 | #ifdef OMP_TARGET 54 | #pragma omp target teams distribute parallel for simd if (is_offload) collapse(2) 55 | #endif 56 | for (int jj = 0; jj < depth; ++jj) { 57 | #ifndef OMP_TARGET 58 | #pragma omp parallel for 59 | #endif 60 | for (int kk = halo_depth; kk < x - halo_depth; ++kk) { 61 | int base = kk; 62 | buffer[base + (halo_depth - jj - 1) * x] = buffer[base + (halo_depth + jj) * x]; 63 | } 64 | } 65 | } 66 | 67 | // Updates faces in turn. 68 | void update_face(const int x, const int y, const int halo_depth, const int *chunk_neighbours, const int depth, double *buffer, 69 | bool is_offload) { 70 | if (chunk_neighbours[CHUNK_LEFT] == EXTERNAL_FACE) { 71 | update_left(x, y, halo_depth, depth, buffer, is_offload); 72 | } 73 | if (chunk_neighbours[CHUNK_RIGHT] == EXTERNAL_FACE) { 74 | update_right(x, y, halo_depth, depth, buffer, is_offload); 75 | } 76 | if (chunk_neighbours[CHUNK_TOP] == EXTERNAL_FACE) { 77 | update_top(x, y, halo_depth, depth, buffer, is_offload); 78 | } 79 | if (chunk_neighbours[CHUNK_BOTTOM] == EXTERNAL_FACE) { 80 | update_bottom(x, y, halo_depth, depth, buffer, is_offload); 81 | } 82 | } 83 | 84 | // The kernel for updating halos locally 85 | void local_halos(const int x, const int y, const int depth, const int halo_depth, const int *chunk_neighbours, 86 | const bool *fields_to_exchange, double *density, double *energy0, double *energy, double *u, double *p, double *sd, 87 | bool is_offload) { 88 | if (fields_to_exchange[FIELD_DENSITY]) { 89 | update_face(x, y, halo_depth, chunk_neighbours, depth, density, is_offload); 90 | } 91 | if (fields_to_exchange[FIELD_P]) { 92 | update_face(x, y, halo_depth, chunk_neighbours, depth, p, is_offload); 93 | } 94 | if (fields_to_exchange[FIELD_ENERGY0]) { 95 | update_face(x, y, halo_depth, chunk_neighbours, depth, energy0, is_offload); 96 | } 97 | if (fields_to_exchange[FIELD_ENERGY1]) { 98 | update_face(x, y, halo_depth, chunk_neighbours, depth, energy, is_offload); 99 | } 100 | if (fields_to_exchange[FIELD_U]) { 101 | update_face(x, y, halo_depth, chunk_neighbours, depth, u, is_offload); 102 | } 103 | if (fields_to_exchange[FIELD_SD]) { 104 | update_face(x, y, halo_depth, chunk_neighbours, depth, sd, is_offload); 105 | } 106 | } 107 | 108 | // Solver-wide kernels 109 | void run_local_halos(Chunk *chunk, Settings &settings, int depth) { 110 | START_PROFILING(settings.kernel_profile); 111 | local_halos(chunk->x, chunk->y, depth, settings.halo_depth, chunk->neighbours, settings.fields_to_exchange, chunk->density, 112 | chunk->energy0, chunk->energy, chunk->u, chunk->p, chunk->sd, settings.is_offload); 113 | STOP_PROFILING(settings.kernel_profile, __func__); 114 | } 115 | -------------------------------------------------------------------------------- /src/omp/ppcg.cpp: -------------------------------------------------------------------------------- 1 | #include "chunk.h" 2 | #include "shared.h" 3 | 4 | /* 5 | * PPCG SOLVER KERNEL 6 | */ 7 | 8 | // Initialises the PPCG solver 9 | void ppcg_init(const int x, const int y, const int halo_depth, double theta, const double *r, double *sd) { 10 | #ifdef OMP_TARGET 11 | #pragma omp target teams distribute parallel for simd collapse(2) 12 | #else 13 | #pragma omp parallel for 14 | #endif 15 | for (int jj = halo_depth; jj < y - halo_depth; ++jj) { 16 | for (int kk = halo_depth; kk < x - halo_depth; ++kk) { 17 | const int index = kk + jj * x; 18 | sd[index] = r[index] / theta; 19 | } 20 | } 21 | } 22 | 23 | // The PPCG inner iteration 24 | void ppcg_inner_iteration(const int x, const int y, const int halo_depth, double alpha, double beta, double *u, double *r, const double *kx, 25 | const double *ky, double *sd) { 26 | #ifdef OMP_TARGET 27 | #pragma omp target teams distribute parallel for simd collapse(2) 28 | #else 29 | #pragma omp parallel for 30 | #endif 31 | for (int jj = halo_depth; jj < y - halo_depth; ++jj) { 32 | for (int kk = halo_depth; kk < x - halo_depth; ++kk) { 33 | const int index = kk + jj * x; 34 | const double smvp = tealeaf_SMVP(sd); 35 | r[index] -= smvp; 36 | u[index] += sd[index]; 37 | } 38 | } 39 | 40 | #ifdef OMP_TARGET 41 | #pragma omp target teams distribute parallel for simd collapse(2) 42 | #else 43 | #pragma omp parallel for 44 | #endif 45 | for (int jj = halo_depth; jj < y - halo_depth; ++jj) { 46 | for (int kk = halo_depth; kk < x - halo_depth; ++kk) { 47 | const int index = kk + jj * x; 48 | sd[index] = alpha * sd[index] + beta * r[index]; 49 | } 50 | } 51 | } 52 | 53 | // PPCG solver kernels 54 | void run_ppcg_init(Chunk *chunk, Settings &settings) { 55 | START_PROFILING(settings.kernel_profile); 56 | ppcg_init(chunk->x, chunk->y, settings.halo_depth, chunk->theta, chunk->r, chunk->sd); 57 | STOP_PROFILING(settings.kernel_profile, __func__); 58 | } 59 | 60 | void run_ppcg_inner_iteration(Chunk *chunk, Settings &settings, double alpha, double beta) { 61 | START_PROFILING(settings.kernel_profile); 62 | ppcg_inner_iteration(chunk->x, chunk->y, settings.halo_depth, alpha, beta, chunk->u, chunk->r, chunk->kx, chunk->ky, chunk->sd); 63 | STOP_PROFILING(settings.kernel_profile, __func__); 64 | } 65 | -------------------------------------------------------------------------------- /src/serial/cg.cpp: -------------------------------------------------------------------------------- 1 | #include "chunk.h" 2 | #include "shared.h" 3 | 4 | /* 5 | * CONJUGATE GRADIENT SOLVER KERNEL 6 | */ 7 | 8 | // Initialises the CG solver 9 | void cg_init(const int x, const int y, const int halo_depth, const int coefficient, double rx, double ry, double *rro, 10 | const double *density, const double *energy, double *u, double *p, double *r, double *w, double *kx, double *ky) { 11 | if (coefficient != CONDUCTIVITY && coefficient != RECIP_CONDUCTIVITY) { 12 | die(__LINE__, __FILE__, "Coefficient %d is not valid.\n", coefficient); 13 | } 14 | 15 | for (int jj = 0; jj < y; ++jj) { 16 | for (int kk = 0; kk < x; ++kk) { 17 | const int index = kk + jj * x; 18 | p[index] = 0.0; 19 | r[index] = 0.0; 20 | u[index] = energy[index] * density[index]; 21 | } 22 | } 23 | 24 | for (int jj = 1; jj < y - 1; ++jj) { 25 | for (int kk = 1; kk < x - 1; ++kk) { 26 | const int index = kk + jj * x; 27 | w[index] = (coefficient == CONDUCTIVITY) ? density[index] : 1.0 / density[index]; 28 | } 29 | } 30 | 31 | for (int jj = halo_depth; jj < y - 1; ++jj) { 32 | for (int kk = halo_depth; kk < x - 1; ++kk) { 33 | const int index = kk + jj * x; 34 | kx[index] = rx * (w[index - 1] + w[index]) / (2.0 * w[index - 1] * w[index]); 35 | ky[index] = ry * (w[index - x] + w[index]) / (2.0 * w[index - x] * w[index]); 36 | } 37 | } 38 | 39 | double rro_temp = 0.0; 40 | 41 | for (int jj = halo_depth; jj < y - halo_depth; ++jj) { 42 | for (int kk = halo_depth; kk < x - halo_depth; ++kk) { 43 | const int index = kk + jj * x; 44 | const double smvp = tealeaf_SMVP(u); 45 | w[index] = smvp; 46 | r[index] = u[index] - w[index]; 47 | p[index] = r[index]; 48 | rro_temp += r[index] * p[index]; 49 | } 50 | } 51 | 52 | // Sum locally 53 | *rro += rro_temp; 54 | } 55 | 56 | // Calculates w 57 | void cg_calc_w(const int x, const int y, const int halo_depth, double *pw, const double *p, double *w, const double *kx, const double *ky) { 58 | double pw_temp = 0.0; 59 | 60 | for (int jj = halo_depth; jj < y - halo_depth; ++jj) { 61 | for (int kk = halo_depth; kk < x - halo_depth; ++kk) { 62 | const int index = kk + jj * x; 63 | const double smvp = tealeaf_SMVP(p); 64 | w[index] = smvp; 65 | pw_temp += w[index] * p[index]; 66 | } 67 | } 68 | 69 | *pw += pw_temp; 70 | } 71 | 72 | // Calculates u and r 73 | void cg_calc_ur(const int x, const int y, const int halo_depth, const double alpha, double *rrn, double *u, const double *p, double *r, 74 | const double *w) { 75 | double rrn_temp = 0.0; 76 | 77 | for (int jj = halo_depth; jj < y - halo_depth; ++jj) { 78 | for (int kk = halo_depth; kk < x - halo_depth; ++kk) { 79 | const int index = kk + jj * x; 80 | 81 | u[index] += alpha * p[index]; 82 | r[index] -= alpha * w[index]; 83 | rrn_temp += r[index] * r[index]; 84 | } 85 | } 86 | 87 | *rrn += rrn_temp; 88 | } 89 | 90 | // Calculates p 91 | void cg_calc_p(const int x, const int y, const int halo_depth, const double beta, double *p, const double *r) { 92 | for (int jj = halo_depth; jj < y - halo_depth; ++jj) { 93 | for (int kk = halo_depth; kk < x - halo_depth; ++kk) { 94 | const int index = kk + jj * x; 95 | 96 | p[index] = beta * p[index] + r[index]; 97 | } 98 | } 99 | } 100 | 101 | // CG solver kernels 102 | void run_cg_init(Chunk *chunk, Settings &settings, double rx, double ry, double *rro) { 103 | START_PROFILING(settings.kernel_profile); 104 | cg_init(chunk->x, chunk->y, settings.halo_depth, settings.coefficient, rx, ry, rro, chunk->density, chunk->energy, chunk->u, chunk->p, 105 | chunk->r, chunk->w, chunk->kx, chunk->ky); 106 | STOP_PROFILING(settings.kernel_profile, __func__); 107 | } 108 | 109 | void run_cg_calc_w(Chunk *chunk, Settings &settings, double *pw) { 110 | START_PROFILING(settings.kernel_profile); 111 | cg_calc_w(chunk->x, chunk->y, settings.halo_depth, pw, chunk->p, chunk->w, chunk->kx, chunk->ky); 112 | STOP_PROFILING(settings.kernel_profile, __func__); 113 | } 114 | 115 | void run_cg_calc_ur(Chunk *chunk, Settings &settings, double alpha, double *rrn) { 116 | START_PROFILING(settings.kernel_profile); 117 | cg_calc_ur(chunk->x, chunk->y, settings.halo_depth, alpha, rrn, chunk->u, chunk->p, chunk->r, chunk->w); 118 | STOP_PROFILING(settings.kernel_profile, __func__); 119 | } 120 | 121 | void run_cg_calc_p(Chunk *chunk, Settings &settings, double beta) { 122 | START_PROFILING(settings.kernel_profile); 123 | cg_calc_p(chunk->x, chunk->y, settings.halo_depth, beta, chunk->p, chunk->r); 124 | STOP_PROFILING(settings.kernel_profile, __func__); 125 | } -------------------------------------------------------------------------------- /src/serial/cheby.cpp: -------------------------------------------------------------------------------- 1 | #include "chunk.h" 2 | #include "shared.h" 3 | 4 | /* 5 | * CHEBYSHEV SOLVER KERNEL 6 | */ 7 | 8 | // Calculates the new value for u. 9 | void cheby_calc_u(const int x, const int y, const int halo_depth, double *u, const double *p) { 10 | for (int jj = halo_depth; jj < y - halo_depth; ++jj) { 11 | for (int kk = halo_depth; kk < x - halo_depth; ++kk) { 12 | const int index = kk + jj * x; 13 | u[index] += p[index]; 14 | } 15 | } 16 | } 17 | 18 | // Initialises the Chebyshev solver 19 | void cheby_init(const int x, const int y, const int halo_depth, const double theta, double *u, const double *u0, double *p, double *r, 20 | double *w, const double *kx, const double *ky) { 21 | for (int jj = halo_depth; jj < y - halo_depth; ++jj) { 22 | for (int kk = halo_depth; kk < x - halo_depth; ++kk) { 23 | const int index = kk + jj * x; 24 | const double smvp = tealeaf_SMVP(u); 25 | w[index] = smvp; 26 | r[index] = u0[index] - w[index]; 27 | p[index] = r[index] / theta; 28 | } 29 | } 30 | 31 | cheby_calc_u(x, y, halo_depth, u, p); 32 | } 33 | 34 | // The main chebyshev iteration 35 | void cheby_iterate(const int x, const int y, const int halo_depth, double alpha, double beta, double *u, const double *u0, double *p, 36 | double *r, double *w, const double *kx, const double *ky) { 37 | for (int jj = halo_depth; jj < y - halo_depth; ++jj) { 38 | for (int kk = halo_depth; kk < x - halo_depth; ++kk) { 39 | const int index = kk + jj * x; 40 | const double smvp = tealeaf_SMVP(u); 41 | w[index] = smvp; 42 | r[index] = u0[index] - w[index]; 43 | p[index] = alpha * p[index] + beta * r[index]; 44 | } 45 | } 46 | 47 | cheby_calc_u(x, y, halo_depth, u, p); 48 | } 49 | 50 | // Chebyshev solver kernels 51 | void run_cheby_init(Chunk *chunk, Settings &settings) { 52 | START_PROFILING(settings.kernel_profile); 53 | cheby_init(chunk->x, chunk->y, settings.halo_depth, chunk->theta, chunk->u, chunk->u0, chunk->p, chunk->r, chunk->w, chunk->kx, 54 | chunk->ky); 55 | STOP_PROFILING(settings.kernel_profile, __func__); 56 | } 57 | 58 | void run_cheby_iterate(Chunk *chunk, Settings &settings, double alpha, double beta) { 59 | START_PROFILING(settings.kernel_profile); 60 | cheby_iterate(chunk->x, chunk->y, settings.halo_depth, alpha, beta, chunk->u, chunk->u0, chunk->p, chunk->r, chunk->w, chunk->kx, 61 | chunk->ky); 62 | STOP_PROFILING(settings.kernel_profile, __func__); 63 | } 64 | -------------------------------------------------------------------------------- /src/serial/chunk_extension.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | using FieldBufferType = double *; 4 | using StagingBufferType = double *; 5 | struct ChunkExtension {}; 6 | -------------------------------------------------------------------------------- /src/serial/jacobi.cpp: -------------------------------------------------------------------------------- 1 | #include "chunk.h" 2 | #include "settings.h" 3 | #include "shared.h" 4 | #include 5 | 6 | /* 7 | * JACOBI SOLVER KERNEL 8 | */ 9 | 10 | // Initialises the Jacobi solver 11 | void jacobi_init(const int x, const int y, const int halo_depth, const int coefficient, double rx, double ry, const double *density, 12 | const double *energy, double *u0, double *u, double *kx, double *ky) { 13 | if (coefficient < CONDUCTIVITY && coefficient < RECIP_CONDUCTIVITY) { 14 | die(__LINE__, __FILE__, "Coefficient %d is not valid.\n", coefficient); 15 | } 16 | 17 | for (int jj = 1; jj < y - 1; ++jj) { 18 | for (int kk = 1; kk < x - 1; ++kk) { 19 | const int index = kk + jj * x; 20 | double temp = energy[index] * density[index]; 21 | u0[index] = temp; 22 | u[index] = temp; 23 | } 24 | } 25 | 26 | for (int jj = halo_depth; jj < y - 1; ++jj) { 27 | for (int kk = halo_depth; kk < x - 1; ++kk) { 28 | const int index = kk + jj * x; 29 | double densityCentre = (coefficient == CONDUCTIVITY) ? density[index] : 1.0 / density[index]; 30 | double densityLeft = (coefficient == CONDUCTIVITY) ? density[index - 1] : 1.0 / density[index - 1]; 31 | double densityDown = (coefficient == CONDUCTIVITY) ? density[index - x] : 1.0 / density[index - x]; 32 | 33 | kx[index] = rx * (densityLeft + densityCentre) / (2.0 * densityLeft * densityCentre); 34 | ky[index] = ry * (densityDown + densityCentre) / (2.0 * densityDown * densityCentre); 35 | } 36 | } 37 | } 38 | 39 | // The main Jacobi solve step 40 | void jacobi_iterate(const int x, const int y, const int halo_depth, double *error, const double *kx, const double *ky, const double *u0, 41 | double *u, double *r) { 42 | for (int jj = 0; jj < y; ++jj) { 43 | for (int kk = 0; kk < x; ++kk) { 44 | const int index = kk + jj * x; 45 | r[index] = u[index]; 46 | } 47 | } 48 | 49 | double err = 0.0; 50 | for (int jj = halo_depth; jj < y - halo_depth; ++jj) { 51 | for (int kk = halo_depth; kk < x - halo_depth; ++kk) { 52 | const int index = kk + jj * x; 53 | u[index] = (u0[index] + (kx[index + 1] * r[index + 1] + kx[index] * r[index - 1]) + 54 | (ky[index + x] * r[index + x] + ky[index] * r[index - x])) / 55 | (1.0 + (kx[index] + kx[index + 1]) + (ky[index] + ky[index + x])); 56 | 57 | err += std::fabs(u[index] - r[index]); 58 | } 59 | } 60 | 61 | *error = err; 62 | } 63 | 64 | // Jacobi solver kernels 65 | void run_jacobi_init(Chunk *chunk, Settings &settings, double rx, double ry) { 66 | START_PROFILING(settings.kernel_profile); 67 | jacobi_init(chunk->x, chunk->y, settings.halo_depth, settings.coefficient, rx, ry, chunk->density, chunk->energy, chunk->u0, chunk->u, 68 | chunk->kx, chunk->ky); 69 | STOP_PROFILING(settings.kernel_profile, __func__); 70 | } 71 | 72 | void run_jacobi_iterate(Chunk *chunk, Settings &settings, double *error) { 73 | START_PROFILING(settings.kernel_profile); 74 | jacobi_iterate(chunk->x, chunk->y, settings.halo_depth, error, chunk->kx, chunk->ky, chunk->u0, chunk->u, chunk->r); 75 | STOP_PROFILING(settings.kernel_profile, __func__); 76 | } -------------------------------------------------------------------------------- /src/serial/local_halos.cpp: -------------------------------------------------------------------------------- 1 | #include "chunk.h" 2 | #include "shared.h" 3 | 4 | // Update left halo. 5 | void update_left(const int x, const int y, const int halo_depth, const int depth, double *buffer) { 6 | for (int jj = halo_depth; jj < y - halo_depth; ++jj) { 7 | for (int kk = 0; kk < depth; ++kk) { 8 | int base = jj * x; 9 | buffer[base + (halo_depth - kk - 1)] = buffer[base + (halo_depth + kk)]; 10 | } 11 | } 12 | } 13 | 14 | // Update right halo. 15 | void update_right(const int x, const int y, const int halo_depth, const int depth, double *buffer) { 16 | for (int jj = halo_depth; jj < y - halo_depth; ++jj) { 17 | for (int kk = 0; kk < depth; ++kk) { 18 | int base = jj * x; 19 | buffer[base + (x - halo_depth + kk)] = buffer[base + (x - halo_depth - 1 - kk)]; 20 | } 21 | } 22 | } 23 | 24 | // Update top halo. 25 | void update_top(const int x, const int y, const int halo_depth, const int depth, double *buffer) { 26 | for (int jj = 0; jj < depth; ++jj) { 27 | for (int kk = halo_depth; kk < x - halo_depth; ++kk) { 28 | int base = kk; 29 | buffer[base + (y - halo_depth + jj) * x] = buffer[base + (y - halo_depth - 1 - jj) * x]; 30 | } 31 | } 32 | } 33 | 34 | // Updates bottom halo. 35 | void update_bottom(const int x, const int y, const int halo_depth, const int depth, double *buffer) { 36 | for (int jj = 0; jj < depth; ++jj) { 37 | for (int kk = halo_depth; kk < x - halo_depth; ++kk) { 38 | int base = kk; 39 | buffer[base + (halo_depth - jj - 1) * x] = buffer[base + (halo_depth + jj) * x]; 40 | } 41 | } 42 | } 43 | 44 | // Updates faces in turn. 45 | void update_face(const int x, const int y, const int halo_depth, const int *chunk_neighbours, const int depth, double *buffer) { 46 | if (chunk_neighbours[CHUNK_LEFT] == EXTERNAL_FACE) { 47 | update_left(x, y, halo_depth, depth, buffer); 48 | } 49 | if (chunk_neighbours[CHUNK_RIGHT] == EXTERNAL_FACE) { 50 | update_right(x, y, halo_depth, depth, buffer); 51 | } 52 | if (chunk_neighbours[CHUNK_TOP] == EXTERNAL_FACE) { 53 | update_top(x, y, halo_depth, depth, buffer); 54 | } 55 | if (chunk_neighbours[CHUNK_BOTTOM] == EXTERNAL_FACE) { 56 | update_bottom(x, y, halo_depth, depth, buffer); 57 | } 58 | } 59 | 60 | // The kernel for updating halos locally 61 | void local_halos(int x, int y, int depth, int halo_depth, const int *chunk_neighbours, const bool *fields_to_exchange, double *density, 62 | double *energy0, double *energy, double *u, double *p, double *sd) { 63 | if (fields_to_exchange[FIELD_DENSITY]) { 64 | update_face(x, y, halo_depth, chunk_neighbours, depth, density); 65 | } 66 | if (fields_to_exchange[FIELD_P]) { 67 | update_face(x, y, halo_depth, chunk_neighbours, depth, p); 68 | } 69 | if (fields_to_exchange[FIELD_ENERGY0]) { 70 | update_face(x, y, halo_depth, chunk_neighbours, depth, energy0); 71 | } 72 | if (fields_to_exchange[FIELD_ENERGY1]) { 73 | update_face(x, y, halo_depth, chunk_neighbours, depth, energy); 74 | } 75 | if (fields_to_exchange[FIELD_U]) { 76 | update_face(x, y, halo_depth, chunk_neighbours, depth, u); 77 | } 78 | if (fields_to_exchange[FIELD_SD]) { 79 | update_face(x, y, halo_depth, chunk_neighbours, depth, sd); 80 | } 81 | } 82 | 83 | // Solver-wide kernels 84 | void run_local_halos(Chunk *chunk, Settings &settings, int depth) { 85 | START_PROFILING(settings.kernel_profile); 86 | local_halos(chunk->x, chunk->y, depth, settings.halo_depth, chunk->neighbours, settings.fields_to_exchange, chunk->density, 87 | chunk->energy0, chunk->energy, chunk->u, chunk->p, chunk->sd); 88 | STOP_PROFILING(settings.kernel_profile, __func__); 89 | } 90 | -------------------------------------------------------------------------------- /src/serial/model.cmake: -------------------------------------------------------------------------------- 1 | macro(setup) 2 | set(CMAKE_CXX_STANDARD 17) 3 | endmacro() -------------------------------------------------------------------------------- /src/serial/ppcg.cpp: -------------------------------------------------------------------------------- 1 | #include "chunk.h" 2 | #include "shared.h" 3 | 4 | /* 5 | * PPCG SOLVER KERNEL 6 | */ 7 | 8 | // Initialises the PPCG solver 9 | void ppcg_init(const int x, const int y, const int halo_depth, double theta, const double *r, double *sd) { 10 | for (int jj = halo_depth; jj < y - halo_depth; ++jj) { 11 | for (int kk = halo_depth; kk < x - halo_depth; ++kk) { 12 | const int index = kk + jj * x; 13 | sd[index] = r[index] / theta; 14 | } 15 | } 16 | } 17 | 18 | // The PPCG inner iteration 19 | void ppcg_inner_iteration(const int x, const int y, const int halo_depth, double alpha, double beta, double *u, double *r, const double *kx, 20 | const double *ky, double *sd) { 21 | for (int jj = halo_depth; jj < y - halo_depth; ++jj) { 22 | for (int kk = halo_depth; kk < x - halo_depth; ++kk) { 23 | const int index = kk + jj * x; 24 | const double smvp = tealeaf_SMVP(sd); 25 | r[index] -= smvp; 26 | u[index] += sd[index]; 27 | } 28 | } 29 | 30 | for (int jj = halo_depth; jj < y - halo_depth; ++jj) { 31 | for (int kk = halo_depth; kk < x - halo_depth; ++kk) { 32 | const int index = kk + jj * x; 33 | sd[index] = alpha * sd[index] + beta * r[index]; 34 | } 35 | } 36 | } 37 | 38 | // PPCG solver kernels 39 | void run_ppcg_init(Chunk *chunk, Settings &settings) { 40 | START_PROFILING(settings.kernel_profile); 41 | ppcg_init(chunk->x, chunk->y, settings.halo_depth, chunk->theta, chunk->r, chunk->sd); 42 | STOP_PROFILING(settings.kernel_profile, __func__); 43 | } 44 | 45 | void run_ppcg_inner_iteration(Chunk *chunk, Settings &settings, double alpha, double beta) { 46 | START_PROFILING(settings.kernel_profile); 47 | ppcg_inner_iteration(chunk->x, chunk->y, settings.halo_depth, alpha, beta, chunk->u, chunk->r, chunk->kx, chunk->ky, chunk->sd); 48 | STOP_PROFILING(settings.kernel_profile, __func__); 49 | } 50 | -------------------------------------------------------------------------------- /src/serial/solver_methods.cpp: -------------------------------------------------------------------------------- 1 | #include "chunk.h" 2 | #include "shared.h" 3 | 4 | /* 5 | * SHARED SOLVER METHODS 6 | */ 7 | 8 | // The field summary kernel 9 | void field_summary(const int x, const int y, const int halo_depth, const double *volume, const double *density, const double *energy0, 10 | double *u, double *volOut, double *massOut, double *ieOut, double *tempOut) { 11 | double vol = 0.0; 12 | double ie = 0.0; 13 | double temp = 0.0; 14 | double mass = 0.0; 15 | 16 | for (int jj = halo_depth; jj < y - halo_depth; ++jj) { 17 | for (int kk = halo_depth; kk < x - halo_depth; ++kk) { 18 | const int index = kk + jj * x; 19 | double cellVol = volume[index]; 20 | double cellMass = cellVol * density[index]; 21 | vol += cellVol; 22 | mass += cellMass; 23 | ie += cellMass * energy0[index]; 24 | temp += cellMass * u[index]; 25 | } 26 | } 27 | 28 | *volOut += vol; 29 | *ieOut += ie; 30 | *tempOut += temp; 31 | *massOut += mass; 32 | } 33 | 34 | // Store original energy state 35 | void store_energy(int x, int y, const double *energy0, double *energy) { 36 | for (int ii = 0; ii < x * y; ++ii) { 37 | energy[ii] = energy0[ii]; 38 | } 39 | } 40 | 41 | // Copies the current u into u0 42 | void copy_u(const int x, const int y, const int halo_depth, double *u0, const double *u) { 43 | for (int jj = halo_depth; jj < y - halo_depth; ++jj) { 44 | for (int kk = halo_depth; kk < x - halo_depth; ++kk) { 45 | const int index = kk + jj * x; 46 | u0[index] = u[index]; 47 | } 48 | } 49 | } 50 | 51 | // Calculates the current value of r 52 | void calculate_residual(const int x, const int y, const int halo_depth, const double *u, const double *u0, double *r, const double *kx, 53 | const double *ky) { 54 | for (int jj = halo_depth; jj < y - halo_depth; ++jj) { 55 | for (int kk = halo_depth; kk < x - halo_depth; ++kk) { 56 | const int index = kk + jj * x; 57 | const double smvp = tealeaf_SMVP(u); 58 | r[index] = u0[index] - smvp; 59 | } 60 | } 61 | } 62 | 63 | // Calculates the 2 norm of a given buffer 64 | void calculate_2norm(const int x, const int y, const int halo_depth, const double *buffer, double *norm) { 65 | double norm_temp = 0.0; 66 | 67 | for (int jj = halo_depth; jj < y - halo_depth; ++jj) { 68 | for (int kk = halo_depth; kk < x - halo_depth; ++kk) { 69 | const int index = kk + jj * x; 70 | norm_temp += buffer[index] * buffer[index]; 71 | } 72 | } 73 | 74 | *norm += norm_temp; 75 | } 76 | 77 | // Finalises the solution 78 | void finalise(const int x, const int y, const int halo_depth, double *energy, const double *density, const double *u) { 79 | for (int jj = halo_depth; jj < y - halo_depth; ++jj) { 80 | for (int kk = halo_depth; kk < x - halo_depth; ++kk) { 81 | const int index = kk + jj * x; 82 | energy[index] = u[index] / density[index]; 83 | } 84 | } 85 | } 86 | 87 | void run_store_energy(Chunk *chunk, Settings &settings) { 88 | START_PROFILING(settings.kernel_profile); 89 | store_energy(chunk->x, chunk->y, chunk->energy0, chunk->energy); 90 | STOP_PROFILING(settings.kernel_profile, __func__); 91 | } 92 | 93 | void run_field_summary(Chunk *chunk, Settings &settings, double *vol, double *mass, double *ie, double *temp) { 94 | START_PROFILING(settings.kernel_profile); 95 | field_summary(chunk->x, chunk->y, settings.halo_depth, chunk->volume, chunk->density, chunk->energy0, chunk->u, vol, mass, ie, temp); 96 | STOP_PROFILING(settings.kernel_profile, __func__); 97 | } 98 | 99 | // Shared solver kernels 100 | void run_copy_u(Chunk *chunk, Settings &settings) { 101 | START_PROFILING(settings.kernel_profile); 102 | copy_u(chunk->x, chunk->y, settings.halo_depth, chunk->u0, chunk->u); 103 | STOP_PROFILING(settings.kernel_profile, __func__); 104 | } 105 | 106 | void run_calculate_residual(Chunk *chunk, Settings &settings) { 107 | START_PROFILING(settings.kernel_profile); 108 | calculate_residual(chunk->x, chunk->y, settings.halo_depth, chunk->u, chunk->u0, chunk->r, chunk->kx, chunk->ky); 109 | STOP_PROFILING(settings.kernel_profile, __func__); 110 | } 111 | 112 | void run_calculate_2norm(Chunk *chunk, Settings &settings, double *buffer, double *norm) { 113 | START_PROFILING(settings.kernel_profile); 114 | calculate_2norm(chunk->x, chunk->y, settings.halo_depth, buffer, norm); 115 | STOP_PROFILING(settings.kernel_profile, __func__); 116 | } 117 | 118 | void run_finalise(Chunk *chunk, Settings &settings) { 119 | START_PROFILING(settings.kernel_profile); 120 | finalise(chunk->x, chunk->y, settings.halo_depth, chunk->energy, chunk->density, chunk->u); 121 | STOP_PROFILING(settings.kernel_profile, __func__); 122 | } 123 | -------------------------------------------------------------------------------- /src/std-indices/cheby.cpp: -------------------------------------------------------------------------------- 1 | #include "chunk.h" 2 | #include "dpl_shim.h" 3 | #include "ranged.h" 4 | #include "shared.h" 5 | #include "std_shared.h" 6 | /* 7 | * CHEBYSHEV SOLVER KERNEL 8 | */ 9 | 10 | // Calculates the new value for u. 11 | void cheby_calc_u(const int x, // 12 | const int y, // 13 | const int halo_depth, // 14 | double *u, // 15 | const double *p) { 16 | Range2d range(halo_depth, halo_depth, x - halo_depth, y - halo_depth); 17 | ranged it(0, range.sizeXY()); 18 | std::for_each(EXEC_POLICY, it.begin(), it.end(), [=](int i) { 19 | const int index = range.restore(i, x); 20 | u[index] += p[index]; 21 | }); 22 | } 23 | 24 | // Initialises the Chebyshev solver 25 | void cheby_init(const int x, // 26 | const int y, // 27 | const int halo_depth, // 28 | const double theta, // 29 | double *u, // 30 | const double *u0, // 31 | double *p, // 32 | double *r, // 33 | double *w, // 34 | const double *kx, // 35 | const double *ky) { 36 | Range2d range(halo_depth, halo_depth, x - halo_depth, y - halo_depth); 37 | ranged it(0, range.sizeXY()); 38 | std::for_each(EXEC_POLICY, it.begin(), it.end(), [=](int i) { 39 | const int index = range.restore(i, x); 40 | const double smvp = tealeaf_SMVP(u); 41 | w[index] = smvp; 42 | r[index] = u0[index] - w[index]; 43 | p[index] = r[index] / theta; 44 | }); 45 | 46 | cheby_calc_u(x, y, halo_depth, u, p); 47 | } 48 | 49 | // The main chebyshev iteration 50 | void cheby_iterate(const int x, // 51 | const int y, // 52 | const int halo_depth, // 53 | double alpha, // 54 | double beta, // 55 | double *u, // 56 | const double *u0, // 57 | double *p, // 58 | double *r, // 59 | double *w, // 60 | const double *kx, // 61 | const double *ky) { 62 | Range2d range(halo_depth, halo_depth, x - halo_depth, y - halo_depth); 63 | ranged it(0, range.sizeXY()); 64 | std::for_each(EXEC_POLICY, it.begin(), it.end(), [=](int i) { 65 | const int index = range.restore(i, x); 66 | const double smvp = tealeaf_SMVP(u); 67 | w[index] = smvp; 68 | r[index] = u0[index] - w[index]; 69 | p[index] = alpha * p[index] + beta * r[index]; 70 | }); 71 | 72 | cheby_calc_u(x, y, halo_depth, u, p); 73 | } 74 | 75 | // Chebyshev solver kernels 76 | void run_cheby_init(Chunk *chunk, Settings &settings) { 77 | START_PROFILING(settings.kernel_profile); 78 | cheby_init(chunk->x, chunk->y, settings.halo_depth, chunk->theta, chunk->u, chunk->u0, chunk->p, chunk->r, chunk->w, chunk->kx, 79 | chunk->ky); 80 | STOP_PROFILING(settings.kernel_profile, __func__); 81 | } 82 | 83 | void run_cheby_iterate(Chunk *chunk, Settings &settings, double alpha, double beta) { 84 | START_PROFILING(settings.kernel_profile); 85 | cheby_iterate(chunk->x, chunk->y, settings.halo_depth, alpha, beta, chunk->u, chunk->u0, chunk->p, chunk->r, chunk->w, chunk->kx, 86 | chunk->ky); 87 | STOP_PROFILING(settings.kernel_profile, __func__); 88 | } 89 | -------------------------------------------------------------------------------- /src/std-indices/chunk_extension.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | using FieldBufferType = double *; 4 | using StagingBufferType = double *; 5 | struct ChunkExtension {}; 6 | -------------------------------------------------------------------------------- /src/std-indices/dpl_shim.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | #ifdef USE_ONEDPL 7 | 8 | // oneDPL C++17 PSTL 9 | 10 | #include 11 | #include 12 | #include 13 | 14 | #if ONEDPL_USE_DPCPP_BACKEND 15 | 16 | #include 17 | 18 | const static auto EXEC_POLICY = 19 | oneapi::dpl::execution::device_policy<>{oneapi::dpl::execution::make_device_policy(oneapi::dpl::execution::dpcpp_default)}; 20 | 21 | template T *alloc_raw(size_t size) { return sycl::malloc_shared(size, EXEC_POLICY.queue()); } 22 | 23 | template void dealloc_raw(T *ptr) { sycl::free(ptr, EXEC_POLICY.queue()); } 24 | 25 | #else 26 | 27 | // auto exe_policy = dpl::execution::seq; 28 | // auto exe_policy = dpl::execution::par; 29 | static constexpr auto EXEC_POLICY = dpl::execution::par_unseq; 30 | #define USE_STD_PTR_ALLOC_DEALLOC 31 | 32 | #endif 33 | 34 | #else 35 | 36 | // Normal C++17 PSTL 37 | 38 | #include 39 | #include 40 | #include 41 | 42 | // static auto EXEC_POLICY = std::execution::seq; 43 | // auto exe_policy = std::execution::par; 44 | static constexpr auto EXEC_POLICY = std::execution::par_unseq; 45 | #define USE_STD_PTR_ALLOC_DEALLOC 46 | 47 | #endif 48 | 49 | #ifdef USE_STD_PTR_ALLOC_DEALLOC 50 | 51 | template T *alloc_raw(size_t size) { return static_cast(std::malloc(size * sizeof(T))); } 52 | template void dealloc_raw(T *ptr) { std::free(ptr); } 53 | 54 | #endif 55 | -------------------------------------------------------------------------------- /src/std-indices/jacobi.cpp: -------------------------------------------------------------------------------- 1 | #include "chunk.h" 2 | #include "dpl_shim.h" 3 | #include "ranged.h" 4 | #include "shared.h" 5 | #include "std_shared.h" 6 | #include 7 | 8 | /* 9 | * JACOBI SOLVER KERNEL 10 | */ 11 | 12 | // Initialises the Jacobi solver 13 | void jacobi_init(const int x, // 14 | const int y, // 15 | const int halo_depth, // 16 | const int coefficient, // 17 | double rx, // 18 | double ry, // 19 | const double *density, // 20 | const double *energy, // 21 | double *u0, // 22 | double *u, // 23 | double *kx, // 24 | double *ky) { 25 | if (coefficient < CONDUCTIVITY && coefficient < RECIP_CONDUCTIVITY) { 26 | die(__LINE__, __FILE__, "Coefficient %d is not valid.\n", coefficient); 27 | } 28 | Range2d range(1, 1, x - 1, y - 1); 29 | ranged it(0, range.sizeXY()); 30 | std::for_each(EXEC_POLICY, it.begin(), it.end(), [=](int i) { 31 | const int index = range.restore(i, x); 32 | double temp = energy[index] * density[index]; 33 | u0[index] = temp; 34 | u[index] = temp; 35 | }); 36 | 37 | std::for_each(EXEC_POLICY, it.begin(), it.end(), [=](int i) { 38 | const int index = range.restore(i, x); 39 | double densityCentre = (coefficient == CONDUCTIVITY) ? density[index] : 1.0 / density[index]; 40 | double densityLeft = (coefficient == CONDUCTIVITY) ? density[index - 1] : 1.0 / density[index - 1]; 41 | double densityDown = (coefficient == CONDUCTIVITY) ? density[index - x] : 1.0 / density[index - x]; 42 | 43 | kx[index] = rx * (densityLeft + densityCentre) / (2.0 * densityLeft * densityCentre); 44 | ky[index] = ry * (densityDown + densityCentre) / (2.0 * densityDown * densityCentre); 45 | }); 46 | } 47 | 48 | // The main Jacobi solve step 49 | void jacobi_iterate(const int x, // 50 | const int y, // 51 | const int halo_depth, // 52 | double *error, // 53 | const double *kx, // 54 | const double *ky, // 55 | const double *u0, // 56 | double *u, // 57 | double *r) { 58 | 59 | { 60 | Range2d range(0, 0, x, y); 61 | ranged it(0, range.sizeXY()); 62 | std::for_each(EXEC_POLICY, it.begin(), it.end(), [=](int i) { 63 | const int index = range.restore(i, x); 64 | r[index] = u[index]; 65 | }); 66 | } 67 | 68 | { 69 | Range2d range(halo_depth, halo_depth, x - halo_depth, y - halo_depth); 70 | ranged it(0, range.sizeXY()); 71 | *error = std::transform_reduce(EXEC_POLICY, it.begin(), it.end(), 0.0, std::plus<>(), [=](int i) { 72 | const int index = range.restore(i, x); 73 | u[index] = (u0[index] + (kx[index + 1] * r[index + 1] + kx[index] * r[index - 1]) + 74 | (ky[index + x] * r[index + x] + ky[index] * r[index - x])) / 75 | (1.0 + (kx[index] + kx[index + 1]) + (ky[index] + ky[index + x])); 76 | 77 | return fabs(u[index] - r[index]); 78 | }); 79 | } 80 | } 81 | 82 | // Jacobi solver kernels 83 | void run_jacobi_init(Chunk *chunk, Settings &settings, double rx, double ry) { 84 | START_PROFILING(settings.kernel_profile); 85 | jacobi_init(chunk->x, chunk->y, settings.halo_depth, settings.coefficient, rx, ry, chunk->density, chunk->energy, chunk->u0, chunk->u, 86 | chunk->kx, chunk->ky); 87 | STOP_PROFILING(settings.kernel_profile, __func__); 88 | } 89 | 90 | void run_jacobi_iterate(Chunk *chunk, Settings &settings, double *error) { 91 | START_PROFILING(settings.kernel_profile); 92 | jacobi_iterate(chunk->x, chunk->y, settings.halo_depth, error, chunk->kx, chunk->ky, chunk->u0, chunk->u, chunk->r); 93 | STOP_PROFILING(settings.kernel_profile, __func__); 94 | } -------------------------------------------------------------------------------- /src/std-indices/model.cmake: -------------------------------------------------------------------------------- 1 | 2 | register_flag_optional(CMAKE_CXX_COMPILER 3 | "Any CXX compiler that is supported by CMake detection" 4 | "c++") 5 | 6 | register_flag_optional(NVHPC_OFFLOAD 7 | "Enable offloading support (via the non-standard `-stdpar`) for the new NVHPC SDK. 8 | The values are Nvidia architectures in ccXY format will be passed in via `-gpu=` (e.g `cc70`) 9 | 10 | Possible values are: 11 | cc35 - Compile for compute capability 3.5 12 | cc50 - Compile for compute capability 5.0 13 | cc60 - Compile for compute capability 6.0 14 | cc62 - Compile for compute capability 6.2 15 | cc70 - Compile for compute capability 7.0 16 | cc72 - Compile for compute capability 7.2 17 | cc75 - Compile for compute capability 7.5 18 | cc80 - Compile for compute capability 8.0 19 | ccall - Compile for all supported compute capabilities" 20 | "") 21 | 22 | register_flag_optional(USE_TBB 23 | "Link against an in-tree oneTBB via FetchContent_Declare, see top level CMakeLists.txt for details." 24 | "OFF") 25 | 26 | register_flag_optional(USE_ONEDPL 27 | "Link oneDPL which implements C++17 executor policies (via execution_policy_tag) for different backends. 28 | 29 | Possible values are: 30 | OPENMP - Implements policies using OpenMP. 31 | CMake will handle any flags needed to enable OpenMP if the compiler supports it. 32 | TBB - Implements policies using TBB. 33 | TBB must be linked via USE_TBB or be available in LD_LIBRARY_PATH. 34 | DPCPP - Implements policies through SYCL2020. 35 | This requires the DPC++ compiler (other SYCL compilers are untested), required SYCL flags are added automatically." 36 | "OFF") 37 | 38 | macro(setup) 39 | set(CMAKE_CXX_STANDARD 17) # because SYCL oneDPL is C++17, NVHPC isn't bound by this 40 | 41 | if (USE_TBB) 42 | register_link_library(TBB::tbb) 43 | endif () 44 | 45 | if (USE_ONEDPL) 46 | register_definitions(USE_ONEDPL) 47 | register_link_library(oneDPL) 48 | endif () 49 | 50 | if (NVHPC_OFFLOAD) 51 | set(NVHPC_FLAGS 52 | -stdpar 53 | -gpu=${NVHPC_OFFLOAD},fastmath,keep 54 | --restrict 55 | -Mfpapprox 56 | -Mfprelaxed 57 | -Mllvm-fast 58 | -Ktrap=none 59 | -Minfo=accel -Minfo=stdpar) 60 | # propagate flags to linker so that it links with the gpu stuff as well 61 | register_append_cxx_flags(ANY ${NVHPC_FLAGS}) 62 | register_append_link_flags(${NVHPC_FLAGS}) 63 | endif () 64 | 65 | 66 | endmacro() -------------------------------------------------------------------------------- /src/std-indices/ppcg.cpp: -------------------------------------------------------------------------------- 1 | #include "chunk.h" 2 | #include "dpl_shim.h" 3 | #include "ranged.h" 4 | #include "shared.h" 5 | #include "std_shared.h" 6 | /* 7 | * PPCG SOLVER KERNEL 8 | */ 9 | 10 | // Initialises the PPCG solver 11 | void ppcg_init(const int x, // 12 | const int y, // 13 | const int halo_depth, // 14 | double theta, // 15 | const double *r, // 16 | double *sd) { 17 | Range2d range(halo_depth, halo_depth, x - halo_depth, y - halo_depth); 18 | ranged it(0, range.sizeXY()); 19 | std::for_each(EXEC_POLICY, it.begin(), it.end(), [=](int i) { 20 | const int index = range.restore(i, x); 21 | sd[index] = r[index] / theta; 22 | }); 23 | } 24 | 25 | // The PPCG inner iteration 26 | void ppcg_inner_iteration(const int x, // 27 | const int y, // 28 | const int halo_depth, // 29 | double alpha, // 30 | double beta, // 31 | double *u, // 32 | double *r, // 33 | const double *kx, // 34 | const double *ky, // 35 | double *sd) { 36 | 37 | Range2d range(halo_depth, halo_depth, x - halo_depth, y - halo_depth); 38 | ranged it(0, range.sizeXY()); 39 | 40 | std::for_each(EXEC_POLICY, it.begin(), it.end(), [=](int i) { 41 | const int index = range.restore(i, x); 42 | const double smvp = tealeaf_SMVP(sd); 43 | r[index] -= smvp; 44 | u[index] += sd[index]; 45 | }); 46 | 47 | std::for_each(EXEC_POLICY, it.begin(), it.end(), [=](int i) { 48 | const int index = range.restore(i, x); 49 | sd[index] = alpha * sd[index] + beta * r[index]; 50 | }); 51 | } 52 | 53 | // PPCG solver kernels 54 | void run_ppcg_init(Chunk *chunk, Settings &settings) { 55 | START_PROFILING(settings.kernel_profile); 56 | ppcg_init(chunk->x, chunk->y, settings.halo_depth, chunk->theta, chunk->r, chunk->sd); 57 | STOP_PROFILING(settings.kernel_profile, __func__); 58 | } 59 | 60 | void run_ppcg_inner_iteration(Chunk *chunk, Settings &settings, double alpha, double beta) { 61 | START_PROFILING(settings.kernel_profile); 62 | ppcg_inner_iteration(chunk->x, chunk->y, settings.halo_depth, alpha, beta, chunk->u, chunk->r, chunk->kx, chunk->ky, chunk->sd); 63 | STOP_PROFILING(settings.kernel_profile, __func__); 64 | } 65 | -------------------------------------------------------------------------------- /src/std-indices/ranged.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | // A lightweight counting iterator which will be used by the STL algorithms 4 | // NB: C++ <= 17 doesn't have this built-in, and it's only added later in ranges-v3 (C++2a) which this 5 | // implementation doesn't target 6 | template class ranged { 7 | public: 8 | class iterator { 9 | friend class ranged; 10 | 11 | public: 12 | using difference_type = N; 13 | using value_type = N; 14 | using pointer = const N *; 15 | using reference = N; 16 | using iterator_category = std::random_access_iterator_tag; 17 | 18 | // XXX This is not part of the iterator spec, it gets picked up by oneDPL if enabled. 19 | // Without this, the DPL SYCL backend collects the iterator data on the host and copies to the device. 20 | // This type is unused for any other STL impl. 21 | using is_passed_directly = std::true_type; 22 | 23 | reference operator*() const { return i_; } 24 | 25 | iterator &operator++() { 26 | ++i_; 27 | return *this; 28 | } 29 | 30 | iterator operator++(int) { 31 | iterator copy(*this); 32 | ++i_; 33 | return copy; 34 | } 35 | 36 | iterator &operator--() { 37 | --i_; 38 | return *this; 39 | } 40 | 41 | iterator operator--(int) { 42 | iterator copy(*this); 43 | --i_; 44 | return copy; 45 | } 46 | 47 | iterator &operator+=(N by) { 48 | i_ += by; 49 | return *this; 50 | } 51 | 52 | value_type operator[](const difference_type &i) const { return i_ + i; } 53 | 54 | difference_type operator-(const iterator &it) const { return i_ - it.i_; } 55 | 56 | iterator operator+(const value_type v) const { return iterator(i_ + v); } 57 | 58 | bool operator==(const iterator &other) const { return i_ == other.i_; } 59 | 60 | bool operator!=(const iterator &other) const { return i_ != other.i_; } 61 | 62 | bool operator<(const iterator &other) const { return i_ < other.i_; } 63 | 64 | protected: 65 | explicit iterator(N start) : i_(start) {} 66 | 67 | private: 68 | N i_; 69 | }; 70 | 71 | [[nodiscard]] iterator begin() const { return begin_; } 72 | 73 | [[nodiscard]] iterator end() const { return end_; } 74 | 75 | ranged(N begin, N end) : begin_(begin), end_(end) {} 76 | 77 | private: 78 | iterator begin_; 79 | iterator end_; 80 | }; -------------------------------------------------------------------------------- /src/std-indices/std_shared.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | template struct Range2d { 7 | const N fromX, toX; 8 | const N fromY, toY; 9 | 10 | constexpr inline Range2d(N fromX, N fromY, N toX, N toY) : fromX(fromX), toX(toX), fromY(fromY), toY(toY) { 11 | assert(fromX < toX); 12 | assert(fromY < toY); 13 | assert(sizeX() >= 0); 14 | assert(sizeY() >= 0); 15 | } 16 | [[nodiscard]] constexpr inline N sizeX() const { return toX - fromX; } 17 | [[nodiscard]] constexpr inline N sizeY() const { return toY - fromY; } 18 | [[nodiscard]] constexpr inline N sizeXY() const { return sizeX() * sizeY(); } 19 | 20 | constexpr inline N restore(N i, N xLimit) const { 21 | const int jj = (i / sizeX()) + fromX; 22 | const int kk = (i % sizeX()) + fromY; 23 | return kk + jj * xLimit; 24 | } 25 | 26 | friend std::ostream &operator<<(std::ostream &os, const Range2d &d) { 27 | os << "Range2d{" 28 | << " X[" << d.fromX << "->" << d.toX << " (" << d.sizeX() << ")]" 29 | << " Y[" << d.fromY << "->" << d.toY << " (" << d.sizeY() << ")]" 30 | << "}"; 31 | return os; 32 | } 33 | }; -------------------------------------------------------------------------------- /src/sycl-acc/chunk_extension.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | using namespace cl; 6 | 7 | using FieldBufferType = sycl::buffer *; 8 | using StagingBufferType = sycl::buffer *; 9 | 10 | struct ChunkExtension { 11 | sycl::queue *device_queue; 12 | }; 13 | -------------------------------------------------------------------------------- /src/sycl-acc/model.cmake: -------------------------------------------------------------------------------- 1 | 2 | register_flag_optional(CMAKE_CXX_COMPILER 3 | "Any CXX compiler that is supported by CMake detection, this is used for host compilation when required by the SYCL compiler" 4 | "c++") 5 | 6 | register_flag_required(SYCL_COMPILER 7 | "Compile using the specified SYCL compiler implementation 8 | Supported values are 9 | ONEAPI-ICPX - icpx as a standalone compiler 10 | ONEAPI-Clang - oneAPI's Clang driver (enabled via `source /opt/intel/oneapi/setvars.sh --include-intel-llvm`) 11 | DPCPP - dpc++ as a standalone compiler (https://github.com/intel/llvm) 12 | HIPSYCL - hipSYCL compiler (https://github.com/illuhad/hipSYCL) 13 | COMPUTECPP - ComputeCpp compiler (https://developer.codeplay.com/products/computecpp/ce/home)") 14 | 15 | register_flag_optional(SYCL_COMPILER_DIR 16 | "Absolute path to the selected SYCL compiler directory, most are packaged differently so set the path according to `SYCL_COMPILER`: 17 | ONEAPI-ICPX - `icpx` must be used for OneAPI 2023 and later on releases (i.e `source /opt/intel/oneapi/setvars.sh` first) 18 | ONEAPI-Clang - set to the directory that contains the Intel clang++ binary. 19 | HIPSYCL|DPCPP|COMPUTECPP - set to the root of the binary distribution that contains at least `bin/`, `include/`, and `lib/`" 20 | "") 21 | 22 | register_flag_optional(USE_HOSTTASK 23 | "Whether to use SYCL2020 host_task for MPI related calls or fallback to queue.wait() not all SYCL compilers support this" 24 | "OFF") 25 | 26 | 27 | register_flag_optional(OpenCL_LIBRARY 28 | "[ComputeCpp only] Path to OpenCL library, usually called libOpenCL.so" 29 | "${OpenCL_LIBRARY}") 30 | 31 | macro(setup) 32 | set(CMAKE_CXX_STANDARD 17) 33 | 34 | if (USE_HOSTTASK) 35 | register_definitions(USE_HOSTTASK) 36 | endif () 37 | 38 | 39 | if (${SYCL_COMPILER} STREQUAL "HIPSYCL") 40 | 41 | 42 | set(hipSYCL_DIR ${SYCL_COMPILER_DIR}/lib/cmake/hipSYCL) 43 | 44 | if (NOT EXISTS "${hipSYCL_DIR}") 45 | message(WARNING "Falling back to hipSYCL < 0.9.0 CMake structure") 46 | set(hipSYCL_DIR ${SYCL_COMPILER_DIR}/lib/cmake) 47 | endif () 48 | if (NOT EXISTS "${hipSYCL_DIR}") 49 | message(FATAL_ERROR "Can't find the appropriate CMake definitions for hipSYCL") 50 | endif () 51 | 52 | # register_definitions(_GLIBCXX_USE_CXX11_ABI=0) 53 | find_package(hipSYCL CONFIG REQUIRED) 54 | message(STATUS "ok") 55 | 56 | elseif (${SYCL_COMPILER} STREQUAL "COMPUTECPP") 57 | 58 | list(APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake/Modules) 59 | set(ComputeCpp_DIR ${SYCL_COMPILER_DIR}) 60 | 61 | # don't point to the CL dir as the imports already have the CL prefix 62 | set(OpenCL_INCLUDE_DIR "${CMAKE_SOURCE_DIR}") 63 | 64 | register_definitions(CL_TARGET_OPENCL_VERSION=220 _GLIBCXX_USE_CXX11_ABI=0) 65 | # ComputeCpp needs OpenCL 66 | find_package(ComputeCpp REQUIRED) 67 | 68 | # this must come after FindComputeCpp (!) 69 | set(COMPUTECPP_USER_FLAGS -O3 -no-serial-memop) 70 | 71 | elseif (${SYCL_COMPILER} STREQUAL "DPCPP") 72 | set(CMAKE_CXX_COMPILER ${SYCL_COMPILER_DIR}/bin/clang++) 73 | include_directories(${SYCL_COMPILER_DIR}/include/sycl) 74 | register_append_cxx_flags(ANY -fsycl) 75 | register_append_link_flags(-fsycl) 76 | elseif (${SYCL_COMPILER} STREQUAL "ONEAPI-ICPX") 77 | set(CMAKE_CXX_COMPILER icpx) 78 | set(CMAKE_C_COMPILER icx) 79 | register_append_cxx_flags(ANY -fsycl) 80 | register_append_link_flags(-fsycl) 81 | elseif (${SYCL_COMPILER} STREQUAL "ONEAPI-Clang") 82 | set(CMAKE_CXX_COMPILER clang++) 83 | set(CMAKE_C_COMPILER clang) 84 | register_append_cxx_flags(ANY -fsycl) 85 | register_append_link_flags(-fsycl) 86 | else () 87 | message(FATAL_ERROR "SYCL_COMPILER=${SYCL_COMPILER} is unsupported") 88 | endif () 89 | 90 | endmacro() 91 | 92 | 93 | macro(setup_target NAME) 94 | if ( 95 | (${SYCL_COMPILER} STREQUAL "COMPUTECPP") OR 96 | (${SYCL_COMPILER} STREQUAL "HIPSYCL")) 97 | # so ComputeCpp and hipSYCL has this weird (and bad) CMake usage where they append their 98 | # own custom integration header flags AFTER the target has been specified 99 | # hence this macro here 100 | add_sycl_to_target( 101 | TARGET ${NAME} 102 | SOURCES ${IMPL_SOURCES}) 103 | endif () 104 | endmacro() 105 | -------------------------------------------------------------------------------- /src/sycl-acc/ppcg.cpp: -------------------------------------------------------------------------------- 1 | #include "chunk.h" 2 | #include "shared.h" 3 | #include "sycl_shared.hpp" 4 | 5 | using namespace cl::sycl; 6 | 7 | // Initialises Sd 8 | void ppcg_init(const int x, const int y, const int halo_depth, const double theta, SyclBuffer &sdBuff, SyclBuffer &rBuff, 9 | queue &device_queue) { 10 | device_queue.submit([&](handler &h) { 11 | auto sd = sdBuff.get_access(h); 12 | auto r = rBuff.get_access(h); 13 | h.parallel_for(range<1>(x * y), [=](id<1> idx) { 14 | const auto kk = idx[0] % x; 15 | const auto jj = idx[0] / x; 16 | if (kk >= halo_depth && kk < x - halo_depth && jj >= halo_depth && jj < y - halo_depth) { 17 | sd[idx[0]] = r[idx[0]] / theta; 18 | } 19 | }); 20 | }); 21 | #ifdef ENABLE_PROFILING 22 | device_queue.wait_and_throw(); 23 | #endif 24 | } 25 | 26 | // Calculates U and R 27 | void ppcg_calc_ur(const int x, const int y, const int halo_depth, SyclBuffer &sdBuff, SyclBuffer &rBuff, SyclBuffer &uBuff, 28 | SyclBuffer &kxBuff, SyclBuffer &kyBuff, queue &device_queue) { 29 | device_queue.submit([&](handler &h) { 30 | auto sd = sdBuff.get_access(h); 31 | auto r = rBuff.get_access(h); 32 | auto u = uBuff.get_access(h); 33 | auto kx = kxBuff.get_access(h); 34 | auto ky = kyBuff.get_access(h); 35 | h.parallel_for(range<1>(x * y), [=](id<1> idx) { 36 | const auto kk = idx[0] % x; 37 | const auto jj = idx[0] / x; 38 | if (kk >= halo_depth && kk < x - halo_depth && jj >= halo_depth && jj < y - halo_depth) { 39 | // smvp uses kx and ky and index 40 | int index = idx[0]; 41 | const double smvp = tealeaf_SMVP(sd); 42 | r[idx[0]] -= smvp; 43 | u[idx[0]] += sd[idx[0]]; 44 | } 45 | }); 46 | }); 47 | #ifdef ENABLE_PROFILING 48 | device_queue.wait_and_throw(); 49 | #endif 50 | } 51 | 52 | // Calculates Sd 53 | void ppcg_calc_sd(const int x, const int y, const int halo_depth, const double theta, const double alpha, const double beta, 54 | SyclBuffer &sdBuff, SyclBuffer &rBuff, queue &device_queue) { 55 | device_queue.submit([&](handler &h) { 56 | auto sd = sdBuff.get_access(h); 57 | auto r = rBuff.get_access(h); 58 | h.parallel_for(range<1>(x * y), [=](id<1> idx) { 59 | const auto kk = idx[0] % x; 60 | const auto jj = idx[0] / x; 61 | if (kk >= halo_depth && kk < x - halo_depth && jj >= halo_depth && jj < y - halo_depth) { 62 | sd[idx[0]] = alpha * sd[idx[0]] + beta * r[idx[0]]; 63 | } 64 | }); 65 | }); 66 | #ifdef ENABLE_PROFILING 67 | device_queue.wait_and_throw(); 68 | #endif 69 | } 70 | 71 | // PPCG solver kernels 72 | void run_ppcg_init(Chunk *chunk, Settings &settings) { 73 | START_PROFILING(settings.kernel_profile); 74 | 75 | ppcg_init(chunk->x, chunk->y, settings.halo_depth, chunk->theta, *(chunk->sd), *(chunk->r), *(chunk->ext->device_queue)); 76 | 77 | STOP_PROFILING(settings.kernel_profile, __func__); 78 | } 79 | 80 | void run_ppcg_inner_iteration(Chunk *chunk, Settings &settings, double alpha, double beta) { 81 | START_PROFILING(settings.kernel_profile); 82 | 83 | ppcg_calc_ur(chunk->x, chunk->y, settings.halo_depth, *(chunk->sd), *(chunk->r), *(chunk->u), *(chunk->kx), *(chunk->ky), 84 | *(chunk->ext->device_queue)); 85 | 86 | ppcg_calc_sd(chunk->x, chunk->y, settings.halo_depth, chunk->theta, alpha, beta, *(chunk->sd), *(chunk->r), *(chunk->ext->device_queue)); 87 | 88 | STOP_PROFILING(settings.kernel_profile, __func__); 89 | } 90 | -------------------------------------------------------------------------------- /src/sycl-acc/sycl_shared.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | using namespace cl::sycl; 6 | 7 | using SyclBuffer = buffer; 8 | 9 | template inline auto reduction_shim(buffer &b, sycl::handler &h, T init, BinaryOp f) { 10 | #if defined(__HIPSYCL__) || defined(__OPENSYCL__) 11 | return sycl::reduction(b. template get_access(h), init, f); 12 | #else 13 | return sycl::reduction(b, h, init, f, sycl::property::reduction::initialize_to_identity()); 14 | #endif 15 | } -------------------------------------------------------------------------------- /src/sycl-usm/cheby.cpp: -------------------------------------------------------------------------------- 1 | #include "chunk.h" 2 | #include "shared.h" 3 | #include "sycl_shared.hpp" 4 | 5 | using namespace cl::sycl; 6 | 7 | // Initialises the Chebyshev solver 8 | void cheby_init(const int x, // 9 | const int y, // 10 | const int halo_depth, // 11 | const double theta, // 12 | SyclBuffer &p, // 13 | SyclBuffer &r, // 14 | SyclBuffer &u, // 15 | SyclBuffer &u0, // 16 | SyclBuffer &w, // 17 | SyclBuffer &kx, // 18 | SyclBuffer &ky, // 19 | queue &device_queue) { 20 | device_queue 21 | .submit([&](handler &h) { 22 | h.parallel_for(range<1>(x * y), [=](id<1> idx) { 23 | const auto kk = idx[0] % x; 24 | const auto jj = idx[0] / x; 25 | if (kk >= halo_depth && kk < x - halo_depth && jj >= halo_depth && jj < y - halo_depth) { 26 | // smvp uses kx and ky and index 27 | int index = idx[0]; 28 | const double smvp = tealeaf_SMVP(u); 29 | w[idx[0]] = smvp; 30 | // could make w write only and then use smvp here 31 | r[idx[0]] = u0[idx[0]] - w[idx[0]]; 32 | p[idx[0]] = r[idx[0]] / theta; 33 | } 34 | }); 35 | }) 36 | .wait_and_throw(); 37 | } 38 | 39 | // Calculates U 40 | void cheby_calc_u(const int x, // 41 | const int y, // 42 | const int halo_depth, // 43 | SyclBuffer &p, // 44 | SyclBuffer &u, // 45 | queue &device_queue) { 46 | device_queue.submit([&](handler &h) { 47 | h.parallel_for(range<1>(x * y), [=](id<1> idx) { 48 | const auto kk = idx[0] % x; 49 | const auto jj = idx[0] / x; 50 | if (kk >= halo_depth && kk < x - halo_depth && jj >= halo_depth && jj < y - halo_depth) { 51 | u[idx[0]] += p[idx[0]]; 52 | } 53 | }); 54 | }); 55 | #ifdef ENABLE_PROFILING 56 | device_queue.wait_and_throw(); 57 | #endif 58 | } 59 | 60 | // The main Cheby iteration step 61 | void cheby_iterate(const int x, // 62 | const int y, // 63 | const int halo_depth, // 64 | const double alpha, // 65 | const double beta, // 66 | SyclBuffer &p, // 67 | SyclBuffer &r, // 68 | SyclBuffer &u, // 69 | SyclBuffer &u0, // 70 | SyclBuffer &w, // 71 | SyclBuffer &kx, // 72 | SyclBuffer &ky, // 73 | queue &device_queue) { 74 | device_queue.submit([&](handler &h) { 75 | h.parallel_for(range<1>(x * y), [=](id<1> idx) { 76 | const auto kk = idx[0] % x; 77 | const auto jj = idx[0] / x; 78 | if (kk >= halo_depth && kk < x - halo_depth && jj >= halo_depth && jj < y - halo_depth) { 79 | // smvp uses kx and ky and index 80 | int index = idx[0]; 81 | const double smvp = tealeaf_SMVP(u); 82 | w[index] = smvp; 83 | // could make w write only and then use smvp here 84 | r[index] = u0[index] - w[index]; 85 | p[index] = alpha * p[index] + beta * r[index]; 86 | } 87 | }); 88 | }); 89 | #ifdef ENABLE_PROFILING 90 | device_queue.wait_and_throw(); 91 | #endif 92 | } 93 | 94 | // Chebyshev solver kernels 95 | void run_cheby_init(Chunk *chunk, Settings &settings) { 96 | START_PROFILING(settings.kernel_profile); 97 | 98 | cheby_init(chunk->x, chunk->y, settings.halo_depth, chunk->theta, (chunk->p), (chunk->r), (chunk->u), (chunk->u0), (chunk->w), 99 | (chunk->kx), (chunk->ky), *(chunk->ext->device_queue)); 100 | 101 | STOP_PROFILING(settings.kernel_profile, __func__); 102 | } 103 | 104 | void run_cheby_iterate(Chunk *chunk, Settings &settings, double alpha, double beta) { 105 | START_PROFILING(settings.kernel_profile); 106 | 107 | cheby_iterate(chunk->x, chunk->y, settings.halo_depth, alpha, beta, (chunk->p), (chunk->r), (chunk->u), (chunk->u0), (chunk->w), 108 | (chunk->kx), (chunk->ky), *(chunk->ext->device_queue)); 109 | 110 | cheby_calc_u(chunk->x, chunk->y, settings.halo_depth, (chunk->p), (chunk->u), *(chunk->ext->device_queue)); 111 | 112 | STOP_PROFILING(settings.kernel_profile, __func__); 113 | } 114 | -------------------------------------------------------------------------------- /src/sycl-usm/chunk_extension.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | using namespace cl; 6 | 7 | using FieldBufferType = double *; 8 | using StagingBufferType = double *; 9 | 10 | struct Summary { 11 | double vol = 0.0; 12 | double mass = 0.0; 13 | double ie = 0.0; 14 | double temp = 0.0; 15 | [[nodiscard]] constexpr Summary operator+(const Summary &that) const { // 16 | return {vol + that.vol, mass + that.mass, ie + that.ie, temp + that.temp}; 17 | } 18 | }; 19 | 20 | struct ChunkExtension { 21 | sycl::queue *device_queue; 22 | double *reduction_cg_rro; 23 | double *reduction_cg_pw; 24 | double *reduction_cg_rrn; 25 | double *reduction_jacobi_error; 26 | double *reduction_norm; 27 | Summary *reduction_field_summary; 28 | }; 29 | -------------------------------------------------------------------------------- /src/sycl-usm/jacobi.cpp: -------------------------------------------------------------------------------- 1 | #include "chunk.h" 2 | #include "shared.h" 3 | #include "sycl_shared.hpp" 4 | 5 | using namespace cl::sycl; 6 | 7 | // Initialises the Jacobi solver 8 | void jacobi_init(const int x, // 9 | const int y, // 10 | const int halo_depth, // 11 | const int coefficient, // 12 | const double rx, // 13 | const double ry, // 14 | SyclBuffer &u, // 15 | SyclBuffer &u0, // 16 | SyclBuffer &density, // 17 | SyclBuffer &energy, // 18 | SyclBuffer &kx, // 19 | SyclBuffer &ky, // 20 | queue &device_queue) { 21 | device_queue 22 | .submit([&](handler &h) { 23 | h.parallel_for(range<1>(x * y), [=](id<1> idx) { 24 | const auto kk = idx[0] % x; 25 | const auto jj = idx[0] / x; 26 | if (kk > 0 && kk < x - 1 && jj > 0 && jj < y - 1) { 27 | u0[idx[0]] = energy[idx[0]] * density[idx[0]]; 28 | u[idx[0]] = u0[idx[0]]; 29 | } 30 | if (jj >= halo_depth && jj < y - 1 && kk >= halo_depth && kk < x - 1) { 31 | double densityCentre = (coefficient == CONDUCTIVITY) ? density[idx[0]] : 1.0 / density[idx[0]]; 32 | double densityLeft = (coefficient == CONDUCTIVITY) ? density[idx[0] - 1] : 1.0 / density[idx[0] - 1]; 33 | double densityDown = (coefficient == CONDUCTIVITY) ? density[idx[0] - x] : 1.0 / density[idx[0] - x]; 34 | 35 | kx[idx[0]] = rx * (densityLeft + densityCentre) / (2.0 * densityLeft * densityCentre); 36 | ky[idx[0]] = ry * (densityDown + densityCentre) / (2.0 * densityDown * densityCentre); 37 | } 38 | }); 39 | }) 40 | .wait_and_throw(); 41 | #ifdef ENABLE_PROFILING 42 | device_queue.wait_and_throw(); 43 | #endif 44 | } 45 | 46 | // Main Jacobi solver method. 47 | void jacobi_iterate(const int x, // 48 | const int y, // 49 | const int halo_depth, // 50 | SyclBuffer &u, // 51 | SyclBuffer &u0, // 52 | SyclBuffer &r, // 53 | SyclBuffer &kx, // 54 | SyclBuffer &ky, // 55 | SyclBuffer &error_temp, // 56 | double *error, // 57 | queue &device_queue) { 58 | auto event = device_queue.submit([&](handler &h) { 59 | h.parallel_for( // 60 | range<1>(x * y), // 61 | reduction_shim(error_temp, *error, sycl::plus()), // 62 | [=](item<1> item, auto &acc) { 63 | const auto kk = item[0] % x; 64 | const auto jj = item[0] / x; 65 | if (kk >= halo_depth && kk < x - halo_depth && jj >= halo_depth && jj < y - halo_depth) { 66 | u[item[0]] = (u0[item[0]] + (kx[item[0] + 1] * r[item[0] + 1] + kx[item[0]] * r[item[0] - 1]) + 67 | (ky[item[0] + x] * r[item[0] + x] + ky[item[0]] * r[item[0] - x])) / 68 | (1.0 + (kx[item[0]] + kx[item[0] + 1]) + (ky[item[0]] + ky[item[0] + x])); 69 | acc += ::fabs((u[item[0]] - r[item[0]])); // fabs is float version of abs 70 | } 71 | }); 72 | }); 73 | device_queue.copy(error_temp, error, 1, event).wait_and_throw(); 74 | #ifdef ENABLE_PROFILING 75 | device_queue.wait_and_throw(); 76 | #endif 77 | } 78 | 79 | // Copies u into r 80 | void jacobi_copy_u(const int x, // 81 | const int y, // 82 | SyclBuffer &r, // 83 | SyclBuffer &u, // 84 | queue &device_queue) { 85 | device_queue.submit([&](handler &h) { h.parallel_for(range<1>(x * y), [=](id<1> idx) { r[idx[0]] = u[idx[0]]; }); }); 86 | #ifdef ENABLE_PROFILING 87 | device_queue.wait_and_throw(); 88 | #endif 89 | } 90 | 91 | // Jacobi solver kernels 92 | void run_jacobi_init(Chunk *chunk, Settings &settings, double rx, double ry) { 93 | START_PROFILING(settings.kernel_profile); 94 | 95 | jacobi_init(chunk->x, chunk->y, settings.halo_depth, settings.coefficient, rx, ry, (chunk->u), (chunk->u0), (chunk->density), 96 | (chunk->energy), (chunk->kx), (chunk->ky), *(chunk->ext->device_queue)); 97 | 98 | STOP_PROFILING(settings.kernel_profile, __func__); 99 | } 100 | 101 | void run_jacobi_iterate(Chunk *chunk, Settings &settings, double *error) { 102 | START_PROFILING(settings.kernel_profile); 103 | 104 | jacobi_copy_u(chunk->x, chunk->y, (chunk->r), (chunk->u), *(chunk->ext->device_queue)); 105 | 106 | jacobi_iterate(chunk->x, chunk->y, settings.halo_depth, (chunk->u), (chunk->u0), (chunk->r), (chunk->kx), (chunk->ky), 107 | (chunk->ext->reduction_jacobi_error), error, *(chunk->ext->device_queue)); 108 | 109 | STOP_PROFILING(settings.kernel_profile, __func__); 110 | } 111 | -------------------------------------------------------------------------------- /src/sycl-usm/model.cmake: -------------------------------------------------------------------------------- 1 | 2 | register_flag_optional(CMAKE_CXX_COMPILER 3 | "Any CXX compiler that is supported by CMake detection, this is used for host compilation when required by the SYCL compiler" 4 | "c++") 5 | 6 | register_flag_required(SYCL_COMPILER 7 | "Compile using the specified SYCL compiler implementation 8 | Supported values are 9 | ONEAPI-ICPX - icpx as a standalone compiler 10 | ONEAPI-Clang - oneAPI's Clang driver (enabled via `source /opt/intel/oneapi/setvars.sh --include-intel-llvm`) 11 | DPCPP - dpc++ as a standalone compiler (https://github.com/intel/llvm) 12 | HIPSYCL - hipSYCL compiler (https://github.com/illuhad/hipSYCL) 13 | COMPUTECPP - ComputeCpp compiler (https://developer.codeplay.com/products/computecpp/ce/home)") 14 | 15 | register_flag_optional(SYCL_COMPILER_DIR 16 | "Absolute path to the selected SYCL compiler directory, most are packaged differently so set the path according to `SYCL_COMPILER`: 17 | ONEAPI-ICPX - `icpx` must be used for OneAPI 2023 and later on releases (i.e `source /opt/intel/oneapi/setvars.sh` first) 18 | ONEAPI-Clang - set to the directory that contains the Intel clang++ binary. 19 | HIPSYCL|DPCPP|COMPUTECPP - set to the root of the binary distribution that contains at least `bin/`, `include/`, and `lib/`" 20 | "") 21 | 22 | register_flag_optional(USE_HOSTTASK 23 | "Whether to use SYCL2020 host_task for MPI related calls or fallback to queue.wait() not all SYCL compilers support this" 24 | "OFF") 25 | 26 | 27 | register_flag_optional(OpenCL_LIBRARY 28 | "[ComputeCpp only] Path to OpenCL library, usually called libOpenCL.so" 29 | "${OpenCL_LIBRARY}") 30 | 31 | macro(setup) 32 | set(CMAKE_CXX_STANDARD 17) 33 | 34 | if (USE_HOSTTASK) 35 | register_definitions(USE_HOSTTASK) 36 | endif () 37 | 38 | 39 | if (${SYCL_COMPILER} STREQUAL "HIPSYCL") 40 | 41 | 42 | set(hipSYCL_DIR ${SYCL_COMPILER_DIR}/lib/cmake/hipSYCL) 43 | 44 | if (NOT EXISTS "${hipSYCL_DIR}") 45 | message(WARNING "Falling back to hipSYCL < 0.9.0 CMake structure") 46 | set(hipSYCL_DIR ${SYCL_COMPILER_DIR}/lib/cmake) 47 | endif () 48 | if (NOT EXISTS "${hipSYCL_DIR}") 49 | message(FATAL_ERROR "Can't find the appropriate CMake definitions for hipSYCL") 50 | endif () 51 | 52 | # register_definitions(_GLIBCXX_USE_CXX11_ABI=0) 53 | find_package(hipSYCL CONFIG REQUIRED) 54 | message(STATUS "ok") 55 | 56 | elseif (${SYCL_COMPILER} STREQUAL "COMPUTECPP") 57 | 58 | list(APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake/Modules) 59 | set(ComputeCpp_DIR ${SYCL_COMPILER_DIR}) 60 | 61 | # don't point to the CL dir as the imports already have the CL prefix 62 | set(OpenCL_INCLUDE_DIR "${CMAKE_SOURCE_DIR}") 63 | 64 | register_definitions(CL_TARGET_OPENCL_VERSION=220 _GLIBCXX_USE_CXX11_ABI=0) 65 | # ComputeCpp needs OpenCL 66 | find_package(ComputeCpp REQUIRED) 67 | 68 | # this must come after FindComputeCpp (!) 69 | set(COMPUTECPP_USER_FLAGS -O3 -no-serial-memop) 70 | 71 | elseif (${SYCL_COMPILER} STREQUAL "DPCPP") 72 | set(CMAKE_CXX_COMPILER ${SYCL_COMPILER_DIR}/bin/clang++) 73 | include_directories(${SYCL_COMPILER_DIR}/include/sycl) 74 | register_append_cxx_flags(ANY -fsycl) 75 | register_append_link_flags(-fsycl) 76 | elseif (${SYCL_COMPILER} STREQUAL "ONEAPI-ICPX") 77 | set(CMAKE_CXX_COMPILER icpx) 78 | set(CMAKE_C_COMPILER icx) 79 | register_append_cxx_flags(ANY -fsycl) 80 | register_append_link_flags(-fsycl) 81 | elseif (${SYCL_COMPILER} STREQUAL "ONEAPI-Clang") 82 | set(CMAKE_CXX_COMPILER clang++) 83 | set(CMAKE_C_COMPILER clang) 84 | register_append_cxx_flags(ANY -fsycl) 85 | register_append_link_flags(-fsycl) 86 | else () 87 | message(FATAL_ERROR "SYCL_COMPILER=${SYCL_COMPILER} is unsupported") 88 | endif () 89 | 90 | endmacro() 91 | 92 | 93 | macro(setup_target NAME) 94 | if ( 95 | (${SYCL_COMPILER} STREQUAL "COMPUTECPP") OR 96 | (${SYCL_COMPILER} STREQUAL "HIPSYCL")) 97 | # so ComputeCpp and hipSYCL has this weird (and bad) CMake usage where they append their 98 | # own custom integration header flags AFTER the target has been specified 99 | # hence this macro here 100 | add_sycl_to_target( 101 | TARGET ${NAME} 102 | SOURCES ${IMPL_SOURCES}) 103 | endif () 104 | endmacro() 105 | -------------------------------------------------------------------------------- /src/sycl-usm/ppcg.cpp: -------------------------------------------------------------------------------- 1 | #include "chunk.h" 2 | #include "shared.h" 3 | #include "sycl_shared.hpp" 4 | 5 | using namespace cl::sycl; 6 | 7 | // Initialises Sd 8 | void ppcg_init(const int x, // 9 | const int y, // 10 | const int halo_depth, // 11 | const double theta, // 12 | SyclBuffer &sd, // 13 | SyclBuffer &r, // 14 | queue &device_queue) { 15 | device_queue.submit([&](handler &h) { 16 | h.parallel_for(range<1>(x * y), [=](id<1> idx) { 17 | const auto kk = idx[0] % x; 18 | const auto jj = idx[0] / x; 19 | if (kk >= halo_depth && kk < x - halo_depth && jj >= halo_depth && jj < y - halo_depth) { 20 | sd[idx[0]] = r[idx[0]] / theta; 21 | } 22 | }); 23 | }); 24 | #ifdef ENABLE_PROFILING 25 | device_queue.wait_and_throw(); 26 | #endif 27 | } 28 | 29 | // Calculates U and R 30 | void ppcg_calc_ur(const int x, // 31 | const int y, // 32 | const int halo_depth, // 33 | SyclBuffer &sd, // 34 | SyclBuffer &r, // 35 | SyclBuffer &u, // 36 | SyclBuffer &kx, // 37 | SyclBuffer &ky, // 38 | queue &device_queue) { 39 | device_queue.submit([&](handler &h) { 40 | h.parallel_for(range<1>(x * y), [=](id<1> idx) { 41 | const auto kk = idx[0] % x; 42 | const auto jj = idx[0] / x; 43 | if (kk >= halo_depth && kk < x - halo_depth && jj >= halo_depth && jj < y - halo_depth) { 44 | // smvp uses kx and ky and index 45 | int index = idx[0]; 46 | const double smvp = tealeaf_SMVP(sd); 47 | r[idx[0]] -= smvp; 48 | u[idx[0]] += sd[idx[0]]; 49 | } 50 | }); 51 | }); 52 | #ifdef ENABLE_PROFILING 53 | device_queue.wait_and_throw(); 54 | #endif 55 | } 56 | 57 | // Calculates Sd 58 | void ppcg_calc_sd(const int x, // 59 | const int y, // 60 | const int halo_depth, // 61 | const double theta, // 62 | const double alpha, // 63 | const double beta, // 64 | SyclBuffer &sd, // 65 | SyclBuffer &r, // 66 | queue &device_queue) { 67 | device_queue.submit([&](handler &h) { 68 | h.parallel_for(range<1>(x * y), [=](id<1> idx) { 69 | const auto kk = idx[0] % x; 70 | const auto jj = idx[0] / x; 71 | if (kk >= halo_depth && kk < x - halo_depth && jj >= halo_depth && jj < y - halo_depth) { 72 | sd[idx[0]] = alpha * sd[idx[0]] + beta * r[idx[0]]; 73 | } 74 | }); 75 | }); 76 | #ifdef ENABLE_PROFILING 77 | device_queue.wait_and_throw(); 78 | #endif 79 | } 80 | 81 | // PPCG solver kernels 82 | void run_ppcg_init(Chunk *chunk, Settings &settings) { 83 | START_PROFILING(settings.kernel_profile); 84 | 85 | ppcg_init(chunk->x, chunk->y, settings.halo_depth, chunk->theta, (chunk->sd), (chunk->r), *(chunk->ext->device_queue)); 86 | 87 | STOP_PROFILING(settings.kernel_profile, __func__); 88 | } 89 | 90 | void run_ppcg_inner_iteration(Chunk *chunk, Settings &settings, double alpha, double beta) { 91 | START_PROFILING(settings.kernel_profile); 92 | 93 | ppcg_calc_ur(chunk->x, chunk->y, settings.halo_depth, (chunk->sd), (chunk->r), (chunk->u), (chunk->kx), (chunk->ky), 94 | *(chunk->ext->device_queue)); 95 | 96 | ppcg_calc_sd(chunk->x, chunk->y, settings.halo_depth, chunk->theta, alpha, beta, (chunk->sd), (chunk->r), *(chunk->ext->device_queue)); 97 | 98 | STOP_PROFILING(settings.kernel_profile, __func__); 99 | } 100 | -------------------------------------------------------------------------------- /src/sycl-usm/sycl_shared.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | using namespace cl::sycl; 6 | 7 | using SyclBuffer = double *; 8 | 9 | template inline auto reduction_shim(T *b, T init, BinaryOp f) { 10 | #if defined(__HIPSYCL__) || defined(__OPENSYCL__) 11 | return sycl::reduction(b, init, f); 12 | #else 13 | return sycl::reduction(b, init, f, sycl::property::reduction::initialize_to_identity()); 14 | #endif 15 | } 16 | -------------------------------------------------------------------------------- /tea.in: -------------------------------------------------------------------------------- 1 | *tea 2 | state 1 density=100.0 energy=0.0001 3 | state 2 density=0.1 energy=25.0 geometry=rectangle xmin=0.0 xmax=1.0 ymin=1.0 ymax=2.0 4 | state 3 density=0.1 energy=0.1 geometry=rectangle xmin=1.0 xmax=6.0 ymin=1.0 ymax=2.0 5 | state 4 density=0.1 energy=0.1 geometry=rectangle xmin=5.0 xmax=6.0 ymin=1.0 ymax=8.0 6 | state 5 density=0.1 energy=0.1 geometry=rectangle xmin=5.0 xmax=10.0 ymin=7.0 ymax=8.0 7 | x_cells=512 8 | y_cells=512 9 | xmin=0.0 10 | ymin=0.0 11 | xmax=10.0 12 | ymax=10.0 13 | initial_timestep=0.004 14 | end_step=20 15 | max_iters=10000 16 | #use_chebyshev 17 | #use_ppcg 18 | #use_jacobi 19 | use_cg 20 | eps 1.0e-15 21 | test_problem 5 22 | profiler_on 23 | use_c_kernels 24 | *endtea 25 | -------------------------------------------------------------------------------- /tea.problems: -------------------------------------------------------------------------------- 1 | 512 512 20 1.034697091898282e+02 2 | 4096 4096 1 8.789826115915487e+01 3 | 1024 1024 20 1.012100932683400e+02 4 | 64 64 1 1.084697720003111e+02 5 | 1000 1000 10 9.727733205075556e+01 6 | 2000 2000 10 9.605026999605091e+01 7 | 4000 4000 10 9.5462351582214282e+01 8 | 8000 8000 10 9.517473876862078e+01 9 | 1000 1000 4 9.348844542172745e+01 10 | 2000 2000 4 9.213313338102208e+01 11 | 4000 4000 4 9.150788514428132e+01 12 | 8000 8000 4 9.120746325516782e+01 13 | 1000 1000 2 9.161051549004094e+01 14 | 2000 2000 2 9.010618606381739e+01 15 | 4000 4000 2 8.944258537125111e+01 16 | 8000 8000 2 8.913203173864531e+01 17 | --------------------------------------------------------------------------------