├── .clang-format
├── .gitignore
├── Benchmarks
    ├── tea_bm_1.in
    ├── tea_bm_1.out
    ├── tea_bm_2.in
    ├── tea_bm_2.out
    ├── tea_bm_3.in
    ├── tea_bm_3.out
    ├── tea_bm_4.in
    ├── tea_bm_4.out
    ├── tea_bm_5.in
    ├── tea_bm_5.out
    ├── tea_bm_5e_1.in
    ├── tea_bm_5e_1_2.in
    ├── tea_bm_5e_1_4.in
    ├── tea_bm_5e_2.in
    ├── tea_bm_5e_2_2.in
    ├── tea_bm_5e_2_4.in
    ├── tea_bm_5e_4.in
    ├── tea_bm_5e_4_2.in
    ├── tea_bm_5e_4_4.in
    ├── tea_bm_5e_8.in
    ├── tea_bm_5e_8_2.in
    ├── tea_bm_5e_8_4.in
    ├── tea_bm_6.in
    └── tea_bm_6.out
├── CMakeLists.txt
├── README.md
├── build.sh
├── cmake
    └── register_models.cmake
├── driver
    ├── application.h
    ├── cg_driver.cpp
    ├── cheby_driver.cpp
    ├── chunk.cpp
    ├── chunk.h
    ├── comms.cpp
    ├── comms.h
    ├── diffuse.cpp
    ├── drivers.h
    ├── eigenvalue_driver.cpp
    ├── field_summary_driver.cpp
    ├── halo_update_driver.cpp
    ├── initialise.cpp
    ├── jacobi_driver.cpp
    ├── kernel_initialise_driver.cpp
    ├── kernel_interface.h
    ├── main.cpp
    ├── mpi_shim.cpp
    ├── mpi_shim.h
    ├── parse_config.cpp
    ├── ppcg_driver.cpp
    ├── profiler.cpp
    ├── profiler.h
    ├── remote_halo_driver.cpp
    ├── set_chunk_data_driver.cpp
    ├── set_chunk_state_driver.cpp
    ├── settings.cpp
    ├── settings.h
    ├── shared.cpp
    ├── shared.h
    ├── solve_finished_driver.cpp
    └── store_energy_driver.cpp
├── src
    ├── cuda
    │   ├── cg.cpp
    │   ├── cheby.cpp
    │   ├── chunk_extension.h
    │   ├── cuknl_shared.h
    │   ├── jacobi.cpp
    │   ├── kernel_initialise.cpp
    │   ├── local_halos.cpp
    │   ├── model.cmake
    │   ├── pack_halos.cpp
    │   ├── ppcg.cpp
    │   └── solver_methods.cpp
    ├── hip
    │   ├── cg.cpp
    │   ├── cheby.cpp
    │   ├── chunk_extension.h
    │   ├── cuknl_shared.h
    │   ├── jacobi.cpp
    │   ├── kernel_initialise.cpp
    │   ├── local_halos.cpp
    │   ├── model.cmake
    │   ├── pack_halos.cpp
    │   ├── ppcg.cpp
    │   └── solver_methods.cpp
    ├── kokkos
    │   ├── cg.cpp
    │   ├── cheby.cpp
    │   ├── chunk_extension.h
    │   ├── jacobi.cpp
    │   ├── kernel_initialise.cpp
    │   ├── kokkos_shared.hpp
    │   ├── local_halos.cpp
    │   ├── model.cmake
    │   ├── pack_halos.cpp
    │   ├── ppcg.cpp
    │   └── solver_methods.cpp
    ├── omp
    │   ├── cg.cpp
    │   ├── cheby.cpp
    │   ├── chunk_extension.h
    │   ├── diffuse_overload.cpp
    │   ├── jacobi.cpp
    │   ├── kernel_initialise.cpp
    │   ├── local_halos.cpp
    │   ├── model.cmake
    │   ├── pack_halos.cpp
    │   ├── ppcg.cpp
    │   └── solver_methods.cpp
    ├── serial
    │   ├── cg.cpp
    │   ├── cheby.cpp
    │   ├── chunk_extension.h
    │   ├── jacobi.cpp
    │   ├── kernel_initialise.cpp
    │   ├── local_halos.cpp
    │   ├── model.cmake
    │   ├── pack_halos.cpp
    │   ├── ppcg.cpp
    │   └── solver_methods.cpp
    ├── std-indices
    │   ├── cg.cpp
    │   ├── cheby.cpp
    │   ├── chunk_extension.h
    │   ├── dpl_shim.h
    │   ├── jacobi.cpp
    │   ├── kernel_initialise.cpp
    │   ├── local_halos.cpp
    │   ├── model.cmake
    │   ├── pack_halos.cpp
    │   ├── ppcg.cpp
    │   ├── ranged.h
    │   ├── solver_methods.cpp
    │   └── std_shared.h
    ├── sycl-acc
    │   ├── cg.cpp
    │   ├── cheby.cpp
    │   ├── chunk_extension.h
    │   ├── jacobi.cpp
    │   ├── kernel_initialise.cpp
    │   ├── local_halos.cpp
    │   ├── model.cmake
    │   ├── pack_halos.cpp
    │   ├── ppcg.cpp
    │   ├── solver_methods.cpp
    │   └── sycl_shared.hpp
    └── sycl-usm
    │   ├── cg.cpp
    │   ├── cheby.cpp
    │   ├── chunk_extension.h
    │   ├── jacobi.cpp
    │   ├── kernel_initialise.cpp
    │   ├── local_halos.cpp
    │   ├── model.cmake
    │   ├── pack_halos.cpp
    │   ├── ppcg.cpp
    │   ├── solver_methods.cpp
    │   └── sycl_shared.hpp
├── tea.in
├── tea.problems
└── test.sh


/.clang-format:
--------------------------------------------------------------------------------
 1 | ﻿---
 2 | AllowShortIfStatementsOnASingleLine: Always
 3 | AllowShortCaseLabelsOnASingleLine: true
 4 | AllowShortFunctionsOnASingleLine: All
 5 | AlignEscapedNewlines: Left
 6 | IndentCaseLabels: true
 7 | ColumnLimit: 140
 8 | CompactNamespaces: true
 9 | FixNamespaceComments: true
10 | IndentPPDirectives: BeforeHash
11 | ...
12 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.dat
 2 | *.o
 3 | *.a
 4 | *~
 5 | run
 6 | submit
 7 | submit.*
 8 | *.mod
 9 | .*.swp
10 | .DS_STORE
11 | tea.out
12 | *.ptx
13 | *.cub
14 | *.lst
15 | tealeaf
16 | tealeaf.*
17 | .ycm_extra_conf.py*
18 | *.optrpt
19 | src.*
20 | tags
21 | 
22 | # CMake
23 | cmake-build-*/
24 | Build/
25 | build/
26 | build_*/
27 | .idea/
28 | 
29 | # IntelliJ
30 | out/
31 | 
32 | 
33 | # KDE directory preferences
34 | .directory
35 | 


--------------------------------------------------------------------------------
/Benchmarks/tea_bm_1.in:
--------------------------------------------------------------------------------
 1 | *tea
 2 | state 1 density=100.0 energy=0.0001
 3 | state 2 density=0.1 energy=25.0 geometry=rectangle xmin=0.0 xmax=1.0 ymin=1.0 ymax=2.0
 4 | state 3 density=0.1 energy=0.1 geometry=rectangle xmin=1.0 xmax=6.0 ymin=1.0 ymax=2.0
 5 | state 4 density=0.1 energy=0.1 geometry=rectangle xmin=5.0 xmax=6.0 ymin=1.0 ymax=8.0
 6 | state 5 density=0.1 energy=0.1 geometry=rectangle xmin=5.0 xmax=10.0 ymin=7.0 ymax=8.0
 7 | x_cells=10
 8 | y_cells=10
 9 | xmin=0.0
10 | ymin=0.0
11 | xmax=10.0
12 | ymax=10.0
13 | initial_timestep=0.004
14 | end_step=10
15 | max_iters=10000
16 | use_cg
17 | eps 1.0e-15
18 | test_problem 1
19 | use_c_kernels
20 | *endtea
21 | 


--------------------------------------------------------------------------------
/Benchmarks/tea_bm_2.in:
--------------------------------------------------------------------------------
 1 | *tea
 2 | state 1 density=100.0 energy=0.0001
 3 | state 2 density=0.1 energy=25.0 geometry=rectangle xmin=0.0 xmax=1.0 ymin=1.0 ymax=2.0
 4 | state 3 density=0.1 energy=0.1 geometry=rectangle xmin=1.0 xmax=6.0 ymin=1.0 ymax=2.0
 5 | state 4 density=0.1 energy=0.1 geometry=rectangle xmin=5.0 xmax=6.0 ymin=1.0 ymax=8.0
 6 | state 5 density=0.1 energy=0.1 geometry=rectangle xmin=5.0 xmax=10.0 ymin=7.0 ymax=8.0
 7 | x_cells=250
 8 | y_cells=250
 9 | xmin=0.0
10 | ymin=0.0
11 | xmax=10.0
12 | ymax=10.0
13 | initial_timestep=0.004
14 | end_step=10
15 | max_iters=10000
16 | use_cg
17 | eps 1.0e-15
18 | test_problem 2
19 | use_c_kernels
20 | *endtea
21 | 


--------------------------------------------------------------------------------
/Benchmarks/tea_bm_3.in:
--------------------------------------------------------------------------------
 1 | *tea
 2 | state 1 density=100.0 energy=0.0001
 3 | state 2 density=0.1 energy=25.0 geometry=rectangle xmin=0.0 xmax=1.0 ymin=1.0 ymax=2.0
 4 | state 3 density=0.1 energy=0.1 geometry=rectangle xmin=1.0 xmax=6.0 ymin=1.0 ymax=2.0
 5 | state 4 density=0.1 energy=0.1 geometry=rectangle xmin=5.0 xmax=6.0 ymin=1.0 ymax=8.0
 6 | state 5 density=0.1 energy=0.1 geometry=rectangle xmin=5.0 xmax=10.0 ymin=7.0 ymax=8.0
 7 | x_cells=500
 8 | y_cells=500
 9 | xmin=0.0
10 | ymin=0.0
11 | xmax=10.0
12 | ymax=10.0
13 | initial_timestep=0.004
14 | end_step=10
15 | max_iters=10000
16 | use_cg
17 | eps 1.0e-15
18 | test_problem 3
19 | use_c_kernels
20 | *endtea
21 | 


--------------------------------------------------------------------------------
/Benchmarks/tea_bm_4.in:
--------------------------------------------------------------------------------
 1 | *tea
 2 | state 1 density=100.0 energy=0.0001
 3 | state 2 density=0.1 energy=25.0 geometry=rectangle xmin=0.0 xmax=1.0 ymin=1.0 ymax=2.0
 4 | state 3 density=0.1 energy=0.1 geometry=rectangle xmin=1.0 xmax=6.0 ymin=1.0 ymax=2.0
 5 | state 4 density=0.1 energy=0.1 geometry=rectangle xmin=5.0 xmax=6.0 ymin=1.0 ymax=8.0
 6 | state 5 density=0.1 energy=0.1 geometry=rectangle xmin=5.0 xmax=10.0 ymin=7.0 ymax=8.0
 7 | x_cells=1000
 8 | y_cells=1000
 9 | xmin=0.0
10 | ymin=0.0
11 | xmax=10.0
12 | ymax=10.0
13 | initial_timestep=0.004
14 | end_step=10
15 | max_iters=10000
16 | use_cg
17 | eps 1.0e-15
18 | test_problem 4
19 | use_c_kernels
20 | *endtea
21 | 


--------------------------------------------------------------------------------
/Benchmarks/tea_bm_5.in:
--------------------------------------------------------------------------------
 1 | *tea
 2 | state 1 density=100.0 energy=0.0001
 3 | state 2 density=0.1 energy=25.0 geometry=rectangle xmin=0.0 xmax=1.0 ymin=1.0 ymax=2.0
 4 | state 3 density=0.1 energy=0.1 geometry=rectangle xmin=1.0 xmax=6.0 ymin=1.0 ymax=2.0
 5 | state 4 density=0.1 energy=0.1 geometry=rectangle xmin=5.0 xmax=6.0 ymin=1.0 ymax=8.0
 6 | state 5 density=0.1 energy=0.1 geometry=rectangle xmin=5.0 xmax=10.0 ymin=7.0 ymax=8.0
 7 | x_cells=4000
 8 | y_cells=4000
 9 | xmin=0.0
10 | ymin=0.0
11 | xmax=10.0
12 | ymax=10.0
13 | initial_timestep=0.004
14 | end_step=10
15 | max_iters=10000
16 | use_cg
17 | eps 1.0e-15
18 | test_problem 5
19 | profiler_on
20 | use_c_kernels
21 | *endtea
22 | 


--------------------------------------------------------------------------------
/Benchmarks/tea_bm_5e_1.in:
--------------------------------------------------------------------------------
 1 | *tea
 2 | state 1 density=100.0 energy=0.0001
 3 | state 2 density=0.1 energy=25.0 geometry=rectangle xmin=0.0 xmax=1.0 ymin=1.0 ymax=2.0
 4 | state 3 density=0.1 energy=0.1 geometry=rectangle xmin=1.0 xmax=6.0 ymin=1.0 ymax=2.0
 5 | state 4 density=0.1 energy=0.1 geometry=rectangle xmin=5.0 xmax=6.0 ymin=1.0 ymax=8.0
 6 | state 5 density=0.1 energy=0.1 geometry=rectangle xmin=5.0 xmax=10.0 ymin=7.0 ymax=8.0
 7 | x_cells=1000
 8 | y_cells=1000
 9 | xmin=0.0
10 | ymin=0.0
11 | xmax=10.0
12 | ymax=10.0
13 | initial_timestep=0.004
14 | end_step=10
15 | max_iters=10000
16 | use_cg
17 | eps 1.0e-15
18 | test_problem 5
19 | profiler_on
20 | use_c_kernels
21 | *endtea
22 | 


--------------------------------------------------------------------------------
/Benchmarks/tea_bm_5e_1_2.in:
--------------------------------------------------------------------------------
 1 | *tea
 2 | state 1 density=100.0 energy=0.0001
 3 | state 2 density=0.1 energy=25.0 geometry=rectangle xmin=0.0 xmax=1.0 ymin=1.0 ymax=2.0
 4 | state 3 density=0.1 energy=0.1 geometry=rectangle xmin=1.0 xmax=6.0 ymin=1.0 ymax=2.0
 5 | state 4 density=0.1 energy=0.1 geometry=rectangle xmin=5.0 xmax=6.0 ymin=1.0 ymax=8.0
 6 | state 5 density=0.1 energy=0.1 geometry=rectangle xmin=5.0 xmax=10.0 ymin=7.0 ymax=8.0
 7 | x_cells=1000
 8 | y_cells=1000
 9 | xmin=0.0
10 | ymin=0.0
11 | xmax=10.0
12 | ymax=10.0
13 | initial_timestep=0.004
14 | end_step=2
15 | max_iters=10000
16 | use_cg
17 | eps 1.0e-15
18 | test_problem 5
19 | profiler_on
20 | use_c_kernels
21 | *endtea
22 | 


--------------------------------------------------------------------------------
/Benchmarks/tea_bm_5e_1_4.in:
--------------------------------------------------------------------------------
 1 | *tea
 2 | state 1 density=100.0 energy=0.0001
 3 | state 2 density=0.1 energy=25.0 geometry=rectangle xmin=0.0 xmax=1.0 ymin=1.0 ymax=2.0
 4 | state 3 density=0.1 energy=0.1 geometry=rectangle xmin=1.0 xmax=6.0 ymin=1.0 ymax=2.0
 5 | state 4 density=0.1 energy=0.1 geometry=rectangle xmin=5.0 xmax=6.0 ymin=1.0 ymax=8.0
 6 | state 5 density=0.1 energy=0.1 geometry=rectangle xmin=5.0 xmax=10.0 ymin=7.0 ymax=8.0
 7 | x_cells=1000
 8 | y_cells=1000
 9 | xmin=0.0
10 | ymin=0.0
11 | xmax=10.0
12 | ymax=10.0
13 | initial_timestep=0.004
14 | end_step=4
15 | max_iters=10000
16 | use_cg
17 | eps 1.0e-15
18 | test_problem 5
19 | profiler_on
20 | use_c_kernels
21 | *endtea
22 | 


--------------------------------------------------------------------------------
/Benchmarks/tea_bm_5e_2.in:
--------------------------------------------------------------------------------
 1 | *tea
 2 | state 1 density=100.0 energy=0.0001
 3 | state 2 density=0.1 energy=25.0 geometry=rectangle xmin=0.0 xmax=1.0 ymin=1.0 ymax=2.0
 4 | state 3 density=0.1 energy=0.1 geometry=rectangle xmin=1.0 xmax=6.0 ymin=1.0 ymax=2.0
 5 | state 4 density=0.1 energy=0.1 geometry=rectangle xmin=5.0 xmax=6.0 ymin=1.0 ymax=8.0
 6 | state 5 density=0.1 energy=0.1 geometry=rectangle xmin=5.0 xmax=10.0 ymin=7.0 ymax=8.0
 7 | x_cells=2000
 8 | y_cells=2000
 9 | xmin=0.0
10 | ymin=0.0
11 | xmax=10.0
12 | ymax=10.0
13 | initial_timestep=0.004
14 | end_step=10
15 | max_iters=10000
16 | use_cg
17 | eps 1.0e-15
18 | test_problem 5
19 | profiler_on
20 | use_c_kernels
21 | *endtea
22 | 


--------------------------------------------------------------------------------
/Benchmarks/tea_bm_5e_2_2.in:
--------------------------------------------------------------------------------
 1 | *tea
 2 | state 1 density=100.0 energy=0.0001
 3 | state 2 density=0.1 energy=25.0 geometry=rectangle xmin=0.0 xmax=1.0 ymin=1.0 ymax=2.0
 4 | state 3 density=0.1 energy=0.1 geometry=rectangle xmin=1.0 xmax=6.0 ymin=1.0 ymax=2.0
 5 | state 4 density=0.1 energy=0.1 geometry=rectangle xmin=5.0 xmax=6.0 ymin=1.0 ymax=8.0
 6 | state 5 density=0.1 energy=0.1 geometry=rectangle xmin=5.0 xmax=10.0 ymin=7.0 ymax=8.0
 7 | x_cells=2000
 8 | y_cells=2000
 9 | xmin=0.0
10 | ymin=0.0
11 | xmax=10.0
12 | ymax=10.0
13 | initial_timestep=0.004
14 | end_step=2
15 | max_iters=10000
16 | use_cg
17 | eps 1.0e-15
18 | test_problem 5
19 | profiler_on
20 | use_c_kernels
21 | *endtea
22 | 


--------------------------------------------------------------------------------
/Benchmarks/tea_bm_5e_2_4.in:
--------------------------------------------------------------------------------
 1 | *tea
 2 | state 1 density=100.0 energy=0.0001
 3 | state 2 density=0.1 energy=25.0 geometry=rectangle xmin=0.0 xmax=1.0 ymin=1.0 ymax=2.0
 4 | state 3 density=0.1 energy=0.1 geometry=rectangle xmin=1.0 xmax=6.0 ymin=1.0 ymax=2.0
 5 | state 4 density=0.1 energy=0.1 geometry=rectangle xmin=5.0 xmax=6.0 ymin=1.0 ymax=8.0
 6 | state 5 density=0.1 energy=0.1 geometry=rectangle xmin=5.0 xmax=10.0 ymin=7.0 ymax=8.0
 7 | x_cells=2000
 8 | y_cells=2000
 9 | xmin=0.0
10 | ymin=0.0
11 | xmax=10.0
12 | ymax=10.0
13 | initial_timestep=0.004
14 | end_step=4
15 | max_iters=10000
16 | use_cg
17 | eps 1.0e-15
18 | test_problem 5
19 | profiler_on
20 | use_c_kernels
21 | *endtea
22 | 


--------------------------------------------------------------------------------
/Benchmarks/tea_bm_5e_4.in:
--------------------------------------------------------------------------------
 1 | *tea
 2 | state 1 density=100.0 energy=0.0001
 3 | state 2 density=0.1 energy=25.0 geometry=rectangle xmin=0.0 xmax=1.0 ymin=1.0 ymax=2.0
 4 | state 3 density=0.1 energy=0.1 geometry=rectangle xmin=1.0 xmax=6.0 ymin=1.0 ymax=2.0
 5 | state 4 density=0.1 energy=0.1 geometry=rectangle xmin=5.0 xmax=6.0 ymin=1.0 ymax=8.0
 6 | state 5 density=0.1 energy=0.1 geometry=rectangle xmin=5.0 xmax=10.0 ymin=7.0 ymax=8.0
 7 | x_cells=4000
 8 | y_cells=4000
 9 | xmin=0.0
10 | ymin=0.0
11 | xmax=10.0
12 | ymax=10.0
13 | initial_timestep=0.004
14 | end_step=10
15 | max_iters=10000
16 | use_cg
17 | eps 1.0e-15
18 | test_problem 5
19 | profiler_on
20 | use_c_kernels
21 | *endtea
22 | 


--------------------------------------------------------------------------------
/Benchmarks/tea_bm_5e_4_2.in:
--------------------------------------------------------------------------------
 1 | *tea
 2 | state 1 density=100.0 energy=0.0001
 3 | state 2 density=0.1 energy=25.0 geometry=rectangle xmin=0.0 xmax=1.0 ymin=1.0 ymax=2.0
 4 | state 3 density=0.1 energy=0.1 geometry=rectangle xmin=1.0 xmax=6.0 ymin=1.0 ymax=2.0
 5 | state 4 density=0.1 energy=0.1 geometry=rectangle xmin=5.0 xmax=6.0 ymin=1.0 ymax=8.0
 6 | state 5 density=0.1 energy=0.1 geometry=rectangle xmin=5.0 xmax=10.0 ymin=7.0 ymax=8.0
 7 | x_cells=4000
 8 | y_cells=4000
 9 | xmin=0.0
10 | ymin=0.0
11 | xmax=10.0
12 | ymax=10.0
13 | initial_timestep=0.004
14 | end_step=2
15 | max_iters=10000
16 | use_cg
17 | eps 1.0e-15
18 | test_problem 5
19 | profiler_on
20 | use_c_kernels
21 | *endtea
22 | 


--------------------------------------------------------------------------------
/Benchmarks/tea_bm_5e_4_4.in:
--------------------------------------------------------------------------------
 1 | *tea
 2 | state 1 density=100.0 energy=0.0001
 3 | state 2 density=0.1 energy=25.0 geometry=rectangle xmin=0.0 xmax=1.0 ymin=1.0 ymax=2.0
 4 | state 3 density=0.1 energy=0.1 geometry=rectangle xmin=1.0 xmax=6.0 ymin=1.0 ymax=2.0
 5 | state 4 density=0.1 energy=0.1 geometry=rectangle xmin=5.0 xmax=6.0 ymin=1.0 ymax=8.0
 6 | state 5 density=0.1 energy=0.1 geometry=rectangle xmin=5.0 xmax=10.0 ymin=7.0 ymax=8.0
 7 | x_cells=4000
 8 | y_cells=4000
 9 | xmin=0.0
10 | ymin=0.0
11 | xmax=10.0
12 | ymax=10.0
13 | initial_timestep=0.004
14 | end_step=4
15 | max_iters=10000
16 | use_cg
17 | eps 1.0e-15
18 | test_problem 5
19 | profiler_on
20 | use_c_kernels
21 | *endtea
22 | 


--------------------------------------------------------------------------------
/Benchmarks/tea_bm_5e_8.in:
--------------------------------------------------------------------------------
 1 | *tea
 2 | state 1 density=100.0 energy=0.0001
 3 | state 2 density=0.1 energy=25.0 geometry=rectangle xmin=0.0 xmax=1.0 ymin=1.0 ymax=2.0
 4 | state 3 density=0.1 energy=0.1 geometry=rectangle xmin=1.0 xmax=6.0 ymin=1.0 ymax=2.0
 5 | state 4 density=0.1 energy=0.1 geometry=rectangle xmin=5.0 xmax=6.0 ymin=1.0 ymax=8.0
 6 | state 5 density=0.1 energy=0.1 geometry=rectangle xmin=5.0 xmax=10.0 ymin=7.0 ymax=8.0
 7 | x_cells=8000
 8 | y_cells=8000
 9 | xmin=0.0
10 | ymin=0.0
11 | xmax=10.0
12 | ymax=10.0
13 | initial_timestep=0.004
14 | end_step=10
15 | max_iters=10000
16 | use_cg
17 | eps 1.0e-15
18 | test_problem 5
19 | profiler_on
20 | use_c_kernels
21 | *endtea
22 | 


--------------------------------------------------------------------------------
/Benchmarks/tea_bm_5e_8_2.in:
--------------------------------------------------------------------------------
 1 | *tea
 2 | state 1 density=100.0 energy=0.0001
 3 | state 2 density=0.1 energy=25.0 geometry=rectangle xmin=0.0 xmax=1.0 ymin=1.0 ymax=2.0
 4 | state 3 density=0.1 energy=0.1 geometry=rectangle xmin=1.0 xmax=6.0 ymin=1.0 ymax=2.0
 5 | state 4 density=0.1 energy=0.1 geometry=rectangle xmin=5.0 xmax=6.0 ymin=1.0 ymax=8.0
 6 | state 5 density=0.1 energy=0.1 geometry=rectangle xmin=5.0 xmax=10.0 ymin=7.0 ymax=8.0
 7 | x_cells=8000
 8 | y_cells=8000
 9 | xmin=0.0
10 | ymin=0.0
11 | xmax=10.0
12 | ymax=10.0
13 | initial_timestep=0.004
14 | end_step=2
15 | max_iters=10000
16 | use_cg
17 | eps 1.0e-15
18 | test_problem 5
19 | profiler_on
20 | use_c_kernels
21 | *endtea
22 | 


--------------------------------------------------------------------------------
/Benchmarks/tea_bm_5e_8_4.in:
--------------------------------------------------------------------------------
 1 | *tea
 2 | state 1 density=100.0 energy=0.0001
 3 | state 2 density=0.1 energy=25.0 geometry=rectangle xmin=0.0 xmax=1.0 ymin=1.0 ymax=2.0
 4 | state 3 density=0.1 energy=0.1 geometry=rectangle xmin=1.0 xmax=6.0 ymin=1.0 ymax=2.0
 5 | state 4 density=0.1 energy=0.1 geometry=rectangle xmin=5.0 xmax=6.0 ymin=1.0 ymax=8.0
 6 | state 5 density=0.1 energy=0.1 geometry=rectangle xmin=5.0 xmax=10.0 ymin=7.0 ymax=8.0
 7 | x_cells=8000
 8 | y_cells=8000
 9 | xmin=0.0
10 | ymin=0.0
11 | xmax=10.0
12 | ymax=10.0
13 | initial_timestep=0.004
14 | end_step=4
15 | max_iters=10000
16 | use_cg
17 | eps 1.0e-15
18 | test_problem 5
19 | profiler_on
20 | use_c_kernels
21 | *endtea
22 | 


--------------------------------------------------------------------------------
/Benchmarks/tea_bm_6.in:
--------------------------------------------------------------------------------
 1 | *tea
 2 | state 1 density=100.0 energy=0.0001
 3 | state 2 density=0.1 energy=25.0 geometry=rectangle xmin=0.0 xmax=1.0 ymin=1.0 ymax=2.0
 4 | state 3 density=0.1 energy=0.1 geometry=rectangle xmin=1.0 xmax=6.0 ymin=1.0 ymax=2.0
 5 | state 4 density=0.1 energy=0.1 geometry=rectangle xmin=5.0 xmax=6.0 ymin=1.0 ymax=8.0
 6 | state 5 density=0.1 energy=0.1 geometry=rectangle xmin=5.0 xmax=10.0 ymin=7.0 ymax=8.0
 7 | x_cells=8000
 8 | y_cells=8000
 9 | xmin=0.0
10 | ymin=0.0
11 | xmax=10.0
12 | ymax=10.0
13 | initial_timestep=0.004
14 | end_step=10
15 | max_iters=10000
16 | use_cg
17 | eps 1.0e-15
18 | test_problem 6
19 | profiler_on
20 | use_c_kernels
21 | *endtea
22 | 


--------------------------------------------------------------------------------
/build.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | 
4 | module load cmake


--------------------------------------------------------------------------------
/driver/application.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "chunk.h"
 4 | 
 5 | #define TEALEAF_VERSION "2.000"
 6 | 
 7 | void initialise_model_info(Settings &settings);
 8 | void initialise_application(Chunk **chunks, Settings &settings, State * states);
 9 | bool diffuse(Chunk *chunk, Settings &settings);
10 | void read_config(Settings &settings, State **states);
11 | 
12 | #ifdef DIFFUSE_OVERLOAD
13 | bool diffuse_overload(Chunk *chunk, Settings &settings);
14 | #endif
15 | 


--------------------------------------------------------------------------------
/driver/cg_driver.cpp:
--------------------------------------------------------------------------------
 1 | #include "chunk.h"
 2 | #include "comms.h"
 3 | #include "drivers.h"
 4 | #include "kernel_interface.h"
 5 | 
 6 | // Performs a full solve with the CG solver kernels
 7 | void cg_driver(Chunk *chunks, Settings &settings, double rx, double ry, double *error) {
 8 |   int tt;
 9 |   double rro = 0.0;
10 | 
11 |   // Perform CG initialisation
12 |   cg_init_driver(chunks, settings, rx, ry, &rro);
13 | 
14 |   // Iterate till convergence
15 |   for (tt = 0; tt < settings.max_iters; ++tt) {
16 |     cg_main_step_driver(chunks, settings, tt, &rro, error);
17 | 
18 |     halo_update_driver(chunks, settings, 1);
19 | 
20 |     if (sqrt(fabs(*error)) < settings.eps) break;
21 |   }
22 | 
23 |   print_and_log(settings, " CG: \t\t\t%d iterations\n", tt);
24 | }
25 | 
26 | // Invokes the CG initialisation kernels
27 | void cg_init_driver(Chunk *chunks, Settings &settings, double rx, double ry, double *rro) {
28 |   *rro = 0.0;
29 | 
30 |   for (int cc = 0; cc < settings.num_chunks_per_rank; ++cc) {
31 |     if (settings.kernel_language == Kernel_Language::C) {
32 |       run_cg_init(&(chunks[cc]), settings, rx, ry, rro);
33 |     } else if (settings.kernel_language == Kernel_Language::FORTRAN) {
34 |     }
35 |   }
36 | 
37 |   // Need to update for the matvec
38 |   reset_fields_to_exchange(settings);
39 |   settings.fields_to_exchange[FIELD_U] = true;
40 |   settings.fields_to_exchange[FIELD_P] = true;
41 |   halo_update_driver(chunks, settings, 1);
42 | 
43 |   sum_over_ranks(settings, rro);
44 | 
45 |   for (int cc = 0; cc < settings.num_chunks_per_rank; ++cc) {
46 |     if (settings.kernel_language == Kernel_Language::C) {
47 |       run_copy_u(&(chunks[cc]), settings);
48 |     } else if (settings.kernel_language == Kernel_Language::FORTRAN) {
49 |     }
50 |   }
51 | }
52 | 
53 | // Invokes the main CG solve kernels
54 | void cg_main_step_driver(Chunk *chunks, Settings &settings, int tt, double *rro, double *error) {
55 |   double pw = 0.0;
56 | 
57 |   for (int cc = 0; cc < settings.num_chunks_per_rank; ++cc) {
58 |     if (settings.kernel_language == Kernel_Language::C) {
59 |       run_cg_calc_w(&(chunks[cc]), settings, &pw);
60 |     } else if (settings.kernel_language == Kernel_Language::FORTRAN) {
61 |     }
62 |   }
63 | 
64 |   sum_over_ranks(settings, &pw);
65 | 
66 |   double alpha = *rro / pw;
67 |   double rrn = 0.0;
68 | 
69 |   for (int cc = 0; cc < settings.num_chunks_per_rank; ++cc) {
70 |     // TODO: Some redundancy across chunks??
71 |     chunks[cc].cg_alphas[tt] = alpha;
72 | 
73 |     if (settings.kernel_language == Kernel_Language::C) {
74 |       run_cg_calc_ur(&(chunks[cc]), settings, alpha, &rrn);
75 |     } else if (settings.kernel_language == Kernel_Language::FORTRAN) {
76 |     }
77 |   }
78 | 
79 |   sum_over_ranks(settings, &rrn);
80 | 
81 |   double beta = rrn / *rro;
82 | 
83 |   for (int cc = 0; cc < settings.num_chunks_per_rank; ++cc) {
84 |     // TODO: Some redundancy across chunks??
85 |     chunks[cc].cg_betas[tt] = beta;
86 | 
87 |     if (settings.kernel_language == Kernel_Language::C) {
88 |       run_cg_calc_p(&(chunks[cc]), settings, beta);
89 |     } else if (settings.kernel_language == Kernel_Language::FORTRAN) {
90 |     }
91 |   }
92 | 
93 |   *error = rrn;
94 |   *rro = rrn;
95 | }
96 | 


--------------------------------------------------------------------------------
/driver/chunk.cpp:
--------------------------------------------------------------------------------
  1 | #include <cstdio>
  2 | #include <cstdlib>
  3 | #include <cstring>
  4 | 
  5 | #include "chunk.h"
  6 | 
  7 | // static void dump_data(FILE *out, const char *name, double *data, int size) {
  8 | //   auto *host = static_cast<double *>(std::malloc(size * sizeof(double)));
  9 | //   std::memcpy(host, data, size * sizeof(double));
 10 | //
 11 | //   bool all_zero = true;
 12 | //   for (int i = 0; i < size; i++) {
 13 | //     if (host[i] != 0.0) {
 14 | //       all_zero = false;
 15 | //       break;
 16 | //     }
 17 | //   }
 18 | //
 19 | //   std::fprintf(out, "[%s,+0]", name);
 20 | //   if (all_zero) {
 21 | //     std::fprintf(out, "(0.0 * %d)", size);
 22 | //   } else {
 23 | //     for (int i = 0; i < size; i++) {
 24 | //       std::fprintf(out, "%.5f,", host[i]);
 25 | //       if (i % 20 == 0) {
 26 | //         std::fprintf(out, "\n[%s,+%d]", name, i);
 27 | //       }
 28 | //     }
 29 | //   }
 30 | //   std::fprintf(out, "\n");
 31 | //   free(host);
 32 | // }
 33 | //
 34 | // void dump_chunk(const char *prefix, const char *suffix, Chunk *chunk, Settings &settings) {
 35 | //   char name[256] = {};
 36 | //   sprintf(name, "%s_rank=%d+%s.txt", prefix, settings.rank, suffix);
 37 | //   FILE *out = fopen(name, "w");
 38 | //
 39 | //   std::fprintf(out, "x=%d\n", chunk->x);
 40 | //   std::fprintf(out, "y=%d\n", chunk->y);
 41 | //   std::fprintf(out, "dt_init=%f\n", chunk->dt_init);
 42 | //
 43 | //   std::fprintf(out, "left=%d\n", chunk->left);
 44 | //   std::fprintf(out, "right=%d\n", chunk->right);
 45 | //   std::fprintf(out, "bottom=%d\n", chunk->bottom);
 46 | //   std::fprintf(out, "top=%d\n", chunk->top);
 47 | //
 48 | ////  dump_data(out, "density", chunk->density, chunk->x * chunk->y);
 49 | ////  dump_data(out, "energy", chunk->energy, chunk->x * chunk->y);
 50 | ////  dump_data(out, "u", chunk->u, chunk->x * chunk->y);
 51 | ////  dump_data(out, "p", chunk->p, chunk->x * chunk->y);
 52 | ////  dump_data(out, "r", chunk->r, chunk->x * chunk->y);
 53 | ////  dump_data(out, "w", chunk->w, chunk->x * chunk->y);
 54 | ////  dump_data(out, "kx", chunk->kx, chunk->x * chunk->y);
 55 | ////  dump_data(out, "ky", chunk->ky, chunk->x * chunk->y);
 56 | //
 57 | //  std::fclose(out);
 58 | //}
 59 | 
 60 | // Initialise the chunk
 61 | void initialise_chunk(Chunk *chunk, Settings &settings, int x, int y) {
 62 |   // Initialise the key variables
 63 |   chunk->x = x + settings.halo_depth * 2;
 64 |   chunk->y = y + settings.halo_depth * 2;
 65 |   chunk->dt_init = settings.dt_init;
 66 | 
 67 |   // Allocate the neighbour list
 68 |   chunk->neighbours = static_cast<int *>(std::malloc(sizeof(int) * NUM_FACES));
 69 | 
 70 |   // Allocate the MPI comm buffers
 71 |   //  int lr_len = chunk->y * settings.halo_depth * NUM_FIELDS;
 72 |   //  chunk->left_send = static_cast<double *>(std::malloc(sizeof(double) * lr_len));
 73 |   //  chunk->left_recv = static_cast<double *>(std::malloc(sizeof(double) * lr_len));
 74 |   //  chunk->right_send = static_cast<double *>(std::malloc(sizeof(double) * lr_len));
 75 |   //  chunk->right_recv = static_cast<double *>(std::malloc(sizeof(double) * lr_len));
 76 | 
 77 |   //  int tb_len = chunk->x * settings.halo_depth * NUM_FIELDS;
 78 |   //  chunk->top_send = static_cast<double *>(std::malloc(sizeof(double) * tb_len));
 79 |   //  chunk->top_recv = static_cast<double *>(std::malloc(sizeof(double) * tb_len));
 80 |   //  chunk->bottom_send = static_cast<double *>(std::malloc(sizeof(double) * tb_len));
 81 |   //  chunk->bottom_recv = static_cast<double *>(std::malloc(sizeof(double) * tb_len));
 82 | 
 83 |   //    int lr_len = chunk->y * settings.halo_depth * NUM_FIELDS;
 84 |   //    chunk->staging_left_send = static_cast<double *>(std::malloc(sizeof(double) * lr_len));
 85 |   //    chunk->staging_left_recv = static_cast<double *>(std::malloc(sizeof(double) * lr_len));
 86 |   //    chunk->staging_right_send = static_cast<double *>(std::malloc(sizeof(double) * lr_len));
 87 |   //    chunk->staging_right_recv = static_cast<double *>(std::malloc(sizeof(double) * lr_len));
 88 | 
 89 |   //    int tb_len = chunk->x * settings.halo_depth * NUM_FIELDS;
 90 |   //    chunk->staging_top_send = static_cast<double *>(std::malloc(sizeof(double) * tb_len));
 91 |   //    chunk->staging_top_recv = static_cast<double *>(std::malloc(sizeof(double) * tb_len));
 92 |   //    chunk->staging_bottom_send = static_cast<double *>(std::malloc(sizeof(double) * tb_len));
 93 |   //    chunk->staging_bottom_recv = static_cast<double *>(std::malloc(sizeof(double) * tb_len));
 94 | 
 95 |   // Initialise the ChunkExtension, which allows composition of extended
 96 |   // fields specific to individual implementations
 97 |   chunk->ext = static_cast<ChunkExtension *>(std::malloc(sizeof(ChunkExtension)));
 98 | }
 99 | 
100 | // Finalise the chunk
101 | void finalise_chunk(Chunk *chunk) {
102 |   free(chunk->neighbours);
103 |   free(chunk->ext);
104 |   //  free(chunk->left_send);
105 |   //  free(chunk->left_recv);
106 |   //  free(chunk->right_send);
107 |   //  free(chunk->right_recv);
108 |   //  free(chunk->top_send);
109 |   //  free(chunk->top_recv);
110 |   //  free(chunk->bottom_send);
111 |   //  free(chunk->bottom_recv);
112 | }
113 | 


--------------------------------------------------------------------------------
/driver/chunk.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "chunk_extension.h"
 4 | #include "settings.h"
 5 | #include <cmath>
 6 | 
 7 | // The core Tealeaf interface class.
 8 | struct Chunk {
 9 |   // Solve-wide variables
10 |   double dt_init;
11 | 
12 |   // Neighbouring ranks
13 |   int *neighbours;
14 | 
15 |   // MPI comm buffers
16 |   FieldBufferType left_send;
17 |   FieldBufferType left_recv;
18 |   FieldBufferType right_send;
19 |   FieldBufferType right_recv;
20 |   FieldBufferType top_send;
21 |   FieldBufferType top_recv;
22 |   FieldBufferType bottom_send;
23 |   FieldBufferType bottom_recv;
24 | 
25 |   StagingBufferType staging_left_send;
26 |   StagingBufferType staging_left_recv;
27 |   StagingBufferType staging_right_send;
28 |   StagingBufferType staging_right_recv;
29 |   StagingBufferType staging_top_send;
30 |   StagingBufferType staging_top_recv;
31 |   StagingBufferType staging_bottom_send;
32 |   StagingBufferType staging_bottom_recv;
33 | 
34 |   // Mesh chunks
35 |   int left;
36 |   int right;
37 |   int bottom;
38 |   int top;
39 | 
40 |   // Field dimensions
41 |   int x;
42 |   int y;
43 | 
44 |   // Field buffers
45 |   FieldBufferType density0;
46 |   FieldBufferType density;
47 |   FieldBufferType energy0;
48 |   FieldBufferType energy;
49 | 
50 |   FieldBufferType u;
51 |   FieldBufferType u0;
52 |   FieldBufferType p;
53 |   FieldBufferType r;
54 |   FieldBufferType mi;
55 |   FieldBufferType w;
56 |   FieldBufferType kx;
57 |   FieldBufferType ky;
58 |   FieldBufferType sd;
59 | 
60 |   FieldBufferType cell_x;
61 |   FieldBufferType cell_y;
62 |   FieldBufferType cell_dx;
63 |   FieldBufferType cell_dy;
64 | 
65 |   FieldBufferType vertex_dx;
66 |   FieldBufferType vertex_dy;
67 |   FieldBufferType vertex_x;
68 |   FieldBufferType vertex_y;
69 | 
70 |   FieldBufferType volume;
71 |   FieldBufferType x_area;
72 |   FieldBufferType y_area;
73 | 
74 |   // Cheby and PPCG
75 |   double theta;
76 |   double eigmin;
77 |   double eigmax;
78 | 
79 |   double *cg_alphas;
80 |   double *cg_betas;
81 |   double *cheby_alphas;
82 |   double *cheby_betas;
83 | 
84 |   ChunkExtension *ext;
85 | };
86 | 
87 | struct Settings;
88 | 
89 | void dump_chunk(const char *prefix, const char *suffix, Chunk *chunk, Settings &settings);
90 | void initialise_chunk(Chunk *chunk, Settings &settings, int x, int y);
91 | void finalise_chunk(Chunk *chunk);
92 | 


--------------------------------------------------------------------------------
/driver/comms.cpp:
--------------------------------------------------------------------------------
 1 | #include "comms.h"
 2 | #include "settings.h"
 3 | 
 4 | // Initialise MPI
 5 | void initialise_comms(int argc, char **argv) { MPI_Init(&argc, &argv); }
 6 | 
 7 | // Initialise the rank information
 8 | void initialise_ranks(Settings &settings) {
 9 |   MPI_Comm_rank(MPI_COMM_WORLD, &settings.rank);
10 |   MPI_Comm_size(MPI_COMM_WORLD, &settings.num_ranks);
11 | }
12 | 
13 | // Teardown MPI
14 | void finalise_comms() { MPI_Finalize(); }
15 | 
16 | // Sends a message out and receives a message in
17 | void send_recv_message(Settings &settings, double *send_buffer, double *recv_buffer, int buffer_len, int neighbour, int send_tag,
18 |                        int recv_tag, MPI_Request *send_request, MPI_Request *recv_request) {
19 |   START_PROFILING(settings.kernel_profile);
20 | 
21 |   MPI_Isend(send_buffer, buffer_len, MPI_DOUBLE, neighbour, send_tag, MPI_COMM_WORLD, send_request);
22 |   MPI_Irecv(recv_buffer, buffer_len, MPI_DOUBLE, neighbour, recv_tag, MPI_COMM_WORLD, recv_request);
23 | 
24 |   STOP_PROFILING(settings.kernel_profile, __func__);
25 | }
26 | 
27 | // Waits for all requests to complete
28 | void wait_for_requests(Settings &settings, int num_requests, MPI_Request *requests) {
29 |   START_PROFILING(settings.kernel_profile);
30 |   MPI_Waitall(num_requests, requests, MPI_STATUSES_IGNORE);
31 |   STOP_PROFILING(settings.kernel_profile, __func__);
32 | }
33 | 
34 | // Reduce over all ranks to get sum
35 | void sum_over_ranks(Settings &settings, double *a) {
36 |   START_PROFILING(settings.kernel_profile);
37 |   double temp = *a;
38 |   MPI_Allreduce(&temp, a, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD);
39 |   STOP_PROFILING(settings.kernel_profile, __func__);
40 | }
41 | 
42 | // Reduce across all ranks to get minimum value
43 | void min_over_ranks(Settings &settings, double *a) {
44 |   START_PROFILING(settings.kernel_profile);
45 |   double temp = *a;
46 |   MPI_Allreduce(&temp, a, 1, MPI_DOUBLE, MPI_MIN, MPI_COMM_WORLD);
47 |   STOP_PROFILING(settings.kernel_profile, __func__);
48 | }
49 | 
50 | // Synchronise all ranks
51 | void barrier() { MPI_Barrier(MPI_COMM_WORLD); }
52 | 
53 | // End the application
54 | void abort_comms() { MPI_Abort(MPI_COMM_WORLD, 1); }
55 | 


--------------------------------------------------------------------------------
/driver/comms.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #ifndef NO_MPI
 4 |   // XXX OpenMPI pulls in CXX headers which we don't link against, prevent that:
 5 |   #define OMPI_SKIP_MPICXX
 6 |   #include <mpi.h>
 7 |   #if __has_include("mpi-ext.h") // C23, but everyone supports this already
 8 |     #include "mpi-ext.h"         // for CUDA-aware MPI checks
 9 |   #endif
10 | #else
11 |   #include "mpi_shim.h"
12 | #endif
13 | 
14 | #include "chunk.h"
15 | #include "settings.h"
16 | 
17 | void barrier();
18 | void abort_comms();
19 | void finalise_comms();
20 | void initialise_comms(int argc, char **argv);
21 | void initialise_ranks(Settings &settings);
22 | void sum_over_ranks(Settings &settings, double *a);
23 | void min_over_ranks(Settings &settings, double *a);
24 | void wait_for_requests(Settings &settings, int num_requests, MPI_Request *requests);
25 | void send_recv_message(Settings &settings, double *send_buffer, double *recv_buffer, int buffer_len, int neighbour, int send_tag,
26 |                        int recv_tag, MPI_Request *send_request, MPI_Request *recv_request);


--------------------------------------------------------------------------------
/driver/diffuse.cpp:
--------------------------------------------------------------------------------
 1 | #include "application.h"
 2 | #include "comms.h"
 3 | #include "drivers.h"
 4 | 
 5 | double calc_dt(Chunk *chunks);
 6 | void calc_min_timestep(Chunk *chunks, double *dt, int chunks_per_task);
 7 | void solve(Chunk *chunks, Settings &settings, int tt, double *wallclock_prev);
 8 | 
 9 | // The main timestep loop
10 | bool diffuse(Chunk *chunks, Settings &settings) {
11 |   double wallclock_prev = 0.0;
12 |   for (int tt = 0; tt < settings.end_step; ++tt) {
13 |     solve(chunks, settings, tt, &wallclock_prev);
14 |   }
15 | 
16 |   return field_summary_driver(chunks, settings, true);
17 | }
18 | 
19 | // Performs a solve for a single timestep
20 | void solve(Chunk *chunks, Settings &settings, int tt, double *wallclock_prev) {
21 |   print_and_log(settings, "\n Timestep %d\n", tt + 1);
22 |   profiler_start_timer(settings.wallclock_profile);
23 | 
24 |   // Calculate minimum timestep information
25 |   double dt = settings.dt_init;
26 |   calc_min_timestep(chunks, &dt, settings.num_chunks_per_rank);
27 | 
28 |   // Pick the smallest timestep across all ranks
29 |   min_over_ranks(settings, &dt);
30 | 
31 |   double rx = dt / (settings.dx * settings.dx);
32 |   double ry = dt / (settings.dy * settings.dy);
33 | 
34 |   // Prepare halo regions for solve
35 |   reset_fields_to_exchange(settings);
36 |   settings.fields_to_exchange[FIELD_ENERGY1] = true;
37 |   settings.fields_to_exchange[FIELD_DENSITY] = true;
38 |   halo_update_driver(chunks, settings, 2);
39 | 
40 |   double error = 1e+10;
41 | 
42 |   // Perform the solve with one of the integrated solvers
43 |   switch (settings.solver) {
44 |     case Solver::JACOBI_SOLVER: jacobi_driver(chunks, settings, rx, ry, &error); break;
45 |     case Solver::CG_SOLVER: cg_driver(chunks, settings, rx, ry, &error); break;
46 |     case Solver::CHEBY_SOLVER: cheby_driver(chunks, settings, rx, ry, &error); break;
47 |     case Solver::PPCG_SOLVER: ppcg_driver(chunks, settings, rx, ry, &error); break;
48 |   }
49 | 
50 |   // Perform solve finalisation tasks
51 |   solve_finished_driver(chunks, settings);
52 | 
53 |   if (tt % settings.summary_frequency == 0) {
54 |     field_summary_driver(chunks, settings, false);
55 |   }
56 | 
57 |   profiler_end_timer(settings.wallclock_profile, "Wallclock");
58 | 
59 |   double wallclock = settings.wallclock_profile->profiler_entries[0].time;
60 |   print_and_log(settings, " Wallclock: \t\t%.3lfs\n", wallclock);
61 |   print_and_log(settings, " Avg. time per cell: \t%.6e\n", (wallclock - *wallclock_prev) / (settings.grid_x_cells * settings.grid_y_cells));
62 |   print_and_log(settings, " Error: \t\t%.6e\n", error);
63 | }
64 | 
65 | // Calculate minimum timestep
66 | void calc_min_timestep(Chunk *chunks, double *dt, int chunks_per_task) {
67 |   for (int cc = 0; cc < chunks_per_task; ++cc) {
68 |     double dtlp = calc_dt(&(chunks[cc]));
69 | 
70 |     if (dtlp < *dt) {
71 |       *dt = dtlp;
72 |     }
73 |   }
74 | }
75 | 
76 | // Calculates a value for dt
77 | double calc_dt(Chunk *chunk) {
78 |   // Currently defaults to config provided value
79 |   return chunk->dt_init;
80 | }
81 | 


--------------------------------------------------------------------------------
/driver/drivers.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "chunk.h"
 4 | 
 5 | // Initialisation drivers
 6 | void set_chunk_data_driver(Chunk *chunk, Settings &settings);
 7 | void set_chunk_state_driver(Chunk *chunk, Settings &settings, State *states);
 8 | void kernel_initialise_driver(Chunk *chunks, Settings &settings);
 9 | void kernel_finalise_driver(Chunk *chunks, Settings &settings);
10 | 
11 | // Halo drivers
12 | void halo_update_driver(Chunk *chunks, Settings &settings, int depth);
13 | void remote_halo_driver(Chunk *chunks, Settings &settings, int depth);
14 | 
15 | // Conjugate Gradient solver drivers
16 | void cg_driver(Chunk *chunks, Settings &settings, double rx, double ry, double *error);
17 | void cg_init_driver(Chunk *chunks, Settings &settings, double rx, double ry, double *rro);
18 | void cg_main_step_driver(Chunk *chunks, Settings &settings, int tt, double *rro, double *error);
19 | 
20 | // Chebyshev solver drivers
21 | void cheby_driver(Chunk *chunks, Settings &settings, double rx, double ry, double *error);
22 | void cheby_init_driver(Chunk *chunks, Settings &settings, int num_cg_iters, double *bb);
23 | void cheby_coef_driver(Chunk *chunks, Settings &settings, int max_iters);
24 | void cheby_main_step_driver(Chunk *chunks, Settings &settings, int cheby_iters, bool is_calc_2norm, double *error);
25 | 
26 | // PPCG solver drivers
27 | void ppcg_driver(Chunk *chunks, Settings &settings, double rx, double ry, double *error);
28 | void ppcg_init_driver(Chunk *chunks, Settings &settings, double *rro);
29 | void ppcg_main_step_driver(Chunk *chunks, Settings &settings, double *rro, double *error);
30 | 
31 | // Jacobi solver drivers
32 | void jacobi_driver(Chunk *chunks, Settings &settings, double rx, double ry, double *error);
33 | void jacobi_init_driver(Chunk *chunks, Settings &settings, double rx, double ry);
34 | void jacobi_main_step_driver(Chunk *chunks, Settings &settings, int tt, double *error);
35 | 
36 | // Misc drivers
37 | bool field_summary_driver(Chunk *chunks, Settings &settings, bool solve_finished);
38 | void store_energy_driver(Chunk *chunk, Settings &settings);
39 | void solve_finished_driver(Chunk *chunks, Settings &settings);
40 | void eigenvalue_driver_initialise(Chunk *chunks, Settings &settings, int num_cg_iters);
41 | 


--------------------------------------------------------------------------------
/driver/eigenvalue_driver.cpp:
--------------------------------------------------------------------------------
  1 | #include "chunk.h"
  2 | #include "comms.h"
  3 | #include "drivers.h"
  4 | #include "kernel_interface.h"
  5 | #include <cfloat>
  6 | #include <cmath>
  7 | #include <cstring>
  8 | 
  9 | void tqli(double *d, double *e, int n);
 10 | 
 11 | // Calculates the eigenvalues from cg_alphas and cg_betas
 12 | void eigenvalue_driver_initialise(Chunk *chunks, Settings &settings, int num_cg_iters) {
 13 |   START_PROFILING(settings.kernel_profile);
 14 | 
 15 |   for (int cc = 0; cc < settings.num_chunks_per_rank; ++cc) {
 16 |     double diag[num_cg_iters];
 17 |     double offdiag[num_cg_iters];
 18 |     std::memset(diag, 0, sizeof(diag));
 19 |     std::memset(offdiag, 0, sizeof(offdiag));
 20 | 
 21 |     // Prepare matrix
 22 |     for (int ii = 0; ii < num_cg_iters; ++ii) {
 23 |       diag[ii] = 1.0 / chunks[cc].cg_alphas[ii];
 24 | 
 25 |       if (ii > 0) {
 26 |         diag[ii] += chunks[cc].cg_betas[ii - 1] / chunks[cc].cg_alphas[ii - 1];
 27 |       }
 28 |       if (ii < num_cg_iters - 1) {
 29 |         offdiag[ii + 1] = std::sqrt(chunks[cc].cg_betas[ii]) / chunks[cc].cg_alphas[ii];
 30 |       }
 31 |     }
 32 | 
 33 |     // Calculate the eigenvalues (ignore eigenvectors)
 34 |     tqli(diag, offdiag, num_cg_iters);
 35 | 
 36 |     chunks[cc].eigmin = DBL_MAX;
 37 |     chunks[cc].eigmax = DBL_MIN;
 38 | 
 39 |     // Get minimum and maximum eigenvalues
 40 |     for (int ii = 0; ii < num_cg_iters; ++ii) {
 41 |       chunks[cc].eigmin = tealeaf_MIN(chunks[cc].eigmin, diag[ii]);
 42 |       chunks[cc].eigmax = tealeaf_MAX(chunks[cc].eigmax, diag[ii]);
 43 |     }
 44 | 
 45 |     if (chunks[cc].eigmin < 0.0 || chunks[cc].eigmax < 0.0) {
 46 |       die(__LINE__, __FILE__, "Calculated negative eigenvalues.\n");
 47 |     }
 48 | 
 49 |     // TODO: Find out the reasoning behind this!?
 50 |     // Adds some buffer for precision maybe?
 51 |     chunks[cc].eigmin *= 0.95;
 52 |     chunks[cc].eigmax *= 1.05;
 53 | 
 54 |     print_and_log(settings, "Min. eigenvalue: \t%.12e\nMax. eigenvalue: \t%.12e\n", chunks[cc].eigmin, chunks[cc].eigmax);
 55 |   }
 56 | 
 57 |   STOP_PROFILING(settings.kernel_profile, __func__);
 58 | }
 59 | 
 60 | // Adapted from
 61 | // http://ftp.cs.stanford.edu/cs/robotics/scohen/nr/tqli.c
 62 | void tqli(double *d, double *e, int n) {
 63 |   int m, l, iter, i;
 64 |   double s, r, p, g, f, dd, c, b;
 65 | 
 66 |   for (i = 0; i < n - 1; i++)
 67 |     e[i] = e[i + 1];
 68 |   e[n - 1] = 0.0;
 69 |   for (l = 0; l < n; l++) {
 70 |     iter = 0;
 71 |     do {
 72 |       for (m = l; m < n - 1; m++) {
 73 |         dd = std::fabs(d[m]) + std::fabs(d[m + 1]);
 74 |         if (std::fabs(e[m]) + dd == dd) break;
 75 |       }
 76 | 
 77 |       if (m == l) break;
 78 | 
 79 |       if (iter++ == 30) {
 80 |         die(__LINE__, __FILE__, "Too many iterations in TQLI routine\n");
 81 |       }
 82 |       g = (d[l + 1] - d[l]) / (2.0 * e[l]);
 83 |       r = std::sqrt((g * g) + 1.0);
 84 |       g = d[m] - d[l] + e[l] / (g + tealeaf_sign(r, g));
 85 |       s = c = 1.0;
 86 |       p = 0.0;
 87 |       for (i = m - 1; i >= l; i--) {
 88 |         f = s * e[i];
 89 |         b = c * e[i];
 90 |         r = std::sqrt(f * f + g * g);
 91 |         e[i + 1] = r;
 92 |         if (r == 0.0) {
 93 |           d[i + 1] -= p;
 94 |           e[m] = 0.0;
 95 |           continue;
 96 |         }
 97 |         s = f / r;
 98 |         c = g / r;
 99 |         g = d[i + 1] - p;
100 |         r = (d[i] - g) * s + 2.0 * c * b;
101 |         p = s * r;
102 |         d[i + 1] = g + p;
103 |         g = c * r - b;
104 |       }
105 |       d[l] = d[l] - p;
106 |       e[l] = g;
107 |       e[m] = 0.0;
108 |     } while (m != l);
109 |   }
110 | }
111 | 


--------------------------------------------------------------------------------
/driver/field_summary_driver.cpp:
--------------------------------------------------------------------------------
 1 | #include "chunk.h"
 2 | #include "comms.h"
 3 | #include "kernel_interface.h"
 4 | 
 5 | void get_checking_value(Settings &settings, double *checking_value);
 6 | 
 7 | // Invokes the set chunk data kernel
 8 | bool field_summary_driver(Chunk *chunks, Settings &settings, bool is_solve_finished) {
 9 |   double vol = 0.0;
10 |   double ie = 0.0;
11 |   double temp = 0.0;
12 |   double mass = 0.0;
13 | 
14 |   for (int cc = 0; cc < settings.num_chunks_per_rank; ++cc) {
15 |     if (settings.kernel_language == Kernel_Language::C) {
16 |       run_field_summary(&(chunks[cc]), settings, &vol, &mass, &ie, &temp);
17 |     } else if (settings.kernel_language == Kernel_Language::FORTRAN) {
18 |     }
19 |   }
20 | 
21 |   // Bring all of the results to the master
22 |   sum_over_ranks(settings, &vol);
23 |   sum_over_ranks(settings, &mass);
24 |   sum_over_ranks(settings, &ie);
25 |   sum_over_ranks(settings, &temp);
26 | 
27 |   if (settings.rank == MASTER && settings.check_result && is_solve_finished) {
28 |     print_and_log(settings, "\n Checking results...\n");
29 | 
30 |     double checking_value = 1.0;
31 |     get_checking_value(settings, &checking_value);
32 | 
33 |     print_and_log(settings, " Expected %.15e\n", checking_value);
34 |     print_and_log(settings, " Actual   %.15e\n", temp);
35 | 
36 |     double qa_diff = fabs(100.0 * (temp / checking_value) - 100.0);
37 |     if (qa_diff < 0.001 && !std::isnan(temp)) {
38 |       print_and_log(settings, " This run PASSED (Difference is within %.8lf%%)\n", qa_diff);
39 |       return true;
40 |     } else {
41 |       print_and_log(settings, " This run FAILED (Difference is within %.8lf%%)\n", qa_diff);
42 |       return false;
43 |     }
44 |   }
45 |   // only master needs to return validation failure if we see one
46 |   return true;
47 | }
48 | 
49 | // Fetches the checking value from the test problems file
50 | void get_checking_value(Settings &settings, double *checking_value) {
51 |   FILE *test_problem_file = std::fopen(settings.test_problem_filename, "r");
52 | 
53 |   if (!test_problem_file) {
54 |     print_and_log(settings, "\n WARNING: Could not open the test problem file: %s, expected value will be invalid.\n",
55 |                   settings.test_problem_filename);
56 |     return;
57 |   }
58 | 
59 |   size_t len = 0;
60 |   char *line = nullptr;
61 | 
62 |   // Get the number of states present in the config file
63 |   while (getline(&line, &len, test_problem_file) != EOF) {
64 |     int x;
65 |     int y;
66 |     int num_steps;
67 | 
68 |     std::sscanf(line, "%d %d %d %lf", &x, &y, &num_steps, checking_value);
69 | 
70 |     // Found the problem in the file
71 |     if (x == settings.grid_x_cells && y == settings.grid_y_cells && num_steps == settings.end_step) {
72 |       std::fclose(test_problem_file);
73 |       return;
74 |     }
75 |   }
76 | 
77 |   *checking_value = 1.0;
78 |   print_and_log(settings, "\n WARNING: Problem was not found in the test problems file, expected value will be invalid.\n");
79 |   std::fclose(test_problem_file);
80 | }
81 | 


--------------------------------------------------------------------------------
/driver/halo_update_driver.cpp:
--------------------------------------------------------------------------------
 1 | #include "drivers.h"
 2 | #include "kernel_interface.h"
 3 | #include "settings.h"
 4 | 
 5 | // Invoke the halo update kernels
 6 | void halo_update_driver(Chunk *chunks, Settings &settings, int depth) {
 7 |   // Check that we actually have exchanges to perform
 8 |   if (!is_fields_to_exchange(settings)) return;
 9 | 
10 |   remote_halo_driver(chunks, settings, depth);
11 | 
12 |   for (int cc = 0; cc < settings.num_chunks_per_rank; ++cc) {
13 |     if (settings.kernel_language == Kernel_Language::C) {
14 |       run_local_halos(&(chunks[cc]), settings, depth);
15 |     } else if (settings.kernel_language == Kernel_Language::FORTRAN) {
16 |       // Fortran store energy kernel
17 |     }
18 |   }
19 | }
20 | 


--------------------------------------------------------------------------------
/driver/initialise.cpp:
--------------------------------------------------------------------------------
  1 | #include <cfloat>
  2 | #include <cstring>
  3 | 
  4 | #include "application.h"
  5 | #include "chunk.h"
  6 | #include "drivers.h"
  7 | #include "kernel_interface.h"
  8 | #include "settings.h"
  9 | 
 10 | // Decomposes the field into multiple chunks
 11 | void decompose_field(Settings &settings, Chunk *chunks) {
 12 |   // Calculates the num chunks field is to be decomposed into
 13 |   settings.num_chunks = settings.num_ranks * settings.num_chunks_per_rank;
 14 | 
 15 |   int num_chunks = settings.num_chunks;
 16 | 
 17 |   double best_metric = DBL_MAX;
 18 |   auto x_cells = static_cast<double>(settings.grid_x_cells);
 19 |   auto y_cells = static_cast<double>(settings.grid_y_cells);
 20 |   int x_chunks = 0;
 21 |   int y_chunks = 0;
 22 | 
 23 |   // Decompose by minimal area to perimeter
 24 |   for (int xx = 1; xx <= num_chunks; ++xx) {
 25 |     if (num_chunks % xx) continue;
 26 | 
 27 |     // Calculate number of chunks grouped by x split
 28 |     int yy = num_chunks / xx;
 29 | 
 30 |     if (num_chunks % yy) continue;
 31 | 
 32 |     double perimeter = ((x_cells / xx) * (x_cells / xx) + (y_cells / yy) * (y_cells / yy)) * 2;
 33 |     double area = (x_cells / xx) * (y_cells / yy);
 34 | 
 35 |     double current_metric = perimeter / area;
 36 | 
 37 |     // Save improved decompositions
 38 |     if (current_metric < best_metric) {
 39 |       x_chunks = xx;
 40 |       y_chunks = yy;
 41 |       best_metric = current_metric;
 42 |     }
 43 |   }
 44 | 
 45 |   // Check that the decomposition didn't fail
 46 |   if (!x_chunks || !y_chunks) {
 47 |     die(__LINE__, __FILE__, "Failed to decompose the field with given parameters.\n");
 48 |   }
 49 | 
 50 |   int dx = settings.grid_x_cells / x_chunks;
 51 |   int dy = settings.grid_y_cells / y_chunks;
 52 | 
 53 |   int mod_x = settings.grid_x_cells % x_chunks;
 54 |   int mod_y = settings.grid_y_cells % y_chunks;
 55 |   int add_x_prev = 0;
 56 |   int add_y_prev = 0;
 57 | 
 58 |   // Compute the full decomposition on all ranks
 59 |   for (int yy = 0; yy < y_chunks; ++yy) {
 60 |     int add_y = (yy < mod_y);
 61 | 
 62 |     for (int xx = 0; xx < x_chunks; ++xx) {
 63 |       int add_x = (xx < mod_x);
 64 | 
 65 |       for (int cc = 0; cc < settings.num_chunks_per_rank; ++cc) {
 66 |         int chunk = xx + yy * x_chunks;
 67 |         int rank = cc + settings.rank * settings.num_chunks_per_rank;
 68 | 
 69 |         // Store the values for all chunks local to rank
 70 |         if (rank == chunk) {
 71 |           initialise_chunk(&(chunks[cc]), settings, dx + add_x, dy + add_y);
 72 | 
 73 |           // Set up the mesh ranges
 74 |           chunks[cc].left = xx * dx + add_x_prev;
 75 |           chunks[cc].right = chunks[cc].left + dx + add_x;
 76 |           chunks[cc].bottom = yy * dy + add_y_prev;
 77 |           chunks[cc].top = chunks[cc].bottom + dy + add_y;
 78 | 
 79 |           // Set up the chunk connectivity
 80 |           chunks[cc].neighbours[CHUNK_LEFT] = (xx == 0) ? EXTERNAL_FACE : chunk - 1;
 81 |           chunks[cc].neighbours[CHUNK_RIGHT] = (xx == x_chunks - 1) ? EXTERNAL_FACE : chunk + 1;
 82 |           chunks[cc].neighbours[CHUNK_BOTTOM] = (yy == 0) ? EXTERNAL_FACE : chunk - x_chunks;
 83 |           chunks[cc].neighbours[CHUNK_TOP] = (yy == y_chunks - 1) ? EXTERNAL_FACE : chunk + x_chunks;
 84 |         }
 85 |       }
 86 | 
 87 |       // If chunks rounded up, maintain relative location
 88 |       add_x_prev += add_x;
 89 |     }
 90 |     add_x_prev = 0;
 91 |     add_y_prev += add_y;
 92 |   }
 93 | }
 94 | 
 95 | void initialise_model_info(Settings &settings) { run_model_info(settings); }
 96 | 
 97 | // Initialise settings from input file
 98 | void initialise_application(Chunk **chunks, Settings &settings, State* states) {
 99 | 
100 |   *chunks = (Chunk *)malloc(sizeof(Chunk) * settings.num_chunks_per_rank);
101 | 
102 |   decompose_field(settings, *chunks);
103 |   kernel_initialise_driver(*chunks, settings);
104 |   set_chunk_data_driver(*chunks, settings);
105 |   set_chunk_state_driver(*chunks, settings, states);
106 | 
107 |   // Prime the initial halo data
108 |   reset_fields_to_exchange(settings);
109 |   settings.fields_to_exchange[FIELD_DENSITY] = true; // start.f90:111
110 |   settings.fields_to_exchange[FIELD_ENERGY0] = true; // start.f90:112
111 |   settings.fields_to_exchange[FIELD_ENERGY1] = true; // start.f90:113
112 |   halo_update_driver(*chunks, settings, 2);
113 | 
114 |   store_energy_driver(*chunks, settings);
115 | }
116 | 


--------------------------------------------------------------------------------
/driver/jacobi_driver.cpp:
--------------------------------------------------------------------------------
 1 | #include "chunk.h"
 2 | #include "comms.h"
 3 | #include "drivers.h"
 4 | #include "kernel_interface.h"
 5 | 
 6 | // Performs a full solve with the Jacobi solver kernels
 7 | void jacobi_driver(Chunk *chunks, Settings &settings, double rx, double ry, double *error) {
 8 |   jacobi_init_driver(chunks, settings, rx, ry);
 9 | 
10 |   // Iterate till convergence
11 |   int tt;
12 |   for (tt = 0; tt < settings.max_iters; ++tt) {
13 |     jacobi_main_step_driver(chunks, settings, tt, error);
14 | 
15 |     halo_update_driver(chunks, settings, 1);
16 | 
17 |     if (fabs(*error) < settings.eps) break;
18 |   }
19 | 
20 |   print_and_log(settings, "Jacobi: \t\t%d iterations\n", tt);
21 | }
22 | 
23 | // Invokes the CG initialisation kernels
24 | void jacobi_init_driver(Chunk *chunks, Settings &settings, double rx, double ry) {
25 |   for (int cc = 0; cc < settings.num_chunks_per_rank; ++cc) {
26 |     if (settings.kernel_language == Kernel_Language::C) {
27 |       run_jacobi_init(&(chunks[cc]), settings, rx, ry);
28 | 
29 |       run_copy_u(&(chunks[cc]), settings);
30 |     } else if (settings.kernel_language == Kernel_Language::FORTRAN) {
31 |     }
32 |   }
33 | 
34 |   // Need to update for the matvec
35 |   reset_fields_to_exchange(settings);
36 |   settings.fields_to_exchange[FIELD_U] = true;
37 | }
38 | 
39 | // Invokes the main Jacobi solve kernels
40 | void jacobi_main_step_driver(Chunk *chunks, Settings &settings, int tt, double *error) {
41 |   for (int cc = 0; cc < settings.num_chunks_per_rank; ++cc) {
42 |     if (settings.kernel_language == Kernel_Language::C) {
43 |       run_jacobi_iterate(&(chunks[cc]), settings, error);
44 |     } else if (settings.kernel_language == Kernel_Language::FORTRAN) {
45 |     }
46 |   }
47 | 
48 |   if (tt % 50 == 0) {
49 |     halo_update_driver(chunks, settings, 1);
50 | 
51 |     for (int cc = 0; cc < settings.num_chunks_per_rank; ++cc) {
52 |       if (settings.kernel_language == Kernel_Language::C) {
53 |         run_calculate_residual(&(chunks[cc]), settings);
54 | 
55 |         run_calculate_2norm(&(chunks[cc]), settings, chunks[cc].r, error);
56 |       } else if (settings.kernel_language == Kernel_Language::FORTRAN) {
57 |       }
58 |     }
59 |   }
60 | 
61 |   sum_over_ranks(settings, error);
62 | }
63 | 


--------------------------------------------------------------------------------
/driver/kernel_initialise_driver.cpp:
--------------------------------------------------------------------------------
 1 | #include "chunk.h"
 2 | #include "kernel_interface.h"
 3 | 
 4 | // Invokes the kernel initialisation kernels
 5 | void kernel_initialise_driver(Chunk *chunks, Settings &settings) {
 6 |   for (int cc = 0; cc < settings.num_chunks_per_rank; ++cc) {
 7 |     if (settings.kernel_language == Kernel_Language::C) {
 8 |       int lr_len = chunks[cc].y * settings.halo_depth * NUM_FIELDS;
 9 |       int tb_len = chunks[cc].x * settings.halo_depth * NUM_FIELDS;
10 |       run_kernel_initialise(&(chunks[cc]), settings, lr_len, tb_len);
11 |     } else if (settings.kernel_language == Kernel_Language::FORTRAN) {
12 |     }
13 |   }
14 | }
15 | 
16 | // Invokes the kernel finalisation drivers
17 | void kernel_finalise_driver(Chunk *chunks, Settings &settings) {
18 |   for (int cc = 0; cc < settings.num_chunks_per_rank; ++cc) {
19 |     if (settings.kernel_language == Kernel_Language::C) {
20 |       run_kernel_finalise(&(chunks[cc]), settings);
21 |     } else if (settings.kernel_language == Kernel_Language::FORTRAN) {
22 |     }
23 |   }
24 | }
25 | 


--------------------------------------------------------------------------------
/driver/kernel_interface.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "chunk.h"
 4 | #include "comms.h"
 5 | #include "settings.h"
 6 | 
 7 | /*
 8 |  *      This is the main interface file for C based implementations.
 9 |  */
10 | 
11 | // Initialisation kernels
12 | void run_model_info(Settings &settings);
13 | void run_set_chunk_data(Chunk *chunk, Settings &settings);
14 | void run_set_chunk_state(Chunk *chunk, Settings &settings, State *states);
15 | void run_kernel_initialise(Chunk *chunk, Settings &settings, int comms_lr_len, int comms_tb_len);
16 | void run_kernel_finalise(Chunk *chunk, Settings &settings);
17 | 
18 | // Solver-wide kernels
19 | void run_local_halos(Chunk *chunk, Settings &settings, int depth);
20 | 
21 | // void run_pack_or_unpack(Chunk *chunk, Settings &settings, int depth, int face, bool pack, FieldBufferType field,
22 | //                         FieldBufferType destination);
23 | 
24 | void run_pack_or_unpack(Chunk *chunk, Settings &settings, int depth, int face, bool pack, FieldBufferType field,
25 |                         FieldBufferType destination, int offset);
26 | //
27 | void run_send_recv_halo(Chunk *chunk, Settings &settings,                                                       //
28 |                         FieldBufferType src_send_buffer, FieldBufferType src_recv_buffer,                       //
29 |                         StagingBufferType dest_staging_send_buffer, StagingBufferType dest_staging_recv_buffer, //
30 |                         int buffer_len, int neighbour,                                                          //
31 |                         int send_tag, int recv_tag,                                                             //
32 |                         MPI_Request *send_request, MPI_Request *recv_request);
33 | void run_before_waitall_halo(Chunk *chunk, Settings &settings);
34 | void run_restore_recv_halo(Chunk *chunk, Settings &settings, //
35 |                            FieldBufferType dest_recv_buffer, StagingBufferType src_staging_recv_buffer, int buffer_len);
36 | 
37 | void run_store_energy(Chunk *chunk, Settings &settings);
38 | void run_field_summary(Chunk *chunk, Settings &settings, double *vol, double *mass, double *ie, double *temp);
39 | 
40 | // CG solver kernels
41 | void run_cg_init(Chunk *chunk, Settings &settings, double rx, double ry, double *rro);
42 | void run_cg_calc_w(Chunk *chunk, Settings &settings, double *pw);
43 | void run_cg_calc_ur(Chunk *chunk, Settings &settings, double alpha, double *rrn);
44 | void run_cg_calc_p(Chunk *chunk, Settings &settings, double beta);
45 | 
46 | // Chebyshev solver kernels
47 | void run_cheby_init(Chunk *chunk, Settings &settings);
48 | void run_cheby_iterate(Chunk *chunk, Settings &settings, double alpha, double beta);
49 | 
50 | // Jacobi solver kernels
51 | void run_jacobi_init(Chunk *chunk, Settings &settings, double rx, double ry);
52 | void run_jacobi_iterate(Chunk *chunk, Settings &settings, double *error);
53 | 
54 | // PPCG solver kernels
55 | void run_ppcg_init(Chunk *chunk, Settings &settings);
56 | void run_ppcg_inner_iteration(Chunk *chunk, Settings &settings, double alpha, double beta);
57 | 
58 | // Shared solver kernels
59 | void run_copy_u(Chunk *chunk, Settings &settings);
60 | void run_calculate_residual(Chunk *chunk, Settings &settings);
61 | void run_calculate_2norm(Chunk *chunk, Settings &settings, FieldBufferType buffer, double *norm);
62 | void run_finalise(Chunk *chunk, Settings &settings);
63 | 


--------------------------------------------------------------------------------
/driver/mpi_shim.cpp:
--------------------------------------------------------------------------------
 1 | #include "mpi_shim.h"
 2 | #include <cstdio>
 3 | 
 4 | #ifdef NO_MPI
 5 | 
 6 | int MPI_Init(int *, char ***) { return MPI_SUCCESS; }
 7 | int MPI_Comm_rank(MPI_Comm, int *rank) {
 8 |   *rank = 0;
 9 |   return MPI_SUCCESS;
10 | }
11 | int MPI_Comm_size(MPI_Comm, int *size) {
12 |   *size = 1;
13 |   return MPI_SUCCESS;
14 | }
15 | int MPI_Abort(MPI_Comm, int errorcode) {
16 |   std::exit(errorcode);
17 |   return MPI_SUCCESS;
18 | }
19 | int MPI_Finalize() { return MPI_SUCCESS; }
20 | 
21 | int MPI_Barrier(MPI_Comm) {
22 |   // XXX no-op, correct for 1 rank only
23 |   return MPI_SUCCESS;
24 | }
25 | int MPI_Allgather(const void *, int, MPI_Datatype, void *, int, MPI_Datatype, MPI_Comm) {
26 |   // XXX no-op, correct for 1 rank only
27 |   return MPI_SUCCESS;
28 | }
29 | int MPI_Reduce(const void *, void *, int, MPI_Datatype, MPI_Op, int, MPI_Comm) {
30 |   // XXX no-op, correct for 1 rank only
31 |   return MPI_SUCCESS;
32 | }
33 | int MPI_Allreduce(const void *, void *, int, MPI_Datatype, MPI_Op, MPI_Comm) {
34 |   // XXX no-op, correct for 1 rank only
35 |   return MPI_SUCCESS;
36 | }
37 | int MPI_Waitall(int, MPI_Request[], MPI_Status[]) {
38 |   // XXX no-op, correct for 1 rank only
39 |   return MPI_SUCCESS;
40 | }
41 | 
42 | int MPI_Isend(const void *, int, MPI_Datatype, int, int, MPI_Comm, MPI_Request *) {
43 |   fprintf(stderr, "MPI disabled, stub: %s\n", __func__);
44 |   std::abort();
45 |   return MPI_ERR_COMM;
46 | }
47 | int MPI_Irecv(void *, int, MPI_Datatype, int, int, MPI_Comm, MPI_Request *) {
48 |   fprintf(stderr, "MPI disabled, stub: %s\n", __func__);
49 |   std::abort();
50 |   return MPI_ERR_COMM;
51 | }
52 | 
53 | #endif
54 | 


--------------------------------------------------------------------------------
/driver/mpi_shim.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <cstdlib>
 4 | #ifdef NO_MPI
 5 | 
 6 |   #define MPI_SUCCESS (0)
 7 |   #define MPI_ERR_COMM (1)
 8 |   #define MPI_ERR_COUNT (2)
 9 |   #define MPI_ERR_TYPE (3)
10 |   #define MPI_ERR_BUFFER (4)
11 | 
12 |   #define MPI_INT (0)
13 |   #define MPI_LONG (0)
14 |   #define MPI_DOUBLE (0)
15 |   #define MPI_SUM (0)
16 |   #define MPI_MIN (0)
17 |   #define MPI_MAX (0)
18 |   #define MPI_STATUS_IGNORE (0)
19 |   #define MPI_STATUSES_IGNORE (0)
20 | 
21 |   #define MPI_COMM_WORLD (0)
22 | 
23 | using MPI_Comm = int;
24 | using MPI_Request = int;
25 | using MPI_Datatype = int;
26 | using MPI_Op = int;
27 | using MPI_Status = int;
28 | 
29 | int MPI_Init(int *argc, char ***argv);
30 | int MPI_Comm_rank(MPI_Comm comm, int *rank);
31 | int MPI_Comm_size(MPI_Comm comm, int *size);
32 | int MPI_Abort(MPI_Comm comm, int errorcode);
33 | int MPI_Barrier(MPI_Comm comm);
34 | int MPI_Finalize();
35 | 
36 | int MPI_Reduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, int root, MPI_Comm comm);
37 | int MPI_Allreduce(const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, MPI_Comm comm);
38 | int MPI_Isend(const void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm, MPI_Request *request);
39 | int MPI_Irecv(void *buf, int count, MPI_Datatype datatype, int source, int tag, MPI_Comm comm, MPI_Request *request);
40 | int MPI_Allgather(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype,
41 |                   MPI_Comm comm);
42 | int MPI_Waitall(int count, MPI_Request array_of_requests[], MPI_Status array_of_statuses[]);
43 | 
44 | #endif


--------------------------------------------------------------------------------
/driver/profiler.cpp:
--------------------------------------------------------------------------------
  1 | #include "profiler.h"
  2 | #include <cstdio>
  3 | #include <cstdlib>
  4 | #include <cstring>
  5 | 
  6 | #define tealeaf_strmatch(a, b) (strcmp(a, b) == 0)
  7 | 
  8 | struct Profile *profiler_initialise() {
  9 |   auto *profile = static_cast<Profile *>(std::malloc(sizeof(Profile)));
 10 |   std::memset(profile, 0, sizeof(Profile));
 11 |   return profile;
 12 | }
 13 | 
 14 | void profiler_finalise(Profile **profile) {
 15 |   std::free(*profile);
 16 |   *profile = nullptr;
 17 | }
 18 | 
 19 | // Internally start the profiling timer
 20 | void profiler_start_timer(Profile *profile) {
 21 | #ifdef __APPLE__
 22 |   profile->profiler_start = mach_absolute_time();
 23 | #else
 24 |   clock_gettime(CLOCK_MONOTONIC, &profile->profiler_start);
 25 | #endif
 26 | }
 27 | 
 28 | // Internally end the profiling timer and store results
 29 | void profiler_end_timer(Profile *profile, const char *entry_name) {
 30 | #ifdef __APPLE__
 31 |   profile->profiler_end = mach_absolute_time();
 32 | #else
 33 |   clock_gettime(CLOCK_MONOTONIC, &profile->profiler_end);
 34 | #endif
 35 | 
 36 |   // Check if an entry exists
 37 |   int ii;
 38 |   for (ii = 0; ii < profile->profiler_entry_count; ++ii) {
 39 |     if (tealeaf_strmatch(profile->profiler_entries[ii].name, entry_name)) {
 40 |       break;
 41 |     }
 42 |   }
 43 | 
 44 |   // Don't overrun
 45 |   if (ii >= PROFILER_MAX_ENTRIES) {
 46 |     printf("Attempted to profile too many entries, maximum is %d\n", PROFILER_MAX_ENTRIES);
 47 |     exit(1);
 48 |   }
 49 | 
 50 |   // Create new entry
 51 |   if (ii == profile->profiler_entry_count) {
 52 |     profile->profiler_entry_count++;
 53 |     strcpy(profile->profiler_entries[ii].name, entry_name);
 54 |     profile->profiler_entries[ii].time = 0;
 55 |     profile->profiler_entries[ii].calls = 0;
 56 |   }
 57 | 
 58 |   // Update number of calls and time
 59 | #ifdef __APPLE__
 60 |   double elapsed = (profile->profiler_end - profile->profiler_start) * 1.0E-9;
 61 | #else
 62 |   double elapsed = (profile->profiler_end.tv_sec - profile->profiler_start.tv_sec) +
 63 |                    (profile->profiler_end.tv_nsec - profile->profiler_start.tv_nsec) * 1.0E-9;
 64 | #endif
 65 | 
 66 |   profile->profiler_entries[ii].time += elapsed;
 67 |   profile->profiler_entries[ii].calls++;
 68 | }
 69 | 
 70 | // Print the profiling results to output
 71 | void profiler_print_full_profile(Profile *profile) {
 72 |   printf("\n -------------------------------------------------------------\n");
 73 |   printf("\n Profiling Results:\n\n");
 74 |   printf(" %-30s%8s%20s\n", "Kernel Name", "Calls", "Runtime (s)");
 75 | 
 76 |   double total_elapsed_time = 0.0;
 77 |   for (int ii = 0; ii < profile->profiler_entry_count; ++ii) {
 78 |     total_elapsed_time += profile->profiler_entries[ii].time;
 79 |     printf(" %-30s%8d%20.03F\n", profile->profiler_entries[ii].name, profile->profiler_entries[ii].calls,
 80 |            profile->profiler_entries[ii].time);
 81 |   }
 82 | 
 83 |   printf("\n Total elapsed time: %.03Fs, entries * are excluded.\n", total_elapsed_time);
 84 |   printf("\n -------------------------------------------------------------\n\n");
 85 | }
 86 | 
 87 | // Prints profile without extra details
 88 | void profiler_print_simple_profile(Profile *profile) {
 89 |   for (int ii = 0; ii < profile->profiler_entry_count; ++ii) {
 90 |     printf("\033[1m\033[30m%s\033[0m: %.3lfs (%d calls)\n", profile->profiler_entries[ii].name, profile->profiler_entries[ii].time,
 91 |            profile->profiler_entries[ii].calls);
 92 |   }
 93 | }
 94 | 
 95 | // Gets an individual profile entry
 96 | int profiler_get_profile_entry(Profile *profile, const char *entry_name) {
 97 |   for (int ii = 0; ii < profile->profiler_entry_count; ++ii) {
 98 |     if (tealeaf_strmatch(profile->profiler_entries[ii].name, entry_name)) {
 99 |       return ii;
100 |     }
101 |   }
102 | 
103 |   printf("Attempted to retrieve missing profile entry %s\n", entry_name);
104 |   exit(1);
105 | }
106 | 


--------------------------------------------------------------------------------
/driver/profiler.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #ifdef __APPLE__
 4 |   #include <mach/mach.h>
 5 |   #include <mach/mach_time.h>
 6 | #else
 7 |   #include <ctime>
 8 | #endif
 9 | 
10 | /*
11 |  *		PROFILING TOOL
12 |  *		Not thread safe.
13 |  */
14 | 
15 | #define PROFILER_MAX_NAME 128
16 | #define PROFILER_MAX_ENTRIES 2048
17 | 
18 | #ifdef __cplusplus
19 | extern "C" {
20 | #endif
21 | 
22 | struct ProfileEntry {
23 |   int calls;
24 |   double time;
25 |   char name[PROFILER_MAX_NAME];
26 | };
27 | 
28 | struct Profile {
29 | #ifdef __APPLE__
30 |   uint64_t profiler_start;
31 |   uint64_t profiler_end;
32 | #else
33 |   struct timespec profiler_start;
34 |   struct timespec profiler_end;
35 | #endif
36 | 
37 |   int profiler_entry_count;
38 |   ProfileEntry profiler_entries[PROFILER_MAX_ENTRIES];
39 | };
40 | 
41 | Profile *profiler_initialise();
42 | void profiler_finalise(Profile **profile);
43 | 
44 | void profiler_start_timer(Profile *profile);
45 | void profiler_end_timer(Profile *profile, const char *entry_name);
46 | void profiler_print_simple_profile(Profile *profile);
47 | void profiler_print_full_profile(Profile *profile);
48 | int profiler_get_profile_entry(Profile *profile, const char *entry_name);
49 | 
50 | #ifdef __cplusplus
51 | }
52 | #endif
53 | 
54 | // Allows compile-time optimised conditional profiling
55 | #ifdef ENABLE_PROFILING
56 | 
57 |   #define START_PROFILING(profile) profiler_start_timer(profile)
58 | 
59 |   #define STOP_PROFILING(profile, name) profiler_end_timer(profile, name)
60 | 
61 |   #define PRINT_PROFILING_RESULTS(profile) profiler_print_full_profile(profile)
62 | 
63 | #else
64 | 
65 |   #define START_PROFILING(profile) \
66 |     do {                           \
67 |     } while (false)
68 |   #define STOP_PROFILING(profile, name) \
69 |     do {                                \
70 |     } while (false)
71 |   #define PRINT_PROFILING_RESULTS(profile) \
72 |     do {                                   \
73 |     } while (false)
74 | 
75 | #endif
76 | 


--------------------------------------------------------------------------------
/driver/set_chunk_data_driver.cpp:
--------------------------------------------------------------------------------
 1 | #include "chunk.h"
 2 | #include "kernel_interface.h"
 3 | #include "settings.h"
 4 | 
 5 | // Invokes the set chunk data kernel
 6 | void set_chunk_data_driver(Chunk *chunks, Settings &settings) {
 7 |   for (int cc = 0; cc < settings.num_chunks_per_rank; ++cc) {
 8 |     if (settings.kernel_language == Kernel_Language::C) {
 9 |       run_set_chunk_data(&(chunks[cc]), settings);
10 |     } else if (settings.kernel_language == Kernel_Language::FORTRAN) {
11 |       // Fortran store energy kernel
12 |     }
13 |   }
14 | }
15 | 


--------------------------------------------------------------------------------
/driver/set_chunk_state_driver.cpp:
--------------------------------------------------------------------------------
 1 | #include "chunk.h"
 2 | #include "kernel_interface.h"
 3 | 
 4 | // Invokes the set chunk state kernel
 5 | void set_chunk_state_driver(Chunk *chunks, Settings &settings, State *states) {
 6 |   // Issue kernel to all local chunks
 7 |   for (int cc = 0; cc < settings.num_chunks_per_rank; ++cc) {
 8 |     if (settings.kernel_language == Kernel_Language::C) {
 9 |       run_set_chunk_state(&(chunks[cc]), settings, states);
10 |     } else if (settings.kernel_language == Kernel_Language::FORTRAN) {
11 |       // Fortran store energy kernel
12 |     }
13 |   }
14 | }
15 | 


--------------------------------------------------------------------------------
/driver/settings.cpp:
--------------------------------------------------------------------------------
 1 | #include "settings.h"
 2 | #include <cstring>
 3 | 
 4 | #define MAX_CHAR_LEN 256
 5 | 
 6 | void set_default_settings(Settings &settings) {
 7 |   settings.test_problem_filename = (char *)malloc(sizeof(char) * MAX_CHAR_LEN);
 8 |   strncpy(settings.test_problem_filename, DEF_TEST_PROBLEM_FILENAME, MAX_CHAR_LEN);
 9 | 
10 |   settings.tea_in_filename = (char *)malloc(sizeof(char) * MAX_CHAR_LEN);
11 |   strncpy(settings.tea_in_filename, DEF_TEA_IN_FILENAME, MAX_CHAR_LEN);
12 | 
13 |   settings.tea_out_filename = (char *)malloc(sizeof(char) * MAX_CHAR_LEN);
14 |   strncpy(settings.tea_out_filename, DEF_TEA_OUT_FILENAME, MAX_CHAR_LEN);
15 | 
16 |   settings.tea_out_fp = nullptr;
17 |   settings.grid_x_min = DEF_GRID_X_MIN;
18 |   settings.grid_y_min = DEF_GRID_Y_MIN;
19 |   settings.grid_x_max = DEF_GRID_X_MAX;
20 |   settings.grid_y_max = DEF_GRID_Y_MAX;
21 |   settings.grid_x_cells = DEF_GRID_X_CELLS;
22 |   settings.grid_y_cells = DEF_GRID_Y_CELLS;
23 |   settings.dt_init = DEF_DT_INIT;
24 |   settings.max_iters = DEF_MAX_ITERS;
25 |   settings.eps = DEF_EPS;
26 |   settings.end_time = DEF_END_TIME;
27 |   settings.end_step = DEF_END_STEP;
28 |   settings.summary_frequency = DEF_SUMMARY_FREQUENCY;
29 |   settings.solver = DEF_SOLVER;
30 |   settings.staging_buffer_preference = DEF_STAGING_BUFFER;
31 |   settings.model_name = "";
32 |   settings.model_kind = ModelKind::Host;
33 |   settings.coefficient = DEF_COEFFICIENT;
34 |   settings.error_switch = DEF_ERROR_SWITCH;
35 |   settings.presteps = DEF_PRESTEPS;
36 |   settings.eps_lim = DEF_EPS_LIM;
37 |   settings.check_result = DEF_CHECK_RESULT;
38 |   settings.ppcg_inner_steps = DEF_PPCG_INNER_STEPS;
39 |   settings.preconditioner = DEF_PRECONDITIONER;
40 |   settings.num_states = DEF_NUM_STATES;
41 |   settings.num_chunks = DEF_NUM_CHUNKS;
42 |   settings.num_chunks_per_rank = DEF_NUM_CHUNKS_PER_RANK;
43 |   settings.num_ranks = DEF_NUM_RANKS;
44 |   settings.halo_depth = DEF_HALO_DEPTH;
45 |   settings.is_offload = DEF_IS_OFFLOAD;
46 |   settings.kernel_profile = profiler_initialise();
47 |   settings.application_profile = profiler_initialise();
48 |   settings.wallclock_profile = profiler_initialise();
49 |   settings.fields_to_exchange = (bool *)malloc(sizeof(bool) * NUM_FIELDS);
50 |   settings.solver_name = (char *)malloc(sizeof(char) * MAX_CHAR_LEN);
51 |   settings.device_selector = nullptr;
52 | }
53 | 
54 | // Resets all of the fields to be exchanged
55 | void reset_fields_to_exchange(Settings &settings) {
56 |   for (int ii = 0; ii < NUM_FIELDS; ++ii) {
57 |     settings.fields_to_exchange[ii] = false;
58 |   }
59 | }
60 | 
61 | // Checks if any of the fields are to be exchanged
62 | bool is_fields_to_exchange(Settings &settings) {
63 |   for (int ii = 0; ii < NUM_FIELDS; ++ii) {
64 |     if (settings.fields_to_exchange[ii]) {
65 |       return true;
66 |     }
67 |   }
68 | 
69 |   return false;
70 | }
71 | 


--------------------------------------------------------------------------------
/driver/settings.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include "shared.h"
  4 | #include <cstdint>
  5 | #include <string>
  6 | 
  7 | #define NUM_FIELDS 6
  8 | 
  9 | // Default settings
 10 | #define DEF_TEA_IN_FILENAME "tea.in"
 11 | #define DEF_TEA_OUT_FILENAME "tea.out"
 12 | #define DEF_TEST_PROBLEM_FILENAME "tea.problems"
 13 | #define DEF_GRID_X_MIN 0.0
 14 | #define DEF_GRID_Y_MIN 0.0
 15 | #define DEF_GRID_Z_MIN 0.0
 16 | #define DEF_GRID_X_MAX 100.0
 17 | #define DEF_GRID_Y_MAX 100.0
 18 | #define DEF_GRID_Z_MAX 100.0
 19 | #define DEF_GRID_X_CELLS 10
 20 | #define DEF_GRID_Y_CELLS 10
 21 | #define DEF_GRID_Z_CELLS 10
 22 | #define DEF_DT_INIT 0.1
 23 | #define DEF_MAX_ITERS 10000
 24 | #define DEF_EPS 1.0E-15
 25 | #define DEF_END_TIME 10.0
 26 | #define DEF_END_STEP INT32_MAX
 27 | #define DEF_SUMMARY_FREQUENCY 10
 28 | #define DEF_KERNEL_LANGUAGE C
 29 | #define DEF_COEFFICIENT CONDUCTIVITY
 30 | #define DEF_ERROR_SWITCH 0
 31 | #define DEF_PRESTEPS 30
 32 | #define DEF_EPS_LIM 1E-5
 33 | #define DEF_CHECK_RESULT 1
 34 | #define DEF_PPCG_INNER_STEPS 10
 35 | #define DEF_PRECONDITIONER 0
 36 | #define DEF_SOLVER Solver::CG_SOLVER
 37 | #define DEF_STAGING_BUFFER StagingBuffer::AUTO
 38 | #define DEF_NUM_STATES 0
 39 | #define DEF_NUM_CHUNKS 1
 40 | #define DEF_NUM_CHUNKS_PER_RANK 1
 41 | #define DEF_NUM_RANKS 1
 42 | #define DEF_HALO_DEPTH 2
 43 | #define DEF_RANK 0
 44 | #define DEF_IS_OFFLOAD false
 45 | 
 46 | // The type of solver to be run
 47 | enum class Solver { JACOBI_SOLVER, CG_SOLVER, CHEBY_SOLVER, PPCG_SOLVER };
 48 | 
 49 | // The language of the kernels to be run
 50 | enum class Kernel_Language { C, FORTRAN };
 51 | 
 52 | enum class StagingBuffer { ENABLE, DISABLE, AUTO };
 53 | 
 54 | enum class ModelKind { Host, Offload, Unified };
 55 | 
 56 | // The main settings structure
 57 | struct Settings {
 58 |   // Set of system-wide profiles
 59 |   Profile *kernel_profile;
 60 |   Profile *application_profile;
 61 |   Profile *wallclock_profile;
 62 | 
 63 |   // Log files
 64 |   FILE *tea_out_fp;
 65 | 
 66 |   // Solve-wide constants
 67 |   int rank;
 68 |   int end_step;
 69 |   int presteps;
 70 |   int max_iters;
 71 |   int coefficient;
 72 |   int ppcg_inner_steps;
 73 |   int summary_frequency;
 74 |   int halo_depth;
 75 |   int num_states;
 76 |   int num_chunks;
 77 |   int num_chunks_per_rank;
 78 |   int num_ranks;
 79 |   bool *fields_to_exchange;
 80 | 
 81 |   bool is_offload;
 82 | 
 83 |   bool error_switch;
 84 |   bool check_result;
 85 |   bool preconditioner;
 86 | 
 87 |   double eps;
 88 |   double dt_init;
 89 |   double end_time;
 90 |   double eps_lim;
 91 | 
 92 |   // Input-Output files
 93 |   char *tea_in_filename;
 94 |   char *tea_out_filename;
 95 |   char *test_problem_filename;
 96 | 
 97 |   Solver solver;
 98 |   char *solver_name;
 99 | 
100 |   Kernel_Language kernel_language;
101 |   char *device_selector;
102 |   std::string model_name;
103 |   ModelKind model_kind;
104 |   StagingBuffer staging_buffer_preference;
105 |   bool staging_buffer;
106 | 
107 |   // Field dimensions
108 |   int grid_x_cells;
109 |   int grid_y_cells;
110 | 
111 |   double grid_x_min;
112 |   double grid_y_min;
113 |   double grid_x_max;
114 |   double grid_y_max;
115 | 
116 |   double dx;
117 |   double dy;
118 | };
119 | 
120 | // The accepted types of state geometry
121 | enum class Geometry { RECTANGULAR, CIRCULAR, POINT };
122 | 
123 | // State list
124 | struct State {
125 |   bool defined;
126 |   double density;
127 |   double energy;
128 |   double x_min;
129 |   double y_min;
130 |   double x_max;
131 |   double y_max;
132 |   double radius;
133 |   Geometry geometry;
134 | };
135 | 
136 | void set_default_settings(Settings &settings);
137 | void reset_fields_to_exchange(Settings &settings);
138 | bool is_fields_to_exchange(Settings &settings);
139 | 


--------------------------------------------------------------------------------
/driver/shared.cpp:
--------------------------------------------------------------------------------
  1 | #include "shared.h"
  2 | #include "comms.h"
  3 | 
  4 | // Initialises the log file pointer
  5 | void initialise_log(Settings &settings) {
  6 |   // Only write to log in master rank
  7 |   if (settings.rank != MASTER) {
  8 |     return;
  9 |   }
 10 | 
 11 |   std::printf("# Opening %s as log file.\n", settings.tea_out_filename);
 12 |   std::fflush(stdout);
 13 |   settings.tea_out_fp = std::fopen(settings.tea_out_filename, "w");
 14 | 
 15 |   if (!settings.tea_out_fp) {
 16 |     die(__LINE__, __FILE__, "Could not open log %s\n", settings.tea_out_filename);
 17 |   }
 18 | }
 19 | 
 20 | // Prints to stdout and then logs message in log file
 21 | void print_and_log(Settings &settings, const char *format, ...) {
 22 |   // Only master rank should print
 23 |   if (settings.rank != MASTER) {
 24 |     return;
 25 |   }
 26 | 
 27 |   va_list arglist;
 28 |   va_start(arglist, format);
 29 |   std::vprintf(format, arglist);
 30 |   va_end(arglist);
 31 |   std::fflush(stdout);
 32 | 
 33 |   if (!settings.tea_out_fp) {
 34 |     die(__LINE__, __FILE__, "Attempted to write to log before it was initialised\n");
 35 |   }
 36 | 
 37 |   // Obtuse, but necessary
 38 |   va_list arglist2;
 39 |   va_start(arglist2, format);
 40 |   std::vfprintf(settings.tea_out_fp, format, arglist2);
 41 |   va_end(arglist2);
 42 |   std::fflush(settings.tea_out_fp);
 43 | }
 44 | 
 45 | // Logs message in log file
 46 | void print_to_log(Settings &settings, const char *format, ...) {
 47 |   // Only master rank should log
 48 |   if (settings.rank != MASTER) {
 49 |     return;
 50 |   }
 51 | 
 52 |   if (!settings.tea_out_fp) {
 53 |     die(__LINE__, __FILE__, "Attempted to write to log before it was initialised\n");
 54 |   }
 55 | 
 56 |   va_list arglist;
 57 |   va_start(arglist, format);
 58 |   std::vfprintf(settings.tea_out_fp, format, arglist);
 59 |   va_end(arglist);
 60 |   std::fflush(settings.tea_out_fp);
 61 | }
 62 | 
 63 | // Plots a two-dimensional dat file.
 64 | void plot_2d(int x, int y, const double *buffer, const char *name) {
 65 |   // Open the plot file
 66 |   FILE *fp = std::fopen("plot2d.dat", "wb");
 67 |   if (!fp) {
 68 |     std::printf("Could not open plot file.\n");
 69 |   }
 70 | 
 71 |   double b_sum = 0.0;
 72 | 
 73 |   for (int jj = 0; jj < y; ++jj) {
 74 |     for (int kk = 0; kk < x; ++kk) {
 75 |       double val = buffer[kk + jj * x];
 76 |       std::fprintf(fp, "%d %d %.12E\n", kk, jj, val);
 77 |       b_sum += val;
 78 |     }
 79 |   }
 80 | 
 81 |   std::printf("%s: %.12E\n", name, b_sum);
 82 |   std::fclose(fp);
 83 | }
 84 | 
 85 | // Aborts the application.
 86 | void die(int lineNum, const char *file, const char *format, ...) {
 87 |   // Print location of error
 88 |   std::printf("\x1b[31m");
 89 |   std::printf("\nError at line %d in %s:", lineNum, file);
 90 |   std::printf("\x1b[0m \n");
 91 | 
 92 |   va_list arglist;
 93 |   va_start(arglist, format);
 94 |   std::vprintf(format, arglist);
 95 |   va_end(arglist);
 96 |   std::fflush(stdout);
 97 | 
 98 |   abort_comms();
 99 | }
100 | 
101 | // Write out data for visualisation in visit
102 | void write_to_visit(const int nx, const int ny, const int x_off, const int y_off, const double *data, const char *name, const int step,
103 |                     const double time) {
104 |   char bovname[256]{};
105 |   char datname[256]{};
106 |   std::sprintf(bovname, "%s%d.bov", name, step);
107 |   std::sprintf(datname, "%s%d.dat", name, step);
108 | 
109 |   FILE *bovfp = std::fopen(bovname, "w");
110 | 
111 |   if (!bovfp) {
112 |     std::printf("Could not open file %s\n", bovname);
113 |     std::exit(1);
114 |   }
115 | 
116 |   std::fprintf(bovfp, "TIME: %.4f\n", time);
117 |   std::fprintf(bovfp, "DATA_FILE: %s\n", datname);
118 |   std::fprintf(bovfp, "DATA_SIZE: %d %d 1\n", nx, ny);
119 |   std::fprintf(bovfp, "DATA_FORMAT: DOUBLE\n");
120 |   std::fprintf(bovfp, "VARIABLE: density\n");
121 |   std::fprintf(bovfp, "DATA_ENDIAN: LITTLE\n");
122 |   std::fprintf(bovfp, "CENTERING: zone\n");
123 |   std::fprintf(bovfp, "BRICK_ORIGIN: 0. 0. 0.\n");
124 | 
125 |   std::fprintf(bovfp, "BRICK_SIZE: %d %d 1\n", nx, ny);
126 |   std::fclose(bovfp);
127 | 
128 |   FILE *datfp = std::fopen(datname, "wb");
129 |   if (!datfp) {
130 |     std::printf("Could not open file %s\n", datname);
131 |     std::exit(1);
132 |   }
133 | 
134 |   std::fwrite(data, sizeof(double), nx * ny, datfp);
135 |   std::fclose(datfp);
136 | }
137 | 


--------------------------------------------------------------------------------
/driver/shared.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "profiler.h"
 4 | #include <cstdarg>
 5 | #include <cstdio>
 6 | #include <cstdlib>
 7 | #include <cstring>
 8 | 
 9 | #ifdef __cplusplus
10 | extern "C" {
11 | #endif
12 | 
13 | struct Settings;
14 | 
15 | // Shared function declarations
16 | void initialise_log(Settings &settings);
17 | void print_to_log(Settings &settings, const char *format, ...);
18 | void print_and_log(Settings &settings, const char *format, ...);
19 | void plot_2d(int x, int y, const double *buffer, const char *name);
20 | void die(int lineNum, const char *file, const char *format, ...);
21 | 
22 | // Write out data for visualisation in visit
23 | void write_to_visit(int nx, int ny, int x_off, int y_off, const double *data, const char *name, int step, double time);
24 | 
25 | #ifdef __cplusplus
26 | }
27 | #endif
28 | 
29 | // Global constants
30 | #define MASTER 0
31 | 
32 | #define NUM_FACES 4
33 | #define CHUNK_LEFT 0
34 | #define CHUNK_RIGHT 1
35 | #define CHUNK_BOTTOM 2
36 | #define CHUNK_TOP 3
37 | #define EXTERNAL_FACE -1
38 | 
39 | #define FIELD_DENSITY 0
40 | #define FIELD_ENERGY0 1
41 | #define FIELD_ENERGY1 2
42 | #define FIELD_U 3
43 | #define FIELD_P 4
44 | #define FIELD_SD 5
45 | 
46 | #define CONDUCTIVITY 1
47 | #define RECIP_CONDUCTIVITY 2
48 | 
49 | #define CG_ITERS_FOR_EIGENVALUES 20
50 | #define ERROR_SWITCH_MAX 1.0
51 | 
52 | #define tealeaf_MIN(a, b) ((a < b) ? a : b)
53 | #define tealeaf_MAX(a, b) ((a > b) ? a : b)
54 | #define tealeaf_strmatch(a, b) (strcmp(a, b) == 0)
55 | #define tealeaf_sign(a, b) ((b) < 0 ? -fabs(a) : fabs(a))
56 | 
57 | // Sparse Matrix Vector Product
58 | #define tealeaf_SMVP(a)                                                          \
59 |   (1.0 + (kx[index + 1] + kx[index]) + (ky[index + x] + ky[index])) * a[index] - \
60 |       (kx[index + 1] * a[index + 1] + kx[index] * a[index - 1]) - (ky[index + x] * a[index + x] + ky[index] * a[index - x])
61 | 
62 | #define GET_ARRAY_VALUE(len, buffer) \
63 |   temp = 0.0;                        \
64 |   for (int ii = 0; ii < len; ++ii) { \
65 |     temp += buffer[ii];              \
66 |   }                                  \
67 |   printf("%s = %.12E\n", #buffer, temp);
68 | 


--------------------------------------------------------------------------------
/driver/solve_finished_driver.cpp:
--------------------------------------------------------------------------------
 1 | #include "chunk.h"
 2 | #include "comms.h"
 3 | #include "drivers.h"
 4 | #include "kernel_interface.h"
 5 | 
 6 | // Calls all kernels that wrap up a solve regardless of solver
 7 | void solve_finished_driver(Chunk *chunks, Settings &settings) {
 8 |   double exact_error = 0.0;
 9 | 
10 |   if (settings.check_result) {
11 |     for (int cc = 0; cc < settings.num_chunks_per_rank; ++cc) {
12 |       if (settings.kernel_language == Kernel_Language::C) {
13 |         run_calculate_residual(&(chunks[cc]), settings);
14 | 
15 |         run_calculate_2norm(&(chunks[cc]), settings, chunks[cc].r, &exact_error);
16 |       } else if (settings.kernel_language == Kernel_Language::FORTRAN) {
17 |       }
18 |     }
19 | 
20 |     sum_over_ranks(settings, &exact_error);
21 |   }
22 | 
23 |   for (int cc = 0; cc < settings.num_chunks_per_rank; ++cc) {
24 |     if (settings.kernel_language == Kernel_Language::C) {
25 |       run_finalise(&(chunks[cc]), settings);
26 |     } else if (settings.kernel_language == Kernel_Language::FORTRAN) {
27 |     }
28 |   }
29 | 
30 |   settings.fields_to_exchange[FIELD_ENERGY1] = true;
31 |   halo_update_driver(chunks, settings, 1);
32 | }
33 | 


--------------------------------------------------------------------------------
/driver/store_energy_driver.cpp:
--------------------------------------------------------------------------------
 1 | #include "chunk.h"
 2 | #include "kernel_interface.h"
 3 | 
 4 | // Invokes the store energy kernel
 5 | void store_energy_driver(Chunk *chunks, Settings &settings) {
 6 |   for (int cc = 0; cc < settings.num_chunks_per_rank; ++cc) {
 7 |     if (settings.kernel_language == Kernel_Language::C) {
 8 |       run_store_energy(&(chunks[cc]), settings);
 9 |     } else if (settings.kernel_language == Kernel_Language::FORTRAN) {
10 |       // Fortran store energy kernel
11 |     }
12 |   }
13 | }
14 | 


--------------------------------------------------------------------------------
/src/cuda/cheby.cpp:
--------------------------------------------------------------------------------
 1 | #include "chunk.h"
 2 | #include "cuknl_shared.h"
 3 | #include "shared.h"
 4 | 
 5 | __global__ void cheby_init(const int x_inner, const int y_inner, const int halo_depth, const double *u, const double *u0, const double *kx,
 6 |                            const double *ky, const double theta, double *p, double *r, double *w) {
 7 |   const int gid = threadIdx.x + blockIdx.x * blockDim.x;
 8 |   if (gid >= x_inner * y_inner) return;
 9 | 
10 |   const int x = x_inner + 2 * halo_depth;
11 |   const int col = gid % x_inner;
12 |   const int row = gid / x_inner;
13 |   const int off0 = halo_depth * (x + 1);
14 |   const int index = off0 + col + row * x;
15 | 
16 |   const double smvp = tealeaf_SMVP(u);
17 |   w[index] = smvp;
18 |   r[index] = u0[index] - w[index];
19 |   p[index] = r[index] / theta;
20 | }
21 | 
22 | __global__ void cheby_calc_u(const int x_inner, const int y_inner, const int halo_depth, const double *p, double *u) {
23 |   const int gid = threadIdx.x + blockIdx.x * blockDim.x;
24 |   if (gid >= x_inner * y_inner) return;
25 | 
26 |   const int x = x_inner + 2 * halo_depth;
27 |   const int col = gid % x_inner;
28 |   const int row = gid / x_inner;
29 |   const int off0 = halo_depth * (x + 1);
30 |   const int index = off0 + col + row * x;
31 | 
32 |   u[index] += p[index];
33 | }
34 | 
35 | __global__ void cheby_calc_p(const int x_inner, const int y_inner, const int halo_depth, const double *u, const double *u0,
36 |                              const double *kx, const double *ky, const double alpha, const double beta, double *p, double *r, double *w) {
37 |   const int gid = threadIdx.x + blockIdx.x * blockDim.x;
38 |   if (gid >= x_inner * y_inner) return;
39 | 
40 |   const int x = x_inner + 2 * halo_depth;
41 |   const int col = gid % x_inner;
42 |   const int row = gid / x_inner;
43 |   const int off0 = halo_depth * (x + 1);
44 |   const int index = off0 + col + row * x;
45 | 
46 |   const double smvp = tealeaf_SMVP(u);
47 |   w[index] = smvp;
48 |   r[index] = u0[index] - w[index];
49 |   p[index] = alpha * p[index] + beta * r[index];
50 | }
51 | 
52 | // Chebyshev solver kernels
53 | void run_cheby_init(Chunk *chunk, Settings &settings) {
54 |   KERNELS_START(2 * settings.halo_depth);
55 |   cheby_init<<<num_blocks, BLOCK_SIZE>>>(x_inner, y_inner, settings.halo_depth, chunk->u, chunk->u0, chunk->kx, chunk->ky, chunk->theta,
56 |                                          chunk->p, chunk->r, chunk->w);
57 |   KERNELS_END();
58 | }
59 | 
60 | void run_cheby_iterate(Chunk *chunk, Settings &settings, double alpha, double beta) {
61 |   KERNELS_START(2 * settings.halo_depth);
62 |   cheby_calc_p<<<num_blocks, BLOCK_SIZE>>>(x_inner, y_inner, settings.halo_depth, chunk->u, chunk->u0, chunk->kx, chunk->ky, alpha, beta,
63 |                                            chunk->p, chunk->r, chunk->w);
64 |   cheby_calc_u<<<num_blocks, BLOCK_SIZE>>>(x_inner, y_inner, settings.halo_depth, chunk->p, chunk->u);
65 |   KERNELS_END();
66 | }


--------------------------------------------------------------------------------
/src/cuda/chunk_extension.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | using FieldBufferType = double *;
 4 | using StagingBufferType = double *;
 5 | 
 6 | struct ChunkExtension {
 7 |   double *d_reduce_buffer;
 8 |   double *d_reduce_buffer2;
 9 |   double *d_reduce_buffer3;
10 |   double *d_reduce_buffer4;
11 | };
12 | 


--------------------------------------------------------------------------------
/src/cuda/cuknl_shared.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "shared.h"
 4 | #ifndef BLOCK_SIZE
 5 |   #define BLOCK_SIZE 256
 6 | #endif
 7 | 
 8 | #ifdef CLOVER_MANAGED_ALLOC
 9 |   #define CLOVER_MEMCPY_KIND_D2H (cudaMemcpyDefault)
10 |   #define CLOVER_MEMCPY_KIND_H2D (cudaMemcpyDefault)
11 | #else
12 |   #define CLOVER_MEMCPY_KIND_D2H (cudaMemcpyDeviceToHost)
13 |   #define CLOVER_MEMCPY_KIND_H2D (cudaMemcpyHostToDevice)
14 | #endif
15 | 
16 | __device__ inline double SUM(double a, double b) { return a + b; }
17 | 
18 | template <typename T, int offset> class reduce {
19 | public:
20 |   __device__ inline static void run(T *array, T *out, T (*func)(T, T)) {
21 |     // only need to sync if not working within a warp
22 |     if (offset > 16) {
23 |       __syncthreads();
24 |     }
25 | 
26 |     // only continue if it's in the lower half
27 |     if (threadIdx.x < offset) {
28 |       array[threadIdx.x] = func(array[threadIdx.x], array[threadIdx.x + offset]);
29 |       reduce<T, offset / 2>::run(array, out, func);
30 |     }
31 |   }
32 | };
33 | 
34 | template <typename T> class reduce<T, 0> {
35 | public:
36 |   __device__ inline static void run(T *array, T *out, T (*)(T, T)) { out[blockIdx.x] = array[0]; }
37 | };
38 | 
39 | inline void check_errors(int line_num, const char *file) {
40 |   cudaDeviceSynchronize();
41 |   if (auto result = cudaGetLastError(); result != cudaSuccess) {
42 |     die(line_num, file, "Error in %s - return code %d (%s)\n", file, result, cudaGetErrorName(result));
43 |   }
44 | }
45 | 
46 | void sum_reduce_buffer(double *buffer, double *result, int len);
47 | 
48 | #define KERNELS_START(pad)                                                 \
49 |   START_PROFILING(settings.kernel_profile);                                \
50 |   int x_inner = chunk->x - (pad);                                          \
51 |   int y_inner = chunk->y - (pad);                                          \
52 |   int num_blocks = ceil((double)(x_inner * y_inner) / double(BLOCK_SIZE)); \
53 |   do {                                                                     \
54 |   } while (false)
55 | #ifdef CLOVER_SYNC_ALL_KERNELS
56 |   #define KERNELS_END()               \
57 |     check_errors(__LINE__, __FILE__); \
58 |     STOP_PROFILING(settings.kernel_profile, __func__);
59 | #else
60 |   #define KERNELS_END() STOP_PROFILING(settings.kernel_profile, __func__)
61 | #endif
62 | 


--------------------------------------------------------------------------------
/src/cuda/jacobi.cpp:
--------------------------------------------------------------------------------
 1 | #include "chunk.h"
 2 | #include "cuknl_shared.h"
 3 | #include "shared.h"
 4 | 
 5 | // Core computation for Jacobi solver.
 6 | __global__ void jacobi_iterate(const int x_inner, const int y_inner, const int halo_depth, const double *kx, const double *ky,
 7 |                                const double *u0, const double *r, double *u, double *error) {
 8 |   const int gid = threadIdx.x + blockIdx.x * blockDim.x;
 9 |   __shared__ double error_local[BLOCK_SIZE];
10 | 
11 |   const int x = x_inner + 2 * halo_depth;
12 |   const int col = gid % x_inner;
13 |   const int row = gid / x_inner;
14 |   const int off0 = halo_depth * (x + 1);
15 |   const int index = off0 + col + row * x;
16 | 
17 |   if (gid < x_inner * y_inner) {
18 |     u[index] =
19 |         (u0[index] + kx[index + 1] * r[index + 1] + kx[index] * r[index - 1] + ky[index + x] * r[index + x] + ky[index] * r[index - x]) /
20 |         (1.0 + (kx[index] + kx[index + 1]) + (ky[index] + ky[index + x]));
21 | 
22 |     error_local[threadIdx.x] = fabs(u[index] - r[index]);
23 |   } else {
24 |     error_local[threadIdx.x] = 0.0;
25 |   }
26 | 
27 |   reduce<double, BLOCK_SIZE / 2>::run(error_local, error, SUM);
28 | }
29 | 
30 | __global__ void jacobi_init(const int x_inner, const int y_inner, const int halo_depth, const double *density, const double *energy,
31 |                             const double rx, const double ry, double *kx, double *ky, double *u0, double *u, const int coefficient) {
32 |   const int gid = threadIdx.x + blockIdx.x * blockDim.x;
33 |   if (gid >= x_inner * y_inner) return;
34 | 
35 |   const int x = x_inner + 2 * halo_depth;
36 |   const int col = gid % x_inner;
37 |   const int row = gid / x_inner;
38 |   const int off0 = halo_depth * (x + 1);
39 |   const int index = off0 + col + row * x;
40 | 
41 |   const double u_temp = energy[index] * density[index];
42 |   u0[index] = u_temp;
43 |   u[index] = u_temp;
44 | 
45 |   if (row == 0 || col == 0) return;
46 | 
47 |   double density_center;
48 |   double density_left;
49 |   double density_down;
50 | 
51 |   if (coefficient == CONDUCTIVITY) {
52 |     density_center = density[index];
53 |     density_left = density[index - 1];
54 |     density_down = density[index - x];
55 |   } else if (coefficient == RECIP_CONDUCTIVITY) {
56 |     density_center = 1.0 / density[index];
57 |     density_left = 1.0 / density[index - 1];
58 |     density_down = 1.0 / density[index - x];
59 |   }
60 | 
61 |   kx[index] = rx * (density_left + density_center) / (2.0 * density_left * density_center);
62 |   ky[index] = ry * (density_down + density_center) / (2.0 * density_down * density_center);
63 | }
64 | 
65 | __global__ void jacobi_copy_u(const int x_inner, const int y_inner, const double *src, double *dest) {
66 |   const int gid = threadIdx.x + blockIdx.x * blockDim.x;
67 | 
68 |   if (gid < x_inner * y_inner) {
69 |     dest[gid] = src[gid];
70 |   }
71 | }
72 | 
73 | // Jacobi solver kernels
74 | void run_jacobi_init(Chunk *chunk, Settings &settings, double rx, double ry) {
75 |   KERNELS_START(2 * settings.halo_depth);
76 |   jacobi_init<<<num_blocks, BLOCK_SIZE>>>(x_inner, y_inner, settings.halo_depth, chunk->density, chunk->energy, rx, ry, chunk->kx,
77 |                                           chunk->ky, chunk->u0, chunk->u, settings.coefficient);
78 |   KERNELS_END();
79 | }
80 | 
81 | void run_jacobi_iterate(Chunk *chunk, Settings &settings, double *error) {
82 |   KERNELS_START(2 * settings.halo_depth);
83 |   jacobi_iterate<<<num_blocks, BLOCK_SIZE>>>(x_inner, y_inner, settings.halo_depth, chunk->kx, chunk->ky, chunk->u0, chunk->r, chunk->u,
84 |                                              chunk->ext->d_reduce_buffer);
85 |   sum_reduce_buffer(chunk->ext->d_reduce_buffer, error, num_blocks);
86 |   KERNELS_END();
87 | }
88 | 


--------------------------------------------------------------------------------
/src/cuda/local_halos.cpp:
--------------------------------------------------------------------------------
  1 | #include "chunk.h"
  2 | #include "cuknl_shared.h"
  3 | #include "shared.h"
  4 | 
  5 | __global__ void update_bottom(const int x, const int y, const int halo_depth, const int depth, double *buffer) {
  6 |   const int gid = threadIdx.x + blockIdx.x * blockDim.x;
  7 |   if (gid >= x * depth) return;
  8 | 
  9 |   const int lines = gid / x;
 10 |   const int offset = x * halo_depth;
 11 |   const int from_index = offset + gid;
 12 |   const int to_index = from_index - (1 + lines * 2) * x;
 13 |   buffer[to_index] = buffer[from_index];
 14 | }
 15 | 
 16 | __global__ void update_top(const int x, const int y, const int halo_depth, const int depth, double *buffer) {
 17 |   const int gid = threadIdx.x + blockIdx.x * blockDim.x;
 18 |   if (gid >= x * depth) return;
 19 | 
 20 |   const int lines = gid / x;
 21 |   const int offset = x * (y - halo_depth);
 22 |   const int to_index = offset + gid;
 23 |   const int from_index = to_index - (1 + lines * 2) * x;
 24 |   buffer[to_index] = buffer[from_index];
 25 | }
 26 | 
 27 | __global__ void update_left(const int x, const int y, const int halo_depth, const int depth, double *buffer) {
 28 |   const int gid = threadIdx.x + blockDim.x * blockIdx.x;
 29 |   if (gid >= y * depth) return;
 30 | 
 31 |   const int flip = gid % depth;
 32 |   const int lines = gid / depth;
 33 |   const int offset = halo_depth + lines * (x - depth);
 34 |   const int from_index = offset + gid;
 35 |   const int to_index = from_index - (1 + flip * 2);
 36 | 
 37 |   buffer[to_index] = buffer[from_index];
 38 | }
 39 | 
 40 | __global__ void update_right(const int x, const int y, const int halo_depth, const int depth, double *buffer) {
 41 |   const int gid = threadIdx.x + blockDim.x * blockIdx.x;
 42 |   if (gid >= y * depth) return;
 43 | 
 44 |   const int flip = gid % depth;
 45 |   const int lines = gid / depth;
 46 |   const int offset = x - halo_depth + lines * (x - depth);
 47 |   const int to_index = offset + gid;
 48 |   const int from_index = to_index - (1 + flip * 2);
 49 | 
 50 |   buffer[to_index] = buffer[from_index];
 51 | }
 52 | 
 53 | // Updates faces in turn.
 54 | void update_face(const int x, const int y, const int halo_depth, const int *chunk_neighbours, const int depth, double *buffer) {
 55 |   int num_blocks = std::ceil((x * depth) / (double)BLOCK_SIZE);
 56 |   if (chunk_neighbours[CHUNK_TOP] == EXTERNAL_FACE) {
 57 |     update_top<<<num_blocks, BLOCK_SIZE>>>(x, y, halo_depth, depth, buffer);
 58 |     check_errors(__LINE__, __FILE__);
 59 |   }
 60 |   if (chunk_neighbours[CHUNK_BOTTOM] == EXTERNAL_FACE) {
 61 |     update_bottom<<<num_blocks, BLOCK_SIZE>>>(x, y, halo_depth, depth, buffer);
 62 |     check_errors(__LINE__, __FILE__);
 63 |   }
 64 | 
 65 |   num_blocks = std::ceil((y * depth) / (float)BLOCK_SIZE);
 66 |   if (chunk_neighbours[CHUNK_RIGHT] == EXTERNAL_FACE) {
 67 |     update_right<<<num_blocks, BLOCK_SIZE>>>(x, y, halo_depth, depth, buffer);
 68 |     check_errors(__LINE__, __FILE__);
 69 |   }
 70 |   if (chunk_neighbours[CHUNK_LEFT] == EXTERNAL_FACE) {
 71 |     update_left<<<num_blocks, BLOCK_SIZE>>>(x, y, halo_depth, depth, buffer);
 72 |     check_errors(__LINE__, __FILE__);
 73 |   }
 74 | }
 75 | 
 76 | // The kernel for updating halos locally
 77 | void local_halos(const int x, const int y, const int halo_depth, const int depth, const int *chunk_neighbours,
 78 |                  const bool *fields_to_exchange, double *density, double *energy0, double *energy, double *u, double *p, double *sd) {
 79 |   if (fields_to_exchange[FIELD_DENSITY]) {
 80 |     update_face(x, y, halo_depth, chunk_neighbours, depth, density);
 81 |   }
 82 |   if (fields_to_exchange[FIELD_P]) {
 83 |     update_face(x, y, halo_depth, chunk_neighbours, depth, p);
 84 |   }
 85 |   if (fields_to_exchange[FIELD_ENERGY0]) {
 86 |     update_face(x, y, halo_depth, chunk_neighbours, depth, energy0);
 87 |   }
 88 |   if (fields_to_exchange[FIELD_ENERGY1]) {
 89 |     update_face(x, y, halo_depth, chunk_neighbours, depth, energy);
 90 |   }
 91 |   if (fields_to_exchange[FIELD_U]) {
 92 |     update_face(x, y, halo_depth, chunk_neighbours, depth, u);
 93 |   }
 94 |   if (fields_to_exchange[FIELD_SD]) {
 95 |     update_face(x, y, halo_depth, chunk_neighbours, depth, sd);
 96 |   }
 97 | }
 98 | 
 99 | // Solver-wide kernels
100 | void run_local_halos(Chunk *chunk, Settings &settings, int depth) {
101 |   START_PROFILING(settings.kernel_profile);
102 | 
103 |   local_halos(chunk->x, chunk->y, settings.halo_depth, depth, chunk->neighbours, settings.fields_to_exchange, chunk->density,
104 |               chunk->energy0, chunk->energy, chunk->u, chunk->p, chunk->sd);
105 | 
106 |   STOP_PROFILING(settings.kernel_profile, __func__);
107 | }
108 | 


--------------------------------------------------------------------------------
/src/cuda/model.cmake:
--------------------------------------------------------------------------------
 1 | 
 2 | register_flag_optional(CMAKE_CXX_COMPILER
 3 |         "Any CXX compiler that is supported by CMake detection, this is used for host compilation"
 4 |         "c++")
 5 | 
 6 | register_flag_required(CMAKE_CUDA_COMPILER
 7 |         "Path to the CUDA nvcc compiler")
 8 | 
 9 | # XXX we may want to drop this eventually and use CMAKE_CUDA_ARCHITECTURES directly
10 | register_flag_required(CUDA_ARCH
11 |         "Nvidia architecture, will be passed in via `-arch=` (e.g `sm_70`) for nvcc")
12 | 
13 | register_flag_optional(CUDA_EXTRA_FLAGS
14 |         "Additional CUDA flags passed to nvcc, this is appended after `CUDA_ARCH`"
15 |         "")
16 | 
17 | 
18 | register_flag_optional(MANAGED_ALLOC "Use UVM (cudaMallocManaged) instead of the device-only allocation (cudaMalloc)"
19 |         "OFF")
20 | 
21 | register_flag_optional(SYNC_ALL_KERNELS
22 |         "Fully synchronise all kernels after launch, this also enables synchronous error checking with line and file name"
23 |         "OFF")
24 | 
25 | 
26 | macro(setup)
27 | 
28 |     # XXX CMake 3.18 supports CMAKE_CUDA_ARCHITECTURES/CUDA_ARCHITECTURES but we support older CMakes
29 |     if (POLICY CMP0104)
30 |         cmake_policy(SET CMP0104 OLD)
31 |     endif ()
32 | 
33 |     set(CMAKE_CXX_STANDARD 17)
34 |     enable_language(CUDA)
35 | 
36 |     # add -forward-unknown-to-host-compiler for compatibility reasons
37 |     # add -std=c++17 manually as older CMake seems to omit this (source gets treated as C otherwise)
38 |     set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -std=c++17 -forward-unknown-to-host-compiler -arch=${CUDA_ARCH} -use_fast_math -restrict -keep ${CUDA_EXTRA_FLAGS}")
39 | 
40 |     # CMake defaults to -O2 for CUDA at Release, let's wipe that and use the global RELEASE_FLAG
41 |     # appended later
42 |     wipe_gcc_style_optimisation_flags(CMAKE_CUDA_FLAGS_${BUILD_TYPE})
43 | 
44 |     if (MANAGED_ALLOC)
45 |         register_definitions(CLOVER_MANAGED_ALLOC)
46 |     endif ()
47 | 
48 |     if (SYNC_ALL_KERNELS)
49 |         register_definitions(CLOVER_SYNC_ALL_KERNELS)
50 |     endif ()
51 | 
52 |     message(STATUS "NVCC flags: ${CMAKE_CUDA_FLAGS} ${CMAKE_CUDA_FLAGS_${BUILD_TYPE}}")
53 | endmacro()
54 | 
55 | 
56 | macro(setup_target NAME)
57 |     # Treat everything as CUDA source
58 |     get_target_property(PROJECT_SRC "${NAME}" SOURCES)
59 |     foreach (SRC ${PROJECT_SRC})
60 |         set_source_files_properties("${SRC}" PROPERTIES LANGUAGE CUDA)
61 |     endforeach ()
62 | endmacro()
63 | 


--------------------------------------------------------------------------------
/src/cuda/ppcg.cpp:
--------------------------------------------------------------------------------
 1 | #include "chunk.h"
 2 | #include "cuknl_shared.h"
 3 | 
 4 | __global__ void ppcg_init(const int x_inner, const int y_inner, const int halo_depth, const double theta, const double *r, double *sd) {
 5 |   const int gid = threadIdx.x + blockIdx.x * blockDim.x;
 6 |   if (gid >= x_inner * y_inner) return;
 7 | 
 8 |   const int x = x_inner + 2 * halo_depth;
 9 |   const int col = gid % x_inner;
10 |   const int row = gid / x_inner;
11 |   const int off0 = halo_depth * (x + 1);
12 |   const int index = off0 + col + row * x;
13 | 
14 |   sd[index] = r[index] / theta;
15 | }
16 | 
17 | __global__ void ppcg_calc_ur(const int x_inner, const int y_inner, const int halo_depth, const double *kx, const double *ky,
18 |                              const double *sd, double *u, double *r) {
19 |   const int gid = threadIdx.x + blockIdx.x * blockDim.x;
20 |   if (gid >= x_inner * y_inner) return;
21 | 
22 |   const int x = x_inner + 2 * halo_depth;
23 |   const int col = gid % x_inner;
24 |   const int row = gid / x_inner;
25 |   const int off0 = halo_depth * (x + 1);
26 |   const int index = off0 + col + row * x;
27 | 
28 |   const double smvp = (1.0 + (kx[index + 1] + kx[index]) + (ky[index + x] + ky[index])) * sd[index] -
29 |                       (kx[index + 1] * sd[index + 1] + kx[index] * sd[index - 1]) -
30 |                       (ky[index + x] * sd[index + x] + ky[index] * sd[index - x]);
31 | 
32 |   r[index] -= smvp;
33 |   u[index] += sd[index];
34 | }
35 | 
36 | __global__ void ppcg_calc_sd(const int x_inner, const int y_inner, const int halo_depth, const double alpha, const double beta,
37 |                              const double *r, double *sd) {
38 |   const int gid = threadIdx.x + blockIdx.x * blockDim.x;
39 |   if (gid >= x_inner * y_inner) return;
40 | 
41 |   const int x = x_inner + 2 * halo_depth;
42 |   const int col = gid % x_inner;
43 |   const int row = gid / x_inner;
44 |   const int off0 = halo_depth * (x + 1);
45 |   const int index = off0 + col + row * x;
46 | 
47 |   sd[index] = alpha * sd[index] + beta * r[index];
48 | }
49 | 
50 | // PPCG solver kernels
51 | void run_ppcg_init(Chunk *chunk, Settings &settings) {
52 |   KERNELS_START(2 * settings.halo_depth);
53 |   ppcg_init<<<num_blocks, BLOCK_SIZE>>>(x_inner, y_inner, settings.halo_depth, chunk->theta, chunk->r, chunk->sd);
54 |   KERNELS_END();
55 | }
56 | 
57 | void run_ppcg_inner_iteration(Chunk *chunk, Settings &settings, double alpha, double beta) {
58 |   KERNELS_START(2 * settings.halo_depth);
59 |   ppcg_calc_ur<<<num_blocks, BLOCK_SIZE>>>(x_inner, y_inner, settings.halo_depth, chunk->kx, chunk->ky, chunk->sd, chunk->u, chunk->r);
60 |   ppcg_calc_sd<<<num_blocks, BLOCK_SIZE>>>(x_inner, y_inner, settings.halo_depth, alpha, beta, chunk->r, chunk->sd);
61 |   KERNELS_END();
62 | }


--------------------------------------------------------------------------------
/src/hip/cheby.cpp:
--------------------------------------------------------------------------------
 1 | #include "hip/hip_runtime.h"
 2 | 
 3 | #include "chunk.h"
 4 | #include "cuknl_shared.h"
 5 | #include "shared.h"
 6 | 
 7 | __global__ void cheby_init(const int x_inner, const int y_inner, const int halo_depth, const double *u, const double *u0, const double *kx,
 8 |                            const double *ky, const double theta, double *p, double *r, double *w) {
 9 |   const int gid = threadIdx.x + blockIdx.x * blockDim.x;
10 |   if (gid >= x_inner * y_inner) return;
11 | 
12 |   const int x = x_inner + 2 * halo_depth;
13 |   const int col = gid % x_inner;
14 |   const int row = gid / x_inner;
15 |   const int off0 = halo_depth * (x + 1);
16 |   const int index = off0 + col + row * x;
17 | 
18 |   const double smvp = tealeaf_SMVP(u);
19 |   w[index] = smvp;
20 |   r[index] = u0[index] - w[index];
21 |   p[index] = r[index] / theta;
22 | }
23 | 
24 | __global__ void cheby_calc_u(const int x_inner, const int y_inner, const int halo_depth, const double *p, double *u) {
25 |   const int gid = threadIdx.x + blockIdx.x * blockDim.x;
26 |   if (gid >= x_inner * y_inner) return;
27 | 
28 |   const int x = x_inner + 2 * halo_depth;
29 |   const int col = gid % x_inner;
30 |   const int row = gid / x_inner;
31 |   const int off0 = halo_depth * (x + 1);
32 |   const int index = off0 + col + row * x;
33 | 
34 |   u[index] += p[index];
35 | }
36 | 
37 | __global__ void cheby_calc_p(const int x_inner, const int y_inner, const int halo_depth, const double *u, const double *u0,
38 |                              const double *kx, const double *ky, const double alpha, const double beta, double *p, double *r, double *w) {
39 |   const int gid = threadIdx.x + blockIdx.x * blockDim.x;
40 |   if (gid >= x_inner * y_inner) return;
41 | 
42 |   const int x = x_inner + 2 * halo_depth;
43 |   const int col = gid % x_inner;
44 |   const int row = gid / x_inner;
45 |   const int off0 = halo_depth * (x + 1);
46 |   const int index = off0 + col + row * x;
47 | 
48 |   const double smvp = tealeaf_SMVP(u);
49 |   w[index] = smvp;
50 |   r[index] = u0[index] - w[index];
51 |   p[index] = alpha * p[index] + beta * r[index];
52 | }
53 | 
54 | // Chebyshev solver kernels
55 | void run_cheby_init(Chunk *chunk, Settings &settings) {
56 |   KERNELS_START(2 * settings.halo_depth);
57 |   cheby_init<<<num_blocks, BLOCK_SIZE>>>(x_inner, y_inner, settings.halo_depth, chunk->u, chunk->u0, chunk->kx, chunk->ky, chunk->theta,
58 |                                          chunk->p, chunk->r, chunk->w);
59 |   KERNELS_END();
60 | }
61 | 
62 | void run_cheby_iterate(Chunk *chunk, Settings &settings, double alpha, double beta) {
63 |   KERNELS_START(2 * settings.halo_depth);
64 |   cheby_calc_p<<<num_blocks, BLOCK_SIZE>>>(x_inner, y_inner, settings.halo_depth, chunk->u, chunk->u0, chunk->kx, chunk->ky, alpha, beta,
65 |                                            chunk->p, chunk->r, chunk->w);
66 |   cheby_calc_u<<<num_blocks, BLOCK_SIZE>>>(x_inner, y_inner, settings.halo_depth, chunk->p, chunk->u);
67 |   KERNELS_END();
68 | }


--------------------------------------------------------------------------------
/src/hip/chunk_extension.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | using FieldBufferType = double *;
 4 | using StagingBufferType = double *;
 5 | 
 6 | struct ChunkExtension {
 7 |   double *d_reduce_buffer;
 8 |   double *d_reduce_buffer2;
 9 |   double *d_reduce_buffer3;
10 |   double *d_reduce_buffer4;
11 | };
12 | 


--------------------------------------------------------------------------------
/src/hip/cuknl_shared.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "hip/hip_runtime.h"
 4 | 
 5 | #include "shared.h"
 6 | #ifndef BLOCK_SIZE
 7 |   #define BLOCK_SIZE 1024 // XXX anything less than 512 would break reduction
 8 | #endif
 9 | 
10 | #ifdef CLOVER_MANAGED_ALLOC
11 |   #define CLOVER_MEMCPY_KIND_D2H (hipMemcpyDefault)
12 |   #define CLOVER_MEMCPY_KIND_H2D (hipMemcpyDefault)
13 | #else
14 |   #define CLOVER_MEMCPY_KIND_D2H (hipMemcpyDeviceToHost)
15 |   #define CLOVER_MEMCPY_KIND_H2D (hipMemcpyHostToDevice)
16 | #endif
17 | 
18 | __device__ inline double SUM(double a, double b) { return a + b; }
19 | 
20 | template <typename T, int offset> class reduce {
21 | public:
22 |   __device__ inline static void run(T *array, T *out, T (*func)(T, T)) {
23 |     __syncthreads(); // don't optimise for sub-warp, always sync
24 |     // only continue if it's in the lower half
25 |     if (threadIdx.x < offset) {
26 |       array[threadIdx.x] = func(array[threadIdx.x], array[threadIdx.x + offset]);
27 |       reduce<T, offset / 2>::run(array, out, func);
28 |     }
29 |   }
30 | };
31 | 
32 | template <typename T> class reduce<T, 0> {
33 | public:
34 |   __device__ inline static void run(T *array, T *out, T (*)(T, T)) { out[blockIdx.x] = array[0]; }
35 | };
36 | 
37 | inline void check_errors(int line_num, const char *file) {
38 |   hipDeviceSynchronize();
39 |   if (auto result = hipGetLastError(); result != hipSuccess) {
40 |     die(line_num, file, "Error in %s - return code %d (%s)\n", file, result, hipGetErrorName(result));
41 |   }
42 | }
43 | 
44 | void sum_reduce_buffer(double *buffer, double *result, int len);
45 | 
46 | #define KERNELS_START(pad)                                                 \
47 |   START_PROFILING(settings.kernel_profile);                                \
48 |   int x_inner = chunk->x - (pad);                                          \
49 |   int y_inner = chunk->y - (pad);                                          \
50 |   int num_blocks = ceil((double)(x_inner * y_inner) / double(BLOCK_SIZE)); \
51 |   do {                                                                     \
52 |   } while (false)
53 | #ifdef CLOVER_SYNC_ALL_KERNELS
54 |   #define KERNELS_END()               \
55 |     check_errors(__LINE__, __FILE__); \
56 |     STOP_PROFILING(settings.kernel_profile, __func__);
57 | #else
58 |   #define KERNELS_END() STOP_PROFILING(settings.kernel_profile, __func__)
59 | #endif
60 | 


--------------------------------------------------------------------------------
/src/hip/jacobi.cpp:
--------------------------------------------------------------------------------
 1 | #include "hip/hip_runtime.h"
 2 | 
 3 | #include "chunk.h"
 4 | #include "cuknl_shared.h"
 5 | #include "shared.h"
 6 | 
 7 | // Core computation for Jacobi solver.
 8 | __global__ void jacobi_iterate(const int x_inner, const int y_inner, const int halo_depth, const double *kx, const double *ky,
 9 |                                const double *u0, const double *r, double *u, double *error) {
10 |   const int gid = threadIdx.x + blockIdx.x * blockDim.x;
11 |   __shared__ double error_local[BLOCK_SIZE];
12 | 
13 |   const int x = x_inner + 2 * halo_depth;
14 |   const int col = gid % x_inner;
15 |   const int row = gid / x_inner;
16 |   const int off0 = halo_depth * (x + 1);
17 |   const int index = off0 + col + row * x;
18 | 
19 |   if (gid < x_inner * y_inner) {
20 |     u[index] =
21 |         (u0[index] + kx[index + 1] * r[index + 1] + kx[index] * r[index - 1] + ky[index + x] * r[index + x] + ky[index] * r[index - x]) /
22 |         (1.0 + (kx[index] + kx[index + 1]) + (ky[index] + ky[index + x]));
23 | 
24 |     error_local[threadIdx.x] = fabs(u[index] - r[index]);
25 |   } else {
26 |     error_local[threadIdx.x] = 0.0;
27 |   }
28 | 
29 |   reduce<double, BLOCK_SIZE / 2>::run(error_local, error, SUM);
30 | }
31 | 
32 | __global__ void jacobi_init(const int x_inner, const int y_inner, const int halo_depth, const double *density, const double *energy,
33 |                             const double rx, const double ry, double *kx, double *ky, double *u0, double *u, const int coefficient) {
34 |   const int gid = threadIdx.x + blockIdx.x * blockDim.x;
35 |   if (gid >= x_inner * y_inner) return;
36 | 
37 |   const int x = x_inner + 2 * halo_depth;
38 |   const int col = gid % x_inner;
39 |   const int row = gid / x_inner;
40 |   const int off0 = halo_depth * (x + 1);
41 |   const int index = off0 + col + row * x;
42 | 
43 |   const double u_temp = energy[index] * density[index];
44 |   u0[index] = u_temp;
45 |   u[index] = u_temp;
46 | 
47 |   if (row == 0 || col == 0) return;
48 | 
49 |   double density_center;
50 |   double density_left;
51 |   double density_down;
52 | 
53 |   if (coefficient == CONDUCTIVITY) {
54 |     density_center = density[index];
55 |     density_left = density[index - 1];
56 |     density_down = density[index - x];
57 |   } else if (coefficient == RECIP_CONDUCTIVITY) {
58 |     density_center = 1.0 / density[index];
59 |     density_left = 1.0 / density[index - 1];
60 |     density_down = 1.0 / density[index - x];
61 |   }
62 | 
63 |   kx[index] = rx * (density_left + density_center) / (2.0 * density_left * density_center);
64 |   ky[index] = ry * (density_down + density_center) / (2.0 * density_down * density_center);
65 | }
66 | 
67 | __global__ void jacobi_copy_u(const int x_inner, const int y_inner, const double *src, double *dest) {
68 |   const int gid = threadIdx.x + blockIdx.x * blockDim.x;
69 | 
70 |   if (gid < x_inner * y_inner) {
71 |     dest[gid] = src[gid];
72 |   }
73 | }
74 | 
75 | // Jacobi solver kernels
76 | void run_jacobi_init(Chunk *chunk, Settings &settings, double rx, double ry) {
77 |   KERNELS_START(2 * settings.halo_depth);
78 |   jacobi_init<<<num_blocks, BLOCK_SIZE>>>(x_inner, y_inner, settings.halo_depth, chunk->density, chunk->energy, rx, ry, chunk->kx,
79 |                                           chunk->ky, chunk->u0, chunk->u, settings.coefficient);
80 |   KERNELS_END();
81 | }
82 | 
83 | void run_jacobi_iterate(Chunk *chunk, Settings &settings, double *error) {
84 |   KERNELS_START(2 * settings.halo_depth);
85 |   jacobi_iterate<<<num_blocks, BLOCK_SIZE>>>(x_inner, y_inner, settings.halo_depth, chunk->kx, chunk->ky, chunk->u0, chunk->r, chunk->u,
86 |                                              chunk->ext->d_reduce_buffer);
87 |   sum_reduce_buffer(chunk->ext->d_reduce_buffer, error, num_blocks);
88 |   KERNELS_END();
89 | }
90 | 


--------------------------------------------------------------------------------
/src/hip/local_halos.cpp:
--------------------------------------------------------------------------------
  1 | #include "hip/hip_runtime.h"
  2 | 
  3 | #include "chunk.h"
  4 | #include "cuknl_shared.h"
  5 | #include "shared.h"
  6 | 
  7 | __global__ void update_bottom(const int x, const int y, const int halo_depth, const int depth, double *buffer) {
  8 |   const int gid = threadIdx.x + blockIdx.x * blockDim.x;
  9 |   if (gid >= x * depth) return;
 10 | 
 11 |   const int lines = gid / x;
 12 |   const int offset = x * halo_depth;
 13 |   const int from_index = offset + gid;
 14 |   const int to_index = from_index - (1 + lines * 2) * x;
 15 |   buffer[to_index] = buffer[from_index];
 16 | }
 17 | 
 18 | __global__ void update_top(const int x, const int y, const int halo_depth, const int depth, double *buffer) {
 19 |   const int gid = threadIdx.x + blockIdx.x * blockDim.x;
 20 |   if (gid >= x * depth) return;
 21 | 
 22 |   const int lines = gid / x;
 23 |   const int offset = x * (y - halo_depth);
 24 |   const int to_index = offset + gid;
 25 |   const int from_index = to_index - (1 + lines * 2) * x;
 26 |   buffer[to_index] = buffer[from_index];
 27 | }
 28 | 
 29 | __global__ void update_left(const int x, const int y, const int halo_depth, const int depth, double *buffer) {
 30 |   const int gid = threadIdx.x + blockDim.x * blockIdx.x;
 31 |   if (gid >= y * depth) return;
 32 | 
 33 |   const int flip = gid % depth;
 34 |   const int lines = gid / depth;
 35 |   const int offset = halo_depth + lines * (x - depth);
 36 |   const int from_index = offset + gid;
 37 |   const int to_index = from_index - (1 + flip * 2);
 38 | 
 39 |   buffer[to_index] = buffer[from_index];
 40 | }
 41 | 
 42 | __global__ void update_right(const int x, const int y, const int halo_depth, const int depth, double *buffer) {
 43 |   const int gid = threadIdx.x + blockDim.x * blockIdx.x;
 44 |   if (gid >= y * depth) return;
 45 | 
 46 |   const int flip = gid % depth;
 47 |   const int lines = gid / depth;
 48 |   const int offset = x - halo_depth + lines * (x - depth);
 49 |   const int to_index = offset + gid;
 50 |   const int from_index = to_index - (1 + flip * 2);
 51 | 
 52 |   buffer[to_index] = buffer[from_index];
 53 | }
 54 | 
 55 | // Updates faces in turn.
 56 | void update_face(const int x, const int y, const int halo_depth, const int *chunk_neighbours, const int depth, double *buffer) {
 57 |   int num_blocks = std::ceil((x * depth) / (double)BLOCK_SIZE);
 58 |   if (chunk_neighbours[CHUNK_TOP] == EXTERNAL_FACE) {
 59 |     update_top<<<num_blocks, BLOCK_SIZE>>>(x, y, halo_depth, depth, buffer);
 60 |     check_errors(__LINE__, __FILE__);
 61 |   }
 62 |   if (chunk_neighbours[CHUNK_BOTTOM] == EXTERNAL_FACE) {
 63 |     update_bottom<<<num_blocks, BLOCK_SIZE>>>(x, y, halo_depth, depth, buffer);
 64 |     check_errors(__LINE__, __FILE__);
 65 |   }
 66 | 
 67 |   num_blocks = std::ceil((y * depth) / (float)BLOCK_SIZE);
 68 |   if (chunk_neighbours[CHUNK_RIGHT] == EXTERNAL_FACE) {
 69 |     update_right<<<num_blocks, BLOCK_SIZE>>>(x, y, halo_depth, depth, buffer);
 70 |     check_errors(__LINE__, __FILE__);
 71 |   }
 72 |   if (chunk_neighbours[CHUNK_LEFT] == EXTERNAL_FACE) {
 73 |     update_left<<<num_blocks, BLOCK_SIZE>>>(x, y, halo_depth, depth, buffer);
 74 |     check_errors(__LINE__, __FILE__);
 75 |   }
 76 | }
 77 | 
 78 | // The kernel for updating halos locally
 79 | void local_halos(const int x, const int y, const int halo_depth, const int depth, const int *chunk_neighbours,
 80 |                  const bool *fields_to_exchange, double *density, double *energy0, double *energy, double *u, double *p, double *sd) {
 81 |   if (fields_to_exchange[FIELD_DENSITY]) {
 82 |     update_face(x, y, halo_depth, chunk_neighbours, depth, density);
 83 |   }
 84 |   if (fields_to_exchange[FIELD_P]) {
 85 |     update_face(x, y, halo_depth, chunk_neighbours, depth, p);
 86 |   }
 87 |   if (fields_to_exchange[FIELD_ENERGY0]) {
 88 |     update_face(x, y, halo_depth, chunk_neighbours, depth, energy0);
 89 |   }
 90 |   if (fields_to_exchange[FIELD_ENERGY1]) {
 91 |     update_face(x, y, halo_depth, chunk_neighbours, depth, energy);
 92 |   }
 93 |   if (fields_to_exchange[FIELD_U]) {
 94 |     update_face(x, y, halo_depth, chunk_neighbours, depth, u);
 95 |   }
 96 |   if (fields_to_exchange[FIELD_SD]) {
 97 |     update_face(x, y, halo_depth, chunk_neighbours, depth, sd);
 98 |   }
 99 | }
100 | 
101 | // Solver-wide kernels
102 | void run_local_halos(Chunk *chunk, Settings &settings, int depth) {
103 |   START_PROFILING(settings.kernel_profile);
104 | 
105 |   local_halos(chunk->x, chunk->y, settings.halo_depth, depth, chunk->neighbours, settings.fields_to_exchange, chunk->density,
106 |               chunk->energy0, chunk->energy, chunk->u, chunk->p, chunk->sd);
107 | 
108 |   STOP_PROFILING(settings.kernel_profile, __func__);
109 | }
110 | 


--------------------------------------------------------------------------------
/src/hip/model.cmake:
--------------------------------------------------------------------------------
 1 | 
 2 | register_flag_required(CMAKE_CXX_COMPILER
 3 |         "Absolute path to the AMD HIP C++ compiler")
 4 | 
 5 | register_flag_optional(MANAGED_ALLOC "Use UVM (hipMallocManaged) instead of the device-only allocation (hipMalloc)"
 6 |         "OFF")
 7 | 
 8 | register_flag_optional(SYNC_ALL_KERNELS
 9 |         "Fully synchronise all kernels after launch, this also enables synchronous error checking with line and file name"
10 |         "OFF")
11 | 
12 | 
13 | macro(setup)
14 | 
15 |     set(CMAKE_CXX_STANDARD 17)
16 |     set(CMAKE_POSITION_INDEPENDENT_CODE ON)
17 | 
18 |     if (MANAGED_ALLOC)
19 |         register_definitions(CLOVER_MANAGED_ALLOC)
20 |     endif ()
21 | 
22 |     if (SYNC_ALL_KERNELS)
23 |         register_definitions(CLOVER_SYNC_ALL_KERNELS)
24 |     endif ()
25 | 
26 | 
27 | endmacro()


--------------------------------------------------------------------------------
/src/hip/ppcg.cpp:
--------------------------------------------------------------------------------
 1 | #include "hip/hip_runtime.h"
 2 | 
 3 | #include "chunk.h"
 4 | #include "cuknl_shared.h"
 5 | 
 6 | __global__ void ppcg_init(const int x_inner, const int y_inner, const int halo_depth, const double theta, const double *r, double *sd) {
 7 |   const int gid = threadIdx.x + blockIdx.x * blockDim.x;
 8 |   if (gid >= x_inner * y_inner) return;
 9 | 
10 |   const int x = x_inner + 2 * halo_depth;
11 |   const int col = gid % x_inner;
12 |   const int row = gid / x_inner;
13 |   const int off0 = halo_depth * (x + 1);
14 |   const int index = off0 + col + row * x;
15 | 
16 |   sd[index] = r[index] / theta;
17 | }
18 | 
19 | __global__ void ppcg_calc_ur(const int x_inner, const int y_inner, const int halo_depth, const double *kx, const double *ky,
20 |                              const double *sd, double *u, double *r) {
21 |   const int gid = threadIdx.x + blockIdx.x * blockDim.x;
22 |   if (gid >= x_inner * y_inner) return;
23 | 
24 |   const int x = x_inner + 2 * halo_depth;
25 |   const int col = gid % x_inner;
26 |   const int row = gid / x_inner;
27 |   const int off0 = halo_depth * (x + 1);
28 |   const int index = off0 + col + row * x;
29 | 
30 |   const double smvp = (1.0 + (kx[index + 1] + kx[index]) + (ky[index + x] + ky[index])) * sd[index] -
31 |                       (kx[index + 1] * sd[index + 1] + kx[index] * sd[index - 1]) -
32 |                       (ky[index + x] * sd[index + x] + ky[index] * sd[index - x]);
33 | 
34 |   r[index] -= smvp;
35 |   u[index] += sd[index];
36 | }
37 | 
38 | __global__ void ppcg_calc_sd(const int x_inner, const int y_inner, const int halo_depth, const double alpha, const double beta,
39 |                              const double *r, double *sd) {
40 |   const int gid = threadIdx.x + blockIdx.x * blockDim.x;
41 |   if (gid >= x_inner * y_inner) return;
42 | 
43 |   const int x = x_inner + 2 * halo_depth;
44 |   const int col = gid % x_inner;
45 |   const int row = gid / x_inner;
46 |   const int off0 = halo_depth * (x + 1);
47 |   const int index = off0 + col + row * x;
48 | 
49 |   sd[index] = alpha * sd[index] + beta * r[index];
50 | }
51 | 
52 | // PPCG solver kernels
53 | void run_ppcg_init(Chunk *chunk, Settings &settings) {
54 |   KERNELS_START(2 * settings.halo_depth);
55 |   ppcg_init<<<num_blocks, BLOCK_SIZE>>>(x_inner, y_inner, settings.halo_depth, chunk->theta, chunk->r, chunk->sd);
56 |   KERNELS_END();
57 | }
58 | 
59 | void run_ppcg_inner_iteration(Chunk *chunk, Settings &settings, double alpha, double beta) {
60 |   KERNELS_START(2 * settings.halo_depth);
61 |   ppcg_calc_ur<<<num_blocks, BLOCK_SIZE>>>(x_inner, y_inner, settings.halo_depth, chunk->kx, chunk->ky, chunk->sd, chunk->u, chunk->r);
62 |   ppcg_calc_sd<<<num_blocks, BLOCK_SIZE>>>(x_inner, y_inner, settings.halo_depth, alpha, beta, chunk->r, chunk->sd);
63 |   KERNELS_END();
64 | }


--------------------------------------------------------------------------------
/src/kokkos/cheby.cpp:
--------------------------------------------------------------------------------
 1 | #include "chunk.h"
 2 | #include "kokkos_shared.hpp"
 3 | #include "shared.h"
 4 | 
 5 | // Initialises the Chebyshev solver
 6 | void cheby_init(const int x, const int y, const int halo_depth, const double theta, KView &p, KView &r, KView &u, KView &u0, KView &w,
 7 |                 KView &kx, KView &ky) {
 8 |   Kokkos::parallel_for(
 9 |       x * y, KOKKOS_LAMBDA(const int index) {
10 |         const int kk = index % x;
11 |         const int jj = index / x;
12 | 
13 |         if (kk >= halo_depth && kk < x - halo_depth && jj >= halo_depth && jj < y - halo_depth) {
14 |           const double smvp = tealeaf_SMVP(u);
15 |           w[index] = smvp;
16 |           r[index] = u0[index] - w[index];
17 |           p[index] = r[index] / theta;
18 |         }
19 |       });
20 | }
21 | 
22 | // Calculates U
23 | void cheby_calc_u(const int x, const int y, const int halo_depth, KView &p, KView &u) {
24 |   Kokkos::parallel_for(
25 |       x * y, KOKKOS_LAMBDA(const int index) {
26 |         const int kk = index % x;
27 |         const int jj = index / x;
28 | 
29 |         if (kk >= halo_depth && kk < x - halo_depth && jj >= halo_depth && jj < y - halo_depth) {
30 |           u[index] += p[index];
31 |         }
32 |       });
33 | }
34 | 
35 | // The main Cheby iteration step
36 | void cheby_iterate(const int x, const int y, const int halo_depth, const double alpha, const double beta, KView &p, KView &r, KView &u,
37 |                    KView &u0, KView &w, KView &kx, KView &ky) {
38 |   Kokkos::parallel_for(
39 |       x * y, KOKKOS_LAMBDA(const int index) {
40 |         const int kk = index % x;
41 |         const int jj = index / x;
42 | 
43 |         if (kk >= halo_depth && kk < x - halo_depth && jj >= halo_depth && jj < y - halo_depth) {
44 |           const double smvp = tealeaf_SMVP(u);
45 |           w[index] = smvp;
46 |           r[index] = u0[index] - w[index];
47 |           p[index] = alpha * p[index] + beta * r[index];
48 |         }
49 |       });
50 | }
51 | 
52 | // Chebyshev solver kernels
53 | void run_cheby_init(Chunk *chunk, Settings &settings) {
54 |   START_PROFILING(settings.kernel_profile);
55 | 
56 |   cheby_init(chunk->x, chunk->y, settings.halo_depth, chunk->theta, *chunk->p, *chunk->r, *chunk->u, *chunk->u0, *chunk->w, *chunk->kx,
57 |              *chunk->ky);
58 | 
59 |   STOP_PROFILING(settings.kernel_profile, __func__);
60 | }
61 | 
62 | void run_cheby_iterate(Chunk *chunk, Settings &settings, double alpha, double beta) {
63 |   START_PROFILING(settings.kernel_profile);
64 | 
65 |   cheby_iterate(chunk->x, chunk->y, settings.halo_depth, alpha, beta, *chunk->p, *chunk->r, *chunk->u, *chunk->u0, *chunk->w, *chunk->kx,
66 |                 *chunk->ky);
67 | 
68 |   cheby_calc_u(chunk->x, chunk->y, settings.halo_depth, *chunk->p, *chunk->u);
69 | 
70 |   STOP_PROFILING(settings.kernel_profile, __func__);
71 | }


--------------------------------------------------------------------------------
/src/kokkos/chunk_extension.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | 
3 | #include "kokkos_shared.hpp"
4 | #include <Kokkos_Core.hpp>
5 | 
6 | using FieldBufferType = Kokkos::View<double *> *;
7 | using StagingBufferType = Kokkos::View<double *>::HostMirror *;
8 | struct ChunkExtension {};
9 | 


--------------------------------------------------------------------------------
/src/kokkos/jacobi.cpp:
--------------------------------------------------------------------------------
 1 | #include "chunk.h"
 2 | #include "kokkos_shared.hpp"
 3 | #include "shared.h"
 4 | 
 5 | // Initialises the Jacobi solver
 6 | void jacobi_init(const int x, const int y, const int halo_depth, const int coefficient, const double rx, const double ry, KView &u,
 7 |                  KView &u0, KView &density, KView &energy, KView &kx, KView &ky) {
 8 |   Kokkos::parallel_for(
 9 |       x * y, KOKKOS_LAMBDA(const int index) {
10 |         const int kk = index % x;
11 |         const int jj = index / x;
12 | 
13 |         if (kk > 0 && kk < x - 1 && jj > 0 && jj < y - 1) {
14 |           u0(index) = energy(index) * density(index);
15 |           u(index) = u0(index);
16 |         }
17 | 
18 |         if (jj >= halo_depth && jj < y - 1 && kk >= halo_depth && kk < x - 1) {
19 |           double densityCentre = (coefficient == CONDUCTIVITY) ? density(index) : 1.0 / density(index);
20 |           double densityLeft = (coefficient == CONDUCTIVITY) ? density(index - 1) : 1.0 / density(index - 1);
21 |           double densityDown = (coefficient == CONDUCTIVITY) ? density(index - x) : 1.0 / density(index - x);
22 | 
23 |           kx(index) = rx * (densityLeft + densityCentre) / (2.0 * densityLeft * densityCentre);
24 |           ky(index) = ry * (densityDown + densityCentre) / (2.0 * densityDown * densityCentre);
25 |         }
26 |       });
27 | }
28 | 
29 | // Main Jacobi solver method.
30 | void jacobi_iterate(const int x, const int y, const int halo_depth, KView &u, KView &u0, KView &r, KView &kx, KView &ky, double *error) {
31 |   Kokkos::parallel_reduce(
32 |       x * y,
33 |       KOKKOS_LAMBDA(const int index, double &temp_error) {
34 |         const int kk = index % x;
35 |         const int jj = index / x;
36 | 
37 |         if (kk >= halo_depth && kk < x - halo_depth && jj >= halo_depth && jj < y - halo_depth) {
38 |           u(index) = (u0(index) + (kx(index + 1) * r(index + 1) + kx(index) * r(index - 1)) +
39 |                       (ky(index + x) * r(index + x) + ky(index) * r(index - x))) /
40 |                      (1.0 + (kx(index) + kx(index + 1)) + (ky(index) + ky(index + x)));
41 | 
42 |           temp_error += Kokkos::fabs(u(index) - r(index));
43 |         }
44 |       },
45 |       *error);
46 | }
47 | 
48 | // Copies u into r
49 | void jacobi_copy_u(const int x, const int y, KView &r, KView &u) {
50 |   Kokkos::parallel_for(
51 |       x * y, KOKKOS_LAMBDA(const int index) { r(index) = u(index); });
52 | }
53 | 
54 | // Jacobi solver kernels
55 | void run_jacobi_init(Chunk *chunk, Settings &settings, double rx, double ry) {
56 |   START_PROFILING(settings.kernel_profile);
57 | 
58 |   jacobi_init(chunk->x, chunk->y, settings.halo_depth, settings.coefficient, rx, ry, *chunk->u, *chunk->u0, *chunk->density, *chunk->energy,
59 |               *chunk->kx, *chunk->ky);
60 | 
61 |   STOP_PROFILING(settings.kernel_profile, __func__);
62 | }
63 | 
64 | void run_jacobi_iterate(Chunk *chunk, Settings &settings, double *error) {
65 |   START_PROFILING(settings.kernel_profile);
66 | 
67 |   jacobi_copy_u(chunk->x, chunk->y, *chunk->r, *chunk->u);
68 | 
69 |   jacobi_iterate(chunk->x, chunk->y, settings.halo_depth, *chunk->u, *chunk->u0, *chunk->r, *chunk->kx, *chunk->ky, error);
70 | 
71 |   STOP_PROFILING(settings.kernel_profile, __func__);
72 | }


--------------------------------------------------------------------------------
/src/kokkos/kokkos_shared.hpp:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include <Kokkos_Core.hpp>
3 | using KView = Kokkos::View<double *>;
4 | 


--------------------------------------------------------------------------------
/src/kokkos/local_halos.cpp:
--------------------------------------------------------------------------------
  1 | #include "chunk.h"
  2 | #include "kokkos_shared.hpp"
  3 | #include "shared.h"
  4 | 
  5 | // Updates the local left halo region(s)
  6 | void update_left(const int x, const int y, const int depth, const int halo_depth, KView &buffer) {
  7 |   Kokkos::parallel_for(
  8 |       y * depth, KOKKOS_LAMBDA(const int index) {
  9 |         const int flip = index % depth;
 10 |         const int lines = index / depth;
 11 |         const int offset = lines * (x - depth);
 12 |         const int to_index = offset + halo_depth - depth + index;
 13 |         const int from_index = to_index + 2 * (depth - flip) - 1;
 14 |         buffer(to_index) = buffer(from_index);
 15 |       });
 16 | }
 17 | 
 18 | // Updates the local right halo region(s)
 19 | void update_right(const int x, const int y, const int depth, const int halo_depth, KView &buffer) {
 20 |   Kokkos::parallel_for(
 21 |       y * depth, KOKKOS_LAMBDA(const int index) {
 22 |         const int flip = index % depth;
 23 |         const int lines = index / depth;
 24 |         const int offset = x - halo_depth + lines * (x - depth);
 25 |         const int to_index = offset + index;
 26 |         const int from_index = to_index - (1 + flip * 2);
 27 |         buffer(to_index) = buffer(from_index);
 28 |       });
 29 | }
 30 | 
 31 | // Updates the local top halo region(s)
 32 | void update_top(const int x, const int y, const int depth, const int halo_depth, KView &buffer) {
 33 |   Kokkos::parallel_for(
 34 |       x * depth, KOKKOS_LAMBDA(const int index) {
 35 |         const int lines = index / x;
 36 |         const int offset = x * (y - halo_depth);
 37 | 
 38 |         const int to_index = offset + index;
 39 |         const int from_index = to_index - (1 + lines * 2) * x;
 40 |         buffer(to_index) = buffer(from_index);
 41 |       });
 42 | }
 43 | 
 44 | // Updates the local bottom halo region(s)
 45 | void update_bottom(const int x, const int y, const int depth, const int halo_depth, KView &buffer) {
 46 |   Kokkos::parallel_for(
 47 |       x * depth, KOKKOS_LAMBDA(const int index) {
 48 |         const int lines = index / x;
 49 |         const int offset = x * halo_depth;
 50 | 
 51 |         const int from_index = offset + index;
 52 |         const int to_index = from_index - (1 + lines * 2) * x;
 53 |         buffer(to_index) = buffer(from_index);
 54 |       });
 55 | }
 56 | 
 57 | // Updates faces in turn.
 58 | void update_face(const int x, const int y, const int halo_depth, const int *chunk_neighbours, const int depth, KView &buffer) {
 59 |   if (chunk_neighbours[CHUNK_LEFT] == EXTERNAL_FACE) {
 60 |     update_left(x, y, depth, halo_depth, buffer);
 61 |   }
 62 |   if (chunk_neighbours[CHUNK_RIGHT] == EXTERNAL_FACE) {
 63 |     update_right(x, y, depth, halo_depth, buffer);
 64 |   }
 65 |   if (chunk_neighbours[CHUNK_TOP] == EXTERNAL_FACE) {
 66 |     update_top(x, y, depth, halo_depth, buffer);
 67 |   }
 68 |   if (chunk_neighbours[CHUNK_BOTTOM] == EXTERNAL_FACE) {
 69 |     update_bottom(x, y, depth, halo_depth, buffer);
 70 |   }
 71 | }
 72 | 
 73 | // The kernel for updating halos locally
 74 | void local_halos(const int x, const int y, const int depth, const int halo_depth, const int *chunk_neighbours,
 75 |                  const bool *fields_to_exchange, KView &density, KView &energy0, KView &energy, KView &u, KView &p, KView &sd) {
 76 |   if (fields_to_exchange[FIELD_DENSITY]) {
 77 |     update_face(x, y, halo_depth, chunk_neighbours, depth, density);
 78 |   }
 79 |   if (fields_to_exchange[FIELD_P]) {
 80 |     update_face(x, y, halo_depth, chunk_neighbours, depth, p);
 81 |   }
 82 |   if (fields_to_exchange[FIELD_ENERGY0]) {
 83 |     update_face(x, y, halo_depth, chunk_neighbours, depth, energy0);
 84 |   }
 85 |   if (fields_to_exchange[FIELD_ENERGY1]) {
 86 |     update_face(x, y, halo_depth, chunk_neighbours, depth, energy);
 87 |   }
 88 |   if (fields_to_exchange[FIELD_U]) {
 89 |     update_face(x, y, halo_depth, chunk_neighbours, depth, u);
 90 |   }
 91 |   if (fields_to_exchange[FIELD_SD]) {
 92 |     update_face(x, y, halo_depth, chunk_neighbours, depth, sd);
 93 |   }
 94 | }
 95 | 
 96 | // Solver-wide kernels
 97 | void run_local_halos(Chunk *chunk, Settings &settings, int depth) {
 98 |   START_PROFILING(settings.kernel_profile);
 99 |   local_halos(chunk->x, chunk->y, depth, settings.halo_depth, chunk->neighbours, settings.fields_to_exchange, *chunk->density,
100 |               *chunk->energy0, *chunk->energy, *chunk->u, *chunk->p, *chunk->sd);
101 |   STOP_PROFILING(settings.kernel_profile, __func__);
102 | }
103 | 


--------------------------------------------------------------------------------
/src/kokkos/model.cmake:
--------------------------------------------------------------------------------
 1 | register_flag_optional(CMAKE_CXX_COMPILER
 2 |         "Any CXX compiler that is supported by CMake detection and RAJA.
 3 |          See https://github.com/kokkos/kokkos#primary-tested-compilers-on-x86-are"
 4 |         "c++")
 5 | 
 6 | register_flag_optional(KOKKOS_IN_TREE
 7 |         "Absolute path to the *source* distribution directory of Kokkos.
 8 |          Remember to append Kokkos specific flags as well, for example:
 9 |              -DKOKKOS_IN_TREE=... -DKokkos_ENABLE_OPENMP=ON -DKokkos_ARCH_ZEN=ON ...
10 |          See https://github.com/kokkos/kokkos/blob/master/BUILD.md for all available options" "")
11 | 
12 | register_flag_optional(KOKKOS_IN_PACKAGE
13 |         "Absolute path to package R-Path containing Kokkos libs. 
14 |          Use this instead of KOKKOS_IN_TREE if Kokkos is from a package manager like Spack." "")
15 | 
16 | # compiler vendor and arch specific flags
17 | set(KOKKOS_FLAGS_CPU_INTEL -qopt-streaming-stores=always)
18 | 
19 | macro(setup)
20 | 
21 |     set(CMAKE_CXX_STANDARD 17) # Kokkos 4+ requires CXX >= 17
22 |     cmake_policy(SET CMP0074 NEW) #see https://github.com/kokkos/kokkos/blob/master/BUILD.md
23 | 
24 | 
25 |     if (EXISTS "${KOKKOS_IN_TREE}")
26 |         message(STATUS "Build using in-tree Kokkos source at `${KOKKOS_IN_TREE}`")
27 |         add_subdirectory(${KOKKOS_IN_TREE} ${CMAKE_BINARY_DIR}/kokkos)
28 |         register_link_library(Kokkos::kokkos)
29 |     elseif (EXISTS "${KOKKOS_IN_PACKAGE}")
30 |         message(STATUS "Build using packaged Kokkos at `${KOKKOS_IN_PACKAGE}`")
31 |         set(Kokkos_DIR "${KOKKOS_IN_PACKAGE}/lib64/cmake/Kokkos")
32 |         find_package(Kokkos REQUIRED)
33 |         register_link_library(Kokkos::kokkos)
34 |     else ()
35 |         message(FATAL_ERROR "Neither `KOKKOS_IN_TREE`, or `KOKKOS_IN_PACKAGE` was set!")
36 |     endif ()
37 | 
38 |     register_append_compiler_and_arch_specific_cxx_flags(
39 |             KOKKOS_FLAGS_CPU
40 |             ${CMAKE_CXX_COMPILER_ID}
41 |             ${CMAKE_SYSTEM_PROCESSOR}
42 |     )
43 | 
44 | endmacro()
45 | 


--------------------------------------------------------------------------------
/src/kokkos/ppcg.cpp:
--------------------------------------------------------------------------------
 1 | #include "chunk.h"
 2 | #include "kokkos_shared.hpp"
 3 | #include "shared.h"
 4 | 
 5 | // Initialises Sd
 6 | void ppcg_init(const int x, const int y, const int halo_depth, const double theta, KView &sd, KView &r) {
 7 |   Kokkos::parallel_for(
 8 |       x * y, KOKKOS_LAMBDA(const int index) {
 9 |         const int kk = index % x;
10 |         const int jj = index / x;
11 | 
12 |         if (kk >= halo_depth && kk < x - halo_depth && jj >= halo_depth && jj < y - halo_depth) {
13 |           sd[index] = r[index] / theta;
14 |         }
15 |       });
16 | }
17 | 
18 | // Calculates U and R
19 | void ppcg_calc_ur(const int x, const int y, const int halo_depth, KView &sd, KView &r, KView &u, KView &kx, KView &ky) {
20 |   Kokkos::parallel_for(
21 |       x * y, KOKKOS_LAMBDA(const int index) {
22 |         const int kk = index % x;
23 |         const int jj = index / x;
24 | 
25 |         if (kk >= halo_depth && kk < x - halo_depth && jj >= halo_depth && jj < y - halo_depth) {
26 |           const double smvp = tealeaf_SMVP(sd);
27 |           r[index] -= smvp;
28 |           u[index] += sd[index];
29 |         }
30 |       });
31 | }
32 | 
33 | // Calculates Sd
34 | void ppcg_calc_sd(const int x, const int y, const int halo_depth, const double theta, const double alpha, const double beta, KView &sd,
35 |                   KView &r) {
36 |   Kokkos::parallel_for(
37 |       x * y, KOKKOS_LAMBDA(const int index) {
38 |         const int kk = index % x;
39 |         const int jj = index / x;
40 | 
41 |         if (kk >= halo_depth && kk < x - halo_depth && jj >= halo_depth && jj < y - halo_depth) {
42 |           sd[index] = alpha * sd[index] + beta * r[index];
43 |         }
44 |       });
45 | }
46 | 
47 | // PPCG solver kernels
48 | void run_ppcg_init(Chunk *chunk, Settings &settings) {
49 |   START_PROFILING(settings.kernel_profile);
50 | 
51 |   ppcg_init(chunk->x, chunk->y, settings.halo_depth, chunk->theta, *chunk->sd, *chunk->r);
52 | 
53 |   STOP_PROFILING(settings.kernel_profile, __func__);
54 | }
55 | 
56 | void run_ppcg_inner_iteration(Chunk *chunk, Settings &settings, double alpha, double beta) {
57 |   START_PROFILING(settings.kernel_profile);
58 | 
59 |   ppcg_calc_ur(chunk->x, chunk->y, settings.halo_depth, *chunk->sd, *chunk->r, *chunk->u, *chunk->kx, *chunk->ky);
60 | 
61 |   ppcg_calc_sd(chunk->x, chunk->y, settings.halo_depth, chunk->theta, alpha, beta, *chunk->sd, *chunk->r);
62 | 
63 |   STOP_PROFILING(settings.kernel_profile, __func__);
64 | }


--------------------------------------------------------------------------------
/src/kokkos/solver_methods.cpp:
--------------------------------------------------------------------------------
  1 | #include "chunk.h"
  2 | #include "kokkos_shared.hpp"
  3 | #include "shared.h"
  4 | 
  5 | // Copies energy0 into energy1.
  6 | void store_energy(const int x, const int y, KView &energy, KView &energy0) {
  7 |   Kokkos::parallel_for(
  8 |       x * y, KOKKOS_LAMBDA(const int index) { energy(index) = energy0(index); });
  9 | }
 10 | 
 11 | // Copies the inner u into u0.
 12 | void copy_u(const int x, const int y, const int halo_depth, KView &u, KView &u0) {
 13 |   Kokkos::parallel_for(
 14 |       x * y, KOKKOS_LAMBDA(const int index) {
 15 |         const int kk = index % x;
 16 |         const int jj = index / x;
 17 | 
 18 |         if (kk >= halo_depth && kk < x - halo_depth && jj >= halo_depth && jj < y - halo_depth) {
 19 |           u0(index) = u(index);
 20 |         }
 21 |       });
 22 | }
 23 | 
 24 | // Calculates the residual r.
 25 | void calculate_residual(const int x, const int y, const int halo_depth, KView &u, KView &u0, KView &r, KView &kx, KView &ky) {
 26 |   Kokkos::parallel_for(
 27 |       x * y, KOKKOS_LAMBDA(const int index) {
 28 |         const int kk = index % x;
 29 |         const int jj = index / x;
 30 | 
 31 |         if (kk >= halo_depth && kk < x - halo_depth && jj >= halo_depth && jj < y - halo_depth) {
 32 |           const double smvp = tealeaf_SMVP(u);
 33 |           r(index) = u0(index) - smvp;
 34 |         }
 35 |       });
 36 | }
 37 | 
 38 | // Calculates the 2 norm of the provided buffer.
 39 | void calculate_2norm(const int x, const int y, const int halo_depth, KView &buffer, double *norm) {
 40 |   Kokkos::parallel_reduce(
 41 |       x * y,
 42 |       KOKKOS_LAMBDA(const int index, double &norm_temp) {
 43 |         const int kk = index % x;
 44 |         const int jj = index / x;
 45 | 
 46 |         if (kk >= halo_depth && kk < x - halo_depth && jj >= halo_depth && jj < y - halo_depth) {
 47 |           norm_temp += buffer(index) * buffer(index);
 48 |         }
 49 |       },
 50 |       *norm);
 51 | }
 52 | 
 53 | // Finalises the energy field.
 54 | void finalise(const int x, const int y, const int halo_depth, KView &u, KView &density, KView &energy) {
 55 |   Kokkos::parallel_for(
 56 |       x * y, KOKKOS_LAMBDA(const int index) {
 57 |         const int kk = index % x;
 58 |         const int jj = index / x;
 59 | 
 60 |         if (kk >= halo_depth && kk < x - halo_depth && jj >= halo_depth && jj < y - halo_depth) {
 61 |           energy(index) = u(index) / density(index);
 62 |         }
 63 |       });
 64 | }
 65 | 
 66 | void run_store_energy(Chunk *chunk, Settings &settings) {
 67 |   START_PROFILING(settings.kernel_profile);
 68 |   store_energy(chunk->x, chunk->y, *chunk->energy, *chunk->energy0);
 69 |   STOP_PROFILING(settings.kernel_profile, __func__);
 70 | }
 71 | 
 72 | void run_field_summary(Chunk *chunk, Settings &settings, double *vol, double *mass, double *ie, double *temp) {
 73 |   START_PROFILING(settings.kernel_profile);
 74 |   int x = chunk->x;
 75 |   int y = chunk->y;
 76 |   int halo_depth = settings.halo_depth;
 77 |   auto &u = *chunk->u;
 78 |   auto &density = *chunk->density;
 79 |   auto &energy0 = *chunk->energy0;
 80 |   auto &volume = *chunk->volume;
 81 | 
 82 |   Kokkos::parallel_reduce(
 83 |       chunk->x * chunk->y,
 84 |       KOKKOS_LAMBDA(const int index, double &vol, double &mass, double &ie, double &temp) {
 85 |         const int kk = index % x;
 86 |         const int jj = index / x;
 87 | 
 88 |         if (kk >= halo_depth && kk < x - halo_depth && jj >= halo_depth && jj < y - halo_depth) {
 89 |           const double cellVol = volume[index];
 90 |           const double cellMass = cellVol * density[index];
 91 |           vol += cellVol;
 92 |           mass += cellMass;
 93 |           ie += cellMass * energy0[index];
 94 |           temp += cellMass * u[index];
 95 |         }
 96 |       },
 97 |       *vol, *mass, *ie, *temp);
 98 |   STOP_PROFILING(settings.kernel_profile, __func__);
 99 | }
100 | 
101 | // Shared solver kernels
102 | void run_copy_u(Chunk *chunk, Settings &settings) {
103 |   START_PROFILING(settings.kernel_profile);
104 | 
105 |   copy_u(chunk->x, chunk->y, settings.halo_depth, *chunk->u, *chunk->u0);
106 | 
107 |   STOP_PROFILING(settings.kernel_profile, __func__);
108 | }
109 | 
110 | void run_calculate_residual(Chunk *chunk, Settings &settings) {
111 |   START_PROFILING(settings.kernel_profile);
112 | 
113 |   calculate_residual(chunk->x, chunk->y, settings.halo_depth, *chunk->u, *chunk->u0, *chunk->r, *chunk->kx, *chunk->ky);
114 | 
115 |   STOP_PROFILING(settings.kernel_profile, __func__);
116 | }
117 | 
118 | void run_calculate_2norm(Chunk *chunk, Settings &settings, FieldBufferType buffer, double *norm) {
119 |   START_PROFILING(settings.kernel_profile);
120 | 
121 |   calculate_2norm(chunk->x, chunk->y, settings.halo_depth, *buffer, norm);
122 | 
123 |   STOP_PROFILING(settings.kernel_profile, __func__);
124 | }
125 | 
126 | void run_finalise(Chunk *chunk, Settings &settings) {
127 |   START_PROFILING(settings.kernel_profile);
128 | 
129 |   finalise(chunk->x, chunk->y, settings.halo_depth, *chunk->u, *chunk->density, *chunk->energy);
130 | 
131 |   STOP_PROFILING(settings.kernel_profile, __func__);
132 | }
133 | 


--------------------------------------------------------------------------------
/src/omp/cheby.cpp:
--------------------------------------------------------------------------------
 1 | #include "chunk.h"
 2 | #include "shared.h"
 3 | 
 4 | /*
 5 |  *		CHEBYSHEV SOLVER KERNEL
 6 |  */
 7 | 
 8 | // Calculates the new value for u.
 9 | void cheby_calc_u(const int x, const int y, const int halo_depth, double *u, const double *p) {
10 | #ifdef OMP_TARGET
11 |   #pragma omp target teams distribute parallel for simd collapse(2)
12 | #else
13 |   #pragma omp parallel for
14 | #endif
15 |   for (int jj = halo_depth; jj < y - halo_depth; ++jj) {
16 |     for (int kk = halo_depth; kk < x - halo_depth; ++kk) {
17 |       const int index = kk + jj * x;
18 |       u[index] += p[index];
19 |     }
20 |   }
21 | }
22 | 
23 | // Initialises the Chebyshev solver
24 | void cheby_init(const int x, const int y, const int halo_depth, const double theta, double *u, const double *u0, double *p, double *r,
25 |                 double *w, const double *kx, const double *ky) {
26 | #ifdef OMP_TARGET
27 |   #pragma omp target teams distribute parallel for simd collapse(2)
28 | #else
29 |   #pragma omp parallel for
30 | #endif
31 |   for (int jj = halo_depth; jj < y - halo_depth; ++jj) {
32 |     for (int kk = halo_depth; kk < x - halo_depth; ++kk) {
33 |       const int index = kk + jj * x;
34 |       const double smvp = tealeaf_SMVP(u);
35 |       w[index] = smvp;
36 |       r[index] = u0[index] - w[index];
37 |       p[index] = r[index] / theta;
38 |     }
39 |   }
40 | 
41 |   cheby_calc_u(x, y, halo_depth, u, p);
42 | }
43 | 
44 | // The main chebyshev iteration
45 | void cheby_iterate(const int x, const int y, const int halo_depth, double alpha, double beta, double *u, const double *u0, double *p,
46 |                    double *r, double *w, const double *kx, const double *ky) {
47 | #ifdef OMP_TARGET
48 |   #pragma omp target teams distribute parallel for simd collapse(2)
49 | #else
50 |   #pragma omp parallel for
51 | #endif
52 |   for (int jj = halo_depth; jj < y - halo_depth; ++jj) {
53 |     for (int kk = halo_depth; kk < x - halo_depth; ++kk) {
54 |       const int index = kk + jj * x;
55 |       const double smvp = tealeaf_SMVP(u);
56 |       w[index] = smvp;
57 |       r[index] = u0[index] - w[index];
58 |       p[index] = alpha * p[index] + beta * r[index];
59 |     }
60 |   }
61 | 
62 |   cheby_calc_u(x, y, halo_depth, u, p);
63 | }
64 | 
65 | // Chebyshev solver kernels
66 | void run_cheby_init(Chunk *chunk, Settings &settings) {
67 |   START_PROFILING(settings.kernel_profile);
68 |   cheby_init(chunk->x, chunk->y, settings.halo_depth, chunk->theta, chunk->u, chunk->u0, chunk->p, chunk->r, chunk->w, chunk->kx,
69 |              chunk->ky);
70 |   STOP_PROFILING(settings.kernel_profile, __func__);
71 | }
72 | 
73 | void run_cheby_iterate(Chunk *chunk, Settings &settings, double alpha, double beta) {
74 |   START_PROFILING(settings.kernel_profile);
75 |   cheby_iterate(chunk->x, chunk->y, settings.halo_depth, alpha, beta, chunk->u, chunk->u0, chunk->p, chunk->r, chunk->w, chunk->kx,
76 |                 chunk->ky);
77 |   STOP_PROFILING(settings.kernel_profile, __func__);
78 | }
79 | 


--------------------------------------------------------------------------------
/src/omp/chunk_extension.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | 
3 | using FieldBufferType = double *;
4 | using StagingBufferType = double *;
5 | struct ChunkExtension {};
6 | 


--------------------------------------------------------------------------------
/src/omp/diffuse_overload.cpp:
--------------------------------------------------------------------------------
 1 | #ifdef OMP_TARGET
 2 | 
 3 |   #include "application.h"
 4 |   #include "drivers.h"
 5 | 
 6 | void solve(Chunk *chunks, Settings &settings, int tt, double *wallclock_prev);
 7 | 
 8 | // An implementation specific overload of the main timestep loop
 9 | bool diffuse_overload(Chunk *chunks, Settings &settings) {
10 |   int n = chunks->x * chunks->y;
11 | 
12 |   print_and_log(settings, "This implementation overloads the diffuse function.\n");
13 | 
14 |   // Currently have to place all structure enclose pointers
15 |   // into local variables for OMP 4.0 to accept them in mapping clauses
16 |   double *r = chunks->r;
17 |   double *sd = chunks->sd;
18 |   double *kx = chunks->kx;
19 |   double *ky = chunks->ky;
20 |   double *w = chunks->w;
21 |   double *p = chunks->p;
22 |   double *cheby_alphas = chunks->cheby_alphas;
23 |   double *cheby_betas = chunks->cheby_betas;
24 |   double *cg_alphas = chunks->cg_alphas;
25 |   double *cg_betas = chunks->cg_betas;
26 |   double *energy = chunks->energy;
27 |   double *density = chunks->density;
28 |   double *energy0 = chunks->energy0;
29 |   double *density0 = chunks->density0;
30 |   double *u = chunks->u;
31 |   double *u0 = chunks->u0;
32 | 
33 |   double *left_send = chunks->left_send;
34 |   double *left_recv = chunks->left_recv;
35 |   double *right_send = chunks->right_send;
36 |   double *right_recv = chunks->right_recv;
37 |   double *top_send = chunks->top_send;
38 |   double *top_recv = chunks->top_recv;
39 |   double *bottom_send = chunks->bottom_send;
40 |   double *bottom_recv = chunks->bottom_recv;
41 | 
42 |   settings.is_offload = true;
43 | 
44 |   int lr_len = chunks->y * settings.halo_depth * NUM_FIELDS;
45 |   int tb_len = chunks->x * settings.halo_depth * NUM_FIELDS;
46 | 
47 |   #pragma omp target enter data map(to : r[ : n], sd[ : n], kx[ : n], ky[ : n], w[ : n], p[ : n], cheby_alphas[ : settings.max_iters], \
48 |                                         cheby_betas[ : settings.max_iters], cg_alphas[ : settings.max_iters],                          \
49 |                                         cg_betas[ : settings.max_iters])                                                               \
50 |       map(to : density[ : n], energy[ : n], density0[ : n], energy0[ : n], u[ : n], u0[ : n]),                                         \
51 |       map(alloc : left_send[ : lr_len], left_recv[ : lr_len], right_send[ : lr_len], right_recv[ : lr_len], top_send[ : tb_len],       \
52 |               top_recv[ : tb_len], bottom_send[ : tb_len], bottom_recv[ : tb_len])
53 | 
54 |   double wallclock_prev = 0.0;
55 |   for (int tt = 0; tt < settings.end_step; ++tt) {
56 |     solve(chunks, settings, tt, &wallclock_prev);
57 |   }
58 | 
59 |   #pragma omp target exit data map(from : density[ : n], energy[ : n], density0[ : n], energy0[ : n], u[ : n], u0[ : n])
60 | 
61 |   settings.is_offload = false;
62 | 
63 |   return field_summary_driver(chunks, settings, true);
64 | }
65 | 
66 | #endif


--------------------------------------------------------------------------------
/src/omp/jacobi.cpp:
--------------------------------------------------------------------------------
 1 | #include "chunk.h"
 2 | #include "shared.h"
 3 | #include <cmath>
 4 | 
 5 | /*
 6 |  *		JACOBI SOLVER KERNEL
 7 |  */
 8 | 
 9 | // Initialises the Jacobi solver
10 | void jacobi_init(const int x, const int y, const int halo_depth, const int coefficient, double rx, double ry, const double *density,
11 |                  const double *energy, double *u0, double *u, double *kx, double *ky) {
12 |   if (coefficient < CONDUCTIVITY && coefficient < RECIP_CONDUCTIVITY) {
13 |     die(__LINE__, __FILE__, "Coefficient %d is not valid.\n", coefficient);
14 |   }
15 | 
16 | #ifdef OMP_TARGET
17 |   #pragma omp target teams distribute parallel for simd collapse(2)
18 | #else
19 |   #pragma omp parallel for
20 | #endif
21 |   for (int jj = 1; jj < y - 1; ++jj) {
22 |     for (int kk = 1; kk < x - 1; ++kk) {
23 |       const int index = kk + jj * x;
24 |       double temp = energy[index] * density[index];
25 |       u0[index] = temp;
26 |       u[index] = temp;
27 |     }
28 |   }
29 | 
30 | #ifdef OMP_TARGET
31 |   #pragma omp target teams distribute parallel for simd collapse(2)
32 | #else
33 |   #pragma omp parallel for
34 | #endif
35 |   for (int jj = halo_depth; jj < y - 1; ++jj) {
36 |     for (int kk = halo_depth; kk < x - 1; ++kk) {
37 |       const int index = kk + jj * x;
38 |       double densityCentre = (coefficient == CONDUCTIVITY) ? density[index] : 1.0 / density[index];
39 |       double densityLeft = (coefficient == CONDUCTIVITY) ? density[index - 1] : 1.0 / density[index - 1];
40 |       double densityDown = (coefficient == CONDUCTIVITY) ? density[index - x] : 1.0 / density[index - x];
41 | 
42 |       kx[index] = rx * (densityLeft + densityCentre) / (2.0 * densityLeft * densityCentre);
43 |       ky[index] = ry * (densityDown + densityCentre) / (2.0 * densityDown * densityCentre);
44 |     }
45 |   }
46 | }
47 | 
48 | // The main Jacobi solve step
49 | void jacobi_iterate(const int x, const int y, const int halo_depth, double *error, const double *kx, const double *ky, const double *u0,
50 |                     double *u, double *r) {
51 | #ifdef OMP_TARGET
52 |   #pragma omp target teams distribute parallel for simd collapse(2)
53 | #else
54 |   #pragma omp parallel for
55 | #endif
56 |   for (int jj = 0; jj < y; ++jj) {
57 |     for (int kk = 0; kk < x; ++kk) {
58 |       const int index = kk + jj * x;
59 |       r[index] = u[index];
60 |     }
61 |   }
62 | 
63 |   double err = 0.0;
64 | 
65 | #ifdef OMP_TARGET
66 |   #pragma omp target teams distribute parallel for simd reduction(+ : err) collapse(2)
67 | #else
68 |   #pragma omp parallel for reduction(+ : err)
69 | #endif
70 |   for (int jj = halo_depth; jj < y - halo_depth; ++jj) {
71 |     for (int kk = halo_depth; kk < x - halo_depth; ++kk) {
72 |       const int index = kk + jj * x;
73 |       u[index] = (u0[index] + (kx[index + 1] * r[index + 1] + kx[index] * r[index - 1]) +
74 |                   (ky[index + x] * r[index + x] + ky[index] * r[index - x])) /
75 |                  (1.0 + (kx[index] + kx[index + 1]) + (ky[index] + ky[index + x]));
76 | 
77 |       err += fabs(u[index] - r[index]);
78 |     }
79 |   }
80 | 
81 |   *error = err;
82 | }
83 | 
84 | // Jacobi solver kernels
85 | void run_jacobi_init(Chunk *chunk, Settings &settings, double rx, double ry) {
86 |   START_PROFILING(settings.kernel_profile);
87 |   jacobi_init(chunk->x, chunk->y, settings.halo_depth, settings.coefficient, rx, ry, chunk->density, chunk->energy, chunk->u0, chunk->u,
88 |               chunk->kx, chunk->ky);
89 |   STOP_PROFILING(settings.kernel_profile, __func__);
90 | }
91 | 
92 | void run_jacobi_iterate(Chunk *chunk, Settings &settings, double *error) {
93 |   START_PROFILING(settings.kernel_profile);
94 |   jacobi_iterate(chunk->x, chunk->y, settings.halo_depth, error, chunk->kx, chunk->ky, chunk->u0, chunk->u, chunk->r);
95 |   STOP_PROFILING(settings.kernel_profile, __func__);
96 | }


--------------------------------------------------------------------------------
/src/omp/local_halos.cpp:
--------------------------------------------------------------------------------
  1 | #include "chunk.h"
  2 | #include "shared.h"
  3 | 
  4 | // Update left halo.
  5 | void update_left(const int x, const int y, const int halo_depth, const int depth, double *buffer, bool is_offload) {
  6 | 
  7 | #ifdef OMP_TARGET
  8 |   #pragma omp target teams distribute parallel for simd if (is_offload) collapse(2)
  9 | #else
 10 |   #pragma omp parallel for
 11 | #endif
 12 |   for (int jj = halo_depth; jj < y - halo_depth; ++jj) {
 13 |     for (int kk = 0; kk < depth; ++kk) {
 14 |       int base = jj * x;
 15 |       buffer[base + (halo_depth - kk - 1)] = buffer[base + (halo_depth + kk)];
 16 |     }
 17 |   }
 18 | }
 19 | 
 20 | // Update right halo.
 21 | void update_right(const int x, const int y, const int halo_depth, const int depth, double *buffer, bool is_offload) {
 22 | #ifdef OMP_TARGET
 23 |   #pragma omp target teams distribute parallel for simd if (is_offload) collapse(2)
 24 | #else
 25 |   #pragma omp parallel for
 26 | #endif
 27 |   for (int jj = halo_depth; jj < y - halo_depth; ++jj) {
 28 |     for (int kk = 0; kk < depth; ++kk) {
 29 |       int base = jj * x;
 30 |       buffer[base + (x - halo_depth + kk)] = buffer[base + (x - halo_depth - 1 - kk)];
 31 |     }
 32 |   }
 33 | }
 34 | 
 35 | // Update top halo.
 36 | void update_top(const int x, const int y, const int halo_depth, const int depth, double *buffer, bool is_offload) {
 37 | #ifdef OMP_TARGET
 38 |   #pragma omp target teams distribute parallel for simd if (is_offload) collapse(2)
 39 | #endif
 40 |   for (int jj = 0; jj < depth; ++jj) {
 41 | #ifndef OMP_TARGET
 42 |   #pragma omp parallel for
 43 | #endif
 44 |     for (int kk = halo_depth; kk < x - halo_depth; ++kk) {
 45 |       int base = kk;
 46 |       buffer[base + (y - halo_depth + jj) * x] = buffer[base + (y - halo_depth - 1 - jj) * x];
 47 |     }
 48 |   }
 49 | }
 50 | 
 51 | // Updates bottom halo.
 52 | void update_bottom(const int x, const int y, const int halo_depth, const int depth, double *buffer, bool is_offload) {
 53 | #ifdef OMP_TARGET
 54 |   #pragma omp target teams distribute parallel for simd if (is_offload) collapse(2)
 55 | #endif
 56 |   for (int jj = 0; jj < depth; ++jj) {
 57 | #ifndef OMP_TARGET
 58 |   #pragma omp parallel for
 59 | #endif
 60 |     for (int kk = halo_depth; kk < x - halo_depth; ++kk) {
 61 |       int base = kk;
 62 |       buffer[base + (halo_depth - jj - 1) * x] = buffer[base + (halo_depth + jj) * x];
 63 |     }
 64 |   }
 65 | }
 66 | 
 67 | // Updates faces in turn.
 68 | void update_face(const int x, const int y, const int halo_depth, const int *chunk_neighbours, const int depth, double *buffer,
 69 |                  bool is_offload) {
 70 |   if (chunk_neighbours[CHUNK_LEFT] == EXTERNAL_FACE) {
 71 |     update_left(x, y, halo_depth, depth, buffer, is_offload);
 72 |   }
 73 |   if (chunk_neighbours[CHUNK_RIGHT] == EXTERNAL_FACE) {
 74 |     update_right(x, y, halo_depth, depth, buffer, is_offload);
 75 |   }
 76 |   if (chunk_neighbours[CHUNK_TOP] == EXTERNAL_FACE) {
 77 |     update_top(x, y, halo_depth, depth, buffer, is_offload);
 78 |   }
 79 |   if (chunk_neighbours[CHUNK_BOTTOM] == EXTERNAL_FACE) {
 80 |     update_bottom(x, y, halo_depth, depth, buffer, is_offload);
 81 |   }
 82 | }
 83 | 
 84 | // The kernel for updating halos locally
 85 | void local_halos(const int x, const int y, const int depth, const int halo_depth, const int *chunk_neighbours,
 86 |                  const bool *fields_to_exchange, double *density, double *energy0, double *energy, double *u, double *p, double *sd,
 87 |                  bool is_offload) {
 88 |   if (fields_to_exchange[FIELD_DENSITY]) {
 89 |     update_face(x, y, halo_depth, chunk_neighbours, depth, density, is_offload);
 90 |   }
 91 |   if (fields_to_exchange[FIELD_P]) {
 92 |     update_face(x, y, halo_depth, chunk_neighbours, depth, p, is_offload);
 93 |   }
 94 |   if (fields_to_exchange[FIELD_ENERGY0]) {
 95 |     update_face(x, y, halo_depth, chunk_neighbours, depth, energy0, is_offload);
 96 |   }
 97 |   if (fields_to_exchange[FIELD_ENERGY1]) {
 98 |     update_face(x, y, halo_depth, chunk_neighbours, depth, energy, is_offload);
 99 |   }
100 |   if (fields_to_exchange[FIELD_U]) {
101 |     update_face(x, y, halo_depth, chunk_neighbours, depth, u, is_offload);
102 |   }
103 |   if (fields_to_exchange[FIELD_SD]) {
104 |     update_face(x, y, halo_depth, chunk_neighbours, depth, sd, is_offload);
105 |   }
106 | }
107 | 
108 | // Solver-wide kernels
109 | void run_local_halos(Chunk *chunk, Settings &settings, int depth) {
110 |   START_PROFILING(settings.kernel_profile);
111 |   local_halos(chunk->x, chunk->y, depth, settings.halo_depth, chunk->neighbours, settings.fields_to_exchange, chunk->density,
112 |               chunk->energy0, chunk->energy, chunk->u, chunk->p, chunk->sd, settings.is_offload);
113 |   STOP_PROFILING(settings.kernel_profile, __func__);
114 | }
115 | 


--------------------------------------------------------------------------------
/src/omp/ppcg.cpp:
--------------------------------------------------------------------------------
 1 | #include "chunk.h"
 2 | #include "shared.h"
 3 | 
 4 | /*
 5 |  *		PPCG SOLVER KERNEL
 6 |  */
 7 | 
 8 | // Initialises the PPCG solver
 9 | void ppcg_init(const int x, const int y, const int halo_depth, double theta, const double *r, double *sd) {
10 | #ifdef OMP_TARGET
11 |   #pragma omp target teams distribute parallel for simd collapse(2)
12 | #else
13 |   #pragma omp parallel for
14 | #endif
15 |   for (int jj = halo_depth; jj < y - halo_depth; ++jj) {
16 |     for (int kk = halo_depth; kk < x - halo_depth; ++kk) {
17 |       const int index = kk + jj * x;
18 |       sd[index] = r[index] / theta;
19 |     }
20 |   }
21 | }
22 | 
23 | // The PPCG inner iteration
24 | void ppcg_inner_iteration(const int x, const int y, const int halo_depth, double alpha, double beta, double *u, double *r, const double *kx,
25 |                           const double *ky, double *sd) {
26 | #ifdef OMP_TARGET
27 |   #pragma omp target teams distribute parallel for simd collapse(2)
28 | #else
29 |   #pragma omp parallel for
30 | #endif
31 |   for (int jj = halo_depth; jj < y - halo_depth; ++jj) {
32 |     for (int kk = halo_depth; kk < x - halo_depth; ++kk) {
33 |       const int index = kk + jj * x;
34 |       const double smvp = tealeaf_SMVP(sd);
35 |       r[index] -= smvp;
36 |       u[index] += sd[index];
37 |     }
38 |   }
39 | 
40 | #ifdef OMP_TARGET
41 |   #pragma omp target teams distribute parallel for simd collapse(2)
42 | #else
43 |   #pragma omp parallel for
44 | #endif
45 |   for (int jj = halo_depth; jj < y - halo_depth; ++jj) {
46 |     for (int kk = halo_depth; kk < x - halo_depth; ++kk) {
47 |       const int index = kk + jj * x;
48 |       sd[index] = alpha * sd[index] + beta * r[index];
49 |     }
50 |   }
51 | }
52 | 
53 | // PPCG solver kernels
54 | void run_ppcg_init(Chunk *chunk, Settings &settings) {
55 |   START_PROFILING(settings.kernel_profile);
56 |   ppcg_init(chunk->x, chunk->y, settings.halo_depth, chunk->theta, chunk->r, chunk->sd);
57 |   STOP_PROFILING(settings.kernel_profile, __func__);
58 | }
59 | 
60 | void run_ppcg_inner_iteration(Chunk *chunk, Settings &settings, double alpha, double beta) {
61 |   START_PROFILING(settings.kernel_profile);
62 |   ppcg_inner_iteration(chunk->x, chunk->y, settings.halo_depth, alpha, beta, chunk->u, chunk->r, chunk->kx, chunk->ky, chunk->sd);
63 |   STOP_PROFILING(settings.kernel_profile, __func__);
64 | }
65 | 


--------------------------------------------------------------------------------
/src/serial/cg.cpp:
--------------------------------------------------------------------------------
  1 | #include "chunk.h"
  2 | #include "shared.h"
  3 | 
  4 | /*
  5 |  *		CONJUGATE GRADIENT SOLVER KERNEL
  6 |  */
  7 | 
  8 | // Initialises the CG solver
  9 | void cg_init(const int x, const int y, const int halo_depth, const int coefficient, double rx, double ry, double *rro,
 10 |              const double *density, const double *energy, double *u, double *p, double *r, double *w, double *kx, double *ky) {
 11 |   if (coefficient != CONDUCTIVITY && coefficient != RECIP_CONDUCTIVITY) {
 12 |     die(__LINE__, __FILE__, "Coefficient %d is not valid.\n", coefficient);
 13 |   }
 14 | 
 15 |   for (int jj = 0; jj < y; ++jj) {
 16 |     for (int kk = 0; kk < x; ++kk) {
 17 |       const int index = kk + jj * x;
 18 |       p[index] = 0.0;
 19 |       r[index] = 0.0;
 20 |       u[index] = energy[index] * density[index];
 21 |     }
 22 |   }
 23 | 
 24 |   for (int jj = 1; jj < y - 1; ++jj) {
 25 |     for (int kk = 1; kk < x - 1; ++kk) {
 26 |       const int index = kk + jj * x;
 27 |       w[index] = (coefficient == CONDUCTIVITY) ? density[index] : 1.0 / density[index];
 28 |     }
 29 |   }
 30 | 
 31 |   for (int jj = halo_depth; jj < y - 1; ++jj) {
 32 |     for (int kk = halo_depth; kk < x - 1; ++kk) {
 33 |       const int index = kk + jj * x;
 34 |       kx[index] = rx * (w[index - 1] + w[index]) / (2.0 * w[index - 1] * w[index]);
 35 |       ky[index] = ry * (w[index - x] + w[index]) / (2.0 * w[index - x] * w[index]);
 36 |     }
 37 |   }
 38 | 
 39 |   double rro_temp = 0.0;
 40 | 
 41 |   for (int jj = halo_depth; jj < y - halo_depth; ++jj) {
 42 |     for (int kk = halo_depth; kk < x - halo_depth; ++kk) {
 43 |       const int index = kk + jj * x;
 44 |       const double smvp = tealeaf_SMVP(u);
 45 |       w[index] = smvp;
 46 |       r[index] = u[index] - w[index];
 47 |       p[index] = r[index];
 48 |       rro_temp += r[index] * p[index];
 49 |     }
 50 |   }
 51 | 
 52 |   // Sum locally
 53 |   *rro += rro_temp;
 54 | }
 55 | 
 56 | // Calculates w
 57 | void cg_calc_w(const int x, const int y, const int halo_depth, double *pw, const double *p, double *w, const double *kx, const double *ky) {
 58 |   double pw_temp = 0.0;
 59 | 
 60 |   for (int jj = halo_depth; jj < y - halo_depth; ++jj) {
 61 |     for (int kk = halo_depth; kk < x - halo_depth; ++kk) {
 62 |       const int index = kk + jj * x;
 63 |       const double smvp = tealeaf_SMVP(p);
 64 |       w[index] = smvp;
 65 |       pw_temp += w[index] * p[index];
 66 |     }
 67 |   }
 68 | 
 69 |   *pw += pw_temp;
 70 | }
 71 | 
 72 | // Calculates u and r
 73 | void cg_calc_ur(const int x, const int y, const int halo_depth, const double alpha, double *rrn, double *u, const double *p, double *r,
 74 |                 const double *w) {
 75 |   double rrn_temp = 0.0;
 76 | 
 77 |   for (int jj = halo_depth; jj < y - halo_depth; ++jj) {
 78 |     for (int kk = halo_depth; kk < x - halo_depth; ++kk) {
 79 |       const int index = kk + jj * x;
 80 | 
 81 |       u[index] += alpha * p[index];
 82 |       r[index] -= alpha * w[index];
 83 |       rrn_temp += r[index] * r[index];
 84 |     }
 85 |   }
 86 | 
 87 |   *rrn += rrn_temp;
 88 | }
 89 | 
 90 | // Calculates p
 91 | void cg_calc_p(const int x, const int y, const int halo_depth, const double beta, double *p, const double *r) {
 92 |   for (int jj = halo_depth; jj < y - halo_depth; ++jj) {
 93 |     for (int kk = halo_depth; kk < x - halo_depth; ++kk) {
 94 |       const int index = kk + jj * x;
 95 | 
 96 |       p[index] = beta * p[index] + r[index];
 97 |     }
 98 |   }
 99 | }
100 | 
101 | // CG solver kernels
102 | void run_cg_init(Chunk *chunk, Settings &settings, double rx, double ry, double *rro) {
103 |   START_PROFILING(settings.kernel_profile);
104 |   cg_init(chunk->x, chunk->y, settings.halo_depth, settings.coefficient, rx, ry, rro, chunk->density, chunk->energy, chunk->u, chunk->p,
105 |           chunk->r, chunk->w, chunk->kx, chunk->ky);
106 |   STOP_PROFILING(settings.kernel_profile, __func__);
107 | }
108 | 
109 | void run_cg_calc_w(Chunk *chunk, Settings &settings, double *pw) {
110 |   START_PROFILING(settings.kernel_profile);
111 |   cg_calc_w(chunk->x, chunk->y, settings.halo_depth, pw, chunk->p, chunk->w, chunk->kx, chunk->ky);
112 |   STOP_PROFILING(settings.kernel_profile, __func__);
113 | }
114 | 
115 | void run_cg_calc_ur(Chunk *chunk, Settings &settings, double alpha, double *rrn) {
116 |   START_PROFILING(settings.kernel_profile);
117 |   cg_calc_ur(chunk->x, chunk->y, settings.halo_depth, alpha, rrn, chunk->u, chunk->p, chunk->r, chunk->w);
118 |   STOP_PROFILING(settings.kernel_profile, __func__);
119 | }
120 | 
121 | void run_cg_calc_p(Chunk *chunk, Settings &settings, double beta) {
122 |   START_PROFILING(settings.kernel_profile);
123 |   cg_calc_p(chunk->x, chunk->y, settings.halo_depth, beta, chunk->p, chunk->r);
124 |   STOP_PROFILING(settings.kernel_profile, __func__);
125 | }


--------------------------------------------------------------------------------
/src/serial/cheby.cpp:
--------------------------------------------------------------------------------
 1 | #include "chunk.h"
 2 | #include "shared.h"
 3 | 
 4 | /*
 5 |  *		CHEBYSHEV SOLVER KERNEL
 6 |  */
 7 | 
 8 | // Calculates the new value for u.
 9 | void cheby_calc_u(const int x, const int y, const int halo_depth, double *u, const double *p) {
10 |   for (int jj = halo_depth; jj < y - halo_depth; ++jj) {
11 |     for (int kk = halo_depth; kk < x - halo_depth; ++kk) {
12 |       const int index = kk + jj * x;
13 |       u[index] += p[index];
14 |     }
15 |   }
16 | }
17 | 
18 | // Initialises the Chebyshev solver
19 | void cheby_init(const int x, const int y, const int halo_depth, const double theta, double *u, const double *u0, double *p, double *r,
20 |                 double *w, const double *kx, const double *ky) {
21 |   for (int jj = halo_depth; jj < y - halo_depth; ++jj) {
22 |     for (int kk = halo_depth; kk < x - halo_depth; ++kk) {
23 |       const int index = kk + jj * x;
24 |       const double smvp = tealeaf_SMVP(u);
25 |       w[index] = smvp;
26 |       r[index] = u0[index] - w[index];
27 |       p[index] = r[index] / theta;
28 |     }
29 |   }
30 | 
31 |   cheby_calc_u(x, y, halo_depth, u, p);
32 | }
33 | 
34 | // The main chebyshev iteration
35 | void cheby_iterate(const int x, const int y, const int halo_depth, double alpha, double beta, double *u, const double *u0, double *p,
36 |                    double *r, double *w, const double *kx, const double *ky) {
37 |   for (int jj = halo_depth; jj < y - halo_depth; ++jj) {
38 |     for (int kk = halo_depth; kk < x - halo_depth; ++kk) {
39 |       const int index = kk + jj * x;
40 |       const double smvp = tealeaf_SMVP(u);
41 |       w[index] = smvp;
42 |       r[index] = u0[index] - w[index];
43 |       p[index] = alpha * p[index] + beta * r[index];
44 |     }
45 |   }
46 | 
47 |   cheby_calc_u(x, y, halo_depth, u, p);
48 | }
49 | 
50 | // Chebyshev solver kernels
51 | void run_cheby_init(Chunk *chunk, Settings &settings) {
52 |   START_PROFILING(settings.kernel_profile);
53 |   cheby_init(chunk->x, chunk->y, settings.halo_depth, chunk->theta, chunk->u, chunk->u0, chunk->p, chunk->r, chunk->w, chunk->kx,
54 |              chunk->ky);
55 |   STOP_PROFILING(settings.kernel_profile, __func__);
56 | }
57 | 
58 | void run_cheby_iterate(Chunk *chunk, Settings &settings, double alpha, double beta) {
59 |   START_PROFILING(settings.kernel_profile);
60 |   cheby_iterate(chunk->x, chunk->y, settings.halo_depth, alpha, beta, chunk->u, chunk->u0, chunk->p, chunk->r, chunk->w, chunk->kx,
61 |                 chunk->ky);
62 |   STOP_PROFILING(settings.kernel_profile, __func__);
63 | }
64 | 


--------------------------------------------------------------------------------
/src/serial/chunk_extension.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | 
3 | using FieldBufferType = double *;
4 | using StagingBufferType = double *;
5 | struct ChunkExtension {};
6 | 


--------------------------------------------------------------------------------
/src/serial/jacobi.cpp:
--------------------------------------------------------------------------------
 1 | #include "chunk.h"
 2 | #include "settings.h"
 3 | #include "shared.h"
 4 | #include <cmath>
 5 | 
 6 | /*
 7 |  *		JACOBI SOLVER KERNEL
 8 |  */
 9 | 
10 | // Initialises the Jacobi solver
11 | void jacobi_init(const int x, const int y, const int halo_depth, const int coefficient, double rx, double ry, const double *density,
12 |                  const double *energy, double *u0, double *u, double *kx, double *ky) {
13 |   if (coefficient < CONDUCTIVITY && coefficient < RECIP_CONDUCTIVITY) {
14 |     die(__LINE__, __FILE__, "Coefficient %d is not valid.\n", coefficient);
15 |   }
16 | 
17 |   for (int jj = 1; jj < y - 1; ++jj) {
18 |     for (int kk = 1; kk < x - 1; ++kk) {
19 |       const int index = kk + jj * x;
20 |       double temp = energy[index] * density[index];
21 |       u0[index] = temp;
22 |       u[index] = temp;
23 |     }
24 |   }
25 | 
26 |   for (int jj = halo_depth; jj < y - 1; ++jj) {
27 |     for (int kk = halo_depth; kk < x - 1; ++kk) {
28 |       const int index = kk + jj * x;
29 |       double densityCentre = (coefficient == CONDUCTIVITY) ? density[index] : 1.0 / density[index];
30 |       double densityLeft = (coefficient == CONDUCTIVITY) ? density[index - 1] : 1.0 / density[index - 1];
31 |       double densityDown = (coefficient == CONDUCTIVITY) ? density[index - x] : 1.0 / density[index - x];
32 | 
33 |       kx[index] = rx * (densityLeft + densityCentre) / (2.0 * densityLeft * densityCentre);
34 |       ky[index] = ry * (densityDown + densityCentre) / (2.0 * densityDown * densityCentre);
35 |     }
36 |   }
37 | }
38 | 
39 | // The main Jacobi solve step
40 | void jacobi_iterate(const int x, const int y, const int halo_depth, double *error, const double *kx, const double *ky, const double *u0,
41 |                     double *u, double *r) {
42 |   for (int jj = 0; jj < y; ++jj) {
43 |     for (int kk = 0; kk < x; ++kk) {
44 |       const int index = kk + jj * x;
45 |       r[index] = u[index];
46 |     }
47 |   }
48 | 
49 |   double err = 0.0;
50 |   for (int jj = halo_depth; jj < y - halo_depth; ++jj) {
51 |     for (int kk = halo_depth; kk < x - halo_depth; ++kk) {
52 |       const int index = kk + jj * x;
53 |       u[index] = (u0[index] + (kx[index + 1] * r[index + 1] + kx[index] * r[index - 1]) +
54 |                   (ky[index + x] * r[index + x] + ky[index] * r[index - x])) /
55 |                  (1.0 + (kx[index] + kx[index + 1]) + (ky[index] + ky[index + x]));
56 | 
57 |       err += std::fabs(u[index] - r[index]);
58 |     }
59 |   }
60 | 
61 |   *error = err;
62 | }
63 | 
64 | // Jacobi solver kernels
65 | void run_jacobi_init(Chunk *chunk, Settings &settings, double rx, double ry) {
66 |   START_PROFILING(settings.kernel_profile);
67 |   jacobi_init(chunk->x, chunk->y, settings.halo_depth, settings.coefficient, rx, ry, chunk->density, chunk->energy, chunk->u0, chunk->u,
68 |               chunk->kx, chunk->ky);
69 |   STOP_PROFILING(settings.kernel_profile, __func__);
70 | }
71 | 
72 | void run_jacobi_iterate(Chunk *chunk, Settings &settings, double *error) {
73 |   START_PROFILING(settings.kernel_profile);
74 |   jacobi_iterate(chunk->x, chunk->y, settings.halo_depth, error, chunk->kx, chunk->ky, chunk->u0, chunk->u, chunk->r);
75 |   STOP_PROFILING(settings.kernel_profile, __func__);
76 | }


--------------------------------------------------------------------------------
/src/serial/local_halos.cpp:
--------------------------------------------------------------------------------
 1 | #include "chunk.h"
 2 | #include "shared.h"
 3 | 
 4 | // Update left halo.
 5 | void update_left(const int x, const int y, const int halo_depth, const int depth, double *buffer) {
 6 |   for (int jj = halo_depth; jj < y - halo_depth; ++jj) {
 7 |     for (int kk = 0; kk < depth; ++kk) {
 8 |       int base = jj * x;
 9 |       buffer[base + (halo_depth - kk - 1)] = buffer[base + (halo_depth + kk)];
10 |     }
11 |   }
12 | }
13 | 
14 | // Update right halo.
15 | void update_right(const int x, const int y, const int halo_depth, const int depth, double *buffer) {
16 |   for (int jj = halo_depth; jj < y - halo_depth; ++jj) {
17 |     for (int kk = 0; kk < depth; ++kk) {
18 |       int base = jj * x;
19 |       buffer[base + (x - halo_depth + kk)] = buffer[base + (x - halo_depth - 1 - kk)];
20 |     }
21 |   }
22 | }
23 | 
24 | // Update top halo.
25 | void update_top(const int x, const int y, const int halo_depth, const int depth, double *buffer) {
26 |   for (int jj = 0; jj < depth; ++jj) {
27 |     for (int kk = halo_depth; kk < x - halo_depth; ++kk) {
28 |       int base = kk;
29 |       buffer[base + (y - halo_depth + jj) * x] = buffer[base + (y - halo_depth - 1 - jj) * x];
30 |     }
31 |   }
32 | }
33 | 
34 | // Updates bottom halo.
35 | void update_bottom(const int x, const int y, const int halo_depth, const int depth, double *buffer) {
36 |   for (int jj = 0; jj < depth; ++jj) {
37 |     for (int kk = halo_depth; kk < x - halo_depth; ++kk) {
38 |       int base = kk;
39 |       buffer[base + (halo_depth - jj - 1) * x] = buffer[base + (halo_depth + jj) * x];
40 |     }
41 |   }
42 | }
43 | 
44 | // Updates faces in turn.
45 | void update_face(const int x, const int y, const int halo_depth, const int *chunk_neighbours, const int depth, double *buffer) {
46 |   if (chunk_neighbours[CHUNK_LEFT] == EXTERNAL_FACE) {
47 |     update_left(x, y, halo_depth, depth, buffer);
48 |   }
49 |   if (chunk_neighbours[CHUNK_RIGHT] == EXTERNAL_FACE) {
50 |     update_right(x, y, halo_depth, depth, buffer);
51 |   }
52 |   if (chunk_neighbours[CHUNK_TOP] == EXTERNAL_FACE) {
53 |     update_top(x, y, halo_depth, depth, buffer);
54 |   }
55 |   if (chunk_neighbours[CHUNK_BOTTOM] == EXTERNAL_FACE) {
56 |     update_bottom(x, y, halo_depth, depth, buffer);
57 |   }
58 | }
59 | 
60 | // The kernel for updating halos locally
61 | void local_halos(int x, int y, int depth, int halo_depth, const int *chunk_neighbours, const bool *fields_to_exchange, double *density,
62 |                  double *energy0, double *energy, double *u, double *p, double *sd) {
63 |   if (fields_to_exchange[FIELD_DENSITY]) {
64 |     update_face(x, y, halo_depth, chunk_neighbours, depth, density);
65 |   }
66 |   if (fields_to_exchange[FIELD_P]) {
67 |     update_face(x, y, halo_depth, chunk_neighbours, depth, p);
68 |   }
69 |   if (fields_to_exchange[FIELD_ENERGY0]) {
70 |     update_face(x, y, halo_depth, chunk_neighbours, depth, energy0);
71 |   }
72 |   if (fields_to_exchange[FIELD_ENERGY1]) {
73 |     update_face(x, y, halo_depth, chunk_neighbours, depth, energy);
74 |   }
75 |   if (fields_to_exchange[FIELD_U]) {
76 |     update_face(x, y, halo_depth, chunk_neighbours, depth, u);
77 |   }
78 |   if (fields_to_exchange[FIELD_SD]) {
79 |     update_face(x, y, halo_depth, chunk_neighbours, depth, sd);
80 |   }
81 | }
82 | 
83 | // Solver-wide kernels
84 | void run_local_halos(Chunk *chunk, Settings &settings, int depth) {
85 |   START_PROFILING(settings.kernel_profile);
86 |   local_halos(chunk->x, chunk->y, depth, settings.halo_depth, chunk->neighbours, settings.fields_to_exchange, chunk->density,
87 |               chunk->energy0, chunk->energy, chunk->u, chunk->p, chunk->sd);
88 |   STOP_PROFILING(settings.kernel_profile, __func__);
89 | }
90 | 


--------------------------------------------------------------------------------
/src/serial/model.cmake:
--------------------------------------------------------------------------------
1 | macro(setup)
2 |     set(CMAKE_CXX_STANDARD 17)
3 | endmacro()


--------------------------------------------------------------------------------
/src/serial/ppcg.cpp:
--------------------------------------------------------------------------------
 1 | #include "chunk.h"
 2 | #include "shared.h"
 3 | 
 4 | /*
 5 |  *		PPCG SOLVER KERNEL
 6 |  */
 7 | 
 8 | // Initialises the PPCG solver
 9 | void ppcg_init(const int x, const int y, const int halo_depth, double theta, const double *r, double *sd) {
10 |   for (int jj = halo_depth; jj < y - halo_depth; ++jj) {
11 |     for (int kk = halo_depth; kk < x - halo_depth; ++kk) {
12 |       const int index = kk + jj * x;
13 |       sd[index] = r[index] / theta;
14 |     }
15 |   }
16 | }
17 | 
18 | // The PPCG inner iteration
19 | void ppcg_inner_iteration(const int x, const int y, const int halo_depth, double alpha, double beta, double *u, double *r, const double *kx,
20 |                           const double *ky, double *sd) {
21 |   for (int jj = halo_depth; jj < y - halo_depth; ++jj) {
22 |     for (int kk = halo_depth; kk < x - halo_depth; ++kk) {
23 |       const int index = kk + jj * x;
24 |       const double smvp = tealeaf_SMVP(sd);
25 |       r[index] -= smvp;
26 |       u[index] += sd[index];
27 |     }
28 |   }
29 | 
30 |   for (int jj = halo_depth; jj < y - halo_depth; ++jj) {
31 |     for (int kk = halo_depth; kk < x - halo_depth; ++kk) {
32 |       const int index = kk + jj * x;
33 |       sd[index] = alpha * sd[index] + beta * r[index];
34 |     }
35 |   }
36 | }
37 | 
38 | // PPCG solver kernels
39 | void run_ppcg_init(Chunk *chunk, Settings &settings) {
40 |   START_PROFILING(settings.kernel_profile);
41 |   ppcg_init(chunk->x, chunk->y, settings.halo_depth, chunk->theta, chunk->r, chunk->sd);
42 |   STOP_PROFILING(settings.kernel_profile, __func__);
43 | }
44 | 
45 | void run_ppcg_inner_iteration(Chunk *chunk, Settings &settings, double alpha, double beta) {
46 |   START_PROFILING(settings.kernel_profile);
47 |   ppcg_inner_iteration(chunk->x, chunk->y, settings.halo_depth, alpha, beta, chunk->u, chunk->r, chunk->kx, chunk->ky, chunk->sd);
48 |   STOP_PROFILING(settings.kernel_profile, __func__);
49 | }
50 | 


--------------------------------------------------------------------------------
/src/serial/solver_methods.cpp:
--------------------------------------------------------------------------------
  1 | #include "chunk.h"
  2 | #include "shared.h"
  3 | 
  4 | /*
  5 |  *		SHARED SOLVER METHODS
  6 |  */
  7 | 
  8 | // The field summary kernel
  9 | void field_summary(const int x, const int y, const int halo_depth, const double *volume, const double *density, const double *energy0,
 10 |                    double *u, double *volOut, double *massOut, double *ieOut, double *tempOut) {
 11 |   double vol = 0.0;
 12 |   double ie = 0.0;
 13 |   double temp = 0.0;
 14 |   double mass = 0.0;
 15 | 
 16 |   for (int jj = halo_depth; jj < y - halo_depth; ++jj) {
 17 |     for (int kk = halo_depth; kk < x - halo_depth; ++kk) {
 18 |       const int index = kk + jj * x;
 19 |       double cellVol = volume[index];
 20 |       double cellMass = cellVol * density[index];
 21 |       vol += cellVol;
 22 |       mass += cellMass;
 23 |       ie += cellMass * energy0[index];
 24 |       temp += cellMass * u[index];
 25 |     }
 26 |   }
 27 | 
 28 |   *volOut += vol;
 29 |   *ieOut += ie;
 30 |   *tempOut += temp;
 31 |   *massOut += mass;
 32 | }
 33 | 
 34 | // Store original energy state
 35 | void store_energy(int x, int y, const double *energy0, double *energy) {
 36 |   for (int ii = 0; ii < x * y; ++ii) {
 37 |     energy[ii] = energy0[ii];
 38 |   }
 39 | }
 40 | 
 41 | // Copies the current u into u0
 42 | void copy_u(const int x, const int y, const int halo_depth, double *u0, const double *u) {
 43 |   for (int jj = halo_depth; jj < y - halo_depth; ++jj) {
 44 |     for (int kk = halo_depth; kk < x - halo_depth; ++kk) {
 45 |       const int index = kk + jj * x;
 46 |       u0[index] = u[index];
 47 |     }
 48 |   }
 49 | }
 50 | 
 51 | // Calculates the current value of r
 52 | void calculate_residual(const int x, const int y, const int halo_depth, const double *u, const double *u0, double *r, const double *kx,
 53 |                         const double *ky) {
 54 |   for (int jj = halo_depth; jj < y - halo_depth; ++jj) {
 55 |     for (int kk = halo_depth; kk < x - halo_depth; ++kk) {
 56 |       const int index = kk + jj * x;
 57 |       const double smvp = tealeaf_SMVP(u);
 58 |       r[index] = u0[index] - smvp;
 59 |     }
 60 |   }
 61 | }
 62 | 
 63 | // Calculates the 2 norm of a given buffer
 64 | void calculate_2norm(const int x, const int y, const int halo_depth, const double *buffer, double *norm) {
 65 |   double norm_temp = 0.0;
 66 | 
 67 |   for (int jj = halo_depth; jj < y - halo_depth; ++jj) {
 68 |     for (int kk = halo_depth; kk < x - halo_depth; ++kk) {
 69 |       const int index = kk + jj * x;
 70 |       norm_temp += buffer[index] * buffer[index];
 71 |     }
 72 |   }
 73 | 
 74 |   *norm += norm_temp;
 75 | }
 76 | 
 77 | // Finalises the solution
 78 | void finalise(const int x, const int y, const int halo_depth, double *energy, const double *density, const double *u) {
 79 |   for (int jj = halo_depth; jj < y - halo_depth; ++jj) {
 80 |     for (int kk = halo_depth; kk < x - halo_depth; ++kk) {
 81 |       const int index = kk + jj * x;
 82 |       energy[index] = u[index] / density[index];
 83 |     }
 84 |   }
 85 | }
 86 | 
 87 | void run_store_energy(Chunk *chunk, Settings &settings) {
 88 |   START_PROFILING(settings.kernel_profile);
 89 |   store_energy(chunk->x, chunk->y, chunk->energy0, chunk->energy);
 90 |   STOP_PROFILING(settings.kernel_profile, __func__);
 91 | }
 92 | 
 93 | void run_field_summary(Chunk *chunk, Settings &settings, double *vol, double *mass, double *ie, double *temp) {
 94 |   START_PROFILING(settings.kernel_profile);
 95 |   field_summary(chunk->x, chunk->y, settings.halo_depth, chunk->volume, chunk->density, chunk->energy0, chunk->u, vol, mass, ie, temp);
 96 |   STOP_PROFILING(settings.kernel_profile, __func__);
 97 | }
 98 | 
 99 | // Shared solver kernels
100 | void run_copy_u(Chunk *chunk, Settings &settings) {
101 |   START_PROFILING(settings.kernel_profile);
102 |   copy_u(chunk->x, chunk->y, settings.halo_depth, chunk->u0, chunk->u);
103 |   STOP_PROFILING(settings.kernel_profile, __func__);
104 | }
105 | 
106 | void run_calculate_residual(Chunk *chunk, Settings &settings) {
107 |   START_PROFILING(settings.kernel_profile);
108 |   calculate_residual(chunk->x, chunk->y, settings.halo_depth, chunk->u, chunk->u0, chunk->r, chunk->kx, chunk->ky);
109 |   STOP_PROFILING(settings.kernel_profile, __func__);
110 | }
111 | 
112 | void run_calculate_2norm(Chunk *chunk, Settings &settings, double *buffer, double *norm) {
113 |   START_PROFILING(settings.kernel_profile);
114 |   calculate_2norm(chunk->x, chunk->y, settings.halo_depth, buffer, norm);
115 |   STOP_PROFILING(settings.kernel_profile, __func__);
116 | }
117 | 
118 | void run_finalise(Chunk *chunk, Settings &settings) {
119 |   START_PROFILING(settings.kernel_profile);
120 |   finalise(chunk->x, chunk->y, settings.halo_depth, chunk->energy, chunk->density, chunk->u);
121 |   STOP_PROFILING(settings.kernel_profile, __func__);
122 | }
123 | 


--------------------------------------------------------------------------------
/src/std-indices/cheby.cpp:
--------------------------------------------------------------------------------
 1 | #include "chunk.h"
 2 | #include "dpl_shim.h"
 3 | #include "ranged.h"
 4 | #include "shared.h"
 5 | #include "std_shared.h"
 6 | /*
 7 |  *		CHEBYSHEV SOLVER KERNEL
 8 |  */
 9 | 
10 | // Calculates the new value for u.
11 | void cheby_calc_u(const int x,          //
12 |                   const int y,          //
13 |                   const int halo_depth, //
14 |                   double *u,            //
15 |                   const double *p) {
16 |   Range2d range(halo_depth, halo_depth, x - halo_depth, y - halo_depth);
17 |   ranged<int> it(0, range.sizeXY());
18 |   std::for_each(EXEC_POLICY, it.begin(), it.end(), [=](int i) {
19 |     const int index = range.restore(i, x);
20 |     u[index] += p[index];
21 |   });
22 | }
23 | 
24 | // Initialises the Chebyshev solver
25 | void cheby_init(const int x,          //
26 |                 const int y,          //
27 |                 const int halo_depth, //
28 |                 const double theta,   //
29 |                 double *u,            //
30 |                 const double *u0,     //
31 |                 double *p,            //
32 |                 double *r,            //
33 |                 double *w,            //
34 |                 const double *kx,     //
35 |                 const double *ky) {
36 |   Range2d range(halo_depth, halo_depth, x - halo_depth, y - halo_depth);
37 |   ranged<int> it(0, range.sizeXY());
38 |   std::for_each(EXEC_POLICY, it.begin(), it.end(), [=](int i) {
39 |     const int index = range.restore(i, x);
40 |     const double smvp = tealeaf_SMVP(u);
41 |     w[index] = smvp;
42 |     r[index] = u0[index] - w[index];
43 |     p[index] = r[index] / theta;
44 |   });
45 | 
46 |   cheby_calc_u(x, y, halo_depth, u, p);
47 | }
48 | 
49 | // The main chebyshev iteration
50 | void cheby_iterate(const int x,          //
51 |                    const int y,          //
52 |                    const int halo_depth, //
53 |                    double alpha,         //
54 |                    double beta,          //
55 |                    double *u,            //
56 |                    const double *u0,     //
57 |                    double *p,            //
58 |                    double *r,            //
59 |                    double *w,            //
60 |                    const double *kx,     //
61 |                    const double *ky) {
62 |   Range2d range(halo_depth, halo_depth, x - halo_depth, y - halo_depth);
63 |   ranged<int> it(0, range.sizeXY());
64 |   std::for_each(EXEC_POLICY, it.begin(), it.end(), [=](int i) {
65 |     const int index = range.restore(i, x);
66 |     const double smvp = tealeaf_SMVP(u);
67 |     w[index] = smvp;
68 |     r[index] = u0[index] - w[index];
69 |     p[index] = alpha * p[index] + beta * r[index];
70 |   });
71 | 
72 |   cheby_calc_u(x, y, halo_depth, u, p);
73 | }
74 | 
75 | // Chebyshev solver kernels
76 | void run_cheby_init(Chunk *chunk, Settings &settings) {
77 |   START_PROFILING(settings.kernel_profile);
78 |   cheby_init(chunk->x, chunk->y, settings.halo_depth, chunk->theta, chunk->u, chunk->u0, chunk->p, chunk->r, chunk->w, chunk->kx,
79 |              chunk->ky);
80 |   STOP_PROFILING(settings.kernel_profile, __func__);
81 | }
82 | 
83 | void run_cheby_iterate(Chunk *chunk, Settings &settings, double alpha, double beta) {
84 |   START_PROFILING(settings.kernel_profile);
85 |   cheby_iterate(chunk->x, chunk->y, settings.halo_depth, alpha, beta, chunk->u, chunk->u0, chunk->p, chunk->r, chunk->w, chunk->kx,
86 |                 chunk->ky);
87 |   STOP_PROFILING(settings.kernel_profile, __func__);
88 | }
89 | 


--------------------------------------------------------------------------------
/src/std-indices/chunk_extension.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | 
3 | using FieldBufferType = double *;
4 | using StagingBufferType = double *;
5 | struct ChunkExtension {};
6 | 


--------------------------------------------------------------------------------
/src/std-indices/dpl_shim.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <cstddef>
 4 | #include <cstdlib>
 5 | 
 6 | #ifdef USE_ONEDPL
 7 | 
 8 | // oneDPL C++17 PSTL
 9 | 
10 |   #include <oneapi/dpl/algorithm>
11 |   #include <oneapi/dpl/execution>
12 |   #include <oneapi/dpl/numeric>
13 | 
14 |   #if ONEDPL_USE_DPCPP_BACKEND
15 | 
16 |     #include <CL/sycl.hpp>
17 | 
18 | const static auto EXEC_POLICY =
19 |     oneapi::dpl::execution::device_policy<>{oneapi::dpl::execution::make_device_policy(oneapi::dpl::execution::dpcpp_default)};
20 | 
21 | template <typename T> T *alloc_raw(size_t size) { return sycl::malloc_shared<T>(size, EXEC_POLICY.queue()); }
22 | 
23 | template <typename T> void dealloc_raw(T *ptr) { sycl::free(ptr, EXEC_POLICY.queue()); }
24 | 
25 |   #else
26 | 
27 | // auto exe_policy = dpl::execution::seq;
28 | // auto exe_policy = dpl::execution::par;
29 | static constexpr auto EXEC_POLICY = dpl::execution::par_unseq;
30 |     #define USE_STD_PTR_ALLOC_DEALLOC
31 | 
32 |   #endif
33 | 
34 | #else
35 | 
36 | // Normal C++17 PSTL
37 | 
38 |   #include <algorithm>
39 |   #include <execution>
40 |   #include <numeric>
41 | 
42 | // static auto EXEC_POLICY = std::execution::seq;
43 | // auto exe_policy = std::execution::par;
44 | static constexpr auto EXEC_POLICY = std::execution::par_unseq;
45 |   #define USE_STD_PTR_ALLOC_DEALLOC
46 | 
47 | #endif
48 | 
49 | #ifdef USE_STD_PTR_ALLOC_DEALLOC
50 | 
51 | template <typename T> T *alloc_raw(size_t size) { return static_cast<T *>(std::malloc(size * sizeof(T))); }
52 | template <typename T> void dealloc_raw(T *ptr) { std::free(ptr); }
53 | 
54 | #endif
55 | 


--------------------------------------------------------------------------------
/src/std-indices/jacobi.cpp:
--------------------------------------------------------------------------------
 1 | #include "chunk.h"
 2 | #include "dpl_shim.h"
 3 | #include "ranged.h"
 4 | #include "shared.h"
 5 | #include "std_shared.h"
 6 | #include <cmath>
 7 | 
 8 | /*
 9 |  *		JACOBI SOLVER KERNEL
10 |  */
11 | 
12 | // Initialises the Jacobi solver
13 | void jacobi_init(const int x,           //
14 |                  const int y,           //
15 |                  const int halo_depth,  //
16 |                  const int coefficient, //
17 |                  double rx,             //
18 |                  double ry,             //
19 |                  const double *density, //
20 |                  const double *energy,  //
21 |                  double *u0,            //
22 |                  double *u,             //
23 |                  double *kx,            //
24 |                  double *ky) {
25 |   if (coefficient < CONDUCTIVITY && coefficient < RECIP_CONDUCTIVITY) {
26 |     die(__LINE__, __FILE__, "Coefficient %d is not valid.\n", coefficient);
27 |   }
28 |   Range2d range(1, 1, x - 1, y - 1);
29 |   ranged<int> it(0, range.sizeXY());
30 |   std::for_each(EXEC_POLICY, it.begin(), it.end(), [=](int i) {
31 |     const int index = range.restore(i, x);
32 |     double temp = energy[index] * density[index];
33 |     u0[index] = temp;
34 |     u[index] = temp;
35 |   });
36 | 
37 |   std::for_each(EXEC_POLICY, it.begin(), it.end(), [=](int i) {
38 |     const int index = range.restore(i, x);
39 |     double densityCentre = (coefficient == CONDUCTIVITY) ? density[index] : 1.0 / density[index];
40 |     double densityLeft = (coefficient == CONDUCTIVITY) ? density[index - 1] : 1.0 / density[index - 1];
41 |     double densityDown = (coefficient == CONDUCTIVITY) ? density[index - x] : 1.0 / density[index - x];
42 | 
43 |     kx[index] = rx * (densityLeft + densityCentre) / (2.0 * densityLeft * densityCentre);
44 |     ky[index] = ry * (densityDown + densityCentre) / (2.0 * densityDown * densityCentre);
45 |   });
46 | }
47 | 
48 | // The main Jacobi solve step
49 | void jacobi_iterate(const int x,          //
50 |                     const int y,          //
51 |                     const int halo_depth, //
52 |                     double *error,        //
53 |                     const double *kx,     //
54 |                     const double *ky,     //
55 |                     const double *u0,     //
56 |                     double *u,            //
57 |                     double *r) {
58 | 
59 |   {
60 |     Range2d range(0, 0, x, y);
61 |     ranged<int> it(0, range.sizeXY());
62 |     std::for_each(EXEC_POLICY, it.begin(), it.end(), [=](int i) {
63 |       const int index = range.restore(i, x);
64 |       r[index] = u[index];
65 |     });
66 |   }
67 | 
68 |   {
69 |     Range2d range(halo_depth, halo_depth, x - halo_depth, y - halo_depth);
70 |     ranged<int> it(0, range.sizeXY());
71 |     *error = std::transform_reduce(EXEC_POLICY, it.begin(), it.end(), 0.0, std::plus<>(), [=](int i) {
72 |       const int index = range.restore(i, x);
73 |       u[index] = (u0[index] + (kx[index + 1] * r[index + 1] + kx[index] * r[index - 1]) +
74 |                   (ky[index + x] * r[index + x] + ky[index] * r[index - x])) /
75 |                  (1.0 + (kx[index] + kx[index + 1]) + (ky[index] + ky[index + x]));
76 | 
77 |       return fabs(u[index] - r[index]);
78 |     });
79 |   }
80 | }
81 | 
82 | // Jacobi solver kernels
83 | void run_jacobi_init(Chunk *chunk, Settings &settings, double rx, double ry) {
84 |   START_PROFILING(settings.kernel_profile);
85 |   jacobi_init(chunk->x, chunk->y, settings.halo_depth, settings.coefficient, rx, ry, chunk->density, chunk->energy, chunk->u0, chunk->u,
86 |               chunk->kx, chunk->ky);
87 |   STOP_PROFILING(settings.kernel_profile, __func__);
88 | }
89 | 
90 | void run_jacobi_iterate(Chunk *chunk, Settings &settings, double *error) {
91 |   START_PROFILING(settings.kernel_profile);
92 |   jacobi_iterate(chunk->x, chunk->y, settings.halo_depth, error, chunk->kx, chunk->ky, chunk->u0, chunk->u, chunk->r);
93 |   STOP_PROFILING(settings.kernel_profile, __func__);
94 | }


--------------------------------------------------------------------------------
/src/std-indices/model.cmake:
--------------------------------------------------------------------------------
 1 | 
 2 | register_flag_optional(CMAKE_CXX_COMPILER
 3 |         "Any CXX compiler that is supported by CMake detection"
 4 |         "c++")
 5 | 
 6 | register_flag_optional(NVHPC_OFFLOAD
 7 |         "Enable offloading support (via the non-standard `-stdpar`) for the new NVHPC SDK.
 8 |          The values are Nvidia architectures in ccXY format will be passed in via `-gpu=` (e.g `cc70`)
 9 | 
10 |          Possible values are:
11 |            cc35  - Compile for compute capability 3.5
12 |            cc50  - Compile for compute capability 5.0
13 |            cc60  - Compile for compute capability 6.0
14 |            cc62  - Compile for compute capability 6.2
15 |            cc70  - Compile for compute capability 7.0
16 |            cc72  - Compile for compute capability 7.2
17 |            cc75  - Compile for compute capability 7.5
18 |            cc80  - Compile for compute capability 8.0
19 |            ccall - Compile for all supported compute capabilities"
20 |         "")
21 | 
22 | register_flag_optional(USE_TBB
23 |         "Link against an in-tree oneTBB via FetchContent_Declare, see top level CMakeLists.txt for details."
24 |         "OFF")
25 | 
26 | register_flag_optional(USE_ONEDPL
27 |         "Link oneDPL which implements C++17 executor policies (via execution_policy_tag) for different backends.
28 | 
29 |         Possible values are:
30 |           OPENMP - Implements policies using OpenMP.
31 |                    CMake will handle any flags needed to enable OpenMP if the compiler supports it.
32 |           TBB    - Implements policies using TBB.
33 |                    TBB must be linked via USE_TBB or be available in LD_LIBRARY_PATH.
34 |           DPCPP  - Implements policies through SYCL2020.
35 |                    This requires the DPC++ compiler (other SYCL compilers are untested), required SYCL flags are added automatically."
36 |         "OFF")
37 | 
38 | macro(setup)
39 |     set(CMAKE_CXX_STANDARD 17) # because SYCL oneDPL is C++17, NVHPC isn't bound by this
40 | 
41 |     if (USE_TBB)
42 |         register_link_library(TBB::tbb)
43 |     endif ()
44 | 
45 |     if (USE_ONEDPL)
46 |         register_definitions(USE_ONEDPL)
47 |         register_link_library(oneDPL)
48 |     endif ()
49 | 
50 |     if (NVHPC_OFFLOAD)
51 |         set(NVHPC_FLAGS
52 |                 -stdpar
53 |                 -gpu=${NVHPC_OFFLOAD},fastmath,keep
54 |                 --restrict
55 |                 -Mfpapprox
56 |                 -Mfprelaxed
57 |                 -Mllvm-fast
58 |                 -Ktrap=none
59 |                 -Minfo=accel -Minfo=stdpar)
60 |         # propagate flags to linker so that it links with the gpu stuff as well
61 |         register_append_cxx_flags(ANY ${NVHPC_FLAGS})
62 |         register_append_link_flags(${NVHPC_FLAGS})
63 |     endif ()
64 | 
65 | 
66 | endmacro()


--------------------------------------------------------------------------------
/src/std-indices/ppcg.cpp:
--------------------------------------------------------------------------------
 1 | #include "chunk.h"
 2 | #include "dpl_shim.h"
 3 | #include "ranged.h"
 4 | #include "shared.h"
 5 | #include "std_shared.h"
 6 | /*
 7 |  *		PPCG SOLVER KERNEL
 8 |  */
 9 | 
10 | // Initialises the PPCG solver
11 | void ppcg_init(const int x,          //
12 |                const int y,          //
13 |                const int halo_depth, //
14 |                double theta,         //
15 |                const double *r,      //
16 |                double *sd) {
17 |   Range2d range(halo_depth, halo_depth, x - halo_depth, y - halo_depth);
18 |   ranged<int> it(0, range.sizeXY());
19 |   std::for_each(EXEC_POLICY, it.begin(), it.end(), [=](int i) {
20 |     const int index = range.restore(i, x);
21 |     sd[index] = r[index] / theta;
22 |   });
23 | }
24 | 
25 | // The PPCG inner iteration
26 | void ppcg_inner_iteration(const int x,          //
27 |                           const int y,          //
28 |                           const int halo_depth, //
29 |                           double alpha,         //
30 |                           double beta,          //
31 |                           double *u,            //
32 |                           double *r,            //
33 |                           const double *kx,     //
34 |                           const double *ky,     //
35 |                           double *sd) {
36 | 
37 |   Range2d range(halo_depth, halo_depth, x - halo_depth, y - halo_depth);
38 |   ranged<int> it(0, range.sizeXY());
39 | 
40 |   std::for_each(EXEC_POLICY, it.begin(), it.end(), [=](int i) {
41 |     const int index = range.restore(i, x);
42 |     const double smvp = tealeaf_SMVP(sd);
43 |     r[index] -= smvp;
44 |     u[index] += sd[index];
45 |   });
46 | 
47 |   std::for_each(EXEC_POLICY, it.begin(), it.end(), [=](int i) {
48 |     const int index = range.restore(i, x);
49 |     sd[index] = alpha * sd[index] + beta * r[index];
50 |   });
51 | }
52 | 
53 | // PPCG solver kernels
54 | void run_ppcg_init(Chunk *chunk, Settings &settings) {
55 |   START_PROFILING(settings.kernel_profile);
56 |   ppcg_init(chunk->x, chunk->y, settings.halo_depth, chunk->theta, chunk->r, chunk->sd);
57 |   STOP_PROFILING(settings.kernel_profile, __func__);
58 | }
59 | 
60 | void run_ppcg_inner_iteration(Chunk *chunk, Settings &settings, double alpha, double beta) {
61 |   START_PROFILING(settings.kernel_profile);
62 |   ppcg_inner_iteration(chunk->x, chunk->y, settings.halo_depth, alpha, beta, chunk->u, chunk->r, chunk->kx, chunk->ky, chunk->sd);
63 |   STOP_PROFILING(settings.kernel_profile, __func__);
64 | }
65 | 


--------------------------------------------------------------------------------
/src/std-indices/ranged.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | // A lightweight counting iterator which will be used by the STL algorithms
 4 | // NB: C++ <= 17 doesn't have this built-in, and it's only added later in ranges-v3 (C++2a) which this
 5 | // implementation doesn't target
 6 | template <typename N> class ranged {
 7 | public:
 8 |   class iterator {
 9 |     friend class ranged;
10 | 
11 |   public:
12 |     using difference_type = N;
13 |     using value_type = N;
14 |     using pointer = const N *;
15 |     using reference = N;
16 |     using iterator_category = std::random_access_iterator_tag;
17 | 
18 |     // XXX This is not part of the iterator spec, it gets picked up by oneDPL if enabled.
19 |     // Without this, the DPL SYCL backend collects the iterator data on the host and copies to the device.
20 |     // This type is unused for any other STL impl.
21 |     using is_passed_directly = std::true_type;
22 | 
23 |     reference operator*() const { return i_; }
24 | 
25 |     iterator &operator++() {
26 |       ++i_;
27 |       return *this;
28 |     }
29 | 
30 |     iterator operator++(int) {
31 |       iterator copy(*this);
32 |       ++i_;
33 |       return copy;
34 |     }
35 | 
36 |     iterator &operator--() {
37 |       --i_;
38 |       return *this;
39 |     }
40 | 
41 |     iterator operator--(int) {
42 |       iterator copy(*this);
43 |       --i_;
44 |       return copy;
45 |     }
46 | 
47 |     iterator &operator+=(N by) {
48 |       i_ += by;
49 |       return *this;
50 |     }
51 | 
52 |     value_type operator[](const difference_type &i) const { return i_ + i; }
53 | 
54 |     difference_type operator-(const iterator &it) const { return i_ - it.i_; }
55 | 
56 |     iterator operator+(const value_type v) const { return iterator(i_ + v); }
57 | 
58 |     bool operator==(const iterator &other) const { return i_ == other.i_; }
59 | 
60 |     bool operator!=(const iterator &other) const { return i_ != other.i_; }
61 | 
62 |     bool operator<(const iterator &other) const { return i_ < other.i_; }
63 | 
64 |   protected:
65 |     explicit iterator(N start) : i_(start) {}
66 | 
67 |   private:
68 |     N i_;
69 |   };
70 | 
71 |   [[nodiscard]] iterator begin() const { return begin_; }
72 | 
73 |   [[nodiscard]] iterator end() const { return end_; }
74 | 
75 |   ranged(N begin, N end) : begin_(begin), end_(end) {}
76 | 
77 | private:
78 |   iterator begin_;
79 |   iterator end_;
80 | };


--------------------------------------------------------------------------------
/src/std-indices/std_shared.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <cassert>
 4 | #include <ostream>
 5 | 
 6 | template <typename N = int> struct Range2d {
 7 |   const N fromX, toX;
 8 |   const N fromY, toY;
 9 | 
10 |   constexpr inline Range2d(N fromX, N fromY, N toX, N toY) : fromX(fromX), toX(toX), fromY(fromY), toY(toY) {
11 |     assert(fromX < toX);
12 |     assert(fromY < toY);
13 |     assert(sizeX() >= 0);
14 |     assert(sizeY() >= 0);
15 |   }
16 |   [[nodiscard]] constexpr inline N sizeX() const { return toX - fromX; }
17 |   [[nodiscard]] constexpr inline N sizeY() const { return toY - fromY; }
18 |   [[nodiscard]] constexpr inline N sizeXY() const { return sizeX() * sizeY(); }
19 | 
20 |   constexpr inline N restore(N i, N xLimit) const {
21 |     const int jj = (i / sizeX()) + fromX;
22 |     const int kk = (i % sizeX()) + fromY;
23 |     return kk + jj * xLimit;
24 |   }
25 | 
26 |   friend std::ostream &operator<<(std::ostream &os, const Range2d &d) {
27 |     os << "Range2d{"
28 |        << " X[" << d.fromX << "->" << d.toX << " (" << d.sizeX() << ")]"
29 |        << " Y[" << d.fromY << "->" << d.toY << " (" << d.sizeY() << ")]"
30 |        << "}";
31 |     return os;
32 |   }
33 | };


--------------------------------------------------------------------------------
/src/sycl-acc/chunk_extension.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <CL/sycl.hpp>
 4 | 
 5 | using namespace cl;
 6 | 
 7 | using FieldBufferType = sycl::buffer<double, 1> *;
 8 | using StagingBufferType = sycl::buffer<double, 1> *;
 9 | 
10 | struct ChunkExtension {
11 |   sycl::queue *device_queue;
12 | };
13 | 


--------------------------------------------------------------------------------
/src/sycl-acc/model.cmake:
--------------------------------------------------------------------------------
  1 | 
  2 | register_flag_optional(CMAKE_CXX_COMPILER
  3 |         "Any CXX compiler that is supported by CMake detection, this is used for host compilation when required by the SYCL compiler"
  4 |         "c++")
  5 | 
  6 | register_flag_required(SYCL_COMPILER
  7 |         "Compile using the specified SYCL compiler implementation
  8 |         Supported values are
  9 |            ONEAPI-ICPX  - icpx as a standalone compiler
 10 |            ONEAPI-Clang - oneAPI's Clang driver (enabled via `source /opt/intel/oneapi/setvars.sh  --include-intel-llvm`)
 11 |            DPCPP        - dpc++ as a standalone compiler (https://github.com/intel/llvm)
 12 |            HIPSYCL      - hipSYCL compiler (https://github.com/illuhad/hipSYCL)
 13 |            COMPUTECPP   - ComputeCpp compiler (https://developer.codeplay.com/products/computecpp/ce/home)")
 14 | 
 15 | register_flag_optional(SYCL_COMPILER_DIR
 16 |         "Absolute path to the selected SYCL compiler directory, most are packaged differently so set the path according to `SYCL_COMPILER`:
 17 |            ONEAPI-ICPX              - `icpx` must be used for OneAPI 2023 and later on releases (i.e `source /opt/intel/oneapi/setvars.sh` first)
 18 |            ONEAPI-Clang             - set to the directory that contains the Intel clang++ binary.
 19 |            HIPSYCL|DPCPP|COMPUTECPP - set to the root of the binary distribution that contains at least `bin/`, `include/`, and `lib/`"
 20 |         "")
 21 | 
 22 | register_flag_optional(USE_HOSTTASK
 23 |         "Whether to use SYCL2020 host_task for MPI related calls or fallback to queue.wait() not all SYCL compilers support this"
 24 |         "OFF")
 25 | 
 26 | 
 27 | register_flag_optional(OpenCL_LIBRARY
 28 |         "[ComputeCpp only] Path to OpenCL library, usually called libOpenCL.so"
 29 |         "${OpenCL_LIBRARY}")
 30 | 
 31 | macro(setup)
 32 |     set(CMAKE_CXX_STANDARD 17)
 33 | 
 34 |     if (USE_HOSTTASK)
 35 |         register_definitions(USE_HOSTTASK)
 36 |     endif ()
 37 | 
 38 | 
 39 |     if (${SYCL_COMPILER} STREQUAL "HIPSYCL")
 40 | 
 41 | 
 42 |         set(hipSYCL_DIR ${SYCL_COMPILER_DIR}/lib/cmake/hipSYCL)
 43 | 
 44 |         if (NOT EXISTS "${hipSYCL_DIR}")
 45 |             message(WARNING "Falling back to hipSYCL < 0.9.0 CMake structure")
 46 |             set(hipSYCL_DIR ${SYCL_COMPILER_DIR}/lib/cmake)
 47 |         endif ()
 48 |         if (NOT EXISTS "${hipSYCL_DIR}")
 49 |             message(FATAL_ERROR "Can't find the appropriate CMake definitions for hipSYCL")
 50 |         endif ()
 51 | 
 52 |         # register_definitions(_GLIBCXX_USE_CXX11_ABI=0)
 53 |         find_package(hipSYCL CONFIG REQUIRED)
 54 |         message(STATUS "ok")
 55 | 
 56 |     elseif (${SYCL_COMPILER} STREQUAL "COMPUTECPP")
 57 | 
 58 |         list(APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake/Modules)
 59 |         set(ComputeCpp_DIR ${SYCL_COMPILER_DIR})
 60 | 
 61 |         # don't point to the CL dir as the imports already have the CL prefix
 62 |         set(OpenCL_INCLUDE_DIR "${CMAKE_SOURCE_DIR}")
 63 | 
 64 |         register_definitions(CL_TARGET_OPENCL_VERSION=220 _GLIBCXX_USE_CXX11_ABI=0)
 65 |         # ComputeCpp needs OpenCL
 66 |         find_package(ComputeCpp REQUIRED)
 67 | 
 68 |         # this must come after FindComputeCpp (!)
 69 |         set(COMPUTECPP_USER_FLAGS -O3 -no-serial-memop)
 70 | 
 71 |     elseif (${SYCL_COMPILER} STREQUAL "DPCPP")
 72 |         set(CMAKE_CXX_COMPILER ${SYCL_COMPILER_DIR}/bin/clang++)
 73 |         include_directories(${SYCL_COMPILER_DIR}/include/sycl)
 74 |         register_append_cxx_flags(ANY -fsycl)
 75 |         register_append_link_flags(-fsycl)
 76 |     elseif (${SYCL_COMPILER} STREQUAL "ONEAPI-ICPX")
 77 |         set(CMAKE_CXX_COMPILER icpx)
 78 |         set(CMAKE_C_COMPILER icx)
 79 |         register_append_cxx_flags(ANY -fsycl)
 80 |         register_append_link_flags(-fsycl)
 81 |     elseif (${SYCL_COMPILER} STREQUAL "ONEAPI-Clang")
 82 |         set(CMAKE_CXX_COMPILER clang++)
 83 |         set(CMAKE_C_COMPILER clang)
 84 |         register_append_cxx_flags(ANY -fsycl)
 85 |         register_append_link_flags(-fsycl)
 86 |     else ()
 87 |         message(FATAL_ERROR "SYCL_COMPILER=${SYCL_COMPILER} is unsupported")
 88 |     endif ()
 89 | 
 90 | endmacro()
 91 | 
 92 | 
 93 | macro(setup_target NAME)
 94 |     if (
 95 |     (${SYCL_COMPILER} STREQUAL "COMPUTECPP") OR
 96 |     (${SYCL_COMPILER} STREQUAL "HIPSYCL"))
 97 |         # so ComputeCpp and hipSYCL has this weird (and bad) CMake usage where they append their
 98 |         # own custom integration header flags AFTER the target has been specified
 99 |         # hence this macro here
100 |         add_sycl_to_target(
101 |                 TARGET ${NAME}
102 |                 SOURCES ${IMPL_SOURCES})
103 |     endif ()
104 | endmacro()
105 | 


--------------------------------------------------------------------------------
/src/sycl-acc/ppcg.cpp:
--------------------------------------------------------------------------------
 1 | #include "chunk.h"
 2 | #include "shared.h"
 3 | #include "sycl_shared.hpp"
 4 | 
 5 | using namespace cl::sycl;
 6 | 
 7 | // Initialises Sd
 8 | void ppcg_init(const int x, const int y, const int halo_depth, const double theta, SyclBuffer &sdBuff, SyclBuffer &rBuff,
 9 |                queue &device_queue) {
10 |   device_queue.submit([&](handler &h) {
11 |     auto sd = sdBuff.get_access<access::mode::discard_write>(h);
12 |     auto r = rBuff.get_access<access::mode::read>(h);
13 |     h.parallel_for<class ppcg_init>(range<1>(x * y), [=](id<1> idx) {
14 |       const auto kk = idx[0] % x;
15 |       const auto jj = idx[0] / x;
16 |       if (kk >= halo_depth && kk < x - halo_depth && jj >= halo_depth && jj < y - halo_depth) {
17 |         sd[idx[0]] = r[idx[0]] / theta;
18 |       }
19 |     });
20 |   });
21 | #ifdef ENABLE_PROFILING
22 |   device_queue.wait_and_throw();
23 | #endif
24 | }
25 | 
26 | // Calculates U and R
27 | void ppcg_calc_ur(const int x, const int y, const int halo_depth, SyclBuffer &sdBuff, SyclBuffer &rBuff, SyclBuffer &uBuff,
28 |                   SyclBuffer &kxBuff, SyclBuffer &kyBuff, queue &device_queue) {
29 |   device_queue.submit([&](handler &h) {
30 |     auto sd = sdBuff.get_access<access::mode::read>(h);
31 |     auto r = rBuff.get_access<access::mode::read_write>(h);
32 |     auto u = uBuff.get_access<access::mode::read_write>(h);
33 |     auto kx = kxBuff.get_access<access::mode::read>(h);
34 |     auto ky = kyBuff.get_access<access::mode::read>(h);
35 |     h.parallel_for<class ppcg_calc_ur>(range<1>(x * y), [=](id<1> idx) {
36 |       const auto kk = idx[0] % x;
37 |       const auto jj = idx[0] / x;
38 |       if (kk >= halo_depth && kk < x - halo_depth && jj >= halo_depth && jj < y - halo_depth) {
39 |         // smvp uses kx and ky and index
40 |         int index = idx[0];
41 |         const double smvp = tealeaf_SMVP(sd);
42 |         r[idx[0]] -= smvp;
43 |         u[idx[0]] += sd[idx[0]];
44 |       }
45 |     });
46 |   });
47 | #ifdef ENABLE_PROFILING
48 |   device_queue.wait_and_throw();
49 | #endif
50 | }
51 | 
52 | // Calculates Sd
53 | void ppcg_calc_sd(const int x, const int y, const int halo_depth, const double theta, const double alpha, const double beta,
54 |                   SyclBuffer &sdBuff, SyclBuffer &rBuff, queue &device_queue) {
55 |   device_queue.submit([&](handler &h) {
56 |     auto sd = sdBuff.get_access<access::mode::read_write>(h);
57 |     auto r = rBuff.get_access<access::mode::read>(h);
58 |     h.parallel_for<class ppcg_calc_sd>(range<1>(x * y), [=](id<1> idx) {
59 |       const auto kk = idx[0] % x;
60 |       const auto jj = idx[0] / x;
61 |       if (kk >= halo_depth && kk < x - halo_depth && jj >= halo_depth && jj < y - halo_depth) {
62 |         sd[idx[0]] = alpha * sd[idx[0]] + beta * r[idx[0]];
63 |       }
64 |     });
65 |   });
66 | #ifdef ENABLE_PROFILING
67 |   device_queue.wait_and_throw();
68 | #endif
69 | }
70 | 
71 | // PPCG solver kernels
72 | void run_ppcg_init(Chunk *chunk, Settings &settings) {
73 |   START_PROFILING(settings.kernel_profile);
74 | 
75 |   ppcg_init(chunk->x, chunk->y, settings.halo_depth, chunk->theta, *(chunk->sd), *(chunk->r), *(chunk->ext->device_queue));
76 | 
77 |   STOP_PROFILING(settings.kernel_profile, __func__);
78 | }
79 | 
80 | void run_ppcg_inner_iteration(Chunk *chunk, Settings &settings, double alpha, double beta) {
81 |   START_PROFILING(settings.kernel_profile);
82 | 
83 |   ppcg_calc_ur(chunk->x, chunk->y, settings.halo_depth, *(chunk->sd), *(chunk->r), *(chunk->u), *(chunk->kx), *(chunk->ky),
84 |                *(chunk->ext->device_queue));
85 | 
86 |   ppcg_calc_sd(chunk->x, chunk->y, settings.halo_depth, chunk->theta, alpha, beta, *(chunk->sd), *(chunk->r), *(chunk->ext->device_queue));
87 | 
88 |   STOP_PROFILING(settings.kernel_profile, __func__);
89 | }
90 | 


--------------------------------------------------------------------------------
/src/sycl-acc/sycl_shared.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <CL/sycl.hpp>
 4 | 
 5 | using namespace cl::sycl;
 6 | 
 7 | using SyclBuffer = buffer<double, 1>;
 8 | 
 9 | template <typename T, int N, typename BinaryOp> inline auto reduction_shim(buffer<T, N> &b, sycl::handler &h, T init, BinaryOp f) {
10 | #if defined(__HIPSYCL__) || defined(__OPENSYCL__)
11 |   return sycl::reduction(b. template get_access<access_mode::read_write>(h),  init, f);
12 | #else
13 |   return sycl::reduction(b, h, init, f, sycl::property::reduction::initialize_to_identity());
14 | #endif
15 | }


--------------------------------------------------------------------------------
/src/sycl-usm/cheby.cpp:
--------------------------------------------------------------------------------
  1 | #include "chunk.h"
  2 | #include "shared.h"
  3 | #include "sycl_shared.hpp"
  4 | 
  5 | using namespace cl::sycl;
  6 | 
  7 | // Initialises the Chebyshev solver
  8 | void cheby_init(const int x,          //
  9 |                 const int y,          //
 10 |                 const int halo_depth, //
 11 |                 const double theta,   //
 12 |                 SyclBuffer &p,        //
 13 |                 SyclBuffer &r,        //
 14 |                 SyclBuffer &u,        //
 15 |                 SyclBuffer &u0,       //
 16 |                 SyclBuffer &w,        //
 17 |                 SyclBuffer &kx,       //
 18 |                 SyclBuffer &ky,       //
 19 |                 queue &device_queue) {
 20 |   device_queue
 21 |       .submit([&](handler &h) {
 22 |         h.parallel_for<class cheby_init>(range<1>(x * y), [=](id<1> idx) {
 23 |           const auto kk = idx[0] % x;
 24 |           const auto jj = idx[0] / x;
 25 |           if (kk >= halo_depth && kk < x - halo_depth && jj >= halo_depth && jj < y - halo_depth) {
 26 |             // smvp uses kx and ky and index
 27 |             int index = idx[0];
 28 |             const double smvp = tealeaf_SMVP(u);
 29 |             w[idx[0]] = smvp;
 30 |             // could make w write only and then use smvp here
 31 |             r[idx[0]] = u0[idx[0]] - w[idx[0]];
 32 |             p[idx[0]] = r[idx[0]] / theta;
 33 |           }
 34 |         });
 35 |       })
 36 |       .wait_and_throw();
 37 | }
 38 | 
 39 | // Calculates U
 40 | void cheby_calc_u(const int x,          //
 41 |                   const int y,          //
 42 |                   const int halo_depth, //
 43 |                   SyclBuffer &p,        //
 44 |                   SyclBuffer &u,        //
 45 |                   queue &device_queue) {
 46 |   device_queue.submit([&](handler &h) {
 47 |     h.parallel_for<class cheby_calc_u>(range<1>(x * y), [=](id<1> idx) {
 48 |       const auto kk = idx[0] % x;
 49 |       const auto jj = idx[0] / x;
 50 |       if (kk >= halo_depth && kk < x - halo_depth && jj >= halo_depth && jj < y - halo_depth) {
 51 |         u[idx[0]] += p[idx[0]];
 52 |       }
 53 |     });
 54 |   });
 55 | #ifdef ENABLE_PROFILING
 56 |   device_queue.wait_and_throw();
 57 | #endif
 58 | }
 59 | 
 60 | // The main Cheby iteration step
 61 | void cheby_iterate(const int x,          //
 62 |                    const int y,          //
 63 |                    const int halo_depth, //
 64 |                    const double alpha,   //
 65 |                    const double beta,    //
 66 |                    SyclBuffer &p,        //
 67 |                    SyclBuffer &r,        //
 68 |                    SyclBuffer &u,        //
 69 |                    SyclBuffer &u0,       //
 70 |                    SyclBuffer &w,        //
 71 |                    SyclBuffer &kx,       //
 72 |                    SyclBuffer &ky,       //
 73 |                    queue &device_queue) {
 74 |   device_queue.submit([&](handler &h) {
 75 |     h.parallel_for<class cheby_iterate>(range<1>(x * y), [=](id<1> idx) {
 76 |       const auto kk = idx[0] % x;
 77 |       const auto jj = idx[0] / x;
 78 |       if (kk >= halo_depth && kk < x - halo_depth && jj >= halo_depth && jj < y - halo_depth) {
 79 |         // smvp uses kx and ky and index
 80 |         int index = idx[0];
 81 |         const double smvp = tealeaf_SMVP(u);
 82 |         w[index] = smvp;
 83 |         // could make w write only and then use smvp here
 84 |         r[index] = u0[index] - w[index];
 85 |         p[index] = alpha * p[index] + beta * r[index];
 86 |       }
 87 |     });
 88 |   });
 89 | #ifdef ENABLE_PROFILING
 90 |   device_queue.wait_and_throw();
 91 | #endif
 92 | }
 93 | 
 94 | // Chebyshev solver kernels
 95 | void run_cheby_init(Chunk *chunk, Settings &settings) {
 96 |   START_PROFILING(settings.kernel_profile);
 97 | 
 98 |   cheby_init(chunk->x, chunk->y, settings.halo_depth, chunk->theta, (chunk->p), (chunk->r), (chunk->u), (chunk->u0), (chunk->w),
 99 |              (chunk->kx), (chunk->ky), *(chunk->ext->device_queue));
100 | 
101 |   STOP_PROFILING(settings.kernel_profile, __func__);
102 | }
103 | 
104 | void run_cheby_iterate(Chunk *chunk, Settings &settings, double alpha, double beta) {
105 |   START_PROFILING(settings.kernel_profile);
106 | 
107 |   cheby_iterate(chunk->x, chunk->y, settings.halo_depth, alpha, beta, (chunk->p), (chunk->r), (chunk->u), (chunk->u0), (chunk->w),
108 |                 (chunk->kx), (chunk->ky), *(chunk->ext->device_queue));
109 | 
110 |   cheby_calc_u(chunk->x, chunk->y, settings.halo_depth, (chunk->p), (chunk->u), *(chunk->ext->device_queue));
111 | 
112 |   STOP_PROFILING(settings.kernel_profile, __func__);
113 | }
114 | 


--------------------------------------------------------------------------------
/src/sycl-usm/chunk_extension.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <CL/sycl.hpp>
 4 | 
 5 | using namespace cl;
 6 | 
 7 | using FieldBufferType = double *;
 8 | using StagingBufferType = double *;
 9 | 
10 | struct Summary {
11 |   double vol = 0.0;
12 |   double mass = 0.0;
13 |   double ie = 0.0;
14 |   double temp = 0.0;
15 |   [[nodiscard]] constexpr Summary operator+(const Summary &that) const { //
16 |     return {vol + that.vol, mass + that.mass, ie + that.ie, temp + that.temp};
17 |   }
18 | };
19 | 
20 | struct ChunkExtension {
21 |   sycl::queue *device_queue;
22 |   double *reduction_cg_rro;
23 |   double *reduction_cg_pw;
24 |   double *reduction_cg_rrn;
25 |   double *reduction_jacobi_error;
26 |   double *reduction_norm;
27 |   Summary *reduction_field_summary;
28 | };
29 | 


--------------------------------------------------------------------------------
/src/sycl-usm/jacobi.cpp:
--------------------------------------------------------------------------------
  1 | #include "chunk.h"
  2 | #include "shared.h"
  3 | #include "sycl_shared.hpp"
  4 | 
  5 | using namespace cl::sycl;
  6 | 
  7 | // Initialises the Jacobi solver
  8 | void jacobi_init(const int x,           //
  9 |                  const int y,           //
 10 |                  const int halo_depth,  //
 11 |                  const int coefficient, //
 12 |                  const double rx,       //
 13 |                  const double ry,       //
 14 |                  SyclBuffer &u,         //
 15 |                  SyclBuffer &u0,        //
 16 |                  SyclBuffer &density,   //
 17 |                  SyclBuffer &energy,    //
 18 |                  SyclBuffer &kx,        //
 19 |                  SyclBuffer &ky,        //
 20 |                  queue &device_queue) {
 21 |   device_queue
 22 |       .submit([&](handler &h) {
 23 |         h.parallel_for<class jacobi_init>(range<1>(x * y), [=](id<1> idx) {
 24 |           const auto kk = idx[0] % x;
 25 |           const auto jj = idx[0] / x;
 26 |           if (kk > 0 && kk < x - 1 && jj > 0 && jj < y - 1) {
 27 |             u0[idx[0]] = energy[idx[0]] * density[idx[0]];
 28 |             u[idx[0]] = u0[idx[0]];
 29 |           }
 30 |           if (jj >= halo_depth && jj < y - 1 && kk >= halo_depth && kk < x - 1) {
 31 |             double densityCentre = (coefficient == CONDUCTIVITY) ? density[idx[0]] : 1.0 / density[idx[0]];
 32 |             double densityLeft = (coefficient == CONDUCTIVITY) ? density[idx[0] - 1] : 1.0 / density[idx[0] - 1];
 33 |             double densityDown = (coefficient == CONDUCTIVITY) ? density[idx[0] - x] : 1.0 / density[idx[0] - x];
 34 | 
 35 |             kx[idx[0]] = rx * (densityLeft + densityCentre) / (2.0 * densityLeft * densityCentre);
 36 |             ky[idx[0]] = ry * (densityDown + densityCentre) / (2.0 * densityDown * densityCentre);
 37 |           }
 38 |         });
 39 |       })
 40 |       .wait_and_throw();
 41 | #ifdef ENABLE_PROFILING
 42 |   device_queue.wait_and_throw();
 43 | #endif
 44 | }
 45 | 
 46 | // Main Jacobi solver method.
 47 | void jacobi_iterate(const int x,            //
 48 |                     const int y,            //
 49 |                     const int halo_depth,   //
 50 |                     SyclBuffer &u,          //
 51 |                     SyclBuffer &u0,         //
 52 |                     SyclBuffer &r,          //
 53 |                     SyclBuffer &kx,         //
 54 |                     SyclBuffer &ky,         //
 55 |                     SyclBuffer &error_temp, //
 56 |                     double *error,          //
 57 |                     queue &device_queue) {
 58 |   auto event = device_queue.submit([&](handler &h) {
 59 |     h.parallel_for<class jacobi_iterate>(                         //
 60 |         range<1>(x * y),                                          //
 61 |         reduction_shim(error_temp, *error, sycl::plus<double>()), //
 62 |         [=](item<1> item, auto &acc) {
 63 |           const auto kk = item[0] % x;
 64 |           const auto jj = item[0] / x;
 65 |           if (kk >= halo_depth && kk < x - halo_depth && jj >= halo_depth && jj < y - halo_depth) {
 66 |             u[item[0]] = (u0[item[0]] + (kx[item[0] + 1] * r[item[0] + 1] + kx[item[0]] * r[item[0] - 1]) +
 67 |                           (ky[item[0] + x] * r[item[0] + x] + ky[item[0]] * r[item[0] - x])) /
 68 |                          (1.0 + (kx[item[0]] + kx[item[0] + 1]) + (ky[item[0]] + ky[item[0] + x]));
 69 |             acc += ::fabs((u[item[0]] - r[item[0]])); // fabs is float version of abs
 70 |           }
 71 |         });
 72 |   });
 73 |   device_queue.copy(error_temp, error, 1, event).wait_and_throw();
 74 | #ifdef ENABLE_PROFILING
 75 |   device_queue.wait_and_throw();
 76 | #endif
 77 | }
 78 | 
 79 | // Copies u into r
 80 | void jacobi_copy_u(const int x,   //
 81 |                    const int y,   //
 82 |                    SyclBuffer &r, //
 83 |                    SyclBuffer &u, //
 84 |                    queue &device_queue) {
 85 |   device_queue.submit([&](handler &h) { h.parallel_for<class jacobi_copy_u>(range<1>(x * y), [=](id<1> idx) { r[idx[0]] = u[idx[0]]; }); });
 86 | #ifdef ENABLE_PROFILING
 87 |   device_queue.wait_and_throw();
 88 | #endif
 89 | }
 90 | 
 91 | // Jacobi solver kernels
 92 | void run_jacobi_init(Chunk *chunk, Settings &settings, double rx, double ry) {
 93 |   START_PROFILING(settings.kernel_profile);
 94 | 
 95 |   jacobi_init(chunk->x, chunk->y, settings.halo_depth, settings.coefficient, rx, ry, (chunk->u), (chunk->u0), (chunk->density),
 96 |               (chunk->energy), (chunk->kx), (chunk->ky), *(chunk->ext->device_queue));
 97 | 
 98 |   STOP_PROFILING(settings.kernel_profile, __func__);
 99 | }
100 | 
101 | void run_jacobi_iterate(Chunk *chunk, Settings &settings, double *error) {
102 |   START_PROFILING(settings.kernel_profile);
103 | 
104 |   jacobi_copy_u(chunk->x, chunk->y, (chunk->r), (chunk->u), *(chunk->ext->device_queue));
105 | 
106 |   jacobi_iterate(chunk->x, chunk->y, settings.halo_depth, (chunk->u), (chunk->u0), (chunk->r), (chunk->kx), (chunk->ky),
107 |                  (chunk->ext->reduction_jacobi_error), error, *(chunk->ext->device_queue));
108 | 
109 |   STOP_PROFILING(settings.kernel_profile, __func__);
110 | }
111 | 


--------------------------------------------------------------------------------
/src/sycl-usm/model.cmake:
--------------------------------------------------------------------------------
  1 | 
  2 | register_flag_optional(CMAKE_CXX_COMPILER
  3 |         "Any CXX compiler that is supported by CMake detection, this is used for host compilation when required by the SYCL compiler"
  4 |         "c++")
  5 | 
  6 | register_flag_required(SYCL_COMPILER
  7 |         "Compile using the specified SYCL compiler implementation
  8 |         Supported values are
  9 |            ONEAPI-ICPX  - icpx as a standalone compiler
 10 |            ONEAPI-Clang - oneAPI's Clang driver (enabled via `source /opt/intel/oneapi/setvars.sh  --include-intel-llvm`)
 11 |            DPCPP        - dpc++ as a standalone compiler (https://github.com/intel/llvm)
 12 |            HIPSYCL      - hipSYCL compiler (https://github.com/illuhad/hipSYCL)
 13 |            COMPUTECPP   - ComputeCpp compiler (https://developer.codeplay.com/products/computecpp/ce/home)")
 14 | 
 15 | register_flag_optional(SYCL_COMPILER_DIR
 16 |         "Absolute path to the selected SYCL compiler directory, most are packaged differently so set the path according to `SYCL_COMPILER`:
 17 |            ONEAPI-ICPX              - `icpx` must be used for OneAPI 2023 and later on releases (i.e `source /opt/intel/oneapi/setvars.sh` first)
 18 |            ONEAPI-Clang             - set to the directory that contains the Intel clang++ binary.
 19 |            HIPSYCL|DPCPP|COMPUTECPP - set to the root of the binary distribution that contains at least `bin/`, `include/`, and `lib/`"
 20 |         "")
 21 | 
 22 | register_flag_optional(USE_HOSTTASK
 23 |         "Whether to use SYCL2020 host_task for MPI related calls or fallback to queue.wait() not all SYCL compilers support this"
 24 |         "OFF")
 25 | 
 26 | 
 27 | register_flag_optional(OpenCL_LIBRARY
 28 |         "[ComputeCpp only] Path to OpenCL library, usually called libOpenCL.so"
 29 |         "${OpenCL_LIBRARY}")
 30 | 
 31 | macro(setup)
 32 |     set(CMAKE_CXX_STANDARD 17)
 33 | 
 34 |     if (USE_HOSTTASK)
 35 |         register_definitions(USE_HOSTTASK)
 36 |     endif ()
 37 | 
 38 | 
 39 |     if (${SYCL_COMPILER} STREQUAL "HIPSYCL")
 40 | 
 41 | 
 42 |         set(hipSYCL_DIR ${SYCL_COMPILER_DIR}/lib/cmake/hipSYCL)
 43 | 
 44 |         if (NOT EXISTS "${hipSYCL_DIR}")
 45 |             message(WARNING "Falling back to hipSYCL < 0.9.0 CMake structure")
 46 |             set(hipSYCL_DIR ${SYCL_COMPILER_DIR}/lib/cmake)
 47 |         endif ()
 48 |         if (NOT EXISTS "${hipSYCL_DIR}")
 49 |             message(FATAL_ERROR "Can't find the appropriate CMake definitions for hipSYCL")
 50 |         endif ()
 51 | 
 52 |         # register_definitions(_GLIBCXX_USE_CXX11_ABI=0)
 53 |         find_package(hipSYCL CONFIG REQUIRED)
 54 |         message(STATUS "ok")
 55 | 
 56 |     elseif (${SYCL_COMPILER} STREQUAL "COMPUTECPP")
 57 | 
 58 |         list(APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake/Modules)
 59 |         set(ComputeCpp_DIR ${SYCL_COMPILER_DIR})
 60 | 
 61 |         # don't point to the CL dir as the imports already have the CL prefix
 62 |         set(OpenCL_INCLUDE_DIR "${CMAKE_SOURCE_DIR}")
 63 | 
 64 |         register_definitions(CL_TARGET_OPENCL_VERSION=220 _GLIBCXX_USE_CXX11_ABI=0)
 65 |         # ComputeCpp needs OpenCL
 66 |         find_package(ComputeCpp REQUIRED)
 67 | 
 68 |         # this must come after FindComputeCpp (!)
 69 |         set(COMPUTECPP_USER_FLAGS -O3 -no-serial-memop)
 70 | 
 71 |     elseif (${SYCL_COMPILER} STREQUAL "DPCPP")
 72 |         set(CMAKE_CXX_COMPILER ${SYCL_COMPILER_DIR}/bin/clang++)
 73 |         include_directories(${SYCL_COMPILER_DIR}/include/sycl)
 74 |         register_append_cxx_flags(ANY -fsycl)
 75 |         register_append_link_flags(-fsycl)
 76 |     elseif (${SYCL_COMPILER} STREQUAL "ONEAPI-ICPX")
 77 |         set(CMAKE_CXX_COMPILER icpx)
 78 |         set(CMAKE_C_COMPILER icx)
 79 |         register_append_cxx_flags(ANY -fsycl)
 80 |         register_append_link_flags(-fsycl)
 81 |     elseif (${SYCL_COMPILER} STREQUAL "ONEAPI-Clang")
 82 |         set(CMAKE_CXX_COMPILER clang++)
 83 |         set(CMAKE_C_COMPILER clang)
 84 |         register_append_cxx_flags(ANY -fsycl)
 85 |         register_append_link_flags(-fsycl)
 86 |     else ()
 87 |         message(FATAL_ERROR "SYCL_COMPILER=${SYCL_COMPILER} is unsupported")
 88 |     endif ()
 89 | 
 90 | endmacro()
 91 | 
 92 | 
 93 | macro(setup_target NAME)
 94 |     if (
 95 |     (${SYCL_COMPILER} STREQUAL "COMPUTECPP") OR
 96 |     (${SYCL_COMPILER} STREQUAL "HIPSYCL"))
 97 |         # so ComputeCpp and hipSYCL has this weird (and bad) CMake usage where they append their
 98 |         # own custom integration header flags AFTER the target has been specified
 99 |         # hence this macro here
100 |         add_sycl_to_target(
101 |                 TARGET ${NAME}
102 |                 SOURCES ${IMPL_SOURCES})
103 |     endif ()
104 | endmacro()
105 | 


--------------------------------------------------------------------------------
/src/sycl-usm/ppcg.cpp:
--------------------------------------------------------------------------------
  1 | #include "chunk.h"
  2 | #include "shared.h"
  3 | #include "sycl_shared.hpp"
  4 | 
  5 | using namespace cl::sycl;
  6 | 
  7 | // Initialises Sd
  8 | void ppcg_init(const int x,          //
  9 |                const int y,          //
 10 |                const int halo_depth, //
 11 |                const double theta,   //
 12 |                SyclBuffer &sd,       //
 13 |                SyclBuffer &r,        //
 14 |                queue &device_queue) {
 15 |   device_queue.submit([&](handler &h) {
 16 |     h.parallel_for<class ppcg_init>(range<1>(x * y), [=](id<1> idx) {
 17 |       const auto kk = idx[0] % x;
 18 |       const auto jj = idx[0] / x;
 19 |       if (kk >= halo_depth && kk < x - halo_depth && jj >= halo_depth && jj < y - halo_depth) {
 20 |         sd[idx[0]] = r[idx[0]] / theta;
 21 |       }
 22 |     });
 23 |   });
 24 | #ifdef ENABLE_PROFILING
 25 |   device_queue.wait_and_throw();
 26 | #endif
 27 | }
 28 | 
 29 | // Calculates U and R
 30 | void ppcg_calc_ur(const int x,          //
 31 |                   const int y,          //
 32 |                   const int halo_depth, //
 33 |                   SyclBuffer &sd,       //
 34 |                   SyclBuffer &r,        //
 35 |                   SyclBuffer &u,        //
 36 |                   SyclBuffer &kx,       //
 37 |                   SyclBuffer &ky,       //
 38 |                   queue &device_queue) {
 39 |   device_queue.submit([&](handler &h) {
 40 |     h.parallel_for<class ppcg_calc_ur>(range<1>(x * y), [=](id<1> idx) {
 41 |       const auto kk = idx[0] % x;
 42 |       const auto jj = idx[0] / x;
 43 |       if (kk >= halo_depth && kk < x - halo_depth && jj >= halo_depth && jj < y - halo_depth) {
 44 |         // smvp uses kx and ky and index
 45 |         int index = idx[0];
 46 |         const double smvp = tealeaf_SMVP(sd);
 47 |         r[idx[0]] -= smvp;
 48 |         u[idx[0]] += sd[idx[0]];
 49 |       }
 50 |     });
 51 |   });
 52 | #ifdef ENABLE_PROFILING
 53 |   device_queue.wait_and_throw();
 54 | #endif
 55 | }
 56 | 
 57 | // Calculates Sd
 58 | void ppcg_calc_sd(const int x,          //
 59 |                   const int y,          //
 60 |                   const int halo_depth, //
 61 |                   const double theta,   //
 62 |                   const double alpha,   //
 63 |                   const double beta,    //
 64 |                   SyclBuffer &sd,       //
 65 |                   SyclBuffer &r,        //
 66 |                   queue &device_queue) {
 67 |   device_queue.submit([&](handler &h) {
 68 |     h.parallel_for<class ppcg_calc_sd>(range<1>(x * y), [=](id<1> idx) {
 69 |       const auto kk = idx[0] % x;
 70 |       const auto jj = idx[0] / x;
 71 |       if (kk >= halo_depth && kk < x - halo_depth && jj >= halo_depth && jj < y - halo_depth) {
 72 |         sd[idx[0]] = alpha * sd[idx[0]] + beta * r[idx[0]];
 73 |       }
 74 |     });
 75 |   });
 76 | #ifdef ENABLE_PROFILING
 77 |   device_queue.wait_and_throw();
 78 | #endif
 79 | }
 80 | 
 81 | // PPCG solver kernels
 82 | void run_ppcg_init(Chunk *chunk, Settings &settings) {
 83 |   START_PROFILING(settings.kernel_profile);
 84 | 
 85 |   ppcg_init(chunk->x, chunk->y, settings.halo_depth, chunk->theta, (chunk->sd), (chunk->r), *(chunk->ext->device_queue));
 86 | 
 87 |   STOP_PROFILING(settings.kernel_profile, __func__);
 88 | }
 89 | 
 90 | void run_ppcg_inner_iteration(Chunk *chunk, Settings &settings, double alpha, double beta) {
 91 |   START_PROFILING(settings.kernel_profile);
 92 | 
 93 |   ppcg_calc_ur(chunk->x, chunk->y, settings.halo_depth, (chunk->sd), (chunk->r), (chunk->u), (chunk->kx), (chunk->ky),
 94 |                *(chunk->ext->device_queue));
 95 | 
 96 |   ppcg_calc_sd(chunk->x, chunk->y, settings.halo_depth, chunk->theta, alpha, beta, (chunk->sd), (chunk->r), *(chunk->ext->device_queue));
 97 | 
 98 |   STOP_PROFILING(settings.kernel_profile, __func__);
 99 | }
100 | 


--------------------------------------------------------------------------------
/src/sycl-usm/sycl_shared.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <CL/sycl.hpp>
 4 | 
 5 | using namespace cl::sycl;
 6 | 
 7 | using SyclBuffer = double *;
 8 | 
 9 | template <typename T, typename BinaryOp> inline auto reduction_shim(T *b, T init, BinaryOp f) {
10 | #if defined(__HIPSYCL__) || defined(__OPENSYCL__)
11 |   return sycl::reduction(b, init, f);
12 | #else
13 |   return sycl::reduction(b, init, f, sycl::property::reduction::initialize_to_identity());
14 | #endif
15 | }
16 | 


--------------------------------------------------------------------------------
/tea.in:
--------------------------------------------------------------------------------
 1 | *tea
 2 | state 1 density=100.0 energy=0.0001
 3 | state 2 density=0.1 energy=25.0 geometry=rectangle xmin=0.0 xmax=1.0 ymin=1.0 ymax=2.0
 4 | state 3 density=0.1 energy=0.1 geometry=rectangle xmin=1.0 xmax=6.0 ymin=1.0 ymax=2.0
 5 | state 4 density=0.1 energy=0.1 geometry=rectangle xmin=5.0 xmax=6.0 ymin=1.0 ymax=8.0
 6 | state 5 density=0.1 energy=0.1 geometry=rectangle xmin=5.0 xmax=10.0 ymin=7.0 ymax=8.0
 7 | x_cells=512
 8 | y_cells=512
 9 | xmin=0.0
10 | ymin=0.0
11 | xmax=10.0
12 | ymax=10.0
13 | initial_timestep=0.004
14 | end_step=20
15 | max_iters=10000
16 | #use_chebyshev
17 | #use_ppcg
18 | #use_jacobi
19 | use_cg
20 | eps 1.0e-15
21 | test_problem 5
22 | profiler_on
23 | use_c_kernels
24 | *endtea
25 | 


--------------------------------------------------------------------------------
/tea.problems:
--------------------------------------------------------------------------------
 1 | 512 512 20 1.034697091898282e+02
 2 | 4096 4096 1 8.789826115915487e+01
 3 | 1024 1024 20 1.012100932683400e+02
 4 | 64 64 1 1.084697720003111e+02
 5 | 1000 1000 10 9.727733205075556e+01
 6 | 2000 2000 10 9.605026999605091e+01
 7 | 4000 4000 10 9.5462351582214282e+01
 8 | 8000 8000 10 9.517473876862078e+01
 9 | 1000 1000 4 9.348844542172745e+01
10 | 2000 2000 4 9.213313338102208e+01
11 | 4000 4000 4 9.150788514428132e+01
12 | 8000 8000 4 9.120746325516782e+01
13 | 1000 1000 2 9.161051549004094e+01
14 | 2000 2000 2 9.010618606381739e+01
15 | 4000 4000 2 8.944258537125111e+01
16 | 8000 8000 2 8.913203173864531e+01
17 | 


--------------------------------------------------------------------------------