├── apps
    ├── choleskey
    │   ├── CMakeLists.txt
    │   ├── matrixutil.hpp
    │   ├── choleskey_serial.cpp
    │   ├── choleskey_stdpar.cpp
    │   └── choleskey_stdpar_snd.cpp
    ├── comm-study
    │   ├── CMakeLists.txt
    │   └── comm-study-no-senders.cpp
    ├── mdspan-stdpar
    │   ├── CMakeLists.txt
    │   └── mdspan-stdpar.cpp
    ├── 1d-stencil
    │   ├── CMakeLists.txt
    │   ├── 1d-cuda.cpp
    │   ├── 1d-omp.cpp
    │   ├── 1d-serial.cpp
    │   ├── 1d-stdpar.cpp
    │   └── 1d-stdexec.cpp
    ├── heat-equation
    │   ├── CMakeLists.txt
    │   ├── heat-equation.hpp
    │   ├── heat-equation-stdpar.cpp
    │   ├── heat-equation-serial.cpp
    │   ├── heat-equation-omp.cpp
    │   ├── heat-equation-stdexec.cpp
    │   └── heat-equation-cuda.cpp
    ├── prefixSum
    │   ├── CMakeLists.txt
    │   ├── prefixSum.hpp
    │   ├── prefixSum-stdpar.cpp
    │   ├── prefixSum-serial.cpp
    │   └── prefixSum-stdexec.cpp
    ├── fft
    │   ├── CMakeLists.txt
    │   ├── fft-serial.cpp
    │   ├── fft-stdpar.cpp
    │   ├── fft-stdexec.cpp
    │   └── fft.hpp
    └── CMakeLists.txt
├── .github
    └── workflows
    │   └── format_check.yml
├── LICENSE
├── scripts
    ├── fft.nvhpc.gpu.sh
    ├── pm-localrc
    │   └── localrc
    ├── heat-run.gcc.sh
    ├── fft.nvhpc.grace.cpu.sh
    ├── fft.nvhpc.cpu.sh
    ├── heat-run.nvhpc.grace.cpu.sh
    ├── stencil.nvhpc.grace.cpu.sh
    ├── heat-run.nvhpc.cpu.sh
    ├── stencil.nvhpc.cpu.sh
    ├── benchmark.sh
    └── fft.ncu.nsys.gpu.sh
├── README.md
├── .clang-format
├── .gitignore
├── .gitlab-ci.yml
├── include
    ├── counting_iterator.hpp
    └── commons.hpp
├── CMakeLists.txt
└── .cmake-format.py


/apps/choleskey/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | project(choleskey_stdpar LANGUAGES CXX)
 2 | 
 3 | add_executable(choleskey_serial choleskey_serial.cpp)
 4 | target_include_directories(
 5 |   choleskey_serial
 6 |   PRIVATE ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/../../include
 7 |           ${ARGPARSE_INCLUDE_DIR} ${MDSPAN_INCLUDE_DIR})
 8 | target_link_libraries(choleskey_serial hpcpp-core)
 9 | 
10 | add_executable(choleskey_stdpar choleskey_stdpar.cpp)
11 | target_link_libraries(choleskey_stdpar stdexec hpcpp-core)
12 | target_include_directories(
13 |   choleskey_stdpar
14 |   PRIVATE ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/../../include
15 |           ${ARGPARSE_INCLUDE_DIR} ${MDSPAN_INCLUDE_DIR})
16 | 
17 | # TODO: remove this example add_executable(choleskey_stdpar_snd
18 | # choleskey_stdpar_snd.cpp) target_link_libraries(choleskey_stdpar_snd stdexec)
19 | # target_include_directories( choleskey_stdpar_snd PRIVATE ${CMAKE_BINARY_DIR}
20 | # ${CMAKE_CURRENT_LIST_DIR}/../../include ${ARGPARSE_INCLUDE_DIR}
21 | # ${MDSPAN_INCLUDE_DIR})
22 | 


--------------------------------------------------------------------------------
/apps/comm-study/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | project(comm-study LANGUAGES CXX)
 2 | 
 3 | file(GLOB CPP_SOURCES "*.cpp")
 4 | 
 5 | foreach(source_file ${CPP_SOURCES})
 6 |   # get the file name without an extension
 7 |   get_filename_component(exec_name ${source_file} NAME_WE)
 8 | 
 9 |   # add an executable with the same name as the source file
10 |   add_executable(${exec_name} ${_EXCLUDE} ${source_file})
11 |   set_source_files_properties(${source_file} PROPERTIES LANGUAGE CXX
12 |                                                         LINKER_LANGUAGE CXX)
13 |   target_include_directories(
14 |     ${exec_name}
15 |     PRIVATE ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/../../include
16 |             ${MDSPAN_INCLUDE_DIR})
17 | 
18 |   target_link_libraries(${exec_name} PUBLIC ${MPI_LIBS} stdexec hpcpp-core)
19 | 
20 |   set_target_properties(
21 |     ${exec_name}
22 |     PROPERTIES CXX_STANDARD ${CXX_STANDARD}
23 |                CXX_EXTENSIONS ${CMAKE_GNU_EXTENSIONS}
24 |                INSTALL_RPATH_USE_LINK_PATH ON)
25 | 
26 |   # installation
27 |   install(TARGETS ${exec_name} DESTINATION ${CMAKE_INSTALL_BINDIR})
28 | endforeach()
29 | 


--------------------------------------------------------------------------------
/.github/workflows/format_check.yml:
--------------------------------------------------------------------------------
 1 | name: Format Check
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     paths:
 6 |       - '**.cpp'
 7 |       - '**.h'
 8 |       - 'CMakeLists.txt'
 9 |       - '**.cmake'
10 | 
11 | jobs:
12 |   check-format:
13 |     runs-on: ubuntu-latest
14 |     
15 |     steps:
16 |     - name: Checkout code
17 |       uses: actions/checkout@v2
18 |     
19 |     - name: Install clang-format
20 |       run: sudo apt-get install clang-format
21 | 
22 |     - name: Install cmake-format
23 |       run: pip install cmake-format
24 |     
25 |     - name: Check C++ format
26 |       run: |
27 |         clang-format --version
28 |         find . -name '*.cpp' -o -name '*.h' -o -name '*.hpp' -o -name '*.cc' | xargs clang-format -i 
29 |         git diff --exit-code || (echo "Code was not formatted using clang-format! Please format your code." && exit 1)
30 | 
31 |     - name: Check CMake format
32 |       run: |
33 |         find . -name 'CMakeLists.txt' -o -name '*.cmake' | xargs cmake-format -i
34 |         git diff --exit-code || (echo "CMake files were not formatted using cmake-format! Please format your files." && exit 1)
35 | 
36 | 


--------------------------------------------------------------------------------
/apps/mdspan-stdpar/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | project(mdspan-stdpar LANGUAGES CXX)
 2 | 
 3 | add_executable(mdspan-stdpar ${_EXCLUDE}
 4 |                              ${CMAKE_CURRENT_LIST_DIR}/mdspan-stdpar.cpp)
 5 | 
 6 | set_source_files_properties(${CMAKE_CURRENT_LIST_DIR}/mdspan-stdpar.cpp
 7 |                             PROPERTIES LANGUAGE CXX LINKER_LANGUAGE CXX)
 8 | 
 9 | # add dependencies (not applicable yet) add_dependencies(mdspan-stdpar
10 | # magic_enum argparse)
11 | 
12 | # include core/include and generated files DO NOT include
13 | # ${MAGICENUM_INCLUDE_DIR} as it results in an internal error at templates due
14 | # to CUDA compiler.
15 | target_include_directories(
16 |   mdspan-stdpar
17 |   PRIVATE ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/../../include
18 |           ${MDSPAN_INCLUDE_DIR})
19 | 
20 | target_link_libraries(mdspan-stdpar PUBLIC ${MPI_LIBS} stdexec hpcpp-core)
21 | 
22 | set_target_properties(
23 |   mdspan-stdpar
24 |   PROPERTIES CXX_STANDARD ${CXX_STANDARD}
25 |              CXX_EXTENSIONS NO
26 |              INSTALL_RPATH_USE_LINK_PATH ON)
27 | 
28 | # installation
29 | install(TARGETS mdspan-stdpar DESTINATION ${CMAKE_INSTALL_BINDIR})
30 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) The Regents of the University of California (Muhammad Haseeb, Weile Wei), 2023
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/apps/choleskey/matrixutil.hpp:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <iostream>
 4 | #include <vector>
 5 | 
 6 | // generate positive definition matrix
 7 | template <typename T>
 8 | using Matrix = std::vector<std::vector<T>>;
 9 | 
10 | template <typename T>
11 | std::vector<T> generate_pascal_matrix(const int n) {
12 |     Matrix<T> matrix(n, std::vector<T>(n, static_cast<T>(0)));
13 | 
14 |     for (int i = 0; i < n; ++i) {
15 |         for (int j = 0; j < n; ++j) {
16 |             if (i == 0 || j == 0) {
17 |                 matrix[i][j] = static_cast<T>(1);
18 |             } else {
19 |                 matrix[i][j] = matrix[i][j - 1] + matrix[i - 1][j];
20 |             }
21 |         }
22 |     }
23 | 
24 |     std::vector<T> flattenedVector;
25 |     for (const auto& row : matrix) {
26 |         flattenedVector.insert(flattenedVector.end(), row.begin(), row.end());
27 |     }
28 |     return std::move(flattenedVector);
29 | }
30 | 
31 | // parameters define
32 | struct args_params_t : public argparse::Args {
33 |     bool& results = kwarg("results", "print generated results (default: false)").set_default(true);
34 |     std::uint64_t& nd = kwarg("nd", "Number of input(positive definition) matrix dimension(<=18)").set_default(10);
35 |     std::uint64_t& np = kwarg("np", "Number of partitions").set_default(4);
36 |     bool& help = flag("h, help", "print help");
37 |     bool& time = kwarg("t, time", "print time").set_default(true);
38 | };
39 | 


--------------------------------------------------------------------------------
/scripts/fft.nvhpc.gpu.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -le
 2 | 
 3 | #
 4 | # Reminder: Revert any changes to nvstdpar/CMakeLists.txt and
 5 | # nvstdpar/apps/heat-equation/CMakeLists.txt that you did
 6 | # for GCC compiler script before running this.
 7 | #
 8 | 
 9 | #SBATCH -A nstaff_g
10 | #SBATCH -C gpu
11 | #SBATCH --qos=regular
12 | #SBATCH --time=8:00:00
13 | #SBATCH --nodes=1
14 | #SBATCH --gpus=4
15 | #SBATCH --ntasks-per-node=4
16 | #SBATCH --exclusive
17 | #SBATCH --gpu-bind=none
18 | #SBATCH -o fft-gpu.o%j
19 | #SBATCH -e fft-gpu.e%j
20 | #SBATCH -J FFT-GPU
21 | 
22 | set +x
23 | 
24 | mkdir -p ${HOME}/repos/nvstdpar/build-fft-gpu
25 | cd ${HOME}/repos/nvstdpar/build-fft-gpu
26 | rm -rf ./*
27 | 
28 | ml unload cudatoolkit
29 | ml use /global/cfs/cdirs/m1759/wwei/nvhpc_23_7/modulefiles
30 | ml nvhpc/23.7
31 | # need this for GLIBC
32 | ml gcc/12.2.0
33 | ml cmake/3.24
34 | 
35 | cmake .. -DSTDPAR=gpu -DOMP=gpu -DCMAKE_CXX_COMPILER=$(which nvc++)
36 | 
37 | make -j fft-stdexec fft-stdpar
38 | 
39 | cd ${HOME}/repos/nvstdpar/build-fft-gpu/apps/fft
40 | 
41 | D=(536870912 1073741824)
42 | 
43 | for d in "${D[@]}"; do
44 |     echo "stdexec:gpu for ${d}"
45 |     srun -n 1 ./fft-stdexec -N ${d} --time --sch=gpu
46 | 
47 |     echo "stdpar:gpu for ${d}"
48 |     srun -n 1  ./fft-stdpar -N ${d} --time 2>&1
49 | done
50 | 
51 | for d in "${D[@]}"; do
52 |     echo "stdexec:multi_gpu for ${d}"
53 |     srun -n 1 ./fft-stdexec -N ${d} --time --sch=multigpu 2>&1
54 | done
55 | 
56 | 


--------------------------------------------------------------------------------
/scripts/pm-localrc/localrc:
--------------------------------------------------------------------------------
 1 | set LFC=-lgfortran;
 2 | set LDSO=/lib64/ld-linux-x86-64.so.2;
 3 | set GCCDIR=/opt/cray/pe/gcc/12.2.0/snos/lib/gcc/x86_64-suse-linux/12.2.0/;
 4 | set G77DIR=/opt/cray/pe/gcc/12.2.0/snos/lib/gcc/x86_64-suse-linux/12.2.0/;
 5 | set OEM_INFO=64-bit target on x86-64 Linux $INFOTPVAL;
 6 | set GNUATOMIC=-latomic;
 7 | set GCCINC=/opt/nvidia/hpc_sdk/Linux_x86_64/23.1/math_libs/12.0/include /opt/nvidia/hpc_sdk/Linux_x86_64/23.1/cuda/12.0/include /opt/cray/pe/gcc/12.2.0/snos/lib/gcc/x86_64-suse-linux/12.2.0/include /usr/local/include /opt/cray/pe/gcc/12.2.0/snos/include /opt/cray/pe/gcc/12.2.0/snos/lib/gcc/x86_64-suse-linux/12.2.0/include-fixed /usr/include;
 8 | set GPPDIR=/opt/nvidia/hpc_sdk/Linux_x86_64/23.1/math_libs/12.0/include /opt/nvidia/hpc_sdk/Linux_x86_64/23.1/cuda/12.0/include /opt/cray/pe/gcc/12.2.0/snos/include/g++ /opt/cray/pe/gcc/12.2.0/snos/include/g++/x86_64-suse-linux /opt/cray/pe/gcc/12.2.0/snos/include/g++/backward /opt/cray/pe/gcc/12.2.0/snos/lib/gcc/x86_64-suse-linux/12.2.0/include /usr/local/include /opt/cray/pe/gcc/12.2.0/snos/include /opt/cray/pe/gcc/12.2.0/snos/lib/gcc/x86_64-suse-linux/12.2.0/include-fixed /usr/include;
 9 | set NUMALIBNAME=-lnuma;
10 | set LOCALRC=YES;
11 | set EXTENSION=__extension__=;
12 | set LC=-lgcc -lc $if(-Bstatic,-lgcc_eh, -lgcc_s);
13 | set DEFCUDAVERSION=12.0;
14 | set DEFSTDPARCOMPUTECAP=80;
15 | # GLIBC version 2.31
16 | # GCC version 12.2.0
17 | set GCCVERSION=120200;
18 | set LIBNCURSES=YES;
19 | export PGI=$COMPBASE;


--------------------------------------------------------------------------------
/apps/1d-stencil/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | project(heat-equation LANGUAGES CXX)
 2 | 
 3 | file(GLOB CPP_SOURCES "*.cpp")
 4 | 
 5 | foreach(source_file ${CPP_SOURCES})
 6 |   if(NOT STDPAR STREQUAL "gpu")
 7 |     if("${source_file}" MATCHES ".*stdpar.*gpu.*"
 8 |        OR "${source_file}" MATCHES ".*gpu.*stdpar.*"
 9 |        OR "${source_file}" MATCHES ".*cuda.*")
10 |       message(STATUS "Skipping ${source_file} as stdpar=${STDPAR}")
11 |       continue()
12 |     endif()
13 |   endif()
14 | 
15 |   # get the file name without an extension
16 |   get_filename_component(exec_name ${source_file} NAME_WE)
17 | 
18 |   # add an executable with the same name as the source file
19 |   add_executable(${exec_name} ${_EXCLUDE} ${source_file})
20 | 
21 |   # add dependency on argparse
22 |   add_dependencies(${exec_name} argparse)
23 | 
24 |   set_source_files_properties(${source_file} PROPERTIES LANGUAGE CXX
25 |                                                         LINKER_LANGUAGE CXX)
26 |   target_include_directories(
27 |     ${exec_name}
28 |     PRIVATE ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/../../include
29 |             ${ARGPARSE_INCLUDE_DIR} ${MDSPAN_INCLUDE_DIR})
30 | 
31 |   target_link_libraries(${exec_name} PUBLIC ${MPI_LIBS} stdexec hpcpp-core)
32 | 
33 |   set_target_properties(
34 |     ${exec_name}
35 |     PROPERTIES CXX_STANDARD ${CXX_STANDARD}
36 |                CXX_EXTENSIONS ${CMAKE_GNU_EXTENSIONS}
37 |                INSTALL_RPATH_USE_LINK_PATH ON)
38 | 
39 |   # installation
40 |   install(TARGETS ${exec_name} DESTINATION ${CMAKE_INSTALL_BINDIR})
41 | endforeach()
42 | 


--------------------------------------------------------------------------------
/apps/heat-equation/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | project(heat-equation LANGUAGES CXX)
 2 | 
 3 | file(GLOB CPP_SOURCES "*.cpp")
 4 | 
 5 | foreach(source_file ${CPP_SOURCES})
 6 |   if(NOT STDPAR STREQUAL "gpu")
 7 |     if("${source_file}" MATCHES ".*stdpar.*gpu.*"
 8 |        OR "${source_file}" MATCHES ".*gpu.*stdpar.*"
 9 |        OR "${source_file}" MATCHES ".*cuda.*")
10 |       message(STATUS "Skipping ${source_file} as stdpar=${STDPAR}")
11 |       continue()
12 |     endif()
13 |   endif()
14 | 
15 |   # get the file name without an extension
16 |   get_filename_component(exec_name ${source_file} NAME_WE)
17 | 
18 |   # add an executable with the same name as the source file
19 |   add_executable(${exec_name} ${_EXCLUDE} ${source_file})
20 | 
21 |   # add dependency on argparse
22 |   add_dependencies(${exec_name} argparse)
23 | 
24 |   set_source_files_properties(${source_file} PROPERTIES LANGUAGE CXX
25 |                                                         LINKER_LANGUAGE CXX)
26 |   target_include_directories(
27 |     ${exec_name}
28 |     PRIVATE ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/../../include
29 |             ${ARGPARSE_INCLUDE_DIR} ${MDSPAN_INCLUDE_DIR})
30 | 
31 |   target_link_libraries(${exec_name} PUBLIC ${MPI_LIBS} stdexec hpcpp-core)
32 | 
33 |   set_target_properties(
34 |     ${exec_name}
35 |     PROPERTIES CXX_STANDARD ${CXX_STANDARD}
36 |                CXX_EXTENSIONS ${CMAKE_GNU_EXTENSIONS}
37 |                INSTALL_RPATH_USE_LINK_PATH ON)
38 | 
39 |   # installation
40 |   install(TARGETS ${exec_name} DESTINATION ${CMAKE_INSTALL_BINDIR})
41 | endforeach()
42 | 


--------------------------------------------------------------------------------
/scripts/heat-run.gcc.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -le
 2 | 
 3 | #
 4 | # Read and do the following two steps before running this script:
 5 | #
 6 | #
 7 | # 1. In nvstdpar/CMakeLists.txt, replace the following line:
 8 | #
 9 | # set(CMAKE_CXX_FLAGS
10 | #    "${CMAKE_CXX_FLAGS} -stdpar=${STDPAR} -mp=${OMP} --gcc-toolchain=/opt/cray/pe/gcc/12.2.0/bin/ -pthread"
11 | #
12 | # with
13 | #
14 | # set(CMAKE_CXX_FLAGS
15 | #    "${CMAKE_CXX_FLAGS} -fopenmp -pthread"
16 | #
17 | #
18 | # 2. In nvstdpar/apps/heat-equation/CMakeLists.txt, replace the following line:
19 | #
20 | # target_link_libraries(${exec_name} PUBLIC ${MPI_LIBS} stdexec)
21 | #
22 | # with
23 | #
24 | # target_link_libraries(${exec_name} PUBLIC ${MPI_LIBS} stdexec tbb)
25 | #
26 | #
27 | 
28 | set -x
29 | 
30 | mkdir -p ${HOME}/repos/nvstdpar/build-gcc
31 | cd ${HOME}/repos/nvstdpar/build-gcc
32 | 
33 | rm -rf ./*
34 | ml cmake/3.24 gcc/12.2 cudatoolkit/12.0
35 | ml unload cray-mpich
36 | 
37 | cmake .. -DSTDPAR=multicore -DOMP=multicore -DCMAKE_CXX_COMPILER=$(which g++) -DCMAKE_CUDA_HOST_COMPILER=$(which g++)
38 | 
39 | make -j heat-equation-omp heat-equation-mdspan heat-equation-stdpar
40 | 
41 | cd ${HOME}/repos/nvstdpar/build-gcc/apps/heat-equation
42 | 
43 | ./heat-equation-mdspan -s=50 -n=30000 --time 2>&1 |& tee gcc-md.txt
44 | 
45 | # parallel runs
46 | T=(128 64 32 16 8 4 2 1)
47 | 
48 | for i in "${T[@]}"; do
49 |     ./heat-equation-omp -s=50 -n=30000 --time --nthreads=${i} 2>&1 |& tee gcc-omp-${i}.txt
50 | done
51 | 
52 | # will use 128 threads anyway
53 | ./heat-equation-stdpar -s=50 -n=30000 --time 2>&1 |& tee gcc-stdpar-${i}.txt
54 | 


--------------------------------------------------------------------------------
/scripts/fft.nvhpc.grace.cpu.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -le
 2 | 
 3 | #
 4 | # Reminder: Revert any changes to nvstdpar/CMakeLists.txt and
 5 | # nvstdpar/apps/heat-equation/CMakeLists.txt that you did
 6 | # for GCC compiler script before running this.
 7 | #
 8 | 
 9 | #SBATCH -N 1
10 | #SBATCH -p cg4-cpu4x120gb-gpu4x80gb
11 | #SBATCH --gres=gpu:4
12 | #SBATCH --exclusive
13 | #SBATCH -o fft-cpu.o%j
14 | #SBATCH -e fft-cpu.e%j
15 | #SBATCH -J FFT-CPU
16 | 
17 | set +x
18 | 
19 | mkdir -p ${HOME}/repos/nvstdpar/build-fft-cpu
20 | cd ${HOME}/repos/nvstdpar/build-fft-cpu
21 | rm -rf ./*
22 | 
23 | module unload gcc; module load gcc/12.3.0; module load nvhpc/23.5; module load slurm
24 | export PATH=/home/wwei/install/cmake_3_27_3/bin/:$PATH
25 | 
26 | cmake .. -DCMAKE_BUILD_TYPE=Release -DSTDPAR=multicore -DOMP=multicore -DCMAKE_CXX_COMPILER=$(which nvc++)
27 | 
28 | make -j fft-serial fft-stdexec fft-stdpar
29 | 
30 | cd ${HOME}/repos/nvstdpar/build-fft-cpu/apps/fft
31 | 
32 | D=(536870912 1073741824)
33 | 
34 | # parallel runs
35 | T=(256 128 64 32 16 8 4 2 1)
36 | 
37 | for d in "${D[@]}"; do
38 |     for i in "${T[@]}"; do
39 |         echo "stdexec:cpu for ${d}, threads=${i}"
40 |         srun -n 1 --cpu-bind=none ./fft-stdexec -N ${d} --time --sch=cpu --nthreads=${i}
41 | 
42 |         echo "stdpar:cpu for ${d}, threads=${i}"
43 |         export OMP_NUM_THREADS=${i}
44 |         srun -n 1 --cpu-bind=none ./fft-stdpar -N ${d} --time --nthreads=${i}
45 |     done
46 | done
47 | 
48 | unset OMP_NUM_THREADS
49 | 
50 | for d in "${D[@]}"; do
51 |     echo "serial for ${d}"
52 |     srun -n 1 --cpu-bind=none ./fft-serial -N ${d} --time
53 | done
54 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # hpcpp
 2 | 
 3 | Standard C++26 High-Performance Computing (HPC) applications that run on CPUs and GPUs. 
 4 | 
 5 | ## Build
 6 | 
 7 | ```bash
 8 | git clone https://github.com/NERSC/hpcpp.git
 9 | cd hpcpp; mkdir build ; cd build
10 | ml nvhpc/23.7 cmake 3.24
11 | 
12 | # enable GPU support by setting -DSTDPAR=gpu (default)
13 | cmake .. -DSTDPAR=<gpu/multicore> -DOMP=<gpu/multicore; 
14 | make -j 10
15 | ```
16 | 
17 | **Note**: Make sure your `localrc` file (located at `/path/to/nvhpc/bin`) is properly configured to `GCC >= 11.2.0` paths.
18 | 
19 | ### NERSC Users
20 | 
21 | You can also use the pre-configured `localrc` file included in this repo. To use it, run:
22 | 
23 | ```bash
24 | export GCCLOCALRC=/path/to/hpcpp/scripts/pm-localrc/localrc
25 | ```
26 | 
27 | **Note**: Please uncomment the following line in `apps/fft/CMakeLists.txt` if using `nvc++` version < 23.7?
28 | 
29 | ```bash
30 |   # uncomment only if using nvc++ earlier than 23.7 to find libcublas
31 |   # target_link_directories(${exec_name} PRIVATE /opt/nvidia/hpc_sdk/Linux_x86_64/23.1/math_libs/lib64)
32 | ```
33 | 
34 | ## Run Apps
35 | 
36 | ```bash
37 | cd hpcpp/build
38 | srun -n 1 -N 1 -G <> -A <acct> -t 30 -C <cpu/gpu> ./apps/<appname>/<appname> [ARGS]
39 | ```
40 | 
41 | Use `--help` to see help with arguments.
42 | 
43 | ## Contributors
44 | 
45 | (in alphabetical order of last name)
46 | - [Muhammad Haseeb](https://nersc.gov/muhammad-haseeb)
47 | - [Weile Wei](https://nersc.gov/weile-wei)
48 | - [Chuanqiu He](https://github.com/hcq9102)
49 | 
50 | ## License
51 | Copyright (C) The Regents of the University of California, 2023 (See [LICENSE](LICENSE) for details).
52 | 


--------------------------------------------------------------------------------
/scripts/fft.nvhpc.cpu.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -le
 2 | 
 3 | #
 4 | # Reminder: Revert any changes to nvstdpar/CMakeLists.txt and
 5 | # nvstdpar/apps/heat-equation/CMakeLists.txt that you did
 6 | # for GCC compiler script before running this.
 7 | #
 8 | 
 9 | #SBATCH -A nstaff
10 | #SBATCH -C cpu
11 | #SBATCH --qos=regular
12 | #SBATCH --time=12:00:00
13 | #SBATCH --nodes=1
14 | #SBATCH --ntasks-per-node=1
15 | #SBATCH --cpus-per-task=128
16 | #SBATCH --exclusive
17 | #SBATCH -o fft-cpu.o%j
18 | #SBATCH -e fft-cpu.e%j
19 | #SBATCH -J FFT-CPU
20 | 
21 | set +x
22 | 
23 | BUILD_HOME=${HOME}/repos/nvstdpar/build-fft-cpu
24 | 
25 | mkdir -p ${BUILD_HOME}
26 | cd ${BUILD_HOME}
27 | rm -rf ./*
28 | 
29 | ml unload cudatoolkit
30 | ml use /global/cfs/cdirs/m1759/wwei/nvhpc_23_7/modulefiles
31 | ml nvhpc/23.7
32 | # need this for GLIBC
33 | ml gcc/12.2.0
34 | ml cmake/3.24
35 | 
36 | cmake .. -DSTDPAR=multicore -DOMP=multicore -DCMAKE_CXX_COMPILER=$(which nvc++)
37 | 
38 | make -j fft-serial fft-stdexec fft-stdpar
39 | 
40 | cd ${BUILD_HOME}/apps/fft
41 | 
42 | D=(536870912 1073741824)
43 | 
44 | # parallel runs
45 | T=(256 128 64 32 16 8 4 2 1)
46 | 
47 | for d in "${D[@]}"; do
48 |     for i in "${T[@]}"; do
49 |         echo "stdexec:cpu for ${d}, threads=${i}"
50 |         srun -n 1 --cpu-bind=none ./fft-stdexec -N ${d} --time --sch=cpu --nthreads=${i}
51 | 
52 |         echo "stdpar:cpu for ${d}, threads=${i}"
53 |         export OMP_NUM_THREADS=${i}
54 |         srun -n 1 --cpu-bind=none ./fft-stdpar -N ${d} --time --nthreads=${i}
55 |     done
56 | done
57 | 
58 | unset OMP_NUM_THREADS
59 | 
60 | for d in "${D[@]}"; do
61 |     echo "serial for ${d}"
62 |     srun -n 1 --cpu-bind=none ./fft-serial -N ${d} --time
63 | done
64 | 


--------------------------------------------------------------------------------
/scripts/heat-run.nvhpc.grace.cpu.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -le
 2 | 
 3 | #
 4 | # Reminder: Revert any changes to nvstdpar/CMakeLists.txt and
 5 | # nvstdpar/apps/heat-equation/CMakeLists.txt that you did
 6 | # for GCC compiler script before running this.
 7 | #
 8 | #SBATCH -N 1
 9 | #SBATCH -p cg4-cpu4x120gb-gpu4x80gb
10 | #SBATCH --gres=gpu:4
11 | #SBATCH --exclusive
12 | #SBATCH -o heat-cpu.o%j
13 | #SBATCH -e heat-cpu.e%j
14 | #SBATCH -J HEAT-CPU
15 | 
16 | set +x
17 | 
18 | BUILD_HOME=${HOME}/repos/nvstdpar/build-heat-cpu
19 | 
20 | mkdir -p ${BUILD_HOME}
21 | cd ${BUILD_HOME}
22 | rm -rf ./*
23 | 
24 | module unload gcc; module load gcc/12.3.0; module load nvhpc/23.5; module load slurm
25 | export PATH=/home/wwei/install/cmake_3_27_3/bin/:$PATH
26 | 
27 | cmake .. -DCMAKE_BUILD_TYPE=Release -DSTDPAR=multicore -DOMP=multicore -DCMAKE_CXX_COMPILER=$(which nvc++)
28 | 
29 | make -j heat-equation-omp heat-equation-serial heat-equation-stdexec heat-equation-stdpar
30 | 
31 | cd ${BUILD_HOME}/apps/heat-equation
32 | 
33 | # parallel runs
34 | T=(256 128 64 32 16 8 4 2 1)
35 | 
36 | unset OMP_NUM_THREADS
37 | 
38 | for i in "${T[@]}"; do
39 |     echo "heat:omp, threads=${i}"
40 |     srun -n 1 --cpu-bind=none ./heat-equation-omp -s=1000 -n=46000 --time --nthreads=${i}
41 | 
42 |     echo "heat:stdexec, threads=${i}"
43 |     srun -n 1 --cpu-bind=none ./heat-equation-stdexec -s=1000 -n=46000 --time --nthreads=${i}
44 | done
45 | 
46 | for i in "${T[@]}"; do
47 |     echo "heat:stdpar, threads=${i}"
48 |     export OMP_NUM_THREADS=${i}
49 |     srun -n 1 --cpu-bind=none ./heat-equation-stdpar -s=1000 -n=46000 --time
50 | done
51 | 
52 | unset OMP_NUM_THREADS
53 | 
54 | echo "heat:serial"
55 | srun -n 1 --cpu-bind=none ./heat-equation-serial -s=1000 -n=46000 --time
56 | 


--------------------------------------------------------------------------------
/scripts/stencil.nvhpc.grace.cpu.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -le
 2 | 
 3 | #SBATCH -N 1
 4 | #SBATCH -p cg4-cpu4x120gb-gpu4x80gb
 5 | #SBATCH --gres=gpu:4
 6 | #SBATCH --exclusive
 7 | #SBATCH -o 1d-cpu.o%j
 8 | #SBATCH -e 1d-cpu.e%j
 9 | #SBATCH -J 1D-CPU
10 | 
11 | set +x
12 | 
13 | BUILD_HOME=${HOME}/repos/nvstdpar/build-1d-cpu
14 | 
15 | mkdir -p ${BUILD_HOME}
16 | cd ${BUILD_HOME}
17 | rm -rf ./*
18 | 
19 | module unload gcc; module load gcc/12.3.0; module load nvhpc/23.5; module load slurm
20 | export PATH=/home/wwei/install/cmake_3_27_3/bin/:$PATH
21 | 
22 | # export OMP_PLACES="{0:16},{16:16},{32:16},{48:16},{64:16},{80:16},{96:16},{112:16}"
23 | # export OMP_PROC_BIND=close
24 | 
25 | 
26 | oneDimension_size=1000000000
27 | oneDimension_iterations=4000
28 | 
29 | cmake .. -DCMAKE_BUILD_TYPE=Release -DSTDPAR=multicore -DOMP=multicore -DCMAKE_CXX_COMPILER=$(which nvc++)
30 | make -j
31 | 
32 | cd ${BUILD_HOME}/apps/1d_stencil
33 | 
34 | # parallel runs
35 | T=(256 128 64 32 16 8 4 2 1)
36 | 
37 | unset OMP_NUM_THREADS
38 | 
39 | for i in "${T[@]}"; do
40 |     echo "1d:omp, threads=${i}"
41 |     srun -n 1 --cpu-bind=none ./stencil_omp --size $oneDimension_size --nt $oneDimension_iterations --nthreads=$i
42 | 
43 | 
44 |     echo "1d:stdexec, threads=${i}"
45 |     srun -n 1 --cpu-bind=none ./stencil_stdexec --sch cpu --size $oneDimension_size --nt $oneDimension_iterations --nthreads=$i
46 | done
47 | 
48 | for i in "${T[@]}"; do
49 |     echo "1d:stdpar, threads=${i}"
50 |     export OMP_NUM_THREADS=${i}
51 |     srun -n 1 --cpu-bind=none ./stencil_stdpar --size $oneDimension_size --nt $oneDimension_iterations
52 | done
53 | 
54 | unset OMP_NUM_THREADS
55 | 
56 | echo "1d:serial"
57 | srun -n 1 --cpu-bind=none ./stencil_serial --size $oneDimension_size --nt $oneDimension_iterations
58 | 
59 | 


--------------------------------------------------------------------------------
/scripts/heat-run.nvhpc.cpu.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -le
 2 | 
 3 | #
 4 | # Reminder: Revert any changes to nvstdpar/CMakeLists.txt and
 5 | # nvstdpar/apps/heat-equation/CMakeLists.txt that you did
 6 | # for GCC compiler script before running this.
 7 | #
 8 | 
 9 | #SBATCH -A nstaff
10 | #SBATCH -C cpu
11 | #SBATCH --qos=regular
12 | #SBATCH --time=24:00:00
13 | #SBATCH --nodes=1
14 | #SBATCH --ntasks-per-node=1
15 | #SBATCH --cpus-per-task=128
16 | #SBATCH --exclusive
17 | #SBATCH -o heat-cpu.o%j
18 | #SBATCH -e heat-cpu.e%j
19 | #SBATCH -J HEAT-CPU
20 | 
21 | set +x
22 | 
23 | BUILD_HOME=${HOME}/repos/nvstdpar/build-heat-cpu
24 | 
25 | mkdir -p ${BUILD_HOME}
26 | cd ${BUILD_HOME}
27 | rm -rf ./*
28 | 
29 | ml unload cudatoolkit
30 | ml use /global/cfs/cdirs/m1759/wwei/nvhpc_23_7/modulefiles
31 | ml nvhpc/23.7
32 | # needed for GLIBC
33 | ml gcc/12.2.0
34 | ml cmake/3.24
35 | 
36 | cmake .. -DSTDPAR=multicore -DOMP=multicore -DCMAKE_CXX_COMPILER=$(which nvc++)
37 | 
38 | make -j heat-equation-omp heat-equation-serial heat-equation-stdexec heat-equation-stdpar
39 | 
40 | cd ${BUILD_HOME}/apps/heat-equation
41 | 
42 | # parallel runs
43 | T=(256 128 64 32 16 8 4 2 1)
44 | 
45 | unset OMP_NUM_THREADS
46 | 
47 | for i in "${T[@]}"; do
48 |     echo "heat:omp, threads=${i}"
49 |     srun -n 1 --cpu-bind=none ./heat-equation-omp -s=1000 -n=46000 --time --nthreads=${i}
50 | 
51 |     echo "heat:stdexec, threads=${i}"
52 |     srun -n 1 --cpu-bind=none ./heat-equation-stdexec -s=1000 -n=46000 --time --nthreads=${i}
53 | done
54 | 
55 | for i in "${T[@]}"; do
56 |     echo "heat:stdpar, threads=${i}"
57 |     export OMP_NUM_THREADS=${i}
58 |     srun -n 1 --cpu-bind=none ./heat-equation-stdpar -s=1000 -n=46000 --time
59 | done
60 | 
61 | unset OMP_NUM_THREADS
62 | 
63 | echo "heat:serial"
64 | srun -n 1 --cpu-bind=none ./heat-equation-serial -s=1000 -n=46000 --time
65 | 


--------------------------------------------------------------------------------
/scripts/stencil.nvhpc.cpu.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -le
 2 | 
 3 | #SBATCH -A nstaff
 4 | #SBATCH -C cpu
 5 | #SBATCH --qos=regular
 6 | #SBATCH --time=24:00:00
 7 | #SBATCH --nodes=1
 8 | #SBATCH --ntasks-per-node=1
 9 | #SBATCH --cpus-per-task=128
10 | #SBATCH --exclusive
11 | #SBATCH -o 1d-cpu.o%j
12 | #SBATCH -e 1d-cpu.e%j
13 | #SBATCH -J 1D-CPU
14 | 
15 | set +x
16 | 
17 | BUILD_HOME=${HOME}/repos/nvstdpar/build-1d-cpu
18 | 
19 | mkdir -p ${BUILD_HOME}
20 | cd ${BUILD_HOME}
21 | rm -rf ./*
22 | 
23 | ml unload cudatoolkit
24 | ml use /global/cfs/cdirs/m1759/wwei/nvhpc_23_7/modulefiles
25 | ml nvhpc/23.7
26 | # needed for GLIBC
27 | ml gcc/12.2.0
28 | ml cmake/3.24
29 | 
30 | 
31 | # export OMP_PLACES="{0:16},{16:16},{32:16},{48:16},{64:16},{80:16},{96:16},{112:16}"
32 | # export OMP_PROC_BIND=close
33 | 
34 | 
35 | oneDimension_size=1000000000
36 | oneDimension_iterations=4000
37 | 
38 | cmake .. -DSTDPAR=multicore -DOMP=multicore -DCMAKE_CXX_COMPILER=$(which nvc++)
39 | make -j
40 | 
41 | cd ${BUILD_HOME}/apps/1d_stencil
42 | 
43 | # parallel runs
44 | T=(256 128 64 32 16 8 4 2 1)
45 | 
46 | unset OMP_NUM_THREADS
47 | 
48 | for i in "${T[@]}"; do
49 |     echo "1d:omp, threads=${i}"
50 |     srun -n 1 --cpu-bind=none ./stencil_omp --size $oneDimension_size --nt $oneDimension_iterations --nthreads=$i
51 | 
52 | 
53 |     echo "1d:stdexec, threads=${i}"
54 |     srun -n 1 --cpu-bind=none ./stencil_stdexec --sch cpu --size $oneDimension_size --nt $oneDimension_iterations --nthreads=$i
55 | done
56 | 
57 | for i in "${T[@]}"; do
58 |     echo "1d:stdpar, threads=${i}"
59 |     export OMP_NUM_THREADS=${i}
60 |     srun -n 1 --cpu-bind=none ./stencil_stdpar --size $oneDimension_size --nt $oneDimension_iterations
61 | done
62 | 
63 | unset OMP_NUM_THREADS
64 | 
65 | echo "1d:serial"
66 | srun -n 1 --cpu-bind=none ./stencil_serial --size $oneDimension_size --nt $oneDimension_iterations
67 | 
68 | 


--------------------------------------------------------------------------------
/apps/prefixSum/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | project(prefixSum LANGUAGES CXX)
 2 | 
 3 | file(GLOB CPP_SOURCES "*.cpp")
 4 | 
 5 | # add -cudalib=cublas if -stdpar=gpu
 6 | if(STDPAR STREQUAL "gpu")
 7 |   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
 8 | endif()
 9 | 
10 | foreach(source_file ${CPP_SOURCES})
11 |   if(NOT STDPAR STREQUAL "gpu")
12 |     if("${source_file}" MATCHES ".*stdpar.*gpu.*" OR "${source_file}" MATCHES
13 |                                                      ".*gpu.*stdpar.*")
14 |       message(STATUS "Skipping ${source_file} as stdpar=${STDPAR}")
15 |       continue()
16 |     endif()
17 |   endif()
18 | 
19 |   # get the file name without an extension
20 |   get_filename_component(exec_name ${source_file} NAME_WE)
21 | 
22 |   # add an executable with the same name as the source file
23 |   add_executable(${exec_name} ${_EXCLUDE} ${source_file})
24 | 
25 |   # add dependency on argparse
26 |   add_dependencies(${exec_name} argparse)
27 | 
28 |   set_source_files_properties(${source_file} PROPERTIES LANGUAGE CXX
29 |                                                         LINKER_LANGUAGE CXX)
30 |   target_include_directories(
31 |     ${exec_name}
32 |     PRIVATE ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/../../include
33 |             ${ARGPARSE_INCLUDE_DIR} ${MDSPAN_INCLUDE_DIR})
34 | 
35 |   # uncomment only if using nvc++/23.1 - no need if nvc++/23.7
36 |   # target_link_directories(${exec_name} PRIVATE
37 |   # /opt/nvidia/hpc_sdk/Linux_x86_64/23.1/math_libs/lib64)
38 | 
39 |   target_link_libraries(${exec_name} PUBLIC ${MPI_LIBS} stdexec hpcpp-core)
40 | 
41 |   set_target_properties(
42 |     ${exec_name}
43 |     PROPERTIES CXX_STANDARD ${CXX_STANDARD}
44 |                CXX_EXTENSIONS ${CMAKE_GNU_EXTENSIONS}
45 |                INSTALL_RPATH_USE_LINK_PATH ON)
46 | 
47 |   # installation
48 |   install(TARGETS ${exec_name} DESTINATION ${CMAKE_INSTALL_BINDIR})
49 | endforeach()
50 | 


--------------------------------------------------------------------------------
/apps/fft/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | project(fft LANGUAGES CXX)
 2 | 
 3 | file(GLOB CPP_SOURCES "*.cpp")
 4 | 
 5 | # add -cudalib=cublas if -stdpar=gpu
 6 | if(STDPAR STREQUAL "gpu")
 7 |   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -cudalib=cublas")
 8 | endif()
 9 | 
10 | foreach(source_file ${CPP_SOURCES})
11 |   if(NOT STDPAR STREQUAL "gpu")
12 |     if("${source_file}" MATCHES ".*stdpar.*gpu.*" OR "${source_file}" MATCHES
13 |                                                      ".*gpu.*stdpar.*")
14 |       message(STATUS "Skipping ${source_file} as stdpar=${STDPAR}")
15 |       continue()
16 |     endif()
17 |   endif()
18 | 
19 |   # get the file name without an extension
20 |   get_filename_component(exec_name ${source_file} NAME_WE)
21 | 
22 |   # add an executable with the same name as the source file
23 |   add_executable(${exec_name} ${_EXCLUDE} ${source_file})
24 | 
25 |   # add dependency on argparse
26 |   add_dependencies(${exec_name} argparse)
27 | 
28 |   set_source_files_properties(${source_file} PROPERTIES LANGUAGE CXX
29 |                                                         LINKER_LANGUAGE CXX)
30 |   target_include_directories(
31 |     ${exec_name}
32 |     PRIVATE ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/../../include
33 |             ${ARGPARSE_INCLUDE_DIR} ${MDSPAN_INCLUDE_DIR})
34 | 
35 |   # uncomment only if using nvc++/23.1 - no need if nvc++/23.7
36 |   # target_link_directories(${exec_name} PRIVATE
37 |   # /opt/nvidia/hpc_sdk/Linux_x86_64/23.1/math_libs/lib64)
38 | 
39 |   target_link_libraries(${exec_name} PUBLIC ${MPI_LIBS} stdexec blas hpcpp-core)
40 | 
41 |   set_target_properties(
42 |     ${exec_name}
43 |     PROPERTIES CXX_STANDARD ${CXX_STANDARD}
44 |                CXX_EXTENSIONS ${CMAKE_GNU_EXTENSIONS}
45 |                INSTALL_RPATH_USE_LINK_PATH ON)
46 | 
47 |   # installation
48 |   install(TARGETS ${exec_name} DESTINATION ${CMAKE_INSTALL_BINDIR})
49 | endforeach()
50 | 


--------------------------------------------------------------------------------
/apps/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # ----------------------------------------------------------------------------------------#
 2 | # Add comm-study
 3 | # ----------------------------------------------------------------------------------------#
 4 | 
 5 | message(STATUS "Adding comm-study...")
 6 | add_subdirectory(comm-study)
 7 | 
 8 | # ----------------------------------------------------------------------------------------#
 9 | # Add heat equation demo
10 | # ----------------------------------------------------------------------------------------#
11 | 
12 | message(STATUS "Adding heat-equation...")
13 | add_subdirectory(heat-equation)
14 | 
15 | # ----------------------------------------------------------------------------------------#
16 | # Add MDSPAN + hpcpp demo
17 | # ----------------------------------------------------------------------------------------#
18 | 
19 | message(STATUS "Adding mdspan-stdpar...")
20 | add_subdirectory(mdspan-stdpar)
21 | 
22 | message(STATUS "Adding 1d-stencil...")
23 | add_subdirectory(1d-stencil)
24 | 
25 | # ----------------------------------------------------------------------------------------#
26 | # Add choleskey demo
27 | # ----------------------------------------------------------------------------------------#
28 | message(STATUS "Adding choleskey example...")
29 | add_subdirectory(choleskey)
30 | 
31 | # ----------------------------------------------------------------------------------------#
32 | # Add fft demo
33 | # ----------------------------------------------------------------------------------------#
34 | message(STATUS "Adding fft...")
35 | add_subdirectory(fft)
36 | 
37 | # ----------------------------------------------------------------------------------------#
38 | # Add block segmented prefixSum
39 | # ----------------------------------------------------------------------------------------#
40 | message(STATUS "Adding prefixSum...")
41 | add_subdirectory(prefixSum)
42 | 


--------------------------------------------------------------------------------
/scripts/benchmark.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -le
 2 | 
 3 | #SBATCH -A nstaff
 4 | 
 5 | #SBATCH -C gpu
 6 | #SBATCH --qos=regular
 7 | #SBATCH -G 4 
 8 | #SBATCH -t 6:00:00
 9 | #SBATCH --exclusive
10 | #SBATCH -N 1
11 | #SBATCH --ntasks-per-node=1
12 | 
13 | #SBATCH -o nvstdpar_stencil_final_benchmark.out
14 | #SBATCH -e nvstdpar_stencil_final_benchmark.err
15 | 
16 | ml use /global/cfs/cdirs/m1759/wwei/nvhpc_23_7/modulefiles
17 | ml unload cudatoolkit
18 | ml gcc/12.2 cmake/3.24 nvhpc-hpcx/23.7
19 | 
20 | oneDimention_size=1000000000
21 | oneDimention_iterations=4000 
22 | twoDimention_size=46000
23 | twoDimention_iterations=1000
24 | 
25 | cpu_build=build_multicore_benchmark
26 | 
27 | # mkdir -p ${HOME}/src/nvstdpar/$cpu_build
28 | # cd ${HOME}/src/nvstdpar/$cpu_build
29 | # rm -rf ./*
30 | # cmake .. -DCMAKE_BUILD_TYPE=Release -DSTDPAR=multicore -DOMP=multicore
31 | # make -j
32 | 
33 | gpu_build=build_gpu_benchmark
34 | 
35 | # mkdir -p ${HOME}/src/nvstdpar/$gpu_build
36 | # cd ${HOME}/src/nvstdpar/$gpu_build
37 | # rm -rf ./*
38 | # cmake .. -DCMAKE_BUILD_TYPE=Release -DSTDPAR=gpu -DOMP=gpu
39 | # make -j
40 | 
41 | cd ${HOME}/src/nvstdpar/$cpu_build/apps/1d_stencil
42 | echo "1D_serial"
43 | time ./stencil_serial --size $oneDimention_size --nt $oneDimention_iterations
44 | echo "1D_stdpar cpu" 
45 | time ./stencil_stdpar --size $oneDimention_size --nt $oneDimention_iterations
46 | 
47 | cd ${HOME}/src/nvstdpar/$gpu_build/apps/1d_stencil
48 | echo "1D_stdpar gpu" 
49 | time ./stencil_stdpar --size $oneDimention_size --nt $oneDimention_iterations
50 | echo "1D_stdexec gpu"
51 | time ./stencil_stdexec --sch gpu --size $oneDimention_size --nt $oneDimention_iterations
52 | echo "1D_stdexec multigpu"
53 | time ./stencil_stdexec --sch multigpu --size $oneDimention_size --nt $oneDimention_iterations
54 | echo "1D_cuda"
55 | time ./stencil_cuda --size $oneDimention_size --nt $oneDimention_iterations
56 | 
57 | cd ${HOME}/src/nvstdpar/$cpu_build/apps/heat-equation
58 | echo "2D_serial"
59 | time ./heat-equation-serial -n=$twoDimention_size -s=$twoDimention_iterations --time
60 | echo "2D_stdpar cpu" 
61 | time ./heat-equation-stdpar -n=$twoDimention_size -s=$twoDimention_iterations --time
62 | 
63 | cd ${HOME}/src/nvstdpar/$gpu_build/apps/heat-equation
64 | echo "2D_stdpar gpu" 
65 | time ./heat-equation-stdpar -n=$twoDimention_size -s=$twoDimention_iterations --time
66 | echo "2D_stdexec gpu"
67 | time ./heat-equation-stdexec --sch gpu -n=$twoDimention_size -s=$twoDimention_iterations --time
68 | echo "2D_stdexec multigpu"
69 | time ./heat-equation-stdexec --sch multigpu -n=$twoDimention_size -s=$twoDimention_iterations --time
70 | echo "2D_cuda"
71 | time ./heat-equation-cuda -n=$twoDimention_size -s=$twoDimention_iterations --time
72 | 


--------------------------------------------------------------------------------
/.clang-format:
--------------------------------------------------------------------------------
 1 | # Google C/C++ Code Style settings
 2 | # https://clang.llvm.org/docs/ClangFormatStyleOptions.html
 3 | # Author: Kehan Xue, kehan.xue (at) gmail.com
 4 | 
 5 | Language: Cpp
 6 | BasedOnStyle: Google
 7 | AccessModifierOffset: -1
 8 | AlignAfterOpenBracket: Align
 9 | AlignConsecutiveAssignments: None
10 | AlignOperands: Align
11 | AllowAllArgumentsOnNextLine: true
12 | AllowAllConstructorInitializersOnNextLine: true
13 | AllowAllParametersOfDeclarationOnNextLine: false
14 | AllowShortBlocksOnASingleLine: Empty
15 | AllowShortCaseLabelsOnASingleLine: false
16 | AllowShortFunctionsOnASingleLine: Inline
17 | AllowShortIfStatementsOnASingleLine: Never  # To avoid conflict, set this "Never" and each "if statement" should include brace when coding
18 | AllowShortLambdasOnASingleLine: Inline
19 | AllowShortLoopsOnASingleLine: false
20 | AlwaysBreakAfterReturnType: None
21 | AlwaysBreakTemplateDeclarations: Yes
22 | BinPackArguments: true
23 | BreakBeforeBraces: Custom
24 | BraceWrapping:
25 |   AfterCaseLabel: false
26 |   AfterClass: false
27 |   AfterStruct: false
28 |   AfterControlStatement: Never
29 |   AfterEnum: false
30 |   AfterFunction: false
31 |   AfterNamespace: false
32 |   AfterUnion: false
33 |   AfterExternBlock: false
34 |   BeforeCatch: false
35 |   BeforeElse: false
36 |   BeforeLambdaBody: false
37 |   IndentBraces: false
38 |   SplitEmptyFunction: false
39 |   SplitEmptyRecord: false
40 |   SplitEmptyNamespace: false
41 | BreakBeforeBinaryOperators: None
42 | BreakBeforeTernaryOperators: true
43 | BreakConstructorInitializers: BeforeColon
44 | BreakInheritanceList: BeforeColon
45 | ColumnLimit: 120
46 | CompactNamespaces: false
47 | ContinuationIndentWidth: 4
48 | Cpp11BracedListStyle: true
49 | DerivePointerAlignment: false  # Make sure the * or & align on the left
50 | EmptyLineBeforeAccessModifier: LogicalBlock
51 | FixNamespaceComments: true
52 | IncludeBlocks: Preserve
53 | IndentCaseLabels: true
54 | IndentPPDirectives: None
55 | IndentWidth: 4
56 | KeepEmptyLinesAtTheStartOfBlocks: true
57 | MaxEmptyLinesToKeep: 1
58 | NamespaceIndentation: None
59 | ObjCSpaceAfterProperty: false
60 | ObjCSpaceBeforeProtocolList: true
61 | PointerAlignment: Left
62 | ReflowComments: false
63 | # SeparateDefinitionBlocks: Always   # Only support since clang-format 14
64 | SpaceAfterCStyleCast: false
65 | SpaceAfterLogicalNot: false
66 | SpaceAfterTemplateKeyword: true
67 | SpaceBeforeAssignmentOperators: true
68 | SpaceBeforeCpp11BracedList: false
69 | SpaceBeforeCtorInitializerColon: true
70 | SpaceBeforeInheritanceColon: true
71 | SpaceBeforeParens: ControlStatements
72 | SpaceBeforeRangeBasedForLoopColon: true
73 | SpaceBeforeSquareBrackets: false
74 | SpaceInEmptyParentheses: false
75 | SpacesBeforeTrailingComments: 2
76 | SpacesInAngles: false
77 | SpacesInCStyleCastParentheses: false
78 | SpacesInContainerLiterals: false
79 | SpacesInParentheses: false
80 | SpacesInSquareBrackets: false
81 | Standard: c++17
82 | TabWidth: 4
83 | SeparateDefinitionBlocks: Always   # Only support since clang-format 14
84 | UseTab: Never
85 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Prerequisites
  2 | *.d
  3 | 
  4 | # Compiled Object files
  5 | *.slo
  6 | *.sln
  7 | *.lo
  8 | *.o
  9 | *.obj
 10 | 
 11 | # Data, Results and param files
 12 | *params*
 13 | *.tsv
 14 | 
 15 | # Docs
 16 | *.jekyll-metadata
 17 | 
 18 | #IDE files
 19 | /.vscode
 20 | /.vs
 21 | /*.nja
 22 | settings.json
 23 | *.swp
 24 | 
 25 | # Edit files
 26 | *~
 27 | ._*
 28 | 
 29 | # Python
 30 | __pycache__
 31 | 
 32 | # misc
 33 | *.autosave
 34 | /coverage.info
 35 | 
 36 | # macOS Finder files
 37 | .DS_Store
 38 | 
 39 | *.dbprep
 40 | 
 41 | # images
 42 | /*.png
 43 | /*.tif
 44 | /*.tiff
 45 | /*.jpeg
 46 | /*.jpg
 47 | /*.gif
 48 | /run*.sh
 49 | 
 50 | # stashed source tree
 51 | /.stash
 52 | 
 53 | # Directories
 54 | /parts*
 55 | /output*
 56 | /build*
 57 | /install*
 58 | 
 59 | # Precompiled Headers
 60 | *.gch
 61 | *.pch
 62 | 
 63 | # Compiled Dynamic libraries
 64 | *.so
 65 | *.dylib
 66 | *.dll
 67 | 
 68 | # Fortran module files
 69 | *.mod
 70 | *.smod
 71 | 
 72 | # Compiled Static libraries
 73 | *.lai
 74 | *.la
 75 | *.a
 76 | *.lib
 77 | 
 78 | # Executables
 79 | *.exe
 80 | *.out
 81 | *.bin
 82 | *.app
 83 | *.map
 84 | *.pyc
 85 | 
 86 | # slurm logs
 87 | *.e[0-9]*
 88 | *.o[0-9]*
 89 | 
 90 | # nsys and ncu
 91 | *.nsys-rep
 92 | *.nsys.sqlite
 93 | *.ncu-rep
 94 | 
 95 | # preprocessed data files
 96 | *.pbin
 97 | 
 98 | # Remote Sync for Atom
 99 | .remote-sync.json
100 | 
101 | # Byte-compiled / optimized / DLL files
102 | __pycache__/
103 | *.py[cod]
104 | *$py.class
105 | 
106 | # C extensions
107 | *.so
108 | 
109 | # tmp files
110 | ~*
111 | *.~*
112 | 
113 | # Distribution / packaging
114 | .Python
115 | env/
116 | build*/
117 | develop-eggs/
118 | dist/
119 | downloads/
120 | eggs/
121 | .eggs/
122 | lib/
123 | lib64/
124 | parts/
125 | sdist/
126 | var/
127 | wheels/
128 | *.egg-info/
129 | .installed.cfg
130 | *.egg
131 | 
132 | # PyInstaller
133 | #  Usually these files are written by a python script from a template
134 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
135 | *.manifest
136 | *.spec
137 | 
138 | # Installer logs
139 | pip-log.txt
140 | pip-delete-this-directory.txt
141 | 
142 | # Unit test / coverage reports
143 | htmlcov/
144 | .tox/
145 | .coverage
146 | .coverage.*
147 | .cache
148 | nosetests.xml
149 | coverage.xml
150 | *,cover
151 | .hypothesis/
152 | 
153 | # Translations
154 | *.mo
155 | *.pot
156 | 
157 | # Django stuff:
158 | *.log
159 | local_settings.py
160 | 
161 | # Flask stuff:
162 | instance/
163 | .webassets-cache
164 | 
165 | # Scrapy stuff:
166 | .scrapy
167 | 
168 | # Sphinx documentation
169 | docs/_build/
170 | 
171 | # PyBuilder
172 | target/
173 | 
174 | # Jupyter Notebook
175 | .ipynb_checkpoints
176 | 
177 | # pyenv
178 | .python-version
179 | 
180 | # celery beat schedule file
181 | celerybeat-schedule
182 | 
183 | # dotenv
184 | .env
185 | 
186 | # virtualenv
187 | .venv/
188 | venv/
189 | ENV/
190 | 
191 | # Spyder project settings
192 | .spyderproject
193 | 
194 | # Rope project settings
195 | .ropeproject
196 | 
197 | # vscode
198 | .vscode/
199 | .vs/
200 | 
201 | # cmake
202 | CMakeLists.txt.user*
203 | CMakeCache.txt*
204 | CMakeFiles*
205 | CMakeScripts*
206 | Testing*
207 | Makefile*
208 | cmake_install.cmake*
209 | install_manifest.txt*
210 | compile_commands.json*
211 | CTestTestfile.cmake*
212 | _deps*


--------------------------------------------------------------------------------
/.gitlab-ci.yml:
--------------------------------------------------------------------------------
  1 | variables:
  2 |   BASE: ${CI_PROJECT_DIR}
  3 |   threads: 32
  4 | 
  5 | default:
  6 |   tags:
  7 |     - muller-login01
  8 |   interruptible: true
  9 |   before_script:
 10 |     - ml use /global/cfs/cdirs/m1759/wwei/nvhpc_23_7/modulefiles
 11 |     - ml unload cudatoolkit
 12 |     - ml gcc/12.2 cmake/3.24 nvhpc/23.7
 13 | 
 14 | workflow:
 15 |   rules:
 16 |     - if: $CI_PIPELINE_SOURCE == 'merge_request_event'
 17 |       variables:
 18 |         install_prefix: ${CI_PROJECT_DIR}/merge_request_install
 19 |     - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH
 20 |       variables:
 21 |         install_prefix: ${CI_PROJECT_DIR}/default_branch_install
 22 |     - when: always
 23 |       variables:
 24 |         install_prefix: ${CI_PROJECT_DIR}/any_branch_install
 25 | 
 26 | .build_template: &build_template
 27 |   stage: build
 28 |   script:
 29 |     - cd ${BASE}
 30 |     - git clone --recursive https://github.com/NERSC/hpcpp.git hpcpp
 31 |     - cd hpcpp
 32 |     - mkdir -p build-${BUILD_TYPE}-${STDPAR_TYPE} && cd build-${BUILD_TYPE}-${STDPAR_TYPE}
 33 |     - cmake -DCMAKE_CXX_COMPILER=$(which nvc++) -DCMAKE_C_COMPILER=$(which nvc) -DCMAKE_BUILD_TYPE=${BUILD_TYPE} -DSTDPAR=${STDPAR_TYPE} -DOMP=${STDPAR_TYPE} ..
 34 |     - make -j${threads}
 35 |   artifacts:
 36 |     paths:
 37 |       - hpcpp/build-${BUILD_TYPE}-${STDPAR_TYPE}/
 38 | 
 39 | build-debug-gpu:
 40 |   <<: *build_template
 41 |   variables:
 42 |     BUILD_TYPE: Debug
 43 |     STDPAR_TYPE: gpu
 44 | 
 45 | build-debug-multicore:
 46 |   <<: *build_template
 47 |   variables:
 48 |     BUILD_TYPE: Debug
 49 |     STDPAR_TYPE: multicore
 50 | 
 51 | build-release-gpu:
 52 |   <<: *build_template
 53 |   variables:
 54 |     BUILD_TYPE: Release
 55 |     STDPAR_TYPE: gpu
 56 | 
 57 | build-release-multicore:
 58 |   <<: *build_template
 59 |   variables:
 60 |     BUILD_TYPE: Release
 61 |     STDPAR_TYPE: multicore
 62 | 
 63 | .test_template: &test_template
 64 |   stage: test
 65 |   script:
 66 |     - cd ${BASE}/hpcpp/build-${BUILD_TYPE}-${STDPAR_TYPE}/apps/1d-stencil
 67 |     - |
 68 |       if [ "${STDPAR_TYPE}" = "gpu" ]; then
 69 |         ./1d-stdexec --sch gpu --size 10 --nt 10
 70 |         ./1d-stdpar --size 10 --nt 10
 71 |         ./1d-stdexec --sch multigpu --size 10 --nt 10
 72 |         ./1d-cuda --size 10 --nt 10
 73 |       fi
 74 |     - |
 75 |       if [ "${STDPAR_TYPE}" = "multicore" ]; then
 76 |         ./1d-serial --size 10 --nt 10
 77 |         ./1d-omp --size 10 --nt 10
 78 |         ./1d-stdpar --size 10 --nt 10
 79 |         ./1d-stdexec --sch cpu --size 10 --nt 10
 80 |       fi
 81 | 
 82 | test-debug-gpu:
 83 |   <<: *test_template
 84 |   variables:
 85 |     BUILD_TYPE: Debug
 86 |     STDPAR_TYPE: gpu
 87 |   dependencies:
 88 |     - build-debug-gpu
 89 | 
 90 | test-debug-multicore:
 91 |   <<: *test_template
 92 |   variables:
 93 |     BUILD_TYPE: Debug
 94 |     STDPAR_TYPE: multicore
 95 |   dependencies:
 96 |     - build-debug-multicore
 97 | 
 98 | test-release-gpu:
 99 |   <<: *test_template
100 |   variables:
101 |     BUILD_TYPE: Release
102 |     STDPAR_TYPE: gpu
103 |   dependencies:
104 |     - build-release-gpu
105 | 
106 | test-release-multicore:
107 |   <<: *test_template
108 |   variables:
109 |     BUILD_TYPE: Release
110 |     STDPAR_TYPE: multicore
111 |   dependencies:
112 |     - build-release-multicore
113 | 


--------------------------------------------------------------------------------
/scripts/fft.ncu.nsys.gpu.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -le
 2 | 
 3 | #
 4 | # Reminder: Revert any changes to nvstdpar/CMakeLists.txt and
 5 | # nvstdpar/apps/heat-equation/CMakeLists.txt that you did
 6 | # for GCC compiler script before running this.
 7 | #
 8 | 
 9 | #SBATCH -A nstaff_g
10 | #SBATCH -C gpu
11 | #SBATCH --qos=regular
12 | #SBATCH --time=3:00:00
13 | #SBATCH --nodes=1
14 | #SBATCH --gpus=4
15 | #SBATCH --ntasks-per-node=4
16 | #SBATCH --exclusive
17 | #SBATCH --gpu-bind=none
18 | #SBATCH -o ncu-nsys-fft-gpu.o%j
19 | #SBATCH -e ncu-nsys-fft-gpu.e%j
20 | #SBATCH -J FFT-GPU-PERF
21 | 
22 | set +x
23 | 
24 | # config setting
25 | BUILD_HOME=${HOME}/repos/nvstdpar/build-fft-gpu-nsight
26 | 
27 | # build stuff
28 | mkdir -p ${BUILD_HOME}
29 | cd ${BUILD_HOME}
30 | rm -rf ./*
31 | 
32 | ml unload cudatoolkit
33 | ml use /global/cfs/cdirs/m1759/wwei/nvhpc_23_7/modulefiles
34 | ml nvhpc/23.7
35 | 
36 | cmake .. -DSTDPAR=gpu -DOMP=gpu -DCMAKE_CXX_COMPILER=$(which nvc++)
37 | make -j fft-stdexec fft-stdpar
38 | 
39 | # always run NCU and Nsys from $SCRATCH to avoid errors on Perlmutter
40 | mkdir -p ${SCRATCH}/fft-gpu-nsight
41 | cd ${SCRATCH}/fft-gpu-nsight
42 | rm -rf ./*
43 | 
44 | 
45 | # pause dcgmi
46 | srun --ntasks-per-node 1 dcgmi profile --pause
47 | 
48 | # Problem size (increasing this beyond 4024000 may take long time for multigpu runs)
49 | SIZE=4024000
50 | 
51 | # Run Nsys
52 | 
53 | # stdexec-single-gpu
54 | srun nsys profile --force-overwrite true -o fft-gpu-stdexec.nsys --stats=true ${BUILD_HOME}/apps/fft/fft-stdexec --sch=gpu -N ${SIZE} |& tee nsys-fft-stdexec-gpu.log
55 | 
56 | # stdpar-gpu (not sure if more than one)
57 | srun nsys profile --force-overwrite true -o fft-gpu-stdpar.nsys --stats=true ${BUILD_HOME}/apps/fft/fft-stdpar -N ${SIZE} |& tee nsys-fft-stdpar-gpu.log
58 | 
59 | # stdexec-multigpu
60 | srun nsys profile --force-overwrite true -o fft-multigpu-stdexec.nsys --stats=true ${BUILD_HOME}/apps/fft/fft-stdexec --sch=multigpu -N ${SIZE} |& tee nsys-fft-multigpu-stdexec.log
61 | 
62 | 
63 | # Run NCU (set full)
64 | 
65 | # stdexec-single-gpu (full)
66 | srun ncu -f -o fft-gpu-stdexec.ncu  --target-processes all --print-summary per-gpu --replay-mode application  --set full ${BUILD_HOME}/apps/fft/fft-stdexec -N ${SIZE} --sch=gpu |& tee ncu-fft-stdexec-gpu.log
67 | 
68 | # stdpar-gpu (full)
69 | srun ncu -f -o fft-gpu-stdpar.ncu  --target-processes all --print-summary per-gpu --replay-mode application  --set full ${BUILD_HOME}/apps/fft/fft-stdpar -N ${SIZE} |& tee ncu-fft-stdpar-gpu.log
70 | 
71 | # stdexec-multigpu (full)
72 | srun ncu -f -o fft-multigpu-stdexec.log  --target-processes all --print-summary per-gpu --replay-mode application  --set full ${BUILD_HOME}/apps/fft/fft-stdexec -N ${SIZE} --sch=multigpu |& tee ncu-fft-multigpu-stdexec.log
73 | 
74 | 
75 | # Run NCU (set roofline only)
76 | 
77 | # stdexec-single-gpu (roofline)
78 | ncu -f -o fft-gpu-stdexec-roofline.ncu  --target-processes all --print-summary per-gpu --replay-mode application  --set roofline ${BUILD_HOME}/apps/fft/fft-stdexec -N ${SIZE} --sch=gpu |& tee ncu-fft-stdexec-gpu-roofline.log
79 | 
80 | # stdpar-gpu (roofline)
81 | srun ncu -f -o fft-gpu-stdpar-roofline.ncu  --target-processes all --print-summary per-gpu --replay-mode application  --set full ${BUILD_HOME}/apps/fft/fft-stdpar -N ${SIZE} |& tee ncu-fft-stdpar-gpu-roofline.log
82 | 
83 | # stdexec-multigpu (roofline)
84 | srun ncu -f -o fft-multigpu-stdexec-roofline.log  --target-processes all --print-summary per-gpu --replay-mode application  --set roofline ${BUILD_HOME}/apps/fft/fft-stdexec -N ${SIZE} --sch=multigpu |& tee ncu-fft-multigpu-stdexec-roofline.log
85 | 
86 | # resume dcgmi
87 | srun --ntasks-per-node 1 dcgmi profile --resume
88 | 


--------------------------------------------------------------------------------
/apps/mdspan-stdpar/mdspan-stdpar.cpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * MIT License
 3 |  *
 4 |  * Copyright (c) 2023 The Regents of the University of California,
 5 |  * through Lawrence Berkeley National Laboratory (subject to receipt of any
 6 |  * required approvals from the U.S. Dept. of Energy).  All rights reserved.
 7 |  *
 8 |  * Permission is hereby granted, free of charge, to any person obtaining a copy
 9 |  * of this software and associated documentation files (the "Software"), to deal
10 |  * in the Software without restriction, including without limitation the rights
11 |  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 |  * copies of the Software, and to permit persons to whom the Software is
13 |  * furnished to do so, subject to the following conditions:
14 |  *
15 |  * The above copyright notice and this permission notice shall be included in
16 |  * all copies or substantial portions of the Software.
17 |  *
18 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 |  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 |  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24 |  * SOFTWARE.
25 |  */
26 | 
27 | #include "commons.hpp"
28 | 
29 | using data_type = int;
30 | // 2D view
31 | using extents_type = std::extents<int, std::dynamic_extent, std::dynamic_extent>;
32 | // 3D view (fix the first dimension to 2)
33 | using extents_type2 = std::extents<int, 2, std::dynamic_extent, std::dynamic_extent>;
34 | 
35 | int main() {
36 |     constexpr int N = 1e9;
37 |     std::vector<data_type> v(N);
38 | 
39 |     // View data as contiguous memory representing 2 rows of 6 ints each
40 |     auto ms2 = std::mdspan<data_type, extents_type, std::layout_right>(v.data(), N / 2, 2);
41 |     // View the same data as a 3D array 2 (fixed above) x 3 x 2
42 |     auto ms3 = std::mdspan<data_type, extents_type2, std::layout_right>(v.data(), N / 4, 2);
43 | 
44 |     // auto dim2 = [=](int i){int i1 = i/ms2.extent(1); int i2 = i%ms2.extent(1);
45 |     // return std::make_tuple(i1, i2);}; auto dim3 = [=](int i){int i1 =
46 |     // i/(ms3.extent(1)*ms3.extent(2)); int i2 = (i/ms3.extent(2))%ms3.extent(1);
47 |     // int i3 = i%ms3.extent(2); return std::make_tuple(i1, i2, i3);};
48 | 
49 |     std::for_each(std::execution::par_unseq, ms2.data_handle(), ms2.data_handle() + ms2.size(), [=](int& i) {
50 |         auto global_idx = std::distance(ms2.data_handle(), &i);
51 |         dim2(global_idx, ms2);
52 |         // auto [i1, i2] = dim2(global_idx);
53 |         ms2(ii, ij) = global_idx;
54 |     });
55 | 
56 |     fmt::print("\n");
57 | 
58 |     std::for_each(std::execution::par_unseq, ms2.data_handle(), ms2.data_handle() + ms2.size(), [=](int& i) {
59 |         auto global_idx = std::distance(ms2.data_handle(), &i);
60 |         dim3(global_idx, ms3);
61 |         // auto [i1, i2, i3] = dim3(global_idx);
62 |         ms3(ii, ij, ik) = 1000 + global_idx;
63 |     });
64 | 
65 |     // read subset of data using 3D view
66 |     for (size_t i = 0; i < ms3.extent(0); i++) {
67 |         for (size_t j = 0; j < 10; j++) {
68 |             for (size_t k = 0; k < ms3.extent(2); k++) {
69 |                 assert(ms3(i, j, k) == 1000 + i * ms3.extent(1) * ms3.extent(2) + j * ms3.extent(2) + k);
70 |                 fmt::print("{} ", ms3(i, j, k));
71 |             }
72 |             fmt::print("\n");
73 |         }
74 |         fmt::print("\n");
75 |     }
76 | 
77 |     fmt::print("{}\n", ms3(0, 0, 1));
78 | }


--------------------------------------------------------------------------------
/apps/prefixSum/prefixSum.hpp:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * MIT License
 3 |  *
 4 |  * Copyright (c) 2023 The Regents of the University of California,
 5 |  * through Lawrence Berkeley National Laboratory (subject to receipt of any
 6 |  * required approvals from the U.S. Dept. of Energy).All rights reserved.
 7 |  *
 8 |  * Permission is hereby granted, free of charge, to any person obtaining a copy
 9 |  * of this software and associated documentation files (the "Software"), to deal
10 |  * in the Software without restriction, including without limitation the rights
11 |  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 |  * copies of the Software, and to permit persons to whom the Software is
13 |  * furnished to do so, subject to the following conditions:
14 |  *
15 |  * The above copyright notice and this permission notice shall be included in
16 |  * all copies or substantial portions of the Software.
17 |  *
18 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 |  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 |  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24 |  * SOFTWARE.
25 |  */
26 | 
27 | /*
28 |  * commons for the fft codes
29 |  */
30 | 
31 | #pragma once
32 | 
33 | #include <exec/static_thread_pool.hpp>
34 | #include <stdexec/execution.hpp>
35 | 
36 | #if defined(USE_GPU)
37 | #include <nvexec/multi_gpu_context.cuh>
38 | #include <nvexec/stream_context.cuh>
39 | using namespace nvexec;
40 | #endif  //USE_GPU
41 | 
42 | #include "argparse/argparse.hpp"
43 | 
44 | #include "commons.hpp"
45 | 
46 | using namespace std;
47 | using namespace stdexec;
48 | namespace ex = stdexec;
49 | 
50 | // data type
51 | using data_t = unsigned long long;
52 | 
53 | // input arguments
54 | struct prefixSum_params_t : public argparse::Args {
55 |     int& N = kwarg("N", "array size").set_default(1e9);
56 |     bool& print_arr = flag("p,print", "print array and prefixSum");
57 |     int& nthreads = kwarg("nthreads", "number of threads").set_default(std::thread::hardware_concurrency());
58 | 
59 | #if defined(PSUM_STDEXEC)
60 |     std::string& sch = kwarg("sch",
61 |                              "stdexec scheduler: [options: cpu"
62 | #if defined(USE_GPU)
63 |                              ", gpu, multigpu"
64 | #endif  //USE_GPU
65 |                              "]")
66 |                            .set_default("cpu");
67 | #endif  // PSUM_STDEXEC
68 | 
69 |     bool& validate = flag("validate", "validate the results");
70 |     bool& help = flag("h, help", "print help");
71 |     bool& print_time = flag("t,time", "print prefixSum time");
72 |     bool& debug = flag("d,debug", "print internal timers and configs (if any)");
73 | };
74 | 
75 | namespace psum {
76 | template <typename T>
77 | [[nodiscard]] bool validatePrefixSum(T* in, data_t* out, size_t N) {
78 |     fmt::print("Validating: \n");
79 | 
80 |     // compute inclusive_scan via parSTL
81 |     std::vector<data_t> test(N);
82 |     std::inclusive_scan(std::execution::par, in, in + N, test.begin(), std::plus<>());
83 | 
84 |     // check if equal
85 |     return std::equal(std::execution::par, out, out + N, test.begin());
86 | }
87 | 
88 | template <typename T>
89 | void genRandomVector(T* in, int N, T lower, T upper) {
90 |     // random number generator
91 |     std::random_device rd;
92 |     std::mt19937 gen(rd());
93 |     std::uniform_int_distribution<T> dist(lower, upper);
94 | 
95 |     // fill random between 1 to 10
96 |     std::generate(std::execution::seq, in, in + N, [&gen, &dist]() { return dist(gen); });
97 | }
98 | }  // namespace psum
99 | 


--------------------------------------------------------------------------------
/apps/prefixSum/prefixSum-stdpar.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * MIT License
  3 |  *
  4 |  * Copyright (c) 2023 The Regents of the University of California,
  5 |  * through Lawrence Berkeley National Laboratory (subject to receipt of any
  6 |  * required approvals from the U.S. Dept. of Energy).All rights reserved.
  7 |  *
  8 |  * Permission is hereby granted, free of charge, to any person obtaining a copy
  9 |  * of this software and associated documentation files (the "Software"), to deal
 10 |  * in the Software without restriction, including without limitation the rights
 11 |  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 12 |  * copies of the Software, and to permit persons to whom the Software is
 13 |  * furnished to do so, subject to the following conditions:
 14 |  *
 15 |  * The above copyright notice and this permission notice shall be included in
 16 |  * all copies or substantial portions of the Software.
 17 |  *
 18 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 19 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 20 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 21 |  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 22 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 23 |  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 24 |  * SOFTWARE.
 25 |  */
 26 | 
 27 | /*
 28 |  * commons for the prefixSum codes
 29 |  */
 30 | 
 31 | #include "prefixSum.hpp"
 32 | 
 33 | //
 34 | // serial prefixSum function
 35 | //
 36 | template <typename T>
 37 | [[nodiscard]] T* prefixSum_stdpar(const T* in, const int N) {
 38 |     T* y = new T[N];
 39 |     std::inclusive_scan(std::execution::par, in, in + N, y, std::plus<>());
 40 |     return y;
 41 | }
 42 | 
 43 | //
 44 | // simulation
 45 | //
 46 | int main(int argc, char* argv[]) {
 47 |     // parse params
 48 |     const prefixSum_params_t args = argparse::parse<prefixSum_params_t>(argc, argv);
 49 | 
 50 |     // see if help wanted
 51 |     if (args.help) {
 52 |         args.print();  // prints all variables
 53 |         return 0;
 54 |     }
 55 | 
 56 |     // simulation variables
 57 |     int N = args.N;
 58 |     bool print_arr = args.print_arr;
 59 |     bool print_time = args.print_time;
 60 |     bool validate = args.validate;
 61 | 
 62 |     if (!isPowOf2(N)) {
 63 |         N = ceilPowOf2(N);
 64 |         fmt::print("INFO: N != pow(2). Setting => N = {}\n", N);
 65 |     }
 66 | 
 67 |     // input data
 68 |     data_t* in = new data_t[N];
 69 | 
 70 |     fmt::print("Progress:0%");
 71 | 
 72 |     // random number generator
 73 |     psum::genRandomVector(in, N, (data_t)0, (data_t)10);
 74 | 
 75 |     fmt::print("..50%");
 76 | 
 77 |     // output pointer
 78 |     data_t* out = nullptr;
 79 | 
 80 |     // start the timer
 81 |     Timer timer;
 82 | 
 83 |     // serial prefixSum
 84 |     out = prefixSum_stdpar(in, N);
 85 | 
 86 |     // stop timer
 87 |     auto elapsed = timer.stop();
 88 | 
 89 |     fmt::print("..100%\n");
 90 | 
 91 |     // print the input and its prefix sum (don't if N > 100)
 92 |     if (print_arr && N < 100) {
 93 |         fmt::print("int = {}\n", fmt::join(in, in + N, " "));
 94 |         fmt::print("out = {}\n", fmt::join(out, out + N, " "));
 95 |     }
 96 | 
 97 |     // print the elapsed time
 98 |     if (print_time)
 99 |         fmt::print("Elapsed Time: {:f} s\n", elapsed);
100 | 
101 |     // validate the prefixSum
102 |     if (validate) {
103 |         bool verify = psum::validatePrefixSum(in, out, N);
104 | 
105 |         if (verify)
106 |             fmt::print("SUCCESS..");
107 |         else
108 |             fmt::print("FAILED..");
109 | 
110 |         fmt::print("\n");
111 |     }
112 | 
113 |     // delete in and out
114 |     delete[] in;
115 |     delete[] out;
116 | 
117 |     // return status
118 |     return 0;
119 | }
120 | 


--------------------------------------------------------------------------------
/apps/prefixSum/prefixSum-serial.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * MIT License
  3 |  *
  4 |  * Copyright (c) 2023 The Regents of the University of California,
  5 |  * through Lawrence Berkeley National Laboratory (subject to receipt of any
  6 |  * required approvals from the U.S. Dept. of Energy).All rights reserved.
  7 |  *
  8 |  * Permission is hereby granted, free of charge, to any person obtaining a copy
  9 |  * of this software and associated documentation files (the "Software"), to deal
 10 |  * in the Software without restriction, including without limitation the rights
 11 |  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 12 |  * copies of the Software, and to permit persons to whom the Software is
 13 |  * furnished to do so, subject to the following conditions:
 14 |  *
 15 |  * The above copyright notice and this permission notice shall be included in
 16 |  * all copies or substantial portions of the Software.
 17 |  *
 18 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 19 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 20 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 21 |  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 22 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 23 |  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 24 |  * SOFTWARE.
 25 |  */
 26 | 
 27 | /*
 28 |  * commons for the prefixSum codes
 29 |  */
 30 | 
 31 | #include "prefixSum.hpp"
 32 | 
 33 | //
 34 | // serial prefixSum function
 35 | //
 36 | template <typename T>
 37 | [[nodiscard]] T* prefixSum_serial(const T* in, const int N) {
 38 |     T* y = new T[N];
 39 |     std::inclusive_scan(std::execution::seq, in, in + N, y, std::plus<>());
 40 |     return y;
 41 | }
 42 | 
 43 | //
 44 | // simulation
 45 | //
 46 | int main(int argc, char* argv[]) {
 47 |     // parse params
 48 |     const prefixSum_params_t args = argparse::parse<prefixSum_params_t>(argc, argv);
 49 | 
 50 |     // see if help wanted
 51 |     if (args.help) {
 52 |         args.print();  // prints all variables
 53 |         return 0;
 54 |     }
 55 | 
 56 |     // simulation variables
 57 |     int N = args.N;
 58 |     bool print_arr = args.print_arr;
 59 |     bool print_time = args.print_time;
 60 |     bool validate = args.validate;
 61 | 
 62 |     if (!isPowOf2(N)) {
 63 |         N = ceilPowOf2(N);
 64 |         fmt::print("INFO: N != pow(2). Setting => N = {}\n", N);
 65 |     }
 66 | 
 67 |     // input data
 68 |     data_t* in = new data_t[N];
 69 | 
 70 |     fmt::print("Progress:0%");
 71 | 
 72 |     // random number generator
 73 |     psum::genRandomVector(in, N, (data_t)0, (data_t)10);
 74 | 
 75 |     fmt::print("..50%");
 76 | 
 77 |     // output pointer
 78 |     data_t* out = nullptr;
 79 | 
 80 |     // start the timer
 81 |     Timer timer;
 82 | 
 83 |     // serial prefixSum
 84 |     out = prefixSum_serial(in, N);
 85 | 
 86 |     // stop timer
 87 |     auto elapsed = timer.stop();
 88 | 
 89 |     fmt::print("..100%\n");
 90 | 
 91 |     // print the input and its prefix sum (don't if N > 100)
 92 |     if (print_arr && N < 100) {
 93 |         fmt::print("int = {}\n", fmt::join(in, in + N, " "));
 94 |         fmt::print("out = {}\n", fmt::join(out, out + N, " "));
 95 |     }
 96 | 
 97 |     // print the elapsed time
 98 |     if (print_time)
 99 |         fmt::print("Elapsed Time: {:f} s\n", elapsed);
100 | 
101 |     // validate the prefixSum
102 |     if (validate) {
103 |         bool verify = psum::validatePrefixSum(in, out, N);
104 | 
105 |         if (verify) {
106 |             fmt::print("SUCCESS..");
107 |         } else {
108 |             fmt::print("FAILED..");
109 |         }
110 | 
111 |         fmt::print("\n");
112 |     }
113 | 
114 |     // delete in and out
115 |     delete[] in;
116 |     delete[] out;
117 | 
118 |     // return status
119 |     return 0;
120 | }
121 | 


--------------------------------------------------------------------------------
/include/counting_iterator.hpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * MIT License
  3 |  *
  4 |  * Copyright (c) 2023 The Regents of the University of California,
  5 |  * through Lawrence Berkeley National Laboratory (subject to receipt of any
  6 |  * required approvals from the U.S. Dept. of Energy).  All rights reserved.
  7 |  *
  8 |  * Permission is hereby granted, free of charge, to any person obtaining a copy
  9 |  * of this software and associated documentation files (the "Software"), to deal
 10 |  * in the Software without restriction, including without limitation the rights
 11 |  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 12 |  * copies of the Software, and to permit persons to whom the Software is
 13 |  * furnished to do so, subject to the following conditions:
 14 |  *
 15 |  * The above copyright notice and this permission notice shall be included in
 16 |  * all copies or substantial portions of the Software.
 17 |  *
 18 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 19 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 20 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 21 |  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 22 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 23 |  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 24 |  * SOFTWARE.
 25 |  */
 26 | 
 27 | //
 28 | // counting_iterator taken from
 29 | // https://github.com/LLNL/LULESH/blob/2.0.2-dev/stdpar/src/lulesh.h#L687
 30 | //
 31 | 
 32 | #pragma once
 33 | 
 34 | #include "commons.hpp"
 35 | 
 36 | using Index_t = int32_t;
 37 | 
 38 | struct counting_iterator {
 39 |    private:
 40 |     using self = counting_iterator;
 41 | 
 42 |    public:
 43 |     using value_type = Index_t;
 44 |     using difference_type = typename std::make_signed<Index_t>::type;
 45 |     using pointer = Index_t*;
 46 |     using reference = Index_t&;
 47 |     using iterator_category = std::random_access_iterator_tag;
 48 | 
 49 |     counting_iterator() : value(0) {}
 50 | 
 51 |     explicit counting_iterator(value_type v) : value(v) {}
 52 | 
 53 |     value_type operator*() const { return value; }
 54 | 
 55 |     value_type operator[](difference_type n) const { return value + n; }
 56 | 
 57 |     self& operator++() {
 58 |         ++value;
 59 |         return *this;
 60 |     }
 61 | 
 62 |     self operator++(int) {
 63 |         self result{value};
 64 |         ++value;
 65 |         return result;
 66 |     }
 67 | 
 68 |     self& operator--() {
 69 |         --value;
 70 |         return *this;
 71 |     }
 72 | 
 73 |     self operator--(int) {
 74 |         self result{value};
 75 |         --value;
 76 |         return result;
 77 |     }
 78 | 
 79 |     self& operator+=(difference_type n) {
 80 |         value += n;
 81 |         return *this;
 82 |     }
 83 | 
 84 |     self& operator-=(difference_type n) {
 85 |         value -= n;
 86 |         return *this;
 87 |     }
 88 | 
 89 |     friend self operator+(self const& i, difference_type n) { return self(i.value + n); }
 90 | 
 91 |     friend self operator+(difference_type n, self const& i) { return self(i.value + n); }
 92 | 
 93 |     friend difference_type operator-(self const& x, self const& y) { return x.value - y.value; }
 94 | 
 95 |     friend self operator-(self const& i, difference_type n) { return self(i.value - n); }
 96 | 
 97 |     friend bool operator==(self const& x, self const& y) { return x.value == y.value; }
 98 | 
 99 |     friend bool operator!=(self const& x, self const& y) { return x.value != y.value; }
100 | 
101 |     friend bool operator<(self const& x, self const& y) { return x.value < y.value; }
102 | 
103 |     friend bool operator<=(self const& x, self const& y) { return x.value <= y.value; }
104 | 
105 |     friend bool operator>(self const& x, self const& y) { return x.value > y.value; }
106 | 
107 |     friend bool operator>=(self const& x, self const& y) { return x.value >= y.value; }
108 | 
109 |    private:
110 |     value_type value;
111 | };


--------------------------------------------------------------------------------
/apps/1d-stencil/1d-cuda.cpp:
--------------------------------------------------------------------------------
  1 | #include "argparse/argparse.hpp"
  2 | #include "commons.hpp"
  3 | 
  4 | #include <cuda_runtime.h>
  5 | #include <thrust/execution_policy.h>
  6 | #include <thrust/sequence.h>
  7 | 
  8 | // parameters
  9 | struct args_params_t : public argparse::Args {
 10 |     bool& results = kwarg("results", "print generated results (default: false)").set_default(false);
 11 |     std::uint64_t& nt = kwarg("nt", "Number of time steps").set_default(45);
 12 |     std::uint64_t& size = kwarg("size", "Number of elements").set_default(10);
 13 |     bool& k = kwarg("k", "Heat transfer coefficient").set_default(0.5);
 14 |     double& dt = kwarg("dt", "Timestep unit (default: 1.0[s])").set_default(1.0);
 15 |     double& dx = kwarg("dx", "Local x dimension").set_default(1.0);
 16 |     bool& help = flag("h, help", "print help");
 17 |     bool& time = kwarg("t, time", "print time").set_default(true);
 18 | };
 19 | 
 20 | using Real_t = double;
 21 | 
 22 | using view_1d = std::extents<int, std::dynamic_extent>;
 23 | typedef std::mdspan<Real_t, view_1d, std::layout_right> space;
 24 | 
 25 | ///////////////////////////////////////////////////////////////////////////////
 26 | // Command-line variables
 27 | constexpr Real_t k = 0.5;  // heat transfer coefficient
 28 | constexpr Real_t dt = 1.;  // time step
 29 | constexpr Real_t dx = 1.;  // grid spacing
 30 | 
 31 | // Our operator
 32 | __device__ Real_t heat(const Real_t left, const Real_t middle, const Real_t right, const Real_t k, const Real_t dt,
 33 |                        const Real_t dx) {
 34 |     return middle + (k * dt / (dx * dx)) * (left - 2 * middle + right);
 35 | }
 36 | 
 37 | __global__ void heat_equation(Real_t* current, Real_t* next, std::size_t size, const Real_t k, const Real_t dt,
 38 |                               const Real_t dx) {
 39 |     std::size_t i = blockIdx.x * blockDim.x + threadIdx.x;
 40 | 
 41 |     if (i < size) {
 42 |         std::size_t left = (i == 0) ? size - 1 : i - 1;
 43 |         std::size_t right = (i == size - 1) ? 0 : i + 1;
 44 |         next[i] = heat(current[left], current[i], current[right], k, dt, dx);
 45 |     }
 46 | }
 47 | 
 48 | int benchmark(args_params_t const& args) {
 49 |     std::uint64_t size = args.size;  // Number of elements.
 50 |     std::uint64_t nt = args.nt;      // Number of steps.
 51 | 
 52 |     Real_t* h_current = nullptr;
 53 |     Real_t* h_next = nullptr;
 54 | 
 55 |     // Measure execution time.
 56 |     Timer timer;
 57 | 
 58 |     // Memory allocation
 59 |     if (args.results) {
 60 |         h_current = new Real_t[size];
 61 |         h_next = new Real_t[size];
 62 |     }
 63 | 
 64 |     Real_t* d_current;
 65 |     Real_t* d_next;
 66 |     cudaMalloc(&d_current, size * sizeof(Real_t));
 67 |     cudaMalloc(&d_next, size * sizeof(Real_t));
 68 |     thrust::sequence(thrust::device, d_current, d_current + size, 0);
 69 |     thrust::sequence(thrust::device, d_next, d_next + size, 0);
 70 | 
 71 |     // CUDA kernel execution parameters
 72 |     const int threadsPerBlock = std::min(1024, (int)size);
 73 |     const int blocks = (size + threadsPerBlock - 1) / threadsPerBlock;
 74 | 
 75 |     // Actual time step loop
 76 |     for (std::size_t t = 0; t < nt; ++t) {
 77 |         heat_equation<<<blocks, threadsPerBlock>>>(d_current, d_next, size, k, dt, dx);
 78 |         std::swap(d_current, d_next);
 79 |     }
 80 |     cudaDeviceSynchronize();
 81 |     auto time = timer.stop();
 82 | 
 83 |     if (args.results) {
 84 |         // Copy result back to host
 85 |         cudaMemcpy(h_current, d_current, size * sizeof(Real_t), cudaMemcpyDeviceToHost);
 86 | 
 87 |         // Print results
 88 |         if (args.results) {
 89 |             auto h_current_mds = space(h_current, size);
 90 |             fmt::println("{::f}", h_current_mds);
 91 |         }
 92 |         // Cleanup
 93 |         delete[] h_current;
 94 |         delete[] h_next;
 95 |     }
 96 | 
 97 |     cudaFree(d_current);
 98 |     cudaFree(d_next);
 99 | 
100 |     if (args.time) {
101 |         fmt::print("Duration: {:f} ms\n", time);
102 |     }
103 | 
104 |     return 0;
105 | }
106 | 
107 | int main(int argc, char* argv[]) {
108 |     // parse params
109 |     args_params_t args = argparse::parse<args_params_t>(argc, argv);
110 |     // see if help wanted
111 |     if (args.help) {
112 |         args.print();  // prints all variables
113 |         return 0;
114 |     }
115 | 
116 |     benchmark(args);
117 | 
118 |     return 0;
119 | }
120 | 


--------------------------------------------------------------------------------
/apps/comm-study/comm-study-no-senders.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * MIT License
  3 |  *
  4 |  * Copyright (c) 2023 The Regents of the University of California,
  5 |  * through Lawrence Berkeley National Laboratory (subject to receipt of any
  6 |  * required approvals from the U.S. Dept. of Energy).  All rights reserved.
  7 |  *
  8 |  * Permission is hereby granted, free of charge, to any person obtaining a copy
  9 |  * of this software and associated documentation files (the "Software"), to deal
 10 |  * in the Software without restriction, including without limitation the rights
 11 |  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 12 |  * copies of the Software, and to permit persons to whom the Software is
 13 |  * furnished to do so, subject to the following conditions:
 14 |  *
 15 |  * The above copyright notice and this permission notice shall be included in
 16 |  * all copies or substantial portions of the Software.
 17 |  *
 18 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 19 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 20 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 21 |  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 22 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 23 |  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 24 |  * SOFTWARE.
 25 |  */
 26 | 
 27 | #include "commons.hpp"
 28 | //
 29 | // Build and run on Perlmutter using
 30 | // ml nvhpc/23.1 ; nvc++ -stdpar=gpu -std=c++20 -o hpcpp.out
 31 | // ./hpcpp_nosenders.cpp && nsys profile --stats=true ./hpcpp.out
 32 | //
 33 | 
 34 | using T = double;
 35 | using time_point_t = std::chrono::system_clock::time_point;
 36 | 
 37 | // must take in the pointers/vectors by reference
 38 | template <typename P>
 39 | auto work(P& A, P& B, P& Y, int N) {
 40 |     // init A and B separately - will it cause an H2D copy?
 41 |     std::for_each(std::execution::par_unseq, &A[0], &A[N], [&](T& ai) { ai = cos(M_PI / 4); });
 42 | 
 43 |     T sum = 0.0;
 44 | 
 45 |     for (int i = 0; i < N / 3; i++) {
 46 |         // read only or read-write operations
 47 |         sum += A[i] / N;
 48 | 
 49 |         // this line if commented should not result in an H2D after this but it
 50 |         // does.
 51 |         // A[i] = sin(M_PI/4);
 52 |     }
 53 | 
 54 |     fmt::print("\n");
 55 | 
 56 |     // will it cause an H2D here?
 57 |     std::for_each(std::execution::par_unseq, &B[0], &B[N], [&](T& bi) { bi = sin(M_PI / 6); });
 58 | 
 59 |     // compute Y = sqrt((A+B)^2 + B^2)/(A+B+B)
 60 | 
 61 |     std::transform(std::execution::par_unseq, &A[N / 2], &A[N], &B[0], &A[N / 2],
 62 |                    [&](T& ai, T& bi) { return ai + bi; });
 63 |     std::transform(std::execution::par_unseq, &A[N / 2], &A[N], &B[0], &Y[0],
 64 |                    [&](T& ai, T& bi) { return sqrt(pow(ai, 2) + pow(bi, 2)) / (ai + bi); });
 65 | 
 66 |     // should trigger a D2H copy of N/5 elements
 67 |     for (int i = 0; i < N / 3; i++)
 68 |         sum += Y[i] / N;
 69 | 
 70 |     fmt::print("\n");
 71 | 
 72 |     // get sum(Y) - one last memcpy (not USM) D2H
 73 |     sum += std::reduce(std::execution::par_unseq, &Y[0], &Y[N], 0.0, std::plus<T>());
 74 | 
 75 |     return sum / N;
 76 | }
 77 | 
 78 | int main(int argc, char* argv[]) {
 79 |     constexpr int N = 1e9;
 80 |     time_point_t mark = std::chrono::system_clock::now();
 81 |     auto es = std::chrono::duration<double>(std::chrono::system_clock::now() - mark).count();
 82 |     T sum = 0;
 83 | 
 84 | #if 1  // 0 if only want to run with pointers
 85 |     std::vector<T> A(N);
 86 |     std::vector<T> B(N);
 87 |     std::vector<T> Y(N);
 88 | 
 89 |     mark = std::chrono::system_clock::now();
 90 |     sum = work(A, B, Y, N);
 91 |     es = std::chrono::duration<double>(std::chrono::system_clock::now() - mark).count();
 92 |     fmt::print("Vectors: Elapsed Time: {:f}s\n", es);
 93 | 
 94 | #endif
 95 | 
 96 | #if 1  // 0 if only want to run with vectors
 97 | 
 98 |     // allocate memory - where is this allocated?
 99 |     T* a = new T[N];
100 |     T* b = new T[N];
101 |     T* y = new T[N];
102 | 
103 |     sum = 0;
104 |     mark = std::chrono::system_clock::now();
105 |     sum = work(a, b, y, N);
106 |     es = std::chrono::duration<double>(std::chrono::system_clock::now() - mark).count();
107 |     fmt::print("Pointers: Elapsed Time: {:f}s\n\n", es);
108 | #endif
109 | 
110 |     // do not use scientific notation
111 |     fmt::print("sum: {}\n", sum);
112 | 
113 |     return 0;
114 | }


--------------------------------------------------------------------------------
/apps/heat-equation/heat-equation.hpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * MIT License
  3 |  *
  4 |  * Copyright (c) 2023 The Regents of the University of California,
  5 |  * through Lawrence Berkeley National Laboratory (subject to receipt of any
  6 |  * required approvals from the U.S. Dept. of Energy).All rights reserved.
  7 |  *
  8 |  * Permission is hereby granted, free of charge, to any person obtaining a copy
  9 |  * of this software and associated documentation files (the "Software"), to deal
 10 |  * in the Software without restriction, including without limitation the rights
 11 |  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 12 |  * copies of the Software, and to permit persons to whom the Software is
 13 |  * furnished to do so, subject to the following conditions:
 14 |  *
 15 |  * The above copyright notice and this permission notice shall be included in
 16 |  * all copies or substantial portions of the Software.
 17 |  *
 18 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 19 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 20 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 21 |  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 22 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 23 |  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 24 |  * SOFTWARE.
 25 |  */
 26 | 
 27 | /*
 28 |  * commons for the heat equation examples
 29 |  */
 30 | 
 31 | #pragma once
 32 | 
 33 | #include <exec/static_thread_pool.hpp>
 34 | #include <stdexec/execution.hpp>
 35 | 
 36 | #if defined(USE_GPU)
 37 | #include <nvexec/multi_gpu_context.cuh>
 38 | #include <nvexec/stream_context.cuh>
 39 | using namespace nvexec;
 40 | #endif  //USE_GPU
 41 | 
 42 | #include "argparse/argparse.hpp"
 43 | #include "commons.hpp"
 44 | 
 45 | using namespace std;
 46 | using namespace stdexec;
 47 | using stdexec::sync_wait;
 48 | 
 49 | namespace ex = stdexec;
 50 | 
 51 | // data type
 52 | using Real_t = double;
 53 | 
 54 | // number of dimensions
 55 | constexpr int dims = 2;
 56 | 
 57 | // total number of ghost cells = ghosts x dims
 58 | constexpr int ghost_cells = 1;
 59 | constexpr int nghosts = ghost_cells * dims;
 60 | 
 61 | // 2D view
 62 | using view_2d = std::extents<int, std::dynamic_extent, std::dynamic_extent>;
 63 | 
 64 | // 3D view
 65 | using view_3d = std::extents<int, std::dynamic_extent, std::dynamic_extent, std::dynamic_extent>;
 66 | 
 67 | // macros to get x and y positions from indices
 68 | #define pos(i, ghosts, dx) -0.5 + dx*(i - ghosts)
 69 | 
 70 | // parameters
 71 | struct heat_params_t : public argparse::Args {
 72 |     int& ncells = kwarg("n,ncells", "number of cells on each side of the domain").set_default(32);
 73 |     int& nsteps = kwarg("s,nsteps", "total steps in simulation").set_default(100);
 74 | 
 75 | #if defined(HEQ_OMP) || defined(HEQ_STDEXEC)
 76 |     int& nthreads = kwarg("nthreads", "number of threads").set_default(std::thread::hardware_concurrency());
 77 | #endif  // HEQ_OMP || HEQ_STDEXEC
 78 | 
 79 | #if defined(HEQ_STDEXEC)
 80 |     std::string& sch = kwarg("sch",
 81 |                              "stdexec scheduler: [options: cpu"
 82 | #if defined(USE_GPU)
 83 |                              ", gpu, multigpu"
 84 | #endif  //USE_GPU
 85 |                              "]")
 86 |                            .set_default("cpu");
 87 | #endif  // HEQ_STDEXEC
 88 | 
 89 |     Real_t& alpha = kwarg("a,alpha", "thermal diffusivity").set_default(0.5f);
 90 |     Real_t& dt = kwarg("t,dt", "time step").set_default(5.0e-5f);
 91 |     bool& help = flag("h,help", "print help");
 92 |     bool& print_grid = flag("p,print", "print grids at step 0 and step n");
 93 |     bool& print_time = flag("time", "print simulation time");
 94 | };
 95 | 
 96 | // template printGrid
 97 | template <typename T>
 98 | void printGrid(T* grid, int len) {
 99 |     auto view = std::mdspan<T, view_2d, std::layout_right>(grid, len, len);
100 |     fmt::print("Grid: \n");
101 |     fmt::println("{::.2f}", view);
102 | }
103 | 
104 | // fill boundary cells
105 | template <typename T>
106 | void fill2Dboundaries(T* grid, int len, int ghost_cells = 1) {
107 |     std::for_each_n(std::execution::par_unseq, counting_iterator(ghost_cells), len - nghosts, [=](auto i) {
108 |         grid[i] = grid[i + (ghost_cells * len)];
109 |         grid[i + (len * (len - ghost_cells))] = grid[i + (len * (len - ghost_cells - 1))];
110 | 
111 |         grid[i * len] = grid[(ghost_cells * len) + i];
112 |         grid[(len - ghost_cells) + (len * i)] = grid[(len - ghost_cells - 1) + (len * i)];
113 |     });
114 | }
115 | 


--------------------------------------------------------------------------------
/apps/choleskey/choleskey_serial.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * MIT License
  3 |  *
  4 |  * Copyright (c) 2023 Chuanqiu He 
  5 |  * Copyright (c) 2023 Weile Wei 
  6 |  * Copyright (c) 2023 The Regents of the University of California,
  7 |  * through Lawrence Berkeley National Laboratory (subject to receipt of any
  8 |  * required approvals from the U.S. Dept. of Energy).All rights reserved.
  9 |  *
 10 |  * Permission is hereby granted, free of charge, to any person obtaining a copy
 11 |  * of this software and associated documentation files (the "Software"), to deal
 12 |  * in the Software without restriction, including without limitation the rights
 13 |  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 14 |  * copies of the Software, and to permit persons to whom the Software is
 15 |  * furnished to do so, subject to the following conditions:
 16 |  *
 17 |  * The above copyright notice and this permission notice shall be included in
 18 |  * all copies or substantial portions of the Software.
 19 |  *
 20 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 21 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 22 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 23 |  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 24 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 25 |  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 26 |  * SOFTWARE.
 27 |  */
 28 | //
 29 | // This example provides a serial(mdspan) implementation for choleskey decomposition code.
 30 | 
 31 | #include <bits/stdc++.h>
 32 | #include <vector>
 33 | #include "argparse/argparse.hpp"
 34 | #include "commons.hpp"
 35 | #include "matrixutil.hpp"
 36 | 
 37 | using namespace std;
 38 | 
 39 | struct solver {
 40 | 
 41 |     using view_2d = std::extents<int, std::dynamic_extent, std::dynamic_extent>;
 42 | 
 43 |     typedef std::mdspan<int, view_2d, std::layout_right> matrix_ms_t;
 44 | 
 45 |     template <typename T>
 46 |     matrix_ms_t Cholesky_Decomposition(std::vector<T>& vec, int n) {
 47 |         std::vector<T> lower(n * n, 0);
 48 | 
 49 |         auto matrix_ms = std::mdspan<T, view_2d, std::layout_right>(vec.data(), n, n);
 50 |         auto lower_ms = std::mdspan<T, view_2d, std::layout_right>(lower.data(), n, n);
 51 | 
 52 |         // Decomposing a matrix into Lower Triangular
 53 |         for (int i = 0; i < matrix_ms.extent(0); i++) {
 54 |             for (int j = 0; j <= i; j++) {
 55 |                 T sum = 0;
 56 | 
 57 |                 if (j == i) {
 58 |                     // summation for diagonals
 59 |                     for (int k = 0; k < j; k++)
 60 |                         sum += pow(lower_ms(j, k), 2);
 61 |                     lower_ms(j, j) = sqrt(matrix_ms(i, j) - sum);
 62 |                 } else {
 63 |                     // Evaluating L(i, j) using L(j, j)
 64 |                     for (int k = 0; k < j; k++)
 65 |                         sum += (lower_ms(i, k) * lower_ms(j, k));
 66 |                     lower_ms(i, j) = (matrix_ms(i, j) - sum) / lower_ms(j, j);
 67 |                 }
 68 |             }
 69 |         }
 70 |         return lower_ms;
 71 |     }
 72 | };
 73 | 
 74 | ///////////////////////////////////////////////////////////////////////////////
 75 | int benchmark(args_params_t const& args) {
 76 | 
 77 |     std::uint64_t nd = args.nd;  // Number of matrix dimension.
 78 | 
 79 |     std::vector<int> inputMatrix = generate_pascal_matrix<int>(nd);
 80 | 
 81 |     // Create the solverobject
 82 |     solver solve;
 83 |     // Measure execution time.
 84 |     Timer timer;
 85 |     // start decomposation
 86 |     auto res_matrix = solve.Cholesky_Decomposition(inputMatrix, nd);
 87 |     auto time = timer.stop();
 88 | 
 89 |     // Print the final results
 90 |     if (args.results) {
 91 |         // Displaying Lower Triangular and its Transpose
 92 |         fmt::print("{:>6} {:>30}\n", "Lower Triangular", "Transpose");
 93 |         for (int i = 0; i < nd; i++) {
 94 |             // Lower Triangular
 95 |             for (int j = 0; j < nd; j++)
 96 |                 fmt::print("{:>6}\t", res_matrix(i, j));
 97 |             fmt::print("\t");
 98 | 
 99 |             // Transpose of Lower Triangular
100 |             for (int j = 0; j < nd; j++)
101 |                 fmt::print("{:>6}\t", res_matrix(j, i));
102 |             fmt::print("\n");
103 |         }
104 |     }
105 | 
106 |     if (args.time) {
107 |         fmt::print("Duration: {:f} ms\n", time);
108 |     }
109 | 
110 |     return 0;
111 | }
112 | 
113 | // Driver Code for testing
114 | int main(int argc, char* argv[]) {
115 | 
116 |     // parse params
117 |     args_params_t args = argparse::parse<args_params_t>(argc, argv);
118 |     // see if help wanted
119 |     if (args.help) {
120 |         args.print();  // prints all variables
121 |         return 0;
122 |     }
123 | 
124 |     benchmark(args);
125 | 
126 |     return 0;
127 | }
128 | 


--------------------------------------------------------------------------------
/apps/fft/fft-serial.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * MIT License
  3 |  *
  4 |  * Copyright (c) 2023 The Regents of the University of California,
  5 |  * through Lawrence Berkeley National Laboratory (subject to receipt of any
  6 |  * required approvals from the U.S. Dept. of Energy).All rights reserved.
  7 |  *
  8 |  * Permission is hereby granted, free of charge, to any person obtaining a copy
  9 |  * of this software and associated documentation files (the "Software"), to deal
 10 |  * in the Software without restriction, including without limitation the rights
 11 |  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 12 |  * copies of the Software, and to permit persons to whom the Software is
 13 |  * furnished to do so, subject to the following conditions:
 14 |  *
 15 |  * The above copyright notice and this permission notice shall be included in
 16 |  * all copies or substantial portions of the Software.
 17 |  *
 18 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 19 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 20 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 21 |  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 22 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 23 |  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 24 |  * SOFTWARE.
 25 |  */
 26 | 
 27 | /*
 28 |  * commons for the fft codes
 29 |  */
 30 | 
 31 | #include "fft.hpp"
 32 | 
 33 | //
 34 | // serial fft function
 35 | //
 36 | [[nodiscard]] std::vector<data_t> fft_serial(const data_t* x, const int N, bool debug = false) {
 37 |     std::vector<data_t> x_r(N);
 38 | 
 39 |     // bit shift
 40 |     int shift = 32 - ilog2(N);
 41 | 
 42 |     // twiddle data in x[n]
 43 |     for (int k = 0; k < N; k++) {
 44 |         auto new_idx = reverse_bits32(k) >> shift;
 45 |         x_r[k] = x[new_idx];
 46 |     }
 47 | 
 48 |     // niterations
 49 |     int niters = ilog2(N);
 50 |     // local merge partition size
 51 |     int lN = 2;
 52 | 
 53 |     fmt::print("FFT progress: ");
 54 | 
 55 |     for (int k = 0; k < niters; k++, lN *= 2) {
 56 |         fmt::print("{:f}%..", (100.0 * k) / niters);
 57 | 
 58 |         static Timer dtimer;
 59 | 
 60 |         // number of partitions
 61 |         int nparts = N / lN;
 62 |         int tpp = lN / 2;
 63 | 
 64 |         if (debug)
 65 |             dtimer.start();
 66 | 
 67 |         // merge
 68 |         for (int k = 0; k < N / 2; k++) {
 69 |             // compute indices
 70 |             int e = (k / tpp) * lN + (k % tpp);
 71 |             auto o = e + tpp;
 72 |             auto i = (k % tpp);
 73 |             auto tmp = x_r[e] + x_r[o] * WNk(N, i * nparts);
 74 |             x_r[o] = x_r[e] - x_r[o] * WNk(N, i * nparts);
 75 |             x_r[e] = tmp;
 76 |         }
 77 | 
 78 |         if (debug) {
 79 |             fmt::print("This iter time: {:f} ms\n", dtimer.stop());
 80 |         }
 81 |     }
 82 | 
 83 |     fmt::print("100%\n");
 84 |     return x_r;
 85 | }
 86 | 
 87 | //
 88 | // simulation
 89 | //
 90 | int main(int argc, char* argv[]) {
 91 |     // parse params
 92 |     const fft_params_t args = argparse::parse<fft_params_t>(argc, argv);
 93 | 
 94 |     // see if help wanted
 95 |     if (args.help) {
 96 |         args.print();  // prints all variables
 97 |         return 0;
 98 |     }
 99 | 
100 |     // simulation variables
101 |     int N = args.N;
102 |     sig_type_t sig_type = getSignal(args.sig);
103 |     //int freq = args.freq;
104 |     bool print_sig = args.print_sig;
105 |     bool print_time = args.print_time;
106 |     bool validate = args.validate;
107 | 
108 |     // x[n] signal
109 |     sig_t x_n(N, sig_type);
110 | 
111 |     if (!isPowOf2(N)) {
112 |         N = ceilPowOf2(N);
113 |         fmt::print("INFO: N is not a power of 2. Padding zeros => N = {}\n", N);
114 | 
115 |         x_n.resize(N);
116 |     }
117 | 
118 |     if (print_sig) {
119 |         fmt::print("x[n] = ");
120 |         x_n.printSignal();
121 |     }
122 | 
123 |     // niterations
124 |     int niters = ilog2(N);
125 | 
126 |     // start the timer
127 |     Timer timer;
128 | 
129 |     // fft radix-2 algorithm
130 |     // y[n] = fft(x[n]);
131 |     sig_t y_n(std::move(fft_serial(x_n.data(), N, args.debug)));
132 | 
133 |     // stop timer
134 |     auto elapsed = timer.stop();
135 | 
136 |     // print the fft(x)
137 |     if (print_sig) {
138 |         fmt::print("X(k) = ");
139 |         y_n.printSignal();
140 |     }
141 | 
142 |     // print the computation time
143 |     if (print_time) {
144 |         fmt::print("Duration: {:f} ms\n", elapsed);
145 |     }
146 | 
147 |     // validate the recursively computed fft
148 |     if (validate) {
149 |         if (x_n.isFFT(y_n, exec::static_thread_pool(std::thread::hardware_concurrency()).get_scheduler())) {
150 |             fmt::print("SUCCESS: y[n] == fft(x[n])\n");
151 |         } else {
152 |             fmt::print("FAILED: y[n] != fft(x[n])\n");
153 |         }
154 |     }
155 | 
156 |     return 0;
157 | }
158 | 


--------------------------------------------------------------------------------
/apps/choleskey/choleskey_stdpar.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * MIT License
  3 |  *
  4 |  * Copyright (c) 2023 Chuanqiu He 
  5 |  * Copyright (c) 2023 Weile Wei 
  6 |  * Copyright (c) 2023 The Regents of the University of California,
  7 |  * through Lawrence Berkeley National Laboratory (subject to receipt of any
  8 |  * required approvals from the U.S. Dept. of Energy).All rights reserved.
  9 |  *
 10 |  * Permission is hereby granted, free of charge, to any person obtaining a copy
 11 |  * of this software and associated documentation files (the "Software"), to deal
 12 |  * in the Software without restriction, including without limitation the rights
 13 |  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 14 |  * copies of the Software, and to permit persons to whom the Software is
 15 |  * furnished to do so, subject to the following conditions:
 16 |  *
 17 |  * The above copyright notice and this permission notice shall be included in
 18 |  * all copies or substantial portions of the Software.
 19 |  *
 20 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 21 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 22 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 23 |  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 24 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 25 |  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 26 |  * SOFTWARE.
 27 |  */
 28 | //
 29 | // This example provides a stdpar implementation for choleskey decomposition code.
 30 | 
 31 | #include <algorithm>
 32 | #include <execution>
 33 | #include <iostream>
 34 | #include <numeric>
 35 | #include <vector>
 36 | #include "argparse/argparse.hpp"
 37 | #include "commons.hpp"
 38 | #include "matrixutil.hpp"
 39 | 
 40 | using namespace std;
 41 | 
 42 | struct solver {
 43 | 
 44 |     using view_2d = std::extents<int, std::dynamic_extent, std::dynamic_extent>;
 45 | 
 46 |     template <typename T>
 47 |     std::vector<std::vector<T>> Cholesky_Decomposition(std::vector<T>& vec, int n) {
 48 |         std::vector<std::vector<T>> lower(n, std::vector<T>(n, 0));
 49 | 
 50 |         auto matrix_ms = std::mdspan<T, view_2d, std::layout_right>(vec.data(), n, n);
 51 | 
 52 |         auto multiplier_lambda = [=](auto a, auto b) {
 53 |             return a * b;
 54 |         };
 55 | 
 56 |         // Decomposing a matrix into Lower Triangular
 57 |         for (int i = 0; i < matrix_ms.extent(0); i++) {
 58 |             for (int j = 0; j <= i; j++) {
 59 |                 T sum = 0;
 60 | 
 61 |                 if (j == i)  // summation for diagonals
 62 |                 {
 63 |                     sum = std::transform_reduce(std::execution::par, lower[j].cbegin(), lower[j].cbegin() + j, 0,
 64 |                                                 std::plus{}, [=](int val) { return val * val; });
 65 | 
 66 |                     lower[j][j] = std::sqrt(matrix_ms(i, j) - sum);
 67 | 
 68 |                 } else {  // Evaluating L(i, j) using L(j, j)
 69 | 
 70 |                     sum = std::transform_reduce(std::execution::par, lower[j].cbegin(), lower[j].cbegin() + j,
 71 |                                                 lower[i].cbegin(), 0, std::plus<>(), multiplier_lambda);
 72 | 
 73 |                     lower[i][j] = (matrix_ms(i, j) - sum) / lower[j][j];
 74 |                 }
 75 |             }
 76 |         }
 77 |         return lower;
 78 |     }
 79 | };
 80 | 
 81 | ///////////////////////////////////////////////////////////////////////////////
 82 | int benchmark(args_params_t const& args) {
 83 | 
 84 |     std::uint64_t nd = args.nd;  // Number of matrix dimension.
 85 | 
 86 |     std::vector<int> inputMatrix = generate_pascal_matrix<int>(nd);
 87 | 
 88 |     // Create the solver object
 89 |     solver solve;
 90 |     // Measure execution time.
 91 |     Timer timer;
 92 | 
 93 |     // start decomposation
 94 |     auto res_matrix = solve.Cholesky_Decomposition(inputMatrix, nd);
 95 |     auto time = timer.stop();
 96 | 
 97 |     // Print the final results
 98 |     if (args.results) {
 99 |         // Displaying Lower Triangular and its Transpose
100 |         fmt::print("{:>6} {:>30}\n", "Lower Triangular", "Transpose");
101 |         for (int i = 0; i < nd; i++) {
102 |             // Lower Triangular
103 |             for (int j = 0; j < nd; j++)
104 |                 fmt::print("{:>6}\t", res_matrix[i][j]);
105 |             fmt::print("\t");
106 | 
107 |             // Transpose of Lower Triangular
108 |             for (int j = 0; j < nd; j++)
109 |                 fmt::print("{:>6}\t", res_matrix[j][i]);
110 |             fmt::print("\n");
111 |         }
112 |     }
113 | 
114 |     if (args.time) {
115 |         fmt::print("Duration: {:f} ms\n", time);
116 |     }
117 | 
118 |     return 0;
119 | }
120 | 
121 | // Driver Code for testing
122 | int main(int argc, char* argv[]) {
123 | 
124 |     // parse params
125 |     args_params_t args = argparse::parse<args_params_t>(argc, argv);
126 |     // see if help wanted
127 |     if (args.help) {
128 |         args.print();  // prints all variables
129 |         return 0;
130 |     }
131 | 
132 |     benchmark(args);
133 | 
134 |     return 0;
135 | }
136 | 


--------------------------------------------------------------------------------
/apps/1d-stencil/1d-omp.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * MIT License
  3 |  *
  4 |  * Copyright (c) 2023 Weile Wei 
  5 |  * Copyright (c) 2023 The Regents of the University of California,
  6 |  * through Lawrence Berkeley National Laboratory (subject to receipt of any
  7 |  * required approvals from the U.S. Dept. of Energy).All rights reserved.
  8 |  *
  9 |  * Permission is hereby granted, free of charge, to any person obtaining a copy
 10 |  * of this software and associated documentation files (the "Software"), to deal
 11 |  * in the Software without restriction, including without limitation the rights
 12 |  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 13 |  * copies of the Software, and to permit persons to whom the Software is
 14 |  * furnished to do so, subject to the following conditions:
 15 |  *
 16 |  * The above copyright notice and this permission notice shall be included in
 17 |  * all copies or substantial portions of the Software.
 18 |  *
 19 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 20 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 21 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 22 |  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 23 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 24 |  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 25 |  * SOFTWARE.
 26 |  */
 27 | #include <omp.h>
 28 | #include "argparse/argparse.hpp"
 29 | #include "commons.hpp"
 30 | 
 31 | // parameters
 32 | struct args_params_t : public argparse::Args {
 33 |     bool& results = kwarg("results", "print generated results (default: false)").set_default(false);
 34 |     std::uint64_t& nt = kwarg("nt", "Number of time steps").set_default(45);
 35 |     std::uint64_t& size = kwarg("size", "Number of elements").set_default(10);
 36 |     int& nthreads = kwarg("nthreads", "Number of openmp threads").set_default(1);
 37 |     bool& k = kwarg("k", "Heat transfer coefficient").set_default(0.5);
 38 |     double& dt = kwarg("dt", "Timestep unit (default: 1.0[s])").set_default(1.0);
 39 |     double& dx = kwarg("dx", "Local x dimension").set_default(1.0);
 40 |     bool& help = flag("h, help", "print help");
 41 |     bool& time = kwarg("t, time", "print time").set_default(true);
 42 | };
 43 | 
 44 | using Real_t = double;
 45 | ///////////////////////////////////////////////////////////////////////////////
 46 | // Command-line variables
 47 | constexpr Real_t k = 0.5;  // heat transfer coefficient
 48 | constexpr Real_t dt = 1.;  // time step
 49 | constexpr Real_t dx = 1.;  // grid spacing
 50 | 
 51 | ///////////////////////////////////////////////////////////////////////////////
 52 | //[stepper_1
 53 | struct stepper {
 54 |     // Our operator
 55 |     Real_t heat(const Real_t left, const Real_t middle, const Real_t right, const Real_t k = ::k,
 56 |                 const Real_t dt = ::dt, const Real_t dx = ::dx) {
 57 |         return middle + (k * dt / (dx * dx)) * (left - 2 * middle + right);
 58 |     }
 59 | 
 60 |     // do all the work on 'size' data points for 'nt' time steps
 61 |     [[nodiscard]] std::vector<Real_t> do_work(const std::size_t size, const std::size_t nt, const int nthreads) {
 62 |         std::vector<Real_t> current(size);
 63 |         std::vector<Real_t> next(size);
 64 | 
 65 | #pragma omp parallel for num_threads(nthreads)
 66 |         for (std::size_t i = 0; i < size; ++i) {
 67 |             current[i] = Real_t(i);
 68 |         }
 69 | 
 70 |         // Actual time step loop
 71 |         for (std::size_t t = 0; t != nt; ++t) {
 72 | // OpenMP parallel for loop
 73 | #pragma omp parallel for num_threads(nthreads)
 74 |             for (std::size_t i = 0; i < size; ++i) {
 75 |                 std::size_t left = (i == 0) ? size - 1 : i - 1;
 76 |                 std::size_t right = (i == size - 1) ? 0 : i + 1;
 77 |                 next[i] = heat(current[left], current[i], current[right], k, dt, dx);
 78 |             }
 79 |             std::swap(current, next);
 80 |         }
 81 | 
 82 |         return current;
 83 |     }
 84 | };
 85 | 
 86 | ///////////////////////////////////////////////////////////////////////////////
 87 | int benchmark(args_params_t const& args) {
 88 |     std::uint64_t size = args.size;  // Number of elements.
 89 |     std::uint64_t nt = args.nt;      // Number of steps.
 90 |     int nthreads = args.nthreads;
 91 | 
 92 |     // Create the stepper object
 93 |     stepper step;
 94 | 
 95 |     // Measure execution time.
 96 |     Timer timer;
 97 | 
 98 |     auto solution = step.do_work(size, nt, nthreads);
 99 |     auto time = timer.stop();
100 | 
101 |     // Print the final solution
102 |     if (args.results) {
103 |         fmt::println("{::f}", solution);
104 |     }
105 | 
106 |     if (args.time) {
107 |         fmt::print("Duration: {:f} ms\n", time);
108 |     }
109 | 
110 |     return 0;
111 | }
112 | 
113 | int main(int argc, char* argv[]) {
114 |     // parse params
115 |     args_params_t args = argparse::parse<args_params_t>(argc, argv);
116 |     // see if help wanted
117 |     if (args.help) {
118 |         args.print();  // prints all variables
119 |         return 0;
120 |     }
121 | 
122 |     benchmark(args);
123 | 
124 |     return 0;
125 | }
126 | 


--------------------------------------------------------------------------------
/apps/1d-stencil/1d-serial.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * MIT License
  3 |  *
  4 |  * Copyright (c) 2023 Weile Wei
  5 |  * Copyright (c) 2023 The Regents of the University of California,
  6 |  * through Lawrence Berkeley National Laboratory (subject to receipt of any
  7 |  * required approvals from the U.S. Dept. of Energy).All rights reserved.
  8 |  *
  9 |  * Permission is hereby granted, free of charge, to any person obtaining a copy
 10 |  * of this software and associated documentation files (the "Software"), to deal
 11 |  * in the Software without restriction, including without limitation the rights
 12 |  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 13 |  * copies of the Software, and to permit persons to whom the Software is
 14 |  * furnished to do so, subject to the following conditions:
 15 |  *
 16 |  * The above copyright notice and this permission notice shall be included in
 17 |  * all copies or substantial portions of the Software.
 18 |  *
 19 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 20 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 21 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 22 |  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 23 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 24 |  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 25 |  * SOFTWARE.
 26 |  */
 27 | #include "argparse/argparse.hpp"
 28 | #include "commons.hpp"
 29 | 
 30 | // parameters
 31 | struct args_params_t : public argparse::Args {
 32 |     bool& results = kwarg("results", "print generated results (default: false)").set_default(false);
 33 |     std::uint64_t& nt = kwarg("nt", "Number of time steps").set_default(45);
 34 |     std::uint64_t& size = kwarg("size", "Number of elements").set_default(10);
 35 |     bool& k = kwarg("k", "Heat transfer coefficient").set_default(0.5);
 36 |     double& dt = kwarg("dt", "Timestep unit (default: 1.0[s])").set_default(1.0);
 37 |     double& dx = kwarg("dx", "Local x dimension").set_default(1.0);
 38 |     bool& help = flag("h, help", "print help");
 39 |     bool& time = kwarg("t, time", "print time").set_default(true);
 40 | };
 41 | 
 42 | using Real_t = double;
 43 | ///////////////////////////////////////////////////////////////////////////////
 44 | // Command-line variables
 45 | constexpr Real_t k = 0.5;  // heat transfer coefficient
 46 | constexpr Real_t dt = 1.;  // time step
 47 | constexpr Real_t dx = 1.;  // grid spacing
 48 | 
 49 | ///////////////////////////////////////////////////////////////////////////////
 50 | //[stepper_1
 51 | struct stepper {
 52 |     using view_1d = std::extents<int, std::dynamic_extent>;
 53 |     typedef std::mdspan<Real_t, view_1d, std::layout_right> space;
 54 | 
 55 |     void init_value(auto& data, const std::size_t size) {
 56 |         for (std::size_t i = 0; i != size; ++i) {
 57 |             data[i] = Real_t(i);
 58 |         }
 59 |     }
 60 | 
 61 |     // Our operator
 62 |     Real_t heat(const Real_t left, const Real_t middle, const Real_t right, const Real_t k = ::k,
 63 |                 const Real_t dt = ::dt, const Real_t dx = ::dx) {
 64 |         return middle + (k * dt / (dx * dx)) * (left - 2 * middle + right);
 65 |     }
 66 | 
 67 |     // do all the work on 'size' data points for 'nt' time steps
 68 |     [[nodiscard]] space do_work(const std::size_t size, const std::size_t nt) {
 69 |         Real_t* current_ptr = new Real_t[size];
 70 |         Real_t* next_ptr = new Real_t[size];
 71 |         auto current = space(current_ptr, size);
 72 |         auto next = space(next_ptr, size);
 73 | 
 74 |         init_value(current, size);
 75 | 
 76 |         // Actual time step loop
 77 |         for (std::size_t t = 0; t != nt; ++t) {
 78 |             for (std::size_t i = 0; i < size; ++i) {
 79 |                 std::size_t left = (i == 0) ? size - 1 : i - 1;
 80 |                 std::size_t right = (i == size - 1) ? 0 : i + 1;
 81 |                 next[i] = heat(current[left], current[i], current[right], k, dt, dx);
 82 |             }
 83 |             std::swap(current, next);
 84 |         }
 85 | 
 86 |         return current;
 87 |     }
 88 | };
 89 | 
 90 | ///////////////////////////////////////////////////////////////////////////////
 91 | int benchmark(args_params_t const& args) {
 92 |     std::uint64_t size = args.size;  // Number of elements.
 93 |     std::uint64_t nt = args.nt;      // Number of steps.
 94 | 
 95 |     // Create the stepper object
 96 |     stepper step;
 97 | 
 98 |     // Measure execution time.
 99 |     Timer timer;
100 | 
101 |     auto solution = step.do_work(size, nt);
102 |     auto time = timer.stop();
103 | 
104 |     // Print the final solution
105 |     if (args.results) {
106 |         fmt::println("{::f}", solution);
107 |     }
108 | 
109 |     if (args.time) {
110 |         fmt::print("Duration: {:f} ms\n", time);
111 |     }
112 | 
113 |     return 0;
114 | }
115 | 
116 | int main(int argc, char* argv[]) {
117 |     // parse params
118 |     args_params_t args = argparse::parse<args_params_t>(argc, argv);
119 |     // see if help wanted
120 |     if (args.help) {
121 |         args.print();  // prints all variables
122 |         return 0;
123 |     }
124 | 
125 |     benchmark(args);
126 | 
127 |     return 0;
128 | }
129 | 


--------------------------------------------------------------------------------
/apps/heat-equation/heat-equation-stdpar.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * MIT License
  3 |  *
  4 |  * Copyright (c) 2023 The Regents of the University of California,
  5 |  * through Lawrence Berkeley National Laboratory (subject to receipt of any
  6 |  * required approvals from the U.S. Dept. of Energy).All rights reserved.
  7 |  *
  8 |  * Permission is hereby granted, free of charge, to any person obtaining a copy
  9 |  * of this software and associated documentation files (the "Software"), to deal
 10 |  * in the Software without restriction, including without limitation the rights
 11 |  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 12 |  * copies of the Software, and to permit persons to whom the Software is
 13 |  * furnished to do so, subject to the following conditions:
 14 |  *
 15 |  * The above copyright notice and this permission notice shall be included in
 16 |  * all copies or substantial portions of the Software.
 17 |  *
 18 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 19 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 20 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 21 |  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 22 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 23 |  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 24 |  * SOFTWARE.
 25 |  */
 26 | 
 27 | /*
 28 |  * Simplified 2d heat equation example derived from amrex
 29 |  */
 30 | 
 31 | #include "heat-equation.hpp"
 32 | 
 33 | //
 34 | // simulation
 35 | //
 36 | int main(int argc, char* argv[]) {
 37 |     // parse params
 38 |     const heat_params_t args = argparse::parse<heat_params_t>(argc, argv);
 39 | 
 40 |     // see if help wanted
 41 |     if (args.help) {
 42 |         args.print();  // prints all variables
 43 |         return 0;
 44 |     }
 45 | 
 46 |     // simulation variables
 47 |     int ncells = args.ncells;
 48 |     int nsteps = args.nsteps;
 49 |     Real_t dt = args.dt;
 50 |     Real_t alpha = args.alpha;
 51 |     // future if needed to split in multiple grids
 52 |     // int max_grid_size = args.max_grid_size;
 53 | 
 54 |     // initialize dx, dy, dz
 55 |     auto* dx = new Real_t[dims];
 56 |     for (int i = 0; i < dims; ++i)
 57 |         dx[i] = 1.0 / (ncells - 1);
 58 | 
 59 |     // simulation setup (2D)
 60 |     Real_t* grid_old = new Real_t[(ncells + nghosts) * (ncells + nghosts)];
 61 |     Real_t* grid_new = new Real_t[(ncells) * (ncells)];
 62 | 
 63 |     auto phi_old = std::mdspan<Real_t, view_2d, std::layout_right>(grid_old, ncells + nghosts, ncells + nghosts);
 64 |     auto phi_new = std::mdspan<Real_t, view_2d, std::layout_right>(grid_new, ncells, ncells);
 65 | 
 66 |     // initialize phi_old domain: {[-0.5, -0.5], [0.5, 0.5]} -> origin at [0,0]
 67 | 
 68 |     Timer timer;
 69 | 
 70 |     std::for_each_n(std::execution::par_unseq, counting_iterator(0), ncells * ncells, [=](int ind) {
 71 |         int i = 1 + (ind / ncells);
 72 |         int j = 1 + (ind % ncells);
 73 | 
 74 |         Real_t x = pos(i, ghost_cells, dx[0]);
 75 |         Real_t y = pos(j, ghost_cells, dx[1]);
 76 | 
 77 |         // L2 distance (r2 from origin)
 78 |         Real_t r2 = (x * x + y * y) / (0.01);
 79 | 
 80 |         // phi(x,y) = 1 + exp(-r^2)
 81 |         phi_old(i, j) = 1 + exp(-r2);
 82 |     });
 83 | 
 84 |     if (args.print_grid)
 85 |         // print the initial grid
 86 |         printGrid(grid_old, ncells + nghosts);
 87 | 
 88 |     // init simulation time
 89 |     Real_t time = 0.0;
 90 | 
 91 |     // evolve the system
 92 |     for (auto step = 0; step < nsteps; step++) {
 93 |         // fill boundary cells in old_phi
 94 |         fill2Dboundaries(grid_old, ncells + nghosts, ghost_cells);
 95 | 
 96 |         // update phi_new with stencil
 97 |         std::for_each_n(std::execution::par_unseq, counting_iterator(0), ncells * ncells, [=](int ind) {
 98 |             int i = 1 + (ind / ncells);
 99 |             int j = 1 + (ind % ncells);
100 | 
101 |             // Jacobi iteration
102 |             phi_new(i - 1, j - 1) =
103 |                 phi_old(i, j) + alpha * dt *
104 |                                     ((phi_old(i + 1, j) - 2.0 * phi_old(i, j) + phi_old(i - 1, j)) / (dx[0] * dx[0]) +
105 |                                      (phi_old(i, j + 1) - 2.0 * phi_old(i, j) + phi_old(i, j - 1)) / (dx[1] * dx[1]));
106 |         });
107 | 
108 |         // update the simulation time
109 |         time += dt;
110 | 
111 |         // parallel copy phi_new to phi_old
112 |         std::for_each_n(std::execution::par_unseq, counting_iterator(0), ncells * ncells, [=](int ind) {
113 |             int i = 1 + (ind / ncells);
114 |             int j = 1 + (ind % ncells);
115 | 
116 |             // copy phi_new to phi_old
117 |             phi_old(i, j) = phi_new(i - 1, j - 1);
118 |         });
119 |     }
120 | 
121 |     auto elapsed = timer.stop();
122 | 
123 |     // print timing
124 |     if (args.print_time) {
125 |         fmt::print("Duration: {:f} ms\n", elapsed);
126 |     }
127 | 
128 |     if (args.print_grid)
129 |         // print the final grid
130 |         printGrid(grid_new, ncells);
131 | 
132 |     // delete all memory
133 |     delete[] grid_old;
134 |     delete[] grid_new;
135 | 
136 |     grid_old = nullptr;
137 |     grid_new = nullptr;
138 | 
139 |     return 0;
140 | }
141 | 


--------------------------------------------------------------------------------
/apps/1d-stencil/1d-stdpar.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * MIT License
  3 |  *
  4 |  * Copyright (c) 2023 Weile Wei 
  5 |  * Copyright (c) 2023 The Regents of the University of California,
  6 |  * through Lawrence Berkeley National Laboratory (subject to receipt of any
  7 |  * required approvals from the U.S. Dept. of Energy).All rights reserved.
  8 |  *
  9 |  * Permission is hereby granted, free of charge, to any person obtaining a copy
 10 |  * of this software and associated documentation files (the "Software"), to deal
 11 |  * in the Software without restriction, including without limitation the rights
 12 |  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 13 |  * copies of the Software, and to permit persons to whom the Software is
 14 |  * furnished to do so, subject to the following conditions:
 15 |  *
 16 |  * The above copyright notice and this permission notice shall be included in
 17 |  * all copies or substantial portions of the Software.
 18 |  *
 19 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 20 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 21 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 22 |  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 23 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 24 |  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 25 |  * SOFTWARE.
 26 |  */
 27 | //
 28 | // This example provides a stdpar implementation for the 1D stencil code.
 29 | #include "argparse/argparse.hpp"
 30 | #include "commons.hpp"
 31 | 
 32 | // parameters
 33 | struct args_params_t : public argparse::Args {
 34 |     bool& results = kwarg("results", "print generated results (default: false)").set_default(false);
 35 |     std::uint64_t& nt = kwarg("nt", "Number of time steps").set_default(45);
 36 |     std::uint64_t& size = kwarg("size", "Number of elements").set_default(10);
 37 |     bool& k = kwarg("k", "Heat transfer coefficient").set_default(0.5);
 38 |     double& dt = kwarg("dt", "Timestep unit (default: 1.0[s])").set_default(1.0);
 39 |     double& dx = kwarg("dx", "Local x dimension").set_default(1.0);
 40 |     bool& help = flag("h, help", "print help");
 41 |     bool& time = kwarg("t, time", "print time").set_default(true);
 42 | };
 43 | 
 44 | using Real_t = double;
 45 | ///////////////////////////////////////////////////////////////////////////////
 46 | // Command-line variables
 47 | constexpr Real_t k = 0.5;  // heat transfer coefficient
 48 | constexpr Real_t dt = 1.;  // time step
 49 | constexpr Real_t dx = 1.;  // grid spacing
 50 | 
 51 | ///////////////////////////////////////////////////////////////////////////////
 52 | //[stepper_1
 53 | struct stepper {
 54 |     using view_1d = std::extents<int, std::dynamic_extent>;
 55 |     typedef std::mdspan<Real_t, view_1d, std::layout_right> space;
 56 | 
 57 |     // Our operator
 58 |     [[nodiscard]] Real_t heat(const Real_t left, const Real_t middle, const Real_t right, const Real_t k = ::k,
 59 |                               const Real_t dt = ::dt, const Real_t dx = ::dx) {
 60 |         return middle + (k * dt / (dx * dx)) * (left - 2 * middle + right);
 61 |     }
 62 | 
 63 |     // do all the work on 'size' data points for 'nt' time steps
 64 |     [[nodiscard]] space do_work(const std::size_t size, const std::size_t nt) {
 65 |         Real_t* current_ptr = new Real_t[size];
 66 |         Real_t* next_ptr = new Real_t[size];
 67 | 
 68 |         auto current = space(current_ptr, size);
 69 |         auto next = space(next_ptr, size);
 70 | 
 71 |         // parallel init
 72 |         std::for_each_n(std::execution::par, counting_iterator(0), size,
 73 |                         [=](std::size_t i) { current[i] = (Real_t)i; });
 74 | 
 75 |         // Actual time step loop
 76 |         for (std::size_t t = 0; t != nt; ++t) {
 77 |             std::for_each_n(std::execution::par, counting_iterator(0), size, [=, k = k, dt = dt, dx = dx](int32_t i) {
 78 |                 std::size_t left = (i == 0) ? size - 1 : i - 1;
 79 |                 std::size_t right = (i == size - 1) ? 0 : i + 1;
 80 |                 next[i] = heat(current[left], current[i], current[right], k, dt, dx);
 81 |             });
 82 |             std::swap(current, next);
 83 |         }
 84 | 
 85 |         return current;
 86 |     }
 87 | };
 88 | 
 89 | ///////////////////////////////////////////////////////////////////////////////
 90 | int benchmark(args_params_t const& args) {
 91 |     std::uint64_t size = args.size;  // Number of elements.
 92 |     std::uint64_t nt = args.nt;      // Number of steps.
 93 | 
 94 |     // Create the stepper object
 95 |     stepper step;
 96 | 
 97 |     // Measure execution time.
 98 |     Timer timer;
 99 | 
100 |     // Execute nt time steps on nx grid points.
101 |     auto solution = step.do_work(size, nt);
102 |     auto time = timer.stop();
103 | 
104 |     // Print the final solution
105 |     if (args.results) {
106 |         fmt::println("{::f}", solution);
107 |     }
108 | 
109 |     if (args.time) {
110 |         fmt::print("Duration: {:f} ms\n", time);
111 |     }
112 | 
113 |     return 0;
114 | }
115 | 
116 | int main(int argc, char* argv[]) {
117 |     // parse params
118 |     args_params_t args = argparse::parse<args_params_t>(argc, argv);
119 |     // see if help wanted
120 |     if (args.help) {
121 |         args.print();  // prints all variables
122 |         return 0;
123 |     }
124 | 
125 |     benchmark(args);
126 | 
127 |     return 0;
128 | }
129 | 


--------------------------------------------------------------------------------
/apps/fft/fft-stdpar.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * MIT License
  3 |  *
  4 |  * Copyright (c) 2023 The Regents of the University of California,
  5 |  * through Lawrence Berkeley National Laboratory (subject to receipt of any
  6 |  * required approvals from the U.S. Dept. of Energy).All rights reserved.
  7 |  *
  8 |  * Permission is hereby granted, free of charge, to any person obtaining a copy
  9 |  * of this software and associated documentation files (the "Software"), to deal
 10 |  * in the Software without restriction, including without limitation the rights
 11 |  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 12 |  * copies of the Software, and to permit persons to whom the Software is
 13 |  * furnished to do so, subject to the following conditions:
 14 |  *
 15 |  * The above copyright notice and this permission notice shall be included in
 16 |  * all copies or substantial portions of the Software.
 17 |  *
 18 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 19 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 20 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 21 |  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 22 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 23 |  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 24 |  * SOFTWARE.
 25 |  */
 26 | 
 27 | /*
 28 |  * commons for the fft codes
 29 |  */
 30 | 
 31 | #include "fft.hpp"
 32 | 
 33 | //
 34 | // fft algorithm
 35 | //
 36 | [[nodiscard]] std::vector<data_t> fft(const data_t* x, const int N, bool debug = false) {
 37 |     std::vector<data_t> x_rev(N);
 38 | 
 39 |     // create mdspans
 40 |     auto x_r = std::mdspan<data_t, view_1d, std::layout_right>(x_rev.data(), N);
 41 | 
 42 |     // compute shift factor
 43 |     int shift = 32 - ilog2(N);
 44 | 
 45 |     // twiddle bits for fft
 46 |     std::for_each_n(std::execution::par_unseq, counting_iterator(0), N, [=](auto k) {
 47 |         auto new_idx = reverse_bits32(k) >> shift;
 48 |         x_r(k) = x[new_idx];
 49 |     });
 50 | 
 51 |     // niterations
 52 |     int niters = ilog2(N);
 53 | 
 54 |     // local merge partition size
 55 |     int lN = 2;
 56 | 
 57 |     // set cout precision
 58 |     fmt::print("FFT progress: ");
 59 | 
 60 |     // iterate until niters - lN*=2 after each iteration
 61 |     for (int it = 0; it < niters; it++, lN *= 2) {
 62 |         // print progress
 63 |         fmt::print("{:.1f}%..", (100.0 * it) / niters);
 64 | 
 65 |         // debugging timer
 66 |         static Timer dtimer;
 67 | 
 68 |         // number of partitions
 69 |         int nparts = N / lN;
 70 |         int tpp = lN / 2;
 71 | 
 72 |         // display info only if debugging
 73 |         if (debug) {
 74 |             dtimer.start();
 75 |             fmt::print("lN = {}, npartitions = {}, partition size = {}\n", lN, nparts, tpp);
 76 |         }
 77 | 
 78 |         // parallel compute lN-pt FFT
 79 |         std::for_each_n(std::execution::par_unseq, counting_iterator(0), N / 2, [=](auto k) {
 80 |             // compute indices
 81 |             int e = (k / tpp) * lN + (k % tpp);
 82 |             auto o = e + tpp;
 83 |             auto i = (k % tpp);
 84 | 
 85 |             // compute 2-pt DFT
 86 |             auto tmp = x_r(e) + x_r(o) * WNk(N, i * nparts);
 87 |             x_r(o) = x_r(e) - x_r(o) * WNk(N, i * nparts);
 88 |             x_r(e) = tmp;
 89 |         });
 90 | 
 91 |         // print only if debugging
 92 |         if (debug)
 93 |             fmt::print("This iter time: {} ms\n", dtimer.stop());
 94 |     }
 95 | 
 96 |     // print final progress mark
 97 |     fmt::print("100%\n");
 98 | 
 99 |     // return x_rev = fft(x_r)
100 |     return x_rev;
101 | }
102 | 
103 | //
104 | // simulation
105 | //
106 | int main(int argc, char* argv[]) {
107 |     // parse params
108 |     const fft_params_t args = argparse::parse<fft_params_t>(argc, argv);
109 | 
110 |     // see if help wanted
111 |     if (args.help) {
112 |         args.print();  // prints all variables
113 |         return 0;
114 |     }
115 | 
116 |     // simulation variables
117 |     int N = args.N;
118 |     sig_type_t sig_type = sig_type_t::box;
119 |     //int freq = args.freq;
120 |     bool print_sig = args.print_sig;
121 |     bool print_time = args.print_time;
122 |     bool validate = args.validate;
123 | 
124 |     // x[n] signal
125 |     sig_t x_n(N, sig_type);
126 | 
127 |     if (!isPowOf2(N)) {
128 |         N = ceilPowOf2(N);
129 |         fmt::print("INFO: N is not a power of 2. Padding zeros => N = {}\n", N);
130 | 
131 |         x_n.resize(N);
132 |     }
133 | 
134 |     if (print_sig) {
135 |         fmt::print("\nx[n] = ");
136 |         x_n.printSignal();
137 |     }
138 | 
139 |     // start the timer here
140 |     Timer timer;
141 | 
142 |     // y[n] = fft(x[n])
143 |     sig_t y_n(std::move(fft(x_n.data(), N, args.debug)));
144 | 
145 |     // stop timer
146 |     auto elapsed = timer.stop();
147 | 
148 |     // print the fft(x)
149 |     if (print_sig) {
150 |         fmt::print("X(k) = ");
151 |         y_n.printSignal();
152 |     }
153 | 
154 |     // print the computation time
155 |     if (print_time) {
156 |         fmt::print("Elapsed Time: {} ms\n", elapsed);
157 |     }
158 | 
159 |     // validate the recursively computed fft
160 |     if (validate) {
161 |         if (x_n.isFFT(y_n, exec::static_thread_pool(std::thread::hardware_concurrency()).get_scheduler())) {
162 |             fmt::print("SUCCESS: y[n] == fft(x[n])\n");
163 |         } else {
164 |             fmt::print("FAILED: y[n] != fft(x[n])\n");
165 |         }
166 |     }
167 | 
168 |     return 0;
169 | }
170 | 


--------------------------------------------------------------------------------
/include/commons.hpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * MIT License
  3 |  *
  4 |  * Copyright (c) 2023 The Regents of the University of California,
  5 |  * through Lawrence Berkeley National Laboratory (subject to receipt of any
  6 |  * required approvals from the U.S. Dept. of Energy).  All rights reserved.
  7 |  *
  8 |  * Permission is hereby granted, free of charge, to any person obtaining a copy
  9 |  * of this software and associated documentation files (the "Software"), to deal
 10 |  * in the Software without restriction, including without limitation the rights
 11 |  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 12 |  * copies of the Software, and to permit persons to whom the Software is
 13 |  * furnished to do so, subject to the following conditions:
 14 |  *
 15 |  * The above copyright notice and this permission notice shall be included in
 16 |  * all copies or substantial portions of the Software.
 17 |  *
 18 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 19 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 20 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 21 |  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 22 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 23 |  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 24 |  * SOFTWARE.
 25 |  */
 26 | 
 27 | #pragma once
 28 | 
 29 | #include <math.h>
 30 | #include <stdint.h>
 31 | #include <stdlib.h>
 32 | 
 33 | #include <algorithm>
 34 | #include <bit>
 35 | #include <cassert>
 36 | #include <chrono>
 37 | #include <complex>
 38 | #include <cstddef>
 39 | #include <cstdint>
 40 | #include <execution>
 41 | #include <functional>
 42 | #include <iostream>
 43 | #include <iterator>
 44 | #include <map>
 45 | #include <memory>
 46 | #include <numeric>
 47 | #include <random>
 48 | #include <span>
 49 | #include <thread>
 50 | #include <type_traits>
 51 | #include <typeinfo>
 52 | #include <vector>
 53 | 
 54 | #include <fmt/core.h>
 55 | #include <fmt/ranges.h>
 56 | 
 57 | #include <mdspan_fmt_formatter.hpp>
 58 | 
 59 | #include "counting_iterator.hpp"
 60 | 
 61 | template <typename T>
 62 | requires std::floating_point<T> struct fmt::formatter<std::complex<T>> {
 63 |     template <typename ParseContext>
 64 |     constexpr auto parse(ParseContext& ctx) {
 65 |         return ctx.begin();
 66 |     }
 67 | 
 68 |     template <typename FormatContext>
 69 |     auto format(const std::complex<T>& c, FormatContext& ctx) const {
 70 |         return format_to(ctx.out(), "({:.2f} + {:.2f}i)", c.real(), c.imag());
 71 |     }
 72 | };
 73 | 
 74 | // get mdpsan 2d indices from 1d index
 75 | #define dim2(x, ms)            \
 76 |     int ii = x / ms.extent(1); \
 77 |     int ij = x % ms.extent(1);
 78 | // get mdspan 3d indices from 1d index
 79 | #define dim3(x, ms)                              \
 80 |     int ii = x / (ms3.extent(1) * ms.extent(2)); \
 81 |     int ij = (x / ms.extent(2)) % ms.extent(1);  \
 82 |     int ik = x % ms.extent(2)
 83 | 
 84 | class Timer {
 85 |    public:
 86 |     Timer() { start(); }
 87 | 
 88 |     ~Timer() { stop(); }
 89 | 
 90 |     void start() { start_time_point = std::chrono::high_resolution_clock::now(); }
 91 | 
 92 |     double stop() {
 93 |         end_time_point = std::chrono::high_resolution_clock::now();
 94 |         return duration();
 95 |     }
 96 | 
 97 |     double duration() {
 98 |         auto start =
 99 |             std::chrono::time_point_cast<std::chrono::microseconds>(start_time_point).time_since_epoch().count();
100 |         auto end = std::chrono::time_point_cast<std::chrono::microseconds>(end_time_point).time_since_epoch().count();
101 |         auto duration = end - start;
102 |         double ms = duration * 1e-6;
103 |         return ms;
104 |     }
105 | 
106 |    private:
107 |     std::chrono::time_point<std::chrono::high_resolution_clock> start_time_point;
108 |     std::chrono::time_point<std::chrono::high_resolution_clock> end_time_point;
109 | };
110 | 
111 | enum class sch_t { CPU, GPU, MULTIGPU };
112 | 
113 | [[nodiscard]] sch_t get_sch_enum(std::string_view str) {
114 |     static const std::map<std::string_view, sch_t> schmap = {
115 |         {"cpu", sch_t::CPU},
116 | #if defined(USE_GPU)
117 |         {"gpu", sch_t::GPU},
118 |         {"multigpu", sch_t::MULTIGPU}
119 | #endif  // USE_GPU
120 |     };
121 | 
122 |     if (schmap.contains(str)) {
123 |         return schmap.at(str);
124 |     }
125 | 
126 |     throw std::invalid_argument("FATAL: " + std::string(str) +
127 |                                 " is not a stdexec scheduler.\n"
128 |                                 "Available schedulers: cpu"
129 | #if defined(USE_GPU)
130 |                                 ", gpu, multigpu"
131 | #endif
132 |                                 "\n"
133 |                                 "Exiting...\n");
134 | }
135 | 
136 | inline bool isPowOf2(long long int x) {
137 |     return !(x == 0) && !(x & (x - 1));
138 | }
139 | 
140 | inline int ceilPowOf2(unsigned int v) {
141 |     return static_cast<int>(std::bit_ceil(v));
142 | }
143 | 
144 | inline int ilog2(uint32_t x) {
145 |     return static_cast<int>(log2(x));
146 | }
147 | 
148 | template <typename T>
149 | bool complex_compare(T a, T b, double error = 0.0101) {
150 |     auto r = (fabs(a.real() - b.real()) < error) ? true : false;
151 |     return r && (fabs(a.imag() - b.imag()) < error) ? true : false;
152 | }
153 | 
154 | uint32_t reverse_bits32(uint32_t x) {
155 |     x = ((x & 0xaaaaaaaa) >> 1) | ((x & 0x55555555) << 1);
156 |     x = ((x & 0xcccccccc) >> 2) | ((x & 0x33333333) << 2);
157 |     x = ((x & 0xf0f0f0f0) >> 4) | ((x & 0x0f0f0f0f) << 4);
158 |     x = ((x & 0xff00ff00) >> 8) | ((x & 0x00ff00ff) << 8);
159 |     return (x >> 16) | (x << 16);
160 | }
161 | 
162 | // alias for status variables
163 | using status_t = int;
164 | 


--------------------------------------------------------------------------------
/apps/heat-equation/heat-equation-serial.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * MIT License
  3 |  *
  4 |  * Copyright (c) 2023 The Regents of the University of California,
  5 |  * through Lawrence Berkeley National Laboratory (subject to receipt of any
  6 |  * required approvals from the U.S. Dept. of Energy).All rights reserved.
  7 |  *
  8 |  * Permission is hereby granted, free of charge, to any person obtaining a copy
  9 |  * of this software and associated documentation files (the "Software"), to deal
 10 |  * in the Software without restriction, including without limitation the rights
 11 |  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 12 |  * copies of the Software, and to permit persons to whom the Software is
 13 |  * furnished to do so, subject to the following conditions:
 14 |  *
 15 |  * The above copyright notice and this permission notice shall be included in
 16 |  * all copies or substantial portions of the Software.
 17 |  *
 18 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 19 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 20 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 21 |  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 22 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 23 |  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 24 |  * SOFTWARE.
 25 |  */
 26 | 
 27 | /*
 28 |  * Simplified 2d heat equation example derived from amrex
 29 |  */
 30 | 
 31 | #include "heat-equation.hpp"
 32 | 
 33 | // fill boundary cells
 34 | template <typename T>
 35 | void fill2Dboundaries_mdspan(T* grid, int len, int ghost_cells = 1) {
 36 |     auto row_view = std::mdspan<T, view_2d, std::layout_right>(grid, len, len);
 37 | 
 38 |     for (auto j = ghost_cells; j < row_view.extent(1) - ghost_cells; ++j) {
 39 |         row_view(0, j) = row_view(ghost_cells, j);
 40 |         row_view(row_view.extent(0) - ghost_cells, j) = row_view(row_view.extent(0) - ghost_cells - 1, j);
 41 |     }
 42 | 
 43 |     auto col_view = std::mdspan<Real_t, view_2d, std::layout_left>(grid, len, len);
 44 | 
 45 |     for (auto i = 1; i < col_view.extent(1) - 1; ++i) {
 46 |         col_view(0, i) = col_view(ghost_cells, i);
 47 |         col_view(col_view.extent(0) - 1, i) = col_view(col_view.extent(0) - ghost_cells - 1, i);
 48 |     }
 49 | }
 50 | 
 51 | //
 52 | // simulation
 53 | //
 54 | int main(int argc, char* argv[]) {
 55 |     // parse params
 56 |     const heat_params_t args = argparse::parse<heat_params_t>(argc, argv);
 57 | 
 58 |     // see if help wanted
 59 |     if (args.help) {
 60 |         args.print();  // prints all variables
 61 |         return 0;
 62 |     }
 63 | 
 64 |     // simulation variables
 65 |     int ncells = args.ncells;
 66 |     int nsteps = args.nsteps;
 67 |     Real_t dt = args.dt;
 68 |     Real_t alpha = args.alpha;
 69 |     // future if needed to split in multiple grids
 70 |     // int max_grid_size = args.max_grid_size;
 71 | 
 72 |     // initialize dx, dy, dz
 73 |     auto* dx = new Real_t[dims];
 74 |     for (int i = 0; i < dims; ++i)
 75 |         dx[i] = 1.0 / (ncells - 1);
 76 | 
 77 |     // simulation setup (2D)
 78 |     Real_t* grid_old = new Real_t[(ncells + nghosts) * (ncells + nghosts)];
 79 |     Real_t* grid_new = new Real_t[(ncells) * (ncells)];
 80 | 
 81 |     auto phi_old = std::mdspan<Real_t, view_2d, std::layout_right>(grid_old, ncells + nghosts, ncells + nghosts);
 82 |     auto phi_new = std::mdspan<Real_t, view_2d, std::layout_right>(grid_new, ncells, ncells);
 83 | 
 84 |     Timer timer;
 85 | 
 86 |     // initialize phi_old domain: {[-0.5, -0.5], [0.5, 0.5]} -> origin at [0,0]
 87 |     for (int i = 1; i < phi_old.extent(0) - 1; ++i) {
 88 |         for (int j = 1; j < phi_old.extent(1) - 1; ++j) {
 89 |             Real_t x = pos(i, ghost_cells, dx[0]);
 90 |             Real_t y = pos(j, ghost_cells, dx[1]);
 91 | 
 92 |             // L2 distance (r2 from origin)
 93 |             Real_t r2 = (x * x + y * y) / (0.01);
 94 | 
 95 |             // phi(x,y) = 1 + exp(-r^2)
 96 |             phi_old(i, j) = 1 + exp(-r2);
 97 |         }
 98 |     }
 99 | 
100 |     if (args.print_grid)
101 |         // print the initial grid
102 |         printGrid(grid_old, ncells + nghosts);
103 | 
104 |     // init simulation time
105 |     Real_t time = 0.0;
106 | 
107 |     // evolve the system
108 |     for (auto step = 0; step < nsteps; step++) {
109 |         // fill boundary cells in old_phi
110 |         fill2Dboundaries_mdspan(grid_old, ncells + nghosts, ghost_cells);
111 | 
112 |         // update phi_new
113 |         for (auto i = 1; i < phi_old.extent(0) - 1; i++) {
114 |             for (auto j = 1; j < phi_old.extent(1) - 1; j++) {
115 |                 // Jacobi iteration
116 |                 phi_new(i - 1, j - 1) =
117 |                     phi_old(i, j) +
118 |                     alpha * dt *
119 |                         ((phi_old(i + 1, j) - 2.0 * phi_old(i, j) + phi_old(i - 1, j)) / (dx[0] * dx[0]) +
120 |                          (phi_old(i, j + 1) - 2.0 * phi_old(i, j) + phi_old(i, j - 1)) / (dx[1] * dx[1]));
121 |             }
122 |         }
123 | 
124 |         // update the simulation time
125 |         time += dt;
126 | 
127 |         // parallel copy phi_new to phi_old
128 |         for (auto i = 1; i < phi_old.extent(0) - 1; i++)
129 |             for (auto j = 1; j < phi_old.extent(1) - 1; j++)
130 |                 // copy phi_new to phi_old
131 |                 phi_old(i, j) = phi_new(i - 1, j - 1);
132 |     }
133 | 
134 |     auto elapsed = timer.stop();
135 | 
136 |     // print timing
137 |     if (args.print_time) {
138 |         fmt::print("Duration: {:f} ms\n", elapsed);
139 |     }
140 | 
141 |     if (args.print_grid)
142 |         // print the final grid
143 |         printGrid(grid_new, ncells);
144 | 
145 |     // delete all memory
146 |     delete[] grid_old;
147 |     delete[] grid_new;
148 | 
149 |     grid_old = nullptr;
150 |     grid_new = nullptr;
151 | 
152 |     return 0;
153 | }
154 | 


--------------------------------------------------------------------------------
/apps/heat-equation/heat-equation-omp.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * MIT License
  3 |  *
  4 |  * Copyright (c) 2023 The Regents of the University of California,
  5 |  * through Lawrence Berkeley National Laboratory (subject to receipt of any
  6 |  * required approvals from the U.S. Dept. of Energy).All rights reserved.
  7 |  *
  8 |  * Permission is hereby granted, free of charge, to any person obtaining a copy
  9 |  * of this software and associated documentation files (the "Software"), to deal
 10 |  * in the Software without restriction, including without limitation the rights
 11 |  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 12 |  * copies of the Software, and to permit persons to whom the Software is
 13 |  * furnished to do so, subject to the following conditions:
 14 |  *
 15 |  * The above copyright notice and this permission notice shall be included in
 16 |  * all copies or substantial portions of the Software.
 17 |  *
 18 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 19 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 20 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 21 |  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 22 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 23 |  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 24 |  * SOFTWARE.
 25 |  */
 26 | 
 27 | /*
 28 |  * Simplified 2d heat equation example derived from amrex
 29 |  */
 30 | 
 31 | #define HEQ_OMP
 32 | #include "heat-equation.hpp"
 33 | 
 34 | // fill boundary cells OpenMP
 35 | template <typename T>
 36 | void fill2Dboundaries_omp(T* grid, int len, int nthreads = 1, int ghost_cells = 1) {
 37 | #pragma omp parallel for num_threads(nthreads)
 38 |     for (int i = ghost_cells; i < len - ghost_cells; i++) {
 39 |         grid[i] = grid[i + (ghost_cells * len)];
 40 |         grid[i + (len * (len - ghost_cells))] = grid[i + (len * (len - ghost_cells - 1))];
 41 | 
 42 |         grid[i * len] = grid[(ghost_cells * len) + i];
 43 |         grid[(len - ghost_cells) + (len * i)] = grid[(len - ghost_cells - 1) + (len * i)];
 44 |     }
 45 | }
 46 | 
 47 | //
 48 | // simulation
 49 | //
 50 | int main(int argc, char* argv[]) {
 51 |     // parse params
 52 |     const heat_params_t args = argparse::parse<heat_params_t>(argc, argv);
 53 | 
 54 |     // see if help wanted
 55 |     if (args.help) {
 56 |         args.print();  // prints all variables
 57 |         return 0;
 58 |     }
 59 | 
 60 |     // simulation variables
 61 |     int ncells = args.ncells;
 62 |     int nsteps = args.nsteps;
 63 |     int nthreads = args.nthreads;
 64 |     Real_t dt = args.dt;
 65 |     Real_t alpha = args.alpha;
 66 |     // future if needed to split in multiple grids
 67 |     // int max_grid_size = args.max_grid_size;
 68 | 
 69 |     // initialize dx, dy, dz
 70 |     auto* dx = new Real_t[dims];
 71 |     for (int i = 0; i < dims; ++i)
 72 |         dx[i] = 1.0 / (ncells - 1);
 73 | 
 74 |     // simulation setup (2D)
 75 |     Real_t* grid_old = new Real_t[(ncells + nghosts) * (ncells + nghosts)];
 76 |     Real_t* grid_new = new Real_t[(ncells) * (ncells)];
 77 | 
 78 |     auto phi_old = std::mdspan<Real_t, view_2d, std::layout_right>(grid_old, ncells + nghosts, ncells + nghosts);
 79 |     auto phi_new = std::mdspan<Real_t, view_2d, std::layout_right>(grid_new, ncells, ncells);
 80 | 
 81 |     int gsize = ncells * ncells;
 82 | 
 83 |     Timer timer;
 84 | 
 85 |     // initialize phi_old domain: {[-0.5, -0.5], [0.5, 0.5]} -> origin at [0,0]
 86 | #pragma omp parallel for num_threads(nthreads)
 87 |     for (int pos = 0; pos < gsize; pos++) {
 88 |         int i = 1 + (pos / ncells);
 89 |         int j = 1 + (pos % ncells);
 90 | 
 91 |         Real_t x = pos(i, ghost_cells, dx[0]);
 92 |         Real_t y = pos(j, ghost_cells, dx[1]);
 93 | 
 94 |         // L2 distance (r2 from origin)
 95 |         Real_t r2 = (x * x + y * y) / (0.01);
 96 | 
 97 |         // phi(x,y) = 1 + exp(-r^2)
 98 |         phi_old(i, j) = 1 + exp(-r2);
 99 |     }
100 | 
101 |     if (args.print_grid)
102 |         // print the initial grid
103 |         printGrid(grid_old, ncells + nghosts);
104 | 
105 |     // init simulation time
106 |     Real_t time = 0.0;
107 | 
108 |     // evolve the system
109 |     for (auto step = 0; step < nsteps; step++) {
110 |         // fill boundary cells in old_phi
111 |         fill2Dboundaries_omp(grid_old, ncells + nghosts, ghost_cells, nthreads);
112 | 
113 | #pragma omp parallel for num_threads(nthreads)
114 |         for (int pos = 0; pos < gsize; pos++) {
115 |             int i = 1 + (pos / ncells);
116 |             int j = 1 + (pos % ncells);
117 | 
118 |             // Jacobi iteration
119 |             phi_new(i - 1, j - 1) =
120 |                 phi_old(i, j) + alpha * dt *
121 |                                     ((phi_old(i + 1, j) - 2.0 * phi_old(i, j) + phi_old(i - 1, j)) / (dx[0] * dx[0]) +
122 |                                      (phi_old(i, j + 1) - 2.0 * phi_old(i, j) + phi_old(i, j - 1)) / (dx[1] * dx[1]));
123 |         }
124 | 
125 |         // update the simulation time
126 |         time += dt;
127 | 
128 |         // parallel copy phi_new to phi_old
129 | #pragma omp parallel for num_threads(nthreads)
130 |         for (int pos = 0; pos < gsize; pos++) {
131 |             int i = 1 + (pos / ncells);
132 |             int j = 1 + (pos % ncells);
133 | 
134 |             // copy phi_new to phi_old
135 |             phi_old(i, j) = phi_new(i - 1, j - 1);
136 |         }
137 |     }
138 | 
139 |     auto elapsed = timer.stop();
140 | 
141 |     // print timing
142 |     if (args.print_time) {
143 |         fmt::print("Duration: {:f} ms\n", elapsed);
144 |     }
145 | 
146 |     if (args.print_grid)
147 |         // print the final grid
148 |         printGrid(grid_new, ncells);
149 | 
150 |     // delete all memory
151 |     delete[] grid_old;
152 |     delete[] grid_new;
153 | 
154 |     grid_old = nullptr;
155 |     grid_new = nullptr;
156 | 
157 |     return 0;
158 | }
159 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
  1 | # ##############################################################################
  2 | # Add colors
  3 | # ##############################################################################
  4 | if(NOT WIN32)
  5 |   string(ASCII 27 Esc)
  6 |   set(ColourReset "${Esc}[m")
  7 |   set(ColourBold "${Esc}[1m")
  8 |   set(Red "${Esc}[31m")
  9 |   set(Green "${Esc}[32m")
 10 |   set(Yellow "${Esc}[33m")
 11 |   set(Blue "${Esc}[34m")
 12 |   set(Magenta "${Esc}[35m")
 13 |   set(Cyan "${Esc}[36m")
 14 |   set(White "${Esc}[37m")
 15 |   set(BoldRed "${Esc}[1;31m")
 16 |   set(BoldGreen "${Esc}[1;32m")
 17 |   set(BoldYellow "${Esc}[1;33m")
 18 |   set(BoldBlue "${Esc}[1;34m")
 19 |   set(BoldMagenta "${Esc}[1;35m")
 20 |   set(BoldCyan "${Esc}[1;36m")
 21 |   set(BoldWhite "${Esc}[1;37m")
 22 | endif()
 23 | 
 24 | # ##############################################################################
 25 | # CMake settings
 26 | # ##############################################################################
 27 | 
 28 | # cmake min required
 29 | cmake_minimum_required(VERSION 3.22.1 FATAL_ERROR)
 30 | project(hpcpp LANGUAGES CXX CUDA)
 31 | 
 32 | # in source build warning
 33 | if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_BINARY_DIR}")
 34 |   set(MSG "")
 35 |   message(
 36 |     STATUS "Warning! Building from the source directory is not recommended")
 37 |   message(
 38 |     STATUS "If unintented, please remove 'CMakeCache.txt' and 'CMakeFiles'")
 39 |   message(STATUS "and build from a separate directory")
 40 |   message(WARNING "In-source build")
 41 | endif()
 42 | 
 43 | # set cmake module path
 44 | set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/external/")
 45 | 
 46 | # Set a default build type if none was specified
 47 | set(HPCPP_BUILD_TYPE "RelWithDebInfo")
 48 | 
 49 | # set the build type
 50 | if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
 51 |   message(
 52 |     STATUS
 53 |       "${BoldCyan}Setting build type to '${HPCPP_BUILD_TYPE}' as none was specified.${ColourReset}"
 54 |   )
 55 |   set(CMAKE_BUILD_TYPE
 56 |       "${HPCPP_BUILD_TYPE}"
 57 |       CACHE STRING "Choose the type of build." FORCE)
 58 |   # Set the possible values of build type for cmake-gui
 59 |   set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release"
 60 |                                                "MinSizeRel" "RelWithDebInfo")
 61 | endif()
 62 | 
 63 | # ##############################################################################
 64 | # GCC version check
 65 | # ##############################################################################
 66 | set(GCC_EXPECTED_VERSION 11.2)
 67 | 
 68 | if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS GCC_EXPECTED_VERSION)
 69 |   message(
 70 |     FATAL_ERROR
 71 |       "GCC: hpcpp requires GCC v${GCC_EXPECTED_VERSION} or higher to build but found v${CMAKE_CXX_COMPILER_VERSION}"
 72 |   )
 73 | endif()
 74 | 
 75 | # ##############################################################################
 76 | # CXX standard
 77 | # ##############################################################################
 78 | set(CXX_STANDARD_REQUIRED ON)
 79 | 
 80 | # required minimum CXX standard
 81 | set(CMAKE_CXX_STANDARD_REQUIRED 23)
 82 | set(CMAKE_GNU_EXTENSIONS ON)
 83 | 
 84 | if(NOT CXX_STANDARD OR (CXX_STANDARD LESS ${CMAKE_CXX_STANDARD_REQUIRED}))
 85 |   set(CXX_STANDARD ${CMAKE_CXX_STANDARD_REQUIRED})
 86 |   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++${CXX_STANDARD}")
 87 |   message(STATUS "Setting CXX_STANDARD to ${CMAKE_CXX_STANDARD_REQUIRED}")
 88 | endif()
 89 | 
 90 | # ##############################################################################
 91 | # Setup STDEXEC
 92 | # ##############################################################################
 93 | 
 94 | # this is a hack should be automatically detected from the CMAKE_PREFIX_PATH
 95 | # instead of manual
 96 | set(CPM_DOWNLOAD_VERSION 0.35.6)
 97 | 
 98 | if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.24.0")
 99 |   cmake_policy(SET CMP0135 NEW)
100 | endif()
101 | 
102 | if(CPM_SOURCE_CACHE)
103 |   set(CPM_DOWNLOAD_LOCATION
104 |       "${CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
105 | elseif(DEFINED ENV{CPM_SOURCE_CACHE})
106 |   set(CPM_DOWNLOAD_LOCATION
107 |       "$ENV{CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
108 | else()
109 |   set(CPM_DOWNLOAD_LOCATION
110 |       "${CMAKE_BINARY_DIR}/cmake/CPM_${CPM_DOWNLOAD_VERSION}.cmake")
111 | endif()
112 | 
113 | if(NOT (EXISTS ${CPM_DOWNLOAD_LOCATION}))
114 |   message(STATUS "Downloading CPM.cmake to ${CPM_DOWNLOAD_LOCATION}")
115 |   file(
116 |     DOWNLOAD
117 |     https://github.com/TheLartians/CPM.cmake/releases/download/v${CPM_DOWNLOAD_VERSION}/CPM.cmake
118 |     ${CPM_DOWNLOAD_LOCATION})
119 | endif()
120 | 
121 | include(${CPM_DOWNLOAD_LOCATION})
122 | 
123 | cpmaddpackage(
124 |   NAME
125 |   stdexec
126 |   GITHUB_REPOSITORY
127 |   NVIDIA/stdexec
128 |   GIT_TAG
129 |   main
130 |   OPTIONS
131 |   "STDEXEC_ENABLE_CUDA ON"
132 |   "STDEXEC_BUILD_EXAMPLES OFF"
133 |   "STDEXEC_BUILD_TESTS OFF"
134 |   "STDEXEC_ENABLE_IO_URING_TESTS OFF"
135 |   "BUILD_TESTING OFF")
136 | 
137 | cpmaddpackage(NAME mdspan GITHUB_REPOSITORY kokkos/mdspan GIT_TAG stable)
138 | 
139 | cpmaddpackage(NAME fmt GITHUB_REPOSITORY fmtlib/fmt GIT_TAG master)
140 | 
141 | cpmaddpackage(NAME mdspan_formatter GITHUB_REPOSITORY weilewei/mdspan_formatter
142 |               GIT_TAG main)
143 | 
144 | cpmaddpackage(NAME argparse GITHUB_REPOSITORY mhaseeb123/argparse GIT_TAG
145 |               master)
146 | 
147 | add_library(hpcpp-core INTERFACE)
148 | 
149 | # Link external libraries
150 | target_link_libraries(hpcpp-core INTERFACE mdspan fmt mdspan_formatter argparse)
151 | 
152 | # stdpar and openmp
153 | set(STDPAR_TYPE "gpu")
154 | set(OMP_TYPE "multicore")
155 | 
156 | # set the build type
157 | if(NOT STDPAR)
158 |   message(
159 |     STATUS
160 |       "${BoldCyan}Setting -stdpar=${STDPAR_TYPE} as none was specified.${ColourReset}"
161 |   )
162 |   set(STDPAR
163 |       "${STDPAR_TYPE}"
164 |       CACHE STRING "Choose the stdpar accelerator." FORCE)
165 |   # Set the possible values of build type for cmake-gui
166 |   set_property(CACHE STDPAR PROPERTY STRINGS "gpu" "multicore")
167 | endif()
168 | 
169 | # set the omp offload type
170 | if(NOT OMP)
171 |   message(
172 |     STATUS
173 |       "${BoldCyan}Setting -mp=${OMP_TYPE} as none was specified.${ColourReset}")
174 |   set(OMP
175 |       "${OMP_TYPE}"
176 |       CACHE STRING "Choose the OpenMP accelerator." FORCE)
177 |   # Set the possible values of build type for cmake-gui
178 |   set_property(CACHE OMP PROPERTY STRINGS "multicore" "gpu")
179 | endif()
180 | 
181 | # need to add appropriate flags for stdexec
182 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -stdpar=${STDPAR} -mp=${OMP}")
183 | 
184 | # add -cudalib=cublas if -stdpar=gpu
185 | if(STDPAR STREQUAL "gpu")
186 |   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_GPU")
187 | else()
188 |   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -UUSE_GPU")
189 | endif()
190 | 
191 | # ##############################################################################
192 | # Add sub-directories
193 | # ##############################################################################
194 | 
195 | # ----------------------------------------------------------------------------------------#
196 | # apps
197 | # ----------------------------------------------------------------------------------------#
198 | 
199 | message(STATUS "Adding hpcpp apps...")
200 | add_subdirectory(apps)
201 | 


--------------------------------------------------------------------------------
/apps/prefixSum/prefixSum-stdexec.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * MIT License
  3 |  *
  4 |  * Copyright (c) 2023 The Regents of the University of California,
  5 |  * through Lawrence Berkeley National Laboratory (subject to receipt of any
  6 |  * required approvals from the U.S. Dept. of Energy).All rights reserved.
  7 |  *
  8 |  * Permission is hereby granted, free of charge, to any person obtaining a copy
  9 |  * of this software and associated documentation files (the "Software"), to deal
 10 |  * in the Software without restriction, including without limitation the rights
 11 |  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 12 |  * copies of the Software, and to permit persons to whom the Software is
 13 |  * furnished to do so, subject to the following conditions:
 14 |  *
 15 |  * The above copyright notice and this permission notice shall be included in
 16 |  * all copies or substantial portions of the Software.
 17 |  *
 18 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 19 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 20 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 21 |  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 22 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 23 |  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 24 |  * SOFTWARE.
 25 |  */
 26 | 
 27 | /*
 28 |  * commons for the prefixSum codes
 29 |  */
 30 | 
 31 | #define PSUM_STDEXEC
 32 | #include "prefixSum.hpp"
 33 | #include "repeat_n/repeat_n.cuh"
 34 | 
 35 | //
 36 | // stdexec prefixSum function
 37 | //
 38 | template <typename T>
 39 | [[nodiscard]] T* prefixSum(scheduler auto&& sch, const T* in, const int N) {
 40 |     // allocate a N+1 size array as there will be a trailing zero
 41 |     T* y = new T[N + 1];
 42 | 
 43 |     // number of iterations
 44 |     int niters = ilog2(N);
 45 | 
 46 |     // need to be dynamic memory to be able to use it in gpu ctx.
 47 |     int* d_ptr = new int(0);
 48 | 
 49 |     // memcpy to output vector to start computation.
 50 |     ex::sync_wait(ex::schedule(sch) | ex::bulk(N, [=](int k) { y[k] = in[k]; }));
 51 | 
 52 |     // GE Blelloch (1990) algorithm from pseudocode at:
 53 |     // https://developer.nvidia.com/gpugems/gpugems3/part-vi-gpu-computing/chapter-39-parallel-prefix-sum-scan-cuda
 54 | 
 55 |     // upsweep
 56 |     for (int d = 0; d < niters; d++) {
 57 |         int bsize = N / (1 << d + 1);
 58 | 
 59 |         ex::sender auto uSweep = schedule(sch) | ex::bulk(bsize, [=](int k) {
 60 |                                      // stride1 = 2^(d+1)
 61 |                                      int st1 = 1 << d + 1;
 62 |                                      // stride2 = 2^d
 63 |                                      int st2 = 1 << d;
 64 |                                      // only the threads at indices (k+1) * 2^(d+1) -1 will compute
 65 |                                      int myIdx = (k + 1) * st1 - 1;
 66 | 
 67 |                                      // update y[myIdx]
 68 |                                      y[myIdx] += y[myIdx - st2];
 69 |                                  });
 70 |         // wait for upsweep
 71 |         ex::sync_wait(uSweep);
 72 |     }
 73 | 
 74 |     // write sum to y[N] and reset vars
 75 |     ex::sync_wait(schedule(sch) | ex::then([=]() {
 76 |                       y[N] = y[N - 1];
 77 |                       y[N - 1] = 0;
 78 |                   }));
 79 | 
 80 |     // downsweep
 81 |     for (int d = niters - 1; d >= 0; d--) {
 82 |         int bsize = N / (1 << d + 1);
 83 | 
 84 |         ex::sender auto dSweep = schedule(sch) | ex::bulk(bsize, [=](int k) {
 85 |                                      // stride1 = 2^(d+1)
 86 |                                      int st1 = 1 << d + 1;
 87 |                                      // stride2 = 2^d
 88 |                                      int st2 = 1 << d;
 89 |                                      // only the threads at indices (k+1) * 2^(d+1) -1 will compute
 90 |                                      int myIdx = (k + 1) * st1 - 1;
 91 | 
 92 |                                      // update y[myIdx] and y[myIdx-stride2]
 93 |                                      auto tmp = y[myIdx];
 94 |                                      y[myIdx] += y[myIdx - st2];
 95 |                                      y[myIdx - st2] = tmp;
 96 |                                  });
 97 | 
 98 |         // wait for downsweep
 99 |         ex::sync_wait(dSweep);
100 |     }
101 | 
102 |     // return the computed results.
103 |     return y;
104 | }
105 | 
106 | //
107 | // simulation
108 | //
109 | int main(int argc, char* argv[]) {
110 |     // parse params
111 |     const prefixSum_params_t args = argparse::parse<prefixSum_params_t>(argc, argv);
112 | 
113 |     // see if help wanted
114 |     if (args.help) {
115 |         args.print();  // prints all variables
116 |         return 0;
117 |     }
118 | 
119 |     // simulation variables
120 |     int N = args.N;
121 |     bool print_arr = args.print_arr;
122 |     bool print_time = args.print_time;
123 |     bool validate = args.validate;
124 |     std::string sched = args.sch;
125 |     int nthreads = args.nthreads;
126 | 
127 |     if (!isPowOf2(N)) {
128 |         N = ceilPowOf2(N);
129 |         fmt::print("INFO: N != pow(2). Setting => N = {}\n", N);
130 |     }
131 | 
132 |     // input data
133 |     data_t* in = new data_t[N];
134 | 
135 |     fmt::print("Progress:0%");
136 | 
137 |     // random number generator
138 |     psum::genRandomVector(in, N, (data_t)0, (data_t)10);
139 | 
140 |     fmt::print("..50%");
141 | 
142 |     // output pointer
143 |     data_t* out = nullptr;
144 | 
145 |     // start the timer
146 |     Timer timer;
147 | 
148 |     // initialize stdexec scheduler
149 |     sch_t scheduler = get_sch_enum(sched);
150 | 
151 |     // launch with appropriate stdexec scheduler
152 |     switch (scheduler) {
153 |         case sch_t::CPU:
154 |             out = prefixSum(exec::static_thread_pool(nthreads).get_scheduler(), in, N);
155 |             break;
156 | #if defined(USE_GPU)
157 |         case sch_t::GPU:
158 |             out = prefixSum(nvexec::stream_context().get_scheduler(), in, N);
159 |             break;
160 |         case sch_t::MULTIGPU:
161 |             out = prefixSum(nvexec::multi_gpu_stream_context().get_scheduler(), in, N);
162 |             break;
163 | #endif  // USE_GPU
164 |         default:
165 |             throw std::runtime_error("Run: `prefixSum-stdexec --help` to see the list of available schedulers");
166 |     }
167 | 
168 |     // stop timer
169 |     auto elapsed = timer.stop();
170 | 
171 |     fmt::print("..100%\n");
172 | 
173 |     // print the input and its prefix sum (don't if N > 100)
174 |     if (print_arr && N < 100) {
175 |         fmt::print("int = {}\n", fmt::join(in, in + N, " "));
176 |         fmt::print("out = {}\n", fmt::join(out + 1, out + 1 + N, " "));
177 |     }
178 | 
179 |     // print the elapsed time
180 |     if (print_time)
181 |         fmt::print("Elapsed Time: {:f} s\n", elapsed);
182 | 
183 |     // validate the prefixSum
184 |     if (validate) {
185 |         bool verify = psum::validatePrefixSum(in, out + 1, N);
186 | 
187 |         if (verify)
188 |             fmt::print("SUCCESS..");
189 |         else
190 |             fmt::print("FAILED..");
191 | 
192 |         fmt::print("\n");
193 |     }
194 | 
195 |     // return status
196 |     return 0;
197 | }
198 | 


--------------------------------------------------------------------------------
/apps/heat-equation/heat-equation-stdexec.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * MIT License
  3 |  *
  4 |  * Copyright (c) 2023 The Regents of the University of California,
  5 |  * through Lawrence Berkeley National Laboratory (subject to receipt of any
  6 |  * required approvals from the U.S. Dept. of Energy).All rights reserved.
  7 |  *
  8 |  * Permission is hereby granted, free of charge, to any person obtaining a copy
  9 |  * of this software and associated documentation files (the "Software"), to deal
 10 |  * in the Software without restriction, including without limitation the rights
 11 |  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 12 |  * copies of the Software, and to permit persons to whom the Software is
 13 |  * furnished to do so, subject to the following conditions:
 14 |  *
 15 |  * The above copyright notice and this permission notice shall be included in
 16 |  * all copies or substantial portions of the Software.
 17 |  *
 18 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 19 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 20 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 21 |  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 22 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 23 |  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 24 |  * SOFTWARE.
 25 |  */
 26 | 
 27 | /*
 28 |  * Simplified 2d heat equation example derived from amrex
 29 |  */
 30 | 
 31 | #define HEQ_STDEXEC
 32 | #include "heat-equation.hpp"
 33 | #include "repeat_n/repeat_n.cuh"
 34 | 
 35 | // 2D jacobi algorithm pipeline
 36 | void heat_equation(scheduler auto sch, Real_t* phi_old, Real_t* phi_new, Real_t* dx, Real_t dt, Real_t alpha,
 37 |                    int nsteps, int ncells, bool print = false) {
 38 |     // init simulation time
 39 |     Real_t time = 0.0;
 40 |     auto phi_old_extent = ncells + nghosts;
 41 |     int gsize = ncells * ncells;
 42 | 
 43 |     // initialize dx on CPU
 44 |     for (int i = 0; i < dims; ++i)
 45 |         dx[i] = 1.0 / (ncells - 1);
 46 | 
 47 |     // set cout precision
 48 |     fmt::print("HEQ progress: ");
 49 | 
 50 |     ex::sender auto begin = schedule(sch);
 51 | 
 52 |     auto heat_eq_init = ex::bulk(begin, gsize, [=](int pos) {
 53 |         int i = 1 + (pos / ncells);
 54 |         int j = 1 + (pos % ncells);
 55 | 
 56 |         Real_t x = pos(i, ghost_cells, dx[0]);
 57 |         Real_t y = pos(j, ghost_cells, dx[1]);
 58 | 
 59 |         // L2 distance (r2 from origin)
 60 |         Real_t r2 = (x * x + y * y) / (0.01);
 61 | 
 62 |         // phi(x,y) = 1 + exp(-r^2)
 63 |         phi_old[(i)*phi_old_extent + j] = 1 + exp(-r2);
 64 |     });
 65 | 
 66 |     ex::sync_wait(std::move(heat_eq_init));
 67 | 
 68 |     if (print)
 69 |         printGrid(phi_old, ncells + nghosts);
 70 | 
 71 |     auto fillBoundary = [=](int pos) {
 72 |         int i = pos + ghost_cells;
 73 |         int len = phi_old_extent;
 74 |         // fill boundary cells in old_phi
 75 |         phi_old[i] = phi_old[i + (ghost_cells * len)];
 76 |         phi_old[i + (len * (len - ghost_cells))] = phi_old[i + (len * (len - ghost_cells - 1))];
 77 |         phi_old[i * len] = phi_old[(ghost_cells * len) + i];
 78 |         phi_old[(len - ghost_cells) + (len * i)] = phi_old[(len - ghost_cells - 1) + (len * i)];
 79 |     };
 80 | 
 81 |     auto jacobi = [=](int pos) {
 82 |         int i = 1 + (pos / ncells);
 83 |         int j = 1 + (pos % ncells);
 84 | 
 85 |         // Jacobi iteration
 86 |         phi_new[(i - 1) * ncells + j - 1] =
 87 |             phi_old[(i)*phi_old_extent + j] +
 88 |             alpha * dt *
 89 |                 ((phi_old[(i + 1) * phi_old_extent + j] - 2.0 * phi_old[(i)*phi_old_extent + j] +
 90 |                   phi_old[(i - 1) * phi_old_extent + j]) /
 91 |                      (dx[0] * dx[0]) +
 92 |                  (phi_old[(i)*phi_old_extent + j + 1] - 2.0 * phi_old[(i)*phi_old_extent + j] +
 93 |                   phi_old[(i)*phi_old_extent + j - 1]) /
 94 |                      (dx[1] * dx[1]));
 95 |     };
 96 | 
 97 |     auto parallelCopy = [=](int pos) {
 98 |         int i = 1 + (pos / ncells);
 99 |         int j = 1 + (pos % ncells);
100 |         phi_old[(i)*phi_old_extent + j] = phi_new[(i - 1) * ncells + (j - 1)];
101 |     };
102 | 
103 |     // evolve the system
104 | #if !defined(USE_GPU)
105 |     for (auto iter = 0; iter < nsteps; iter++)
106 | #endif
107 |         stdexec::sync_wait(
108 | #if defined(USE_GPU)
109 |             ex::just() | exec::on(sch, repeat_n(nsteps,
110 | #else
111 |   stdexec::schedule(sch) |
112 | #endif  // USE_GPU
113 |                                                 ex::bulk(phi_old_extent - nghosts, [=](int k) { fillBoundary(k); }) |
114 |                                                     ex::bulk(gsize, [=](int k) { jacobi(k); }) |
115 |                                                     ex::bulk(gsize, [=](int k) { parallelCopy(k); })
116 | #if defined(USE_GPU)
117 |                                                     ))
118 | #endif  // USE_GPU
119 |         );
120 | 
121 |     // update the simulation time
122 |     time += nsteps * dt;
123 | 
124 |     // print final progress mark
125 |     fmt::print("100% \n");
126 | 
127 |     return;
128 | }
129 | 
130 | //
131 | // simulation
132 | //
133 | int main(int argc, char* argv[]) {
134 |     // parse params
135 |     const heat_params_t args = argparse::parse<heat_params_t>(argc, argv);
136 | 
137 |     // see if help wanted
138 |     if (args.help) {
139 |         args.print();  // prints all variables
140 |         return 0;
141 |     }
142 | 
143 |     // simulation variables
144 |     int ncells = args.ncells;
145 |     int nsteps = args.nsteps;
146 |     int nthreads = args.nthreads;
147 |     Real_t dt = args.dt;
148 |     Real_t alpha = args.alpha;
149 |     std::string sched = args.sch;
150 | 
151 |     // initialize dx, dy, dz
152 |     std::vector<Real_t> ds(dims);
153 |     // simulation setup (2D)
154 |     std::vector<Real_t> grid_old((ncells + nghosts) * (ncells + nghosts));
155 |     std::vector<Real_t> grid_new(ncells * ncells);
156 | 
157 |     // data pointers
158 |     Real_t* dx = ds.data();
159 |     Real_t* phi_old = grid_old.data();
160 |     Real_t* phi_new = grid_new.data();
161 | 
162 |     // initialize stdexec scheduler
163 |     sch_t scheduler = get_sch_enum(sched);
164 | 
165 |     // init timer
166 |     Timer timer;
167 | 
168 |     // launch with appropriate stdexec scheduler
169 |     switch (scheduler) {
170 |         case sch_t::CPU:
171 |             heat_equation(exec::static_thread_pool{nthreads}.get_scheduler(), phi_old, phi_new, dx, dt, alpha, nsteps,
172 |                           ncells, args.print_grid);
173 |             break;
174 | #if defined(USE_GPU)
175 |         case sch_t::GPU:
176 |             heat_equation(nvexec::stream_context().get_scheduler(), phi_old, phi_new, dx, dt, alpha, nsteps, ncells,
177 |                           args.print_grid);
178 |             break;
179 |         case sch_t::MULTIGPU:
180 |             heat_equation(nvexec::multi_gpu_stream_context().get_scheduler(), phi_old, phi_new, dx, dt, alpha, nsteps,
181 |                           ncells, args.print_grid);
182 |             break;
183 | #endif  // USE_GPU
184 |         default:
185 |             throw std::runtime_error("Run: `heat-equation-stdexec --help` to see the list of available schedulers");
186 |     }
187 | 
188 |     auto elapsed = timer.stop();
189 | 
190 |     // print timing
191 |     if (args.print_time) {
192 |         fmt::print("Duration: {:f} ms\n", elapsed);
193 |     }
194 | 
195 |     if (args.print_grid)
196 |         // print the final grid
197 |         printGrid(phi_new, ncells);
198 | 
199 |     return 0;
200 | }


--------------------------------------------------------------------------------
/apps/1d-stencil/1d-stdexec.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * MIT License
  3 |  *
  4 |  * Copyright (c) 2023 Weile Wei
  5 |  * Copyright (c) 2023 The Regents of the University of California,
  6 |  * through Lawrence Berkeley National Laboratory (subject to receipt of any
  7 |  * required approvals from the U.S. Dept. of Energy).All rights reserved.
  8 |  *
  9 |  * Permission is hereby granted, free of charge, to any person obtaining a copy
 10 |  * of this software and associated documentation files (the "Software"), to deal
 11 |  * in the Software without restriction, including without limitation the rights
 12 |  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 13 |  * copies of the Software, and to permit persons to whom the Software is
 14 |  * furnished to do so, subject to the following conditions:
 15 |  *
 16 |  * The above copyright notice and this permission notice shall be included in
 17 |  * all copies or substantial portions of the Software.
 18 |  *
 19 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 20 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 21 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 22 |  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 23 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 24 |  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 25 |  * SOFTWARE.
 26 |  */
 27 | //
 28 | // This example provides a stdexec implementation for the 1D stencil code.
 29 | #include <exec/static_thread_pool.hpp>
 30 | #if defined(USE_GPU)
 31 | #include <nvexec/multi_gpu_context.cuh>
 32 | #include <nvexec/stream_context.cuh>
 33 | #endif
 34 | #include <stdexec/execution.hpp>
 35 | 
 36 | #include "argparse/argparse.hpp"
 37 | #include "commons.hpp"
 38 | #include "repeat_n/repeat_n.cuh"
 39 | 
 40 | // parameters
 41 | struct args_params_t : public argparse::Args {
 42 |     bool& results = kwarg("results", "print generated results (default: false)").set_default(false);
 43 |     std::uint64_t& nt = kwarg("nt", "Number of time steps").set_default(45);
 44 |     std::uint64_t& size = kwarg("size", "Number of elements").set_default(10);
 45 |     bool& k = kwarg("k", "Heat transfer coefficient").set_default(0.5);
 46 |     double& dt = kwarg("dt", "Timestep unit (default: 1.0[s])").set_default(1.0);
 47 |     double& dx = kwarg("dx", "Local x dimension").set_default(1.0);
 48 |     bool& help = flag("h, help", "print help");
 49 |     bool& time = kwarg("t, time", "print time").set_default(true);
 50 |     std::string& sch = kwarg("sch",
 51 |                              "stdexec scheduler: [options: cpu"
 52 | #if defined(USE_GPU)
 53 |                              ", gpu, multigpu"
 54 | #endif  //USE_GPU
 55 |                              "]")
 56 |                            .set_default("cpu");
 57 | 
 58 |     int& nthreads = kwarg("nthreads", "number of threads").set_default(std::thread::hardware_concurrency());
 59 | };
 60 | 
 61 | using Real_t = double;
 62 | ///////////////////////////////////////////////////////////////////////////////
 63 | // Command-line variables
 64 | constexpr Real_t k = 0.5;  // heat transfer coefficient
 65 | constexpr Real_t dt = 1.;  // time step
 66 | constexpr Real_t dx = 1.;  // grid spacing
 67 | 
 68 | ///////////////////////////////////////////////////////////////////////////////
 69 | //[stepper_1
 70 | struct stepper {
 71 | 
 72 |     // do all the work on 'size' data points for 'nt' time steps
 73 |     [[nodiscard]] std::vector<Real_t> do_work(const auto& sch, std::size_t size, std::size_t nt) {
 74 |         std::vector<Real_t> current(size);
 75 |         std::vector<Real_t> next(size);
 76 | 
 77 |         Real_t** next_ptr_ptr = new Real_t*(next.data());
 78 |         Real_t** current_ptr_ptr = new Real_t*(current.data());
 79 | 
 80 |         stdexec::sender auto init = stdexec::bulk(stdexec::schedule(sch), size, [=](int i) {
 81 |             auto current_ptr = *current_ptr_ptr;
 82 |             ;
 83 |             current_ptr[i] = (Real_t)i;
 84 |         });
 85 |         stdexec::sync_wait(std::move(init));
 86 | 
 87 | #if !defined(USE_GPU)
 88 |         for (auto iter = 0; iter < nt; iter++)
 89 | #endif
 90 |             // evolve the system
 91 |             stdexec::sync_wait(
 92 | #if defined(USE_GPU)
 93 |                 ex::just() |
 94 |                 exec::on(sch, repeat_n(nt,
 95 | #else
 96 |             stdexec::schedule(sch) |
 97 | #endif
 98 |                                        stdexec::bulk(size,
 99 |                                                      [=](int i) {
100 |                                                          auto current_ptr = *current_ptr_ptr;
101 |                                                          auto next_ptr = *next_ptr_ptr;
102 | 
103 |                                                          std::size_t left = (i == 0) ? size - 1 : i - 1;
104 |                                                          std::size_t right = (i == size - 1) ? 0 : i + 1;
105 |                                                          next_ptr[i] = current_ptr[i] +
106 |                                                                        (k * dt / (dx * dx)) *
107 |                                                                            (current_ptr[left] - 2 * current_ptr[i] +
108 |                                                                             current_ptr[right]);
109 |                                                      }) |
110 |                                            stdexec::then([=]() { std::swap(*next_ptr_ptr, *current_ptr_ptr); })
111 | #if defined(USE_GPU)
112 |                                            ))
113 | #endif  // USE_GPU
114 |             );
115 | 
116 |         if (nt % 2 == 0) {
117 |             return current;
118 |         }
119 |         return next;
120 |     }
121 | };
122 | 
123 | ///////////////////////////////////////////////////////////////////////////////
124 | int benchmark(args_params_t const& args) {
125 |     std::uint64_t size = args.size;  // Number of elements.
126 |     std::uint64_t nt = args.nt;      // Number of steps.
127 |     std::string sch_str = args.sch;  // scheduler type
128 |     int nthreads = args.nthreads;    // number of threads for cpu scheduler type
129 | 
130 |     // Create the stepper object
131 |     stepper step;
132 | 
133 |     // Measure execution time.
134 |     Timer timer;
135 | 
136 |     // Execute nt time steps on size of elements.
137 |     // launch with appropriate stdexec scheduler
138 |     std::vector<Real_t> solution;
139 |     try {
140 |         sch_t schedulerType = get_sch_enum(sch_str);
141 | 
142 |         switch (schedulerType) {
143 |             case sch_t::CPU:
144 |                 solution = step.do_work(exec::static_thread_pool(nthreads).get_scheduler(), size, nt);
145 |                 break;
146 | #if defined(USE_GPU)
147 |             case sch_t::GPU:
148 |                 solution = step.do_work(nvexec::stream_context().get_scheduler(), size, nt);
149 |                 break;
150 |             case sch_t::MULTIGPU:
151 |                 solution = step.do_work(nvexec::multi_gpu_stream_context().get_scheduler(), size, nt);
152 |                 break;
153 | #endif  // USE_GPU
154 |             default:
155 |                 std::cerr << "Unknown scheduler type encountered." << std::endl;
156 |                 break;
157 |         }
158 |     } catch (const std::invalid_argument& e) {
159 |         std::cerr << e.what() << std::endl;
160 |         exit(1);
161 |     }
162 | 
163 |     auto time = timer.stop();
164 | 
165 |     // Print the final solution
166 |     if (args.results) {
167 |         fmt::println("{::f}", solution);
168 |     }
169 | 
170 |     if (args.time) {
171 |         fmt::print("Duration: {:f} ms\n", time);
172 |     }
173 | 
174 |     return 0;
175 | }
176 | 
177 | int main(int argc, char* argv[]) {
178 |     // parse params
179 |     args_params_t args = argparse::parse<args_params_t>(argc, argv);
180 |     // see if help wanted
181 |     if (args.help) {
182 |         args.print();  // prints all variables
183 |         return 0;
184 |     }
185 | 
186 |     benchmark(args);
187 | 
188 |     return 0;
189 | }
190 | 


--------------------------------------------------------------------------------
/apps/fft/fft-stdexec.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * MIT License
  3 |  *
  4 |  * Copyright (c) 2023 The Regents of the University of California,
  5 |  * through Lawrence Berkeley National Laboratory (subject to receipt of any
  6 |  * required approvals from the U.S. Dept. of Energy).All rights reserved.
  7 |  *
  8 |  * Permission is hereby granted, free of charge, to any person obtaining a copy
  9 |  * of this software and associated documentation files (the "Software"), to deal
 10 |  * in the Software without restriction, including without limitation the rights
 11 |  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 12 |  * copies of the Software, and to permit persons to whom the Software is
 13 |  * furnished to do so, subject to the following conditions:
 14 |  *
 15 |  * The above copyright notice and this permission notice shall be included in
 16 |  * all copies or substantial portions of the Software.
 17 |  *
 18 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 19 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 20 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 21 |  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 22 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 23 |  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 24 |  * SOFTWARE.
 25 |  */
 26 | 
 27 | /*
 28 |  * commons for the fft codes
 29 |  */
 30 | 
 31 | #define FFT_STDEXEC
 32 | #include "fft.hpp"
 33 | #include "repeat_n/repeat_n.cuh"
 34 | 
 35 | //
 36 | // fft algorithm
 37 | //
 38 | [[nodiscard]] std::vector<data_t> fft(const data_t* x, scheduler auto sch, const int N, const int max_threads,
 39 |                                       bool debug = false) {
 40 |     std::vector<data_t> x_rev(N);
 41 | 
 42 |     data_t* x_r = x_rev.data();
 43 | 
 44 |     // compute shift factor
 45 |     int shift = 32 - ilog2(N);
 46 | 
 47 |     // set cout precision
 48 |     fmt::print("FFT progress: ");
 49 | 
 50 |     // twiddle bits for fft
 51 |     ex::sender auto twiddle = ex::bulk(schedule(sch), N, [=](int k) {
 52 |         auto new_idx = reverse_bits32(k) >> shift;
 53 |         x_r[k] = x[new_idx];
 54 |     });
 55 |     ex::sync_wait(std::move(twiddle));
 56 | 
 57 |     // mark progress of the twiddle stage
 58 |     fmt::print("50%..");
 59 | 
 60 |     // niterations
 61 |     int niters = ilog2(N);
 62 | 
 63 |     // pointer to local partition size (must be dynamic mem to be copied to GPU)
 64 |     int* lN_ptr = new int(1);
 65 | 
 66 | #if !defined(USE_GPU)
 67 |     for (auto iter = 0; iter < niters; iter++)
 68 | #endif
 69 |         // evolve the system
 70 |         stdexec::sync_wait(
 71 | #if defined(USE_GPU)
 72 |             // iterate until niters - lN*=2 after each iteration
 73 |             ex::just() | exec::on(sch, repeat_n(niters,
 74 | #else
 75 |             stdexec::schedule(sch) |
 76 | #endif  // USE_GPU
 77 |                                                 ex::then([=]() { *lN_ptr *= 2; }) |
 78 |                                                     ex::bulk(N / 2,
 79 |                                                              [=](int k) {
 80 |                                                                  // extract lN from pointer
 81 |                                                                  int lN = *lN_ptr;
 82 | 
 83 |                                                                  // number of partitions
 84 |                                                                  int nparts = N / lN;
 85 |                                                                  int tpp = lN / 2;
 86 | 
 87 |                                                                  // compute indices
 88 |                                                                  int e = (k / tpp) * lN + (k % tpp);
 89 |                                                                  auto o = e + tpp;
 90 |                                                                  auto i = (k % tpp);
 91 | 
 92 |                                                                  // compute 2-pt DFT
 93 |                                                                  auto tmp = x_r[e] + x_r[o] * WNk(N, i * nparts);
 94 |                                                                  x_r[o] = x_r[e] - x_r[o] * WNk(N, i * nparts);
 95 |                                                                  x_r[e] = tmp;
 96 |                                                              })
 97 | #if defined(USE_GPU)
 98 |                                                     ))
 99 | #endif  // USE_GPU
100 |         );
101 | 
102 |     // print final progress mark
103 |     fmt::print("100%\n");
104 | 
105 |     // return x_rev = fft(x_r)
106 |     return x_rev;
107 | }
108 | 
109 | //
110 | // simulation
111 | //
112 | int main(int argc, char* argv[]) {
113 |     // parse params
114 |     const fft_params_t args = argparse::parse<fft_params_t>(argc, argv);
115 | 
116 |     // see if help wanted
117 |     if (args.help) {
118 |         args.print();  // prints all variables
119 |         return 0;
120 |     }
121 | 
122 |     // simulation variables
123 |     int N = args.N;
124 |     sig_type_t sig_type = sig_type_t::box;
125 |     int max_threads = args.max_threads;
126 |     //int freq = args.freq;
127 |     bool print_sig = args.print_sig;
128 |     bool print_time = args.print_time;
129 |     bool validate = args.validate;
130 |     std::string sched = args.sch;
131 | 
132 |     // x[n] signal
133 |     sig_t x_n(N, sig_type);
134 | 
135 |     if (!isPowOf2(N)) {
136 |         N = ceilPowOf2(N);
137 |         fmt::print("INFO: N is not a power of 2. Padding zeros => N = {}\n", N);
138 | 
139 |         x_n.resize(N);
140 |     }
141 | 
142 |     if (print_sig) {
143 |         fmt::print("\nx[n] = ");
144 |         x_n.printSignal();
145 |     }
146 | 
147 |     // y[n] = fft(x[n]);
148 |     std::vector<data_t> y(N);
149 | 
150 |     // start the timer here
151 |     Timer timer;
152 | 
153 |     // initialize stdexec scheduler
154 |     sch_t scheduler = get_sch_enum(sched);
155 | 
156 |     // launch with appropriate stdexec scheduler
157 |     switch (scheduler) {
158 |         case sch_t::CPU:
159 |             y = fft(x_n.data(), exec::static_thread_pool(max_threads).get_scheduler(), N, max_threads, args.debug);
160 |             break;
161 | #if defined(USE_GPU)
162 |         case sch_t::GPU:
163 |             y = fft(x_n.data(), nvexec::stream_context().get_scheduler(), N, 1024 * 108, args.debug);
164 |             break;
165 |         case sch_t::MULTIGPU:
166 |             y = fft(x_n.data(), nvexec::multi_gpu_stream_context().get_scheduler(), N, 4 * 1024 * 108, args.debug);
167 |             break;
168 | #endif  // USE_GPU
169 |         default:
170 |             throw std::runtime_error("Run: `fft-stdexec --help` to see the list of available schedulers");
171 |     }
172 | 
173 |     // y[n] = fft(x[n])
174 |     sig_t y_n(y);
175 | 
176 |     // stop timer
177 |     auto elapsed = timer.stop();
178 | 
179 |     // print the fft(x)
180 |     if (print_sig) {
181 |         fmt::print("X(k) = ");
182 |         y_n.printSignal();
183 |     }
184 | 
185 |     // print the computation time
186 |     if (print_time)
187 |         fmt::print("Elapsed Time: {:f} ms\n", elapsed);
188 | 
189 |     // validate the recursively computed fft
190 |     if (validate) {
191 |         bool verify = true;
192 |         // launch with appropriate stdexec scheduler
193 |         switch (scheduler) {
194 |             case sch_t::CPU:
195 |                 verify = x_n.isFFT(y_n, exec::static_thread_pool(max_threads).get_scheduler());
196 |                 break;
197 | #if defined(USE_GPU)
198 |             case sch_t::GPU:
199 |                 verify = x_n.isFFT(y_n, nvexec::stream_context().get_scheduler());
200 |                 break;
201 |             case sch_t::MULTIGPU:
202 |                 verify = x_n.isFFT(y_n, nvexec::stream_context().get_scheduler());
203 |                 break;
204 | #endif  // USE_GPU
205 |             default:
206 |                 throw std::runtime_error("Run: `fft-stdexec --help` to see the list of available schedulers");
207 |         }
208 | 
209 |         if (verify) {
210 |             fmt::print("SUCCESS: y[n] == fft(x[n])\n");
211 |         } else {
212 |             fmt::print("FAILED: y[n] != fft(x[n])\n");
213 |         }
214 |     }
215 | 
216 |     return 0;
217 | }
218 | 


--------------------------------------------------------------------------------
/apps/choleskey/choleskey_stdpar_snd.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * MIT License
  3 |  *
  4 |  * Copyright (c) 2023 Chuanqiu He
  5 |  * Copyright (c) 2023 Weile Wei
  6 |  * Copyright (c) 2023 The Regents of the University of California,
  7 |  * through Lawrence Berkeley National Laboratory (subject to receipt of any
  8 |  * required approvals from the U.S. Dept. of Energy).All rights reserved.
  9 |  *
 10 |  * Permission is hereby granted, free of charge, to any person obtaining a copy
 11 |  * of this software and associated documentation files (the "Software"), to deal
 12 |  * in the Software without restriction, including without limitation the rights
 13 |  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 14 |  * copies of the Software, and to permit persons to whom the Software is
 15 |  * furnished to do so, subject to the following conditions:
 16 |  *
 17 |  * The above copyright notice and this permission notice shall be included in
 18 |  * all copies or substantial portions of the Software.
 19 |  *
 20 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 21 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 22 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 23 |  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 24 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 25 |  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 26 |  * SOFTWARE.
 27 |  */
 28 | //
 29 | // This example provides a stdexec(senders/receivers) implementation for choleskey decomposition code.
 30 | #include <algorithm>
 31 | #include <iostream>
 32 | #include <numeric>
 33 | #include <stdexec/execution.hpp>
 34 | #include <vector>
 35 | #include "argparse/argparse.hpp"
 36 | #include "commons.hpp"
 37 | #include "exec/static_thread_pool.hpp"
 38 | 
 39 | #include "matrixutil.hpp"
 40 | 
 41 | using namespace std;
 42 | 
 43 | struct solver {
 44 | 
 45 |     using view_2d = std::extents<int, std::dynamic_extent, std::dynamic_extent>;
 46 | 
 47 |     template <typename T>
 48 |     std::vector<std::vector<T>> Cholesky_Decomposition(std::vector<T>& vec, int n, int np) {
 49 | 
 50 |         // test here first, scheduler from a thread pool
 51 |         exec::static_thread_pool pool(np);
 52 |         stdexec::scheduler auto sch = pool.get_scheduler();
 53 |         stdexec::sender auto begin = stdexec::schedule(sch);
 54 | 
 55 |         std::vector<std::vector<T>> lower(n, std::vector<T>(n, 0));
 56 | 
 57 |         auto matrix_ms = std::mdspan<T, view_2d, std::layout_right>(vec.data(), n, n);
 58 | 
 59 |         auto multiplier_lambda = [=](auto a, auto b) {
 60 |             return a * b;
 61 |         };
 62 | 
 63 |         for (int i = 0; i < matrix_ms.extent(0); i++) {
 64 |             for (int j = 0; j <= i; j++) {
 65 |                 // avoid over parallelize
 66 |                 if (j == 0) {
 67 |                     np = 1;
 68 |                 } else if (j > 0 && np > j) {
 69 |                     np = j;
 70 |                 }
 71 | 
 72 |                 if (j == i)  // summation for diagonals
 73 |                 {
 74 | 
 75 |                     if (i == 0 && j == 0) {
 76 |                         lower[j][j] = std::sqrt(matrix_ms(i, j));
 77 |                     } else {
 78 | 
 79 |                         std::vector<T> sum_vec(np);  // sub res for each piece
 80 |                         int size = j;                // there are j elements need to be calculated(power)
 81 | 
 82 |                         stdexec::sender auto send1 =
 83 |                             stdexec::bulk(begin, np,
 84 |                                           [&](int piece) {
 85 |                                               int start = piece * size / np;
 86 |                                               int chunk_size = size / np;
 87 |                                               int remaining = size % np;
 88 |                                               chunk_size += (piece == np - 1) ? remaining : 0;
 89 | 
 90 |                                               sum_vec[piece] = std::transform_reduce(
 91 |                                                   std::execution::par, counting_iterator(start),
 92 |                                                   counting_iterator(start + chunk_size), 0, std ::plus{},
 93 |                                                   [=](int val) { return lower[j][val] * lower[j][val]; });
 94 |                                           }) |
 95 |                             stdexec::then([&sum_vec]() {
 96 |                                 return std::reduce(std::execution::par, sum_vec.begin(), sum_vec.end());
 97 |                             });
 98 | 
 99 |                         auto [sum1] = stdexec::sync_wait(std::move(send1)).value();
100 | 
101 |                         lower[j][j] = std::sqrt(matrix_ms(i, j) - sum1);
102 |                     }
103 | 
104 |                 } else {
105 |                     // Evaluating L(i, j) using L(j, j)
106 | 
107 |                     if (j == 0) {
108 |                         lower[i][j] = (matrix_ms(i, j)) / lower[j][j];
109 |                     } else {
110 | 
111 |                         std::vector<T> sum_vec(np);  // sub_result for each par piece
112 |                         int size_nondiag = j;
113 | 
114 |                         stdexec::sender auto send2 =
115 |                             stdexec::bulk(begin, np,
116 |                                           [&](int piece) {
117 |                                               int start = piece * size_nondiag / np;
118 |                                               int chunk_size = size_nondiag / np;
119 |                                               int remaining = size_nondiag % np;
120 |                                               chunk_size += (piece == np - 1) ? remaining : 0;
121 | 
122 |                                               sum_vec[piece] = std::transform_reduce(
123 |                                                   std::execution::par, counting_iterator(start),
124 |                                                   counting_iterator(start + chunk_size), 0, std ::plus{},
125 |                                                   [=](int k) { return lower[j][k] * lower[i][k]; });
126 |                                           }) |
127 |                             stdexec::then([&sum_vec]() {
128 |                                 return std::reduce(std::execution::par, sum_vec.begin(), sum_vec.end());
129 |                             });
130 | 
131 |                         auto [sum2] = stdexec::sync_wait(std::move(send2)).value();
132 | 
133 |                         lower[i][j] = (matrix_ms(i, j) - sum2) / lower[j][j];
134 |                     }
135 |                 }
136 |             }
137 |         }
138 |         return lower;
139 |     }
140 | };
141 | 
142 | ///////////////////////////////////////////////////////////////////////////////
143 | int benchmark(args_params_t const& args) {
144 | 
145 |     std::uint64_t nd = args.nd;  // Number of matrix dimension.
146 |     std::uint64_t np = args.np;  // Number of parallel partitions.
147 | 
148 |     std::vector<int> inputMatrix = generate_pascal_matrix<int>(nd);
149 | 
150 |     // Create the solver object
151 |     solver solve;
152 | 
153 |     // Measure execution time.
154 |     Timer timer;
155 | 
156 |     // start decomposation
157 |     auto res_matrix = solve.Cholesky_Decomposition(inputMatrix, nd, np);
158 |     auto time = timer.stop();
159 | 
160 |     // Print the final results
161 |     if (args.results) {
162 |         // Displaying Lower Triangular and its Transpose
163 |         fmt::print("{:>6} {:>30}\n", "Lower Triangular", "Transpose");
164 |         for (int i = 0; i < nd; i++) {
165 |             // Lower Triangular
166 |             for (int j = 0; j < nd; j++)
167 |                 fmt::print("{:>6}\t", res_matrix[i][j]);
168 |             fmt::print("\t");
169 | 
170 |             // Transpose of Lower Triangular
171 |             for (int j = 0; j < nd; j++)
172 |                 fmt::print("{:>6}\t", res_matrix[j][i]);
173 |             fmt::print("\n");
174 |         }
175 |     }
176 | 
177 |     if (args.time) {
178 |         fmt::print("Duration: {:f} ms\n", time);
179 |     }
180 | 
181 |     return 0;
182 | }
183 | 
184 | // Driver Code for testing
185 | int main(int argc, char* argv[]) {
186 | 
187 |     // parse params
188 |     args_params_t args = argparse::parse<args_params_t>(argc, argv);
189 |     // see if help wanted
190 |     if (args.help) {
191 |         args.print();  // prints all variables
192 |         return 0;
193 |     }
194 | 
195 |     benchmark(args);
196 | 
197 |     return 0;
198 | }
199 | 


--------------------------------------------------------------------------------
/apps/heat-equation/heat-equation-cuda.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * MIT License
  3 |  *
  4 |  * Copyright (c) 2023 The Regents of the University of California,
  5 |  * through Lawrence Berkeley National Laboratory (subject to receipt of any
  6 |  * required approvals from the U.S. Dept. of Energy).All rights reserved.
  7 |  *
  8 |  * Permission is hereby granted, free of charge, to any person obtaining a copy
  9 |  * of this software and associated documentation files (the "Software"), to deal
 10 |  * in the Software without restriction, including without limitation the rights
 11 |  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 12 |  * copies of the Software, and to permit persons to whom the Software is
 13 |  * furnished to do so, subject to the following conditions:
 14 |  *
 15 |  * The above copyright notice and this permission notice shall be included in
 16 |  * all copies or substantial portions of the Software.
 17 |  *
 18 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 19 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 20 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 21 |  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 22 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 23 |  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 24 |  * SOFTWARE.
 25 |  */
 26 | 
 27 | /*
 28 |  * Simplified 2d heat equation example derived from amrex
 29 |  */
 30 | 
 31 | #include <cuda_runtime.h>
 32 | 
 33 | #include "heat-equation.hpp"
 34 | 
 35 | using namespace std;
 36 | 
 37 | // array to store PTM masses
 38 | __constant__ Real_t dx[2];
 39 | 
 40 | #define cudaErrorCheck(ans) check((ans), __FILE__, __LINE__)
 41 | 
 42 | // error checking function
 43 | template <typename T>
 44 | static inline void check(T result, const char* const file, const int line, bool is_fatal = true) {
 45 |     if (result != cudaSuccess) {
 46 |         std::cerr << "CUDA error at " << file << ":" << line << std::endl;
 47 |         std::cerr << cudaGetErrorString(result) << std::endl;
 48 | 
 49 |         if (is_fatal)
 50 |             exit(result);
 51 |     }
 52 | }
 53 | 
 54 | //
 55 | // initialize grid kernel
 56 | //
 57 | template <typename T>
 58 | __global__ void initialize(T* phi, int ncells, int ghost_cells) {
 59 |     int ind = blockIdx.x * blockDim.x + threadIdx.x;
 60 |     int d_nghosts = nghosts;
 61 |     int phi_old_extent = ncells + d_nghosts;
 62 |     int gsize = ncells * ncells;
 63 | 
 64 |     for (; ind < gsize; ind += blockDim.x * gridDim.x) {
 65 |         int i = 1 + (ind / ncells);
 66 |         int j = 1 + (ind % ncells);
 67 | 
 68 |         Real_t x = pos(i, ghost_cells, dx[0]);
 69 |         Real_t y = pos(j, ghost_cells, dx[1]);
 70 | 
 71 |         // L2 distance (r2 from origin)
 72 |         Real_t r2 = (x * x + y * y) / (0.01);
 73 | 
 74 |         // phi(x,y) = 1 + exp(-r^2)
 75 |         phi[(i)*phi_old_extent + j] = 1 + exp(-r2);
 76 |     }
 77 | }
 78 | 
 79 | //
 80 | // fill boundary kernel
 81 | //
 82 | template <typename T>
 83 | __global__ void fillBoundary(T* phi_old, int ncells, int ghost_cells) {
 84 |     int pos = blockIdx.x * blockDim.x + threadIdx.x;
 85 |     int d_nghosts = nghosts;
 86 |     int phi_old_extent = ncells + d_nghosts;
 87 |     int len = phi_old_extent;
 88 | 
 89 |     for (; pos < phi_old_extent - nghosts; pos += blockDim.x * gridDim.x) {
 90 |         int i = pos + ghost_cells;
 91 | 
 92 |         // fill boundary cells in phi_old
 93 |         phi_old[i] = phi_old[i + (ghost_cells * len)];
 94 | 
 95 |         phi_old[i + (len * (len - ghost_cells))] = phi_old[i + (len * (len - ghost_cells - 1))];
 96 | 
 97 |         phi_old[i * len] = phi_old[(ghost_cells * len) + i];
 98 | 
 99 |         phi_old[(len - ghost_cells) + (len * i)] = phi_old[(len - ghost_cells - 1) + (len * i)];
100 |     }
101 | }
102 | 
103 | //
104 | // jacobi 2d stencil kernel
105 | //
106 | template <typename T>
107 | __global__ void jacobi(T* phi_old, T* phi_new, int ncells, Real_t alpha, Real_t dt) {
108 |     int pos = blockIdx.x * blockDim.x + threadIdx.x;
109 |     int d_nghosts = nghosts;
110 |     int phi_old_extent = ncells + d_nghosts;
111 |     int gsize = ncells * ncells;
112 | 
113 |     for (; pos < gsize; pos += blockDim.x * gridDim.x) {
114 |         int i = 1 + (pos / ncells);
115 |         int j = 1 + (pos % ncells);
116 | 
117 |         // Jacobi iteration
118 |         phi_new[(i - 1) * ncells + j - 1] =
119 |             phi_old[(i)*phi_old_extent + j] +
120 |             alpha * dt *
121 | 
122 |                 ((phi_old[(i + 1) * phi_old_extent + j] - 2.0 * phi_old[(i)*phi_old_extent + j] +
123 |                   phi_old[(i - 1) * phi_old_extent + j]) /
124 |                      (dx[0] * dx[0]) +
125 | 
126 |                  (phi_old[(i)*phi_old_extent + j + 1] - 2.0 * phi_old[(i)*phi_old_extent + j] +
127 |                   phi_old[(i)*phi_old_extent + j - 1]) /
128 |                      (dx[1] * dx[1]));
129 |     }
130 | }
131 | 
132 | //
133 | // parallelCopy kernel
134 | //
135 | template <typename T>
136 | __global__ void parallelCopy(T* phi_old, T* phi_new, int ncells) {
137 |     int pos = blockIdx.x * blockDim.x + threadIdx.x;
138 |     int d_nghosts = nghosts;
139 |     int phi_old_extent = ncells + d_nghosts;
140 |     int gsize = ncells * ncells;
141 | 
142 |     for (; pos < gsize; pos += blockDim.x * gridDim.x) {
143 |         int i = 1 + (pos / ncells);
144 |         int j = 1 + (pos % ncells);
145 |         phi_old[(i)*phi_old_extent + j] = phi_new[(i - 1) * ncells + (j - 1)];
146 |     }
147 | }
148 | 
149 | //
150 | // main simulation
151 | //
152 | int main(int argc, char* argv[]) {
153 |     // parse params
154 |     const heat_params_t args = argparse::parse<heat_params_t>(argc, argv);
155 | 
156 |     // see if help wanted
157 |     if (args.help) {
158 |         args.print();  // prints all variables
159 |         return 0;
160 |     }
161 | 
162 |     // simulation variables
163 |     int ncells = args.ncells;
164 |     int nsteps = args.nsteps;
165 |     Real_t dt = args.dt;
166 |     Real_t alpha = args.alpha;
167 | 
168 |     // init simulation time
169 |     Real_t time = 0.0;
170 | 
171 |     // initialize dx, dy, dz
172 |     Real_t h_dx[dims];
173 |     for (int i = 0; i < dims; ++i)
174 |         h_dx[i] = 1.0 / (ncells - 1);
175 | 
176 |     cudaErrorCheck(cudaMemcpyToSymbol(dx, h_dx, sizeof(Real_t) * dims));
177 | 
178 |     // grid size
179 |     int gsize = ncells * ncells;
180 | 
181 |     // host memory for printing
182 |     Real_t* h_phi = nullptr;
183 | 
184 |     // simulation setup (2D)
185 |     Real_t* phi_old = nullptr;
186 |     Real_t* phi_new = nullptr;
187 | 
188 |     cudaErrorCheck(cudaMalloc(&phi_old, sizeof(Real_t) * ((ncells + nghosts) * (ncells + nghosts))));
189 |     cudaErrorCheck(cudaMalloc(&phi_new, sizeof(Real_t) * ((ncells) * (ncells))));
190 | 
191 |     // setup grid
192 |     int blockSize = std::min(1024, gsize);  // let's do at most 1024 threads.
193 |     int nBlocks = (gsize + blockSize - 1) / blockSize;
194 | 
195 |     Timer timer;
196 | 
197 |     // initialize grid
198 |     initialize<<<nBlocks, blockSize>>>(phi_old, ncells, ghost_cells);
199 | 
200 |     cudaErrorCheck(cudaDeviceSynchronize());
201 | 
202 |     // print initial grid if needed
203 |     if (args.print_grid) {
204 |         // copy initial grid to host
205 |         h_phi = new Real_t[(ncells + nghosts) * (ncells + nghosts)];
206 |         cudaErrorCheck(cudaMemcpy(h_phi, phi_old, sizeof(Real_t) * (ncells + nghosts) * (ncells + nghosts),
207 |                                   cudaMemcpyDeviceToHost));
208 | 
209 |         printGrid(h_phi, ncells + nghosts);
210 |     }
211 | 
212 |     // evolve the system
213 |     for (auto step = 0; step < nsteps; step++) {
214 |         static int fBblock = std::min(1024, ncells);              // let's do at most 1024 threads.
215 |         static int fBnBlocks = (ncells + fBblock - 1) / fBblock;  // fillBoundary blocks
216 | 
217 |         // fillboundary
218 |         fillBoundary<<<fBnBlocks, fBblock>>>(phi_old, ncells, ghost_cells);
219 | 
220 |         // jacobi
221 |         jacobi<<<nBlocks, blockSize>>>(phi_old, phi_new, ncells, alpha, dt);
222 | 
223 |         // parallelCopy
224 |         parallelCopy<<<nBlocks, blockSize>>>(phi_old, phi_new, ncells);
225 | 
226 |         cudaErrorCheck(cudaDeviceSynchronize());
227 | 
228 |         // update time
229 |         time += dt;
230 |     }
231 | 
232 |     auto elapsed = timer.stop();
233 | 
234 |     // print timing
235 |     if (args.print_time) {
236 |         fmt::print("Duration: {:f} ms\n", elapsed);
237 |     }
238 | 
239 |     // print final grid if needed
240 |     if (args.print_grid) {
241 |         cudaErrorCheck(cudaMemcpy(h_phi, phi_new, sizeof(Real_t) * gsize, cudaMemcpyDeviceToHost));
242 |         printGrid(h_phi, ncells);
243 | 
244 |         // free host memory
245 |         delete[] h_phi;
246 |         h_phi = nullptr;
247 |     }
248 | 
249 |     // free device memory
250 |     cudaErrorCheck(cudaFree(phi_old));
251 |     cudaErrorCheck(cudaFree(phi_new));
252 | 
253 |     return 0;
254 | }
255 | 


--------------------------------------------------------------------------------
/.cmake-format.py:
--------------------------------------------------------------------------------
  1 | # ----------------------------------
  2 | # Options affecting listfile parsing
  3 | # ----------------------------------
  4 | with section("parse"):
  5 | 
  6 |   # Specify structure for custom cmake functions
  7 |   additional_commands = { 'foo': { 'flags': ['BAR', 'BAZ'],
  8 |              'kwargs': {'DEPENDS': '*', 'HEADERS': '*', 'SOURCES': '*'}}}
  9 | 
 10 |   # Override configurations per-command where available
 11 |   override_spec = {}
 12 | 
 13 |   # Specify variable tags.
 14 |   vartags = []
 15 | 
 16 |   # Specify property tags.
 17 |   proptags = []
 18 | 
 19 | # -----------------------------
 20 | # Options affecting formatting.
 21 | # -----------------------------
 22 | with section("format"):
 23 | 
 24 |   # Disable formatting entirely, making cmake-format a no-op
 25 |   disable = False
 26 | 
 27 |   # How wide to allow formatted cmake files
 28 |   line_width = 80
 29 | 
 30 |   # How many spaces to tab for indent
 31 |   tab_size = 2
 32 | 
 33 |   # If true, lines are indented using tab characters (utf-8 0x09) instead of
 34 |   # <tab_size> space characters (utf-8 0x20). In cases where the layout would
 35 |   # require a fractional tab character, the behavior of the  fractional
 36 |   # indentation is governed by <fractional_tab_policy>
 37 |   use_tabchars = False
 38 | 
 39 |   # If <use_tabchars> is True, then the value of this variable indicates how
 40 |   # fractional indentions are handled during whitespace replacement. If set to
 41 |   # 'use-space', fractional indentation is left as spaces (utf-8 0x20). If set
 42 |   # to `round-up` fractional indentation is replaced with a single tab character
 43 |   # (utf-8 0x09) effectively shifting the column to the next tabstop
 44 |   fractional_tab_policy = 'use-space'
 45 | 
 46 |   # If an argument group contains more than this many sub-groups (parg or kwarg
 47 |   # groups) then force it to a vertical layout.
 48 |   max_subgroups_hwrap = 2
 49 | 
 50 |   # If a positional argument group contains more than this many arguments, then
 51 |   # force it to a vertical layout.
 52 |   max_pargs_hwrap = 6
 53 | 
 54 |   # If a cmdline positional group consumes more than this many lines without
 55 |   # nesting, then invalidate the layout (and nest)
 56 |   max_rows_cmdline = 2
 57 | 
 58 |   # If true, separate flow control names from their parentheses with a space
 59 |   separate_ctrl_name_with_space = False
 60 | 
 61 |   # If true, separate function names from parentheses with a space
 62 |   separate_fn_name_with_space = False
 63 | 
 64 |   # If a statement is wrapped to more than one line, than dangle the closing
 65 |   # parenthesis on its own line.
 66 |   dangle_parens = False
 67 | 
 68 |   # If the trailing parenthesis must be 'dangled' on its on line, then align it
 69 |   # to this reference: `prefix`: the start of the statement,  `prefix-indent`:
 70 |   # the start of the statement, plus one indentation  level, `child`: align to
 71 |   # the column of the arguments
 72 |   dangle_align = 'prefix'
 73 | 
 74 |   # If the statement spelling length (including space and parenthesis) is
 75 |   # smaller than this amount, then force reject nested layouts.
 76 |   min_prefix_chars = 4
 77 | 
 78 |   # If the statement spelling length (including space and parenthesis) is larger
 79 |   # than the tab width by more than this amount, then force reject un-nested
 80 |   # layouts.
 81 |   max_prefix_chars = 10
 82 | 
 83 |   # If a candidate layout is wrapped horizontally but it exceeds this many
 84 |   # lines, then reject the layout.
 85 |   max_lines_hwrap = 2
 86 | 
 87 |   # What style line endings to use in the output.
 88 |   line_ending = 'unix'
 89 | 
 90 |   # Format command names consistently as 'lower' or 'upper' case
 91 |   command_case = 'canonical'
 92 | 
 93 |   # Format keywords consistently as 'lower' or 'upper' case
 94 |   keyword_case = 'unchanged'
 95 | 
 96 |   # A list of command names which should always be wrapped
 97 |   always_wrap = []
 98 | 
 99 |   # If true, the argument lists which are known to be sortable will be sorted
100 |   # lexicographicall
101 |   enable_sort = True
102 | 
103 |   # If true, the parsers may infer whether or not an argument list is sortable
104 |   # (without annotation).
105 |   autosort = False
106 | 
107 |   # By default, if cmake-format cannot successfully fit everything into the
108 |   # desired linewidth it will apply the last, most agressive attempt that it
109 |   # made. If this flag is True, however, cmake-format will print error, exit
110 |   # with non-zero status code, and write-out nothing
111 |   require_valid_layout = False
112 | 
113 |   # A dictionary mapping layout nodes to a list of wrap decisions. See the
114 |   # documentation for more information.
115 |   layout_passes = {}
116 | 
117 | # ------------------------------------------------
118 | # Options affecting comment reflow and formatting.
119 | # ------------------------------------------------
120 | with section("markup"):
121 | 
122 |   # What character to use for bulleted lists
123 |   bullet_char = '*'
124 | 
125 |   # What character to use as punctuation after numerals in an enumerated list
126 |   enum_char = '.'
127 | 
128 |   # If comment markup is enabled, don't reflow the first comment block in each
129 |   # listfile. Use this to preserve formatting of your copyright/license
130 |   # statements.
131 |   first_comment_is_literal = False
132 | 
133 |   # If comment markup is enabled, don't reflow any comment block which matches
134 |   # this (regex) pattern. Default is `None` (disabled).
135 |   literal_comment_pattern = None
136 | 
137 |   # Regular expression to match preformat fences in comments default=
138 |   # ``r'^\s*([`~]{3}[`~]*)(.*)$'``
139 |   fence_pattern = '^\\s*([`~]{3}[`~]*)(.*)$'
140 | 
141 |   # Regular expression to match rulers in comments default=
142 |   # ``r'^\s*[^\w\s]{3}.*[^\w\s]{3}$'``
143 |   ruler_pattern = '^\\s*[^\\w\\s]{3}.*[^\\w\\s]{3}$'
144 | 
145 |   # If a comment line matches starts with this pattern then it is explicitly a
146 |   # trailing comment for the preceeding argument. Default is '#<'
147 |   explicit_trailing_pattern = '#<'
148 | 
149 |   # If a comment line starts with at least this many consecutive hash
150 |   # characters, then don't lstrip() them off. This allows for lazy hash rulers
151 |   # where the first hash char is not separated by space
152 |   hashruler_min_length = 10
153 | 
154 |   # If true, then insert a space between the first hash char and remaining hash
155 |   # chars in a hash ruler, and normalize its length to fill the column
156 |   canonicalize_hashrulers = True
157 | 
158 |   # enable comment markup parsing and reflow
159 |   enable_markup = True
160 | 
161 | # ----------------------------
162 | # Options affecting the linter
163 | # ----------------------------
164 | with section("lint"):
165 | 
166 |   # a list of lint codes to disable
167 |   disabled_codes = []
168 | 
169 |   # regular expression pattern describing valid function names
170 |   function_pattern = '[0-9a-z_]+'
171 | 
172 |   # regular expression pattern describing valid macro names
173 |   macro_pattern = '[0-9A-Z_]+'
174 | 
175 |   # regular expression pattern describing valid names for variables with global
176 |   # (cache) scope
177 |   global_var_pattern = '[A-Z][0-9A-Z_]+'
178 | 
179 |   # regular expression pattern describing valid names for variables with global
180 |   # scope (but internal semantic)
181 |   internal_var_pattern = '_[A-Z][0-9A-Z_]+'
182 | 
183 |   # regular expression pattern describing valid names for variables with local
184 |   # scope
185 |   local_var_pattern = '[a-z][a-z0-9_]+'
186 | 
187 |   # regular expression pattern describing valid names for privatedirectory
188 |   # variables
189 |   private_var_pattern = '_[0-9a-z_]+'
190 | 
191 |   # regular expression pattern describing valid names for public directory
192 |   # variables
193 |   public_var_pattern = '[A-Z][0-9A-Z_]+'
194 | 
195 |   # regular expression pattern describing valid names for function/macro
196 |   # arguments and loop variables.
197 |   argument_var_pattern = '[a-z][a-z0-9_]+'
198 | 
199 |   # regular expression pattern describing valid names for keywords used in
200 |   # functions or macros
201 |   keyword_pattern = '[A-Z][0-9A-Z_]+'
202 | 
203 |   # In the heuristic for C0201, how many conditionals to match within a loop in
204 |   # before considering the loop a parser.
205 |   max_conditionals_custom_parser = 2
206 | 
207 |   # Require at least this many newlines between statements
208 |   min_statement_spacing = 1
209 | 
210 |   # Require no more than this many newlines between statements
211 |   max_statement_spacing = 2
212 |   max_returns = 6
213 |   max_branches = 12
214 |   max_arguments = 5
215 |   max_localvars = 15
216 |   max_statements = 50
217 | 
218 | # -------------------------------
219 | # Options affecting file encoding
220 | # -------------------------------
221 | with section("encode"):
222 | 
223 |   # If true, emit the unicode byte-order mark (BOM) at the start of the file
224 |   emit_byteorder_mark = False
225 | 
226 |   # Specify the encoding of the input file. Defaults to utf-8
227 |   input_encoding = 'utf-8'
228 | 
229 |   # Specify the encoding of the output file. Defaults to utf-8. Note that cmake
230 |   # only claims to support utf-8 so be careful when using anything else
231 |   output_encoding = 'utf-8'
232 | 
233 | # -------------------------------------
234 | # Miscellaneous configurations options.
235 | # -------------------------------------
236 | with section("misc"):
237 | 
238 |   # A dictionary containing any per-command configuration overrides. Currently
239 |   # only `command_case` is supported.
240 |   per_command = {}
241 | 
242 | 


--------------------------------------------------------------------------------
/apps/fft/fft.hpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * MIT License
  3 |  *
  4 |  * Copyright (c) 2023 The Regents of the University of California,
  5 |  * through Lawrence Berkeley National Laboratory (subject to receipt of any
  6 |  * required approvals from the U.S. Dept. of Energy).All rights reserved.
  7 |  *
  8 |  * Permission is hereby granted, free of charge, to any person obtaining a copy
  9 |  * of this software and associated documentation files (the "Software"), to deal
 10 |  * in the Software without restriction, including without limitation the rights
 11 |  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 12 |  * copies of the Software, and to permit persons to whom the Software is
 13 |  * furnished to do so, subject to the following conditions:
 14 |  *
 15 |  * The above copyright notice and this permission notice shall be included in
 16 |  * all copies or substantial portions of the Software.
 17 |  *
 18 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 19 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 20 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 21 |  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 22 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 23 |  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 24 |  * SOFTWARE.
 25 |  */
 26 | 
 27 | /*
 28 |  * commons for the fft codes
 29 |  */
 30 | 
 31 | #pragma once
 32 | 
 33 | #include <exec/static_thread_pool.hpp>
 34 | #include <stdexec/execution.hpp>
 35 | 
 36 | #if defined(USE_GPU)
 37 | #include <nvexec/multi_gpu_context.cuh>
 38 | #include <nvexec/stream_context.cuh>
 39 | using namespace nvexec;
 40 | #endif  //USE_GPU
 41 | 
 42 | #include <experimental/linalg>
 43 | #include "argparse/argparse.hpp"
 44 | 
 45 | #include "commons.hpp"
 46 | 
 47 | using namespace std;
 48 | using namespace stdexec;
 49 | using namespace std::complex_literals;
 50 | using stdexec::sync_wait;
 51 | 
 52 | namespace ex = stdexec;
 53 | 
 54 | // mdspan views
 55 | using view_2d = std::extents<int, std::dynamic_extent, std::dynamic_extent>;
 56 | using view_1d = std::extents<int, std::dynamic_extent>;
 57 | 
 58 | // data type
 59 | using Real_t = double;
 60 | using data_t = std::complex<Real_t>;
 61 | 
 62 | // enum for signal types
 63 | enum sig_type { square, sinusoid, sawtooth, triangle, sinc, box };
 64 | 
 65 | using sig_type_t = sig_type;
 66 | 
 67 | // map for signals
 68 | std::map<std::string, sig_type_t> sigmap{{"square", sig_type_t::square},     {"sinusoid", sig_type_t::sinusoid},
 69 |                                          {"triangle", sig_type_t::sawtooth}, {"triangle", sig_type_t::triangle},
 70 |                                          {"sinc", sig_type_t::sinc},         {"box", sig_type_t::box}};
 71 | 
 72 | // custom get sig_type_t from string
 73 | sig_type_t getSignal(std::string& sig) {
 74 |     if (sigmap.contains(sig)) {
 75 |         return sigmap[sig];
 76 |     } else {
 77 |         return (sig_type_t)(-1);
 78 |     }
 79 | }
 80 | 
 81 | // input arguments
 82 | struct fft_params_t : public argparse::Args {
 83 |     // NVC++ is not supported by magic_enum so using strings
 84 |     std::string& sig = kwarg("sig", "input signal type: square, sinusoid, sawtooth, triangle, box").set_default("box");
 85 | 
 86 |     int& freq = kwarg("f,freq", "Signal frequency").set_default(1024);
 87 |     int& N = kwarg("N", "N-point FFT").set_default(1024);
 88 |     bool& print_sig = flag("p,print", "print x[n] and X(k)");
 89 |     int& max_threads = kwarg("nthreads", "number of threads").set_default(std::thread::hardware_concurrency());
 90 | 
 91 | #if defined(FFT_STDEXEC)
 92 |     std::string& sch = kwarg("sch",
 93 |                              "stdexec scheduler: [options: cpu"
 94 | #if defined(USE_GPU)
 95 |                              ", gpu, multigpu"
 96 | #endif  //USE_GPU
 97 |                              "]")
 98 |                            .set_default("cpu");
 99 | #endif  // FFT_STDEXEC
100 | 
101 |     bool& validate = flag("validate", "validate the results via y[k] = WNk * x[n]");
102 |     bool& help = flag("h, help", "print help");
103 |     bool& print_time = flag("t,time", "print fft time");
104 |     bool& debug = flag("d,debug", "print internal timers and launch configs");
105 | };
106 | 
107 | inline std::complex<Real_t> WNk(int N, int k) {
108 |     return std::complex<Real_t>(exp(-2 * M_PI * 1 / N * k * 1i));
109 | }
110 | 
111 | class signal {
112 |    public:
113 |     signal() = default;
114 | 
115 |     signal(int N) {
116 |         if (N <= 0) {
117 |             std::cerr << "ERROR: N must be > 0. exiting.." << std::endl;
118 |             exit(1);
119 |         }
120 |         y.reserve(ceilPowOf2(N));
121 |         y.resize(N);
122 |     }
123 | 
124 |     signal(signal& rhs) { y = rhs.y; }
125 | 
126 |     signal(std::vector<data_t>&& in) { y = std::move(in); }
127 | 
128 |     signal(std::vector<data_t>& in) { y = std::move(in); }
129 | 
130 |     signal(int N, sig_type type, int threads = std::thread::hardware_concurrency()) {
131 |         if (N <= 0) {
132 |             std::cerr << "ERROR: N must be > 0. exiting.." << std::endl;
133 |             exit(1);
134 |         }
135 |         y.reserve(ceilPowOf2(N));
136 |         y.resize(N);
137 |         signalGenerator(type, threads);
138 |     }
139 | 
140 |     void signalGenerator(sig_type type = sig_type::box, int threads = std::thread::hardware_concurrency()) {
141 |         int N = y.size();
142 | 
143 |         // scheduler from a thread pool
144 |         exec::static_thread_pool ctx{threads};
145 |         scheduler auto sch = ctx.get_scheduler();
146 | 
147 |         // start scheduling
148 |         sender auto start = schedule(sch);
149 | 
150 |         // generate input signal
151 |         switch (type) {
152 |             case sig_type::square:
153 |                 sync_wait(bulk(start, N, [&](int n) { y[n] = (n < N / 4 || n >= 3 * N / 4) ? 1.0 : -1.0; }));
154 |                 break;
155 |             case sig_type::sinusoid:
156 |                 sync_wait(bulk(start, N, [&](int n) { y[n] = std::sin(2.0 * M_PI * n / N); }));
157 |                 break;
158 |             case sig_type::sawtooth:
159 |                 sync_wait(bulk(start, N, [&](int n) { y[n] = 2.0 * (n / N) - 1.0; }));
160 |                 break;
161 |             case sig_type::triangle:
162 |                 sync_wait(bulk(start, N, [&](int n) { y[n] = 2.0 * std::abs(2.0 * (n / N) - 1.0) - 1.0; }));
163 |                 break;
164 |             case sig_type::sinc:
165 |                 y[0] = 1.0;
166 |                 sync_wait(bulk(start, N - 1, [&](int n) {
167 |                     y[n + 1] = std::sin(2.0 * M_PI * (n + 1) / N) / (2.0 * M_PI * (n + 1) / N);
168 |                 }));
169 |                 break;
170 |             case sig_type::box:
171 |                 sync_wait(bulk(start, N, [&](int n) { y[n] = (n < N / 4 || n >= 3 * N / 4) ? 1.0 : 0.0; }));
172 |                 break;
173 |             default:
174 |                 std::cerr << "ERROR: Unknown input signal type. exiting.." << std::endl;
175 |                 std::cerr << "Run: <FFT_app> --help to see the list of available signals" << std::endl;
176 |                 exit(1);
177 |         }
178 |     }
179 | 
180 |     ~signal() { y.clear(); }
181 | 
182 |     data_t* data() { return y.data(); }
183 | 
184 |     int len() { return y.size(); }
185 | 
186 |     void resize(int N) {
187 |         if (N != y.size())
188 |             y.resize(N, 0);
189 |     }
190 | 
191 |     data_t& operator[](int n) { return y[n]; }
192 | 
193 |     data_t& operator()(int n) { return y[n]; }
194 | 
195 |     void printSignal() { fmt::print("{} \n", y); }
196 | 
197 |     [[nodiscard]] bool isFFT(signal& X, scheduler auto sch, int maxN = 20000) {
198 |         int N = y.size();
199 |         bool ret = true;
200 | 
201 |         if (X.len() > maxN) {
202 |             fmt::print("Input signal may be too large to compute DFT via y[n] = WNk * x[n]. Segfaults expected..\n");
203 |         }
204 | 
205 |         std::vector<data_t> Y(N);
206 |         std::vector<data_t> M(N * N);
207 | 
208 |         auto A = std::mdspan<data_t, view_2d, std::layout_right>(M.data(), N, N);
209 |         auto mdy = std::mdspan<data_t, view_2d, std::layout_right>(y.data(), N, 1);
210 |         auto mdY = std::mdspan<data_t, view_2d, std::layout_right>(Y.data(), N, 1);
211 | 
212 |         data_t* F = M.data();
213 |         data_t* X_ptr = X.data();
214 |         data_t* Y_ptr = Y.data();
215 | 
216 |         ex::sender auto init = ex::transfer_just(sch, F) | ex::bulk(N * N, [=](int k, auto F) {
217 |                                    int i = k / N;
218 |                                    int j = k % N;
219 |                                    F[k] = WNk(N, i * j);
220 |                                });
221 | 
222 |         // initialize
223 |         ex::sync_wait(init);
224 | 
225 |         // compute Y[n] = dft(x[n]) = WNk * x[n]
226 |         stdex::linalg::matrix_product(std::execution::par, A, mdy, mdY);
227 | 
228 |         // compare the computed Y[n] (dft) with X[n](fft)
229 |         ex::sender auto verify = ex::transfer_just(sch, ret, X_ptr, Y_ptr) |
230 |                                  ex::bulk(N,
231 |                                           [](int k, auto& ret, auto X_ptr, auto Y_ptr) {
232 |                                               if (!complex_compare(X_ptr[k], Y_ptr[k])) {
233 |                                                   //std::cout << "y[" << i << "] = " << X[i] << " != x[" << i << "] = " << Y[i] << std::endl;
234 |                                                   ret = false;
235 |                                               }
236 |                                           }) |
237 |                                  then([](auto ret, auto&&...) { return ret; });
238 | 
239 |         // let the pipeline run
240 |         auto [re] = ex::sync_wait(verify).value();
241 | 
242 |         return re;
243 |     }
244 | 
245 |    private:
246 |     // y[n]
247 |     std::vector<data_t> y;
248 | };
249 | 
250 | using sig_t = signal;
251 | 


--------------------------------------------------------------------------------