├── apps ├── choleskey │ ├── CMakeLists.txt │ ├── matrixutil.hpp │ ├── choleskey_serial.cpp │ ├── choleskey_stdpar.cpp │ └── choleskey_stdpar_snd.cpp ├── comm-study │ ├── CMakeLists.txt │ └── comm-study-no-senders.cpp ├── mdspan-stdpar │ ├── CMakeLists.txt │ └── mdspan-stdpar.cpp ├── 1d-stencil │ ├── CMakeLists.txt │ ├── 1d-cuda.cpp │ ├── 1d-omp.cpp │ ├── 1d-serial.cpp │ ├── 1d-stdpar.cpp │ └── 1d-stdexec.cpp ├── heat-equation │ ├── CMakeLists.txt │ ├── heat-equation.hpp │ ├── heat-equation-stdpar.cpp │ ├── heat-equation-serial.cpp │ ├── heat-equation-omp.cpp │ ├── heat-equation-stdexec.cpp │ └── heat-equation-cuda.cpp ├── prefixSum │ ├── CMakeLists.txt │ ├── prefixSum.hpp │ ├── prefixSum-stdpar.cpp │ ├── prefixSum-serial.cpp │ └── prefixSum-stdexec.cpp ├── fft │ ├── CMakeLists.txt │ ├── fft-serial.cpp │ ├── fft-stdpar.cpp │ ├── fft-stdexec.cpp │ └── fft.hpp └── CMakeLists.txt ├── .github └── workflows │ └── format_check.yml ├── LICENSE ├── scripts ├── fft.nvhpc.gpu.sh ├── pm-localrc │ └── localrc ├── heat-run.gcc.sh ├── fft.nvhpc.grace.cpu.sh ├── fft.nvhpc.cpu.sh ├── heat-run.nvhpc.grace.cpu.sh ├── stencil.nvhpc.grace.cpu.sh ├── heat-run.nvhpc.cpu.sh ├── stencil.nvhpc.cpu.sh ├── benchmark.sh └── fft.ncu.nsys.gpu.sh ├── README.md ├── .clang-format ├── .gitignore ├── .gitlab-ci.yml ├── include ├── counting_iterator.hpp └── commons.hpp ├── CMakeLists.txt └── .cmake-format.py /apps/choleskey/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | project(choleskey_stdpar LANGUAGES CXX) 2 | 3 | add_executable(choleskey_serial choleskey_serial.cpp) 4 | target_include_directories( 5 | choleskey_serial 6 | PRIVATE ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/../../include 7 | ${ARGPARSE_INCLUDE_DIR} ${MDSPAN_INCLUDE_DIR}) 8 | target_link_libraries(choleskey_serial hpcpp-core) 9 | 10 | add_executable(choleskey_stdpar choleskey_stdpar.cpp) 11 | target_link_libraries(choleskey_stdpar stdexec hpcpp-core) 12 | target_include_directories( 13 | choleskey_stdpar 14 | PRIVATE ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/../../include 15 | ${ARGPARSE_INCLUDE_DIR} ${MDSPAN_INCLUDE_DIR}) 16 | 17 | # TODO: remove this example add_executable(choleskey_stdpar_snd 18 | # choleskey_stdpar_snd.cpp) target_link_libraries(choleskey_stdpar_snd stdexec) 19 | # target_include_directories( choleskey_stdpar_snd PRIVATE ${CMAKE_BINARY_DIR} 20 | # ${CMAKE_CURRENT_LIST_DIR}/../../include ${ARGPARSE_INCLUDE_DIR} 21 | # ${MDSPAN_INCLUDE_DIR}) 22 | -------------------------------------------------------------------------------- /apps/comm-study/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | project(comm-study LANGUAGES CXX) 2 | 3 | file(GLOB CPP_SOURCES "*.cpp") 4 | 5 | foreach(source_file ${CPP_SOURCES}) 6 | # get the file name without an extension 7 | get_filename_component(exec_name ${source_file} NAME_WE) 8 | 9 | # add an executable with the same name as the source file 10 | add_executable(${exec_name} ${_EXCLUDE} ${source_file}) 11 | set_source_files_properties(${source_file} PROPERTIES LANGUAGE CXX 12 | LINKER_LANGUAGE CXX) 13 | target_include_directories( 14 | ${exec_name} 15 | PRIVATE ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/../../include 16 | ${MDSPAN_INCLUDE_DIR}) 17 | 18 | target_link_libraries(${exec_name} PUBLIC ${MPI_LIBS} stdexec hpcpp-core) 19 | 20 | set_target_properties( 21 | ${exec_name} 22 | PROPERTIES CXX_STANDARD ${CXX_STANDARD} 23 | CXX_EXTENSIONS ${CMAKE_GNU_EXTENSIONS} 24 | INSTALL_RPATH_USE_LINK_PATH ON) 25 | 26 | # installation 27 | install(TARGETS ${exec_name} DESTINATION ${CMAKE_INSTALL_BINDIR}) 28 | endforeach() 29 | -------------------------------------------------------------------------------- /.github/workflows/format_check.yml: -------------------------------------------------------------------------------- 1 | name: Format Check 2 | 3 | on: 4 | pull_request: 5 | paths: 6 | - '**.cpp' 7 | - '**.h' 8 | - 'CMakeLists.txt' 9 | - '**.cmake' 10 | 11 | jobs: 12 | check-format: 13 | runs-on: ubuntu-latest 14 | 15 | steps: 16 | - name: Checkout code 17 | uses: actions/checkout@v2 18 | 19 | - name: Install clang-format 20 | run: sudo apt-get install clang-format 21 | 22 | - name: Install cmake-format 23 | run: pip install cmake-format 24 | 25 | - name: Check C++ format 26 | run: | 27 | clang-format --version 28 | find . -name '*.cpp' -o -name '*.h' -o -name '*.hpp' -o -name '*.cc' | xargs clang-format -i 29 | git diff --exit-code || (echo "Code was not formatted using clang-format! Please format your code." && exit 1) 30 | 31 | - name: Check CMake format 32 | run: | 33 | find . -name 'CMakeLists.txt' -o -name '*.cmake' | xargs cmake-format -i 34 | git diff --exit-code || (echo "CMake files were not formatted using cmake-format! Please format your files." && exit 1) 35 | 36 | -------------------------------------------------------------------------------- /apps/mdspan-stdpar/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | project(mdspan-stdpar LANGUAGES CXX) 2 | 3 | add_executable(mdspan-stdpar ${_EXCLUDE} 4 | ${CMAKE_CURRENT_LIST_DIR}/mdspan-stdpar.cpp) 5 | 6 | set_source_files_properties(${CMAKE_CURRENT_LIST_DIR}/mdspan-stdpar.cpp 7 | PROPERTIES LANGUAGE CXX LINKER_LANGUAGE CXX) 8 | 9 | # add dependencies (not applicable yet) add_dependencies(mdspan-stdpar 10 | # magic_enum argparse) 11 | 12 | # include core/include and generated files DO NOT include 13 | # ${MAGICENUM_INCLUDE_DIR} as it results in an internal error at templates due 14 | # to CUDA compiler. 15 | target_include_directories( 16 | mdspan-stdpar 17 | PRIVATE ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/../../include 18 | ${MDSPAN_INCLUDE_DIR}) 19 | 20 | target_link_libraries(mdspan-stdpar PUBLIC ${MPI_LIBS} stdexec hpcpp-core) 21 | 22 | set_target_properties( 23 | mdspan-stdpar 24 | PROPERTIES CXX_STANDARD ${CXX_STANDARD} 25 | CXX_EXTENSIONS NO 26 | INSTALL_RPATH_USE_LINK_PATH ON) 27 | 28 | # installation 29 | install(TARGETS mdspan-stdpar DESTINATION ${CMAKE_INSTALL_BINDIR}) 30 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) The Regents of the University of California (Muhammad Haseeb, Weile Wei), 2023 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /apps/choleskey/matrixutil.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | // generate positive definition matrix 7 | template 8 | using Matrix = std::vector>; 9 | 10 | template 11 | std::vector generate_pascal_matrix(const int n) { 12 | Matrix matrix(n, std::vector(n, static_cast(0))); 13 | 14 | for (int i = 0; i < n; ++i) { 15 | for (int j = 0; j < n; ++j) { 16 | if (i == 0 || j == 0) { 17 | matrix[i][j] = static_cast(1); 18 | } else { 19 | matrix[i][j] = matrix[i][j - 1] + matrix[i - 1][j]; 20 | } 21 | } 22 | } 23 | 24 | std::vector flattenedVector; 25 | for (const auto& row : matrix) { 26 | flattenedVector.insert(flattenedVector.end(), row.begin(), row.end()); 27 | } 28 | return std::move(flattenedVector); 29 | } 30 | 31 | // parameters define 32 | struct args_params_t : public argparse::Args { 33 | bool& results = kwarg("results", "print generated results (default: false)").set_default(true); 34 | std::uint64_t& nd = kwarg("nd", "Number of input(positive definition) matrix dimension(<=18)").set_default(10); 35 | std::uint64_t& np = kwarg("np", "Number of partitions").set_default(4); 36 | bool& help = flag("h, help", "print help"); 37 | bool& time = kwarg("t, time", "print time").set_default(true); 38 | }; 39 | -------------------------------------------------------------------------------- /scripts/fft.nvhpc.gpu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -le 2 | 3 | # 4 | # Reminder: Revert any changes to nvstdpar/CMakeLists.txt and 5 | # nvstdpar/apps/heat-equation/CMakeLists.txt that you did 6 | # for GCC compiler script before running this. 7 | # 8 | 9 | #SBATCH -A nstaff_g 10 | #SBATCH -C gpu 11 | #SBATCH --qos=regular 12 | #SBATCH --time=8:00:00 13 | #SBATCH --nodes=1 14 | #SBATCH --gpus=4 15 | #SBATCH --ntasks-per-node=4 16 | #SBATCH --exclusive 17 | #SBATCH --gpu-bind=none 18 | #SBATCH -o fft-gpu.o%j 19 | #SBATCH -e fft-gpu.e%j 20 | #SBATCH -J FFT-GPU 21 | 22 | set +x 23 | 24 | mkdir -p ${HOME}/repos/nvstdpar/build-fft-gpu 25 | cd ${HOME}/repos/nvstdpar/build-fft-gpu 26 | rm -rf ./* 27 | 28 | ml unload cudatoolkit 29 | ml use /global/cfs/cdirs/m1759/wwei/nvhpc_23_7/modulefiles 30 | ml nvhpc/23.7 31 | # need this for GLIBC 32 | ml gcc/12.2.0 33 | ml cmake/3.24 34 | 35 | cmake .. -DSTDPAR=gpu -DOMP=gpu -DCMAKE_CXX_COMPILER=$(which nvc++) 36 | 37 | make -j fft-stdexec fft-stdpar 38 | 39 | cd ${HOME}/repos/nvstdpar/build-fft-gpu/apps/fft 40 | 41 | D=(536870912 1073741824) 42 | 43 | for d in "${D[@]}"; do 44 | echo "stdexec:gpu for ${d}" 45 | srun -n 1 ./fft-stdexec -N ${d} --time --sch=gpu 46 | 47 | echo "stdpar:gpu for ${d}" 48 | srun -n 1 ./fft-stdpar -N ${d} --time 2>&1 49 | done 50 | 51 | for d in "${D[@]}"; do 52 | echo "stdexec:multi_gpu for ${d}" 53 | srun -n 1 ./fft-stdexec -N ${d} --time --sch=multigpu 2>&1 54 | done 55 | 56 | -------------------------------------------------------------------------------- /scripts/pm-localrc/localrc: -------------------------------------------------------------------------------- 1 | set LFC=-lgfortran; 2 | set LDSO=/lib64/ld-linux-x86-64.so.2; 3 | set GCCDIR=/opt/cray/pe/gcc/12.2.0/snos/lib/gcc/x86_64-suse-linux/12.2.0/; 4 | set G77DIR=/opt/cray/pe/gcc/12.2.0/snos/lib/gcc/x86_64-suse-linux/12.2.0/; 5 | set OEM_INFO=64-bit target on x86-64 Linux $INFOTPVAL; 6 | set GNUATOMIC=-latomic; 7 | set GCCINC=/opt/nvidia/hpc_sdk/Linux_x86_64/23.1/math_libs/12.0/include /opt/nvidia/hpc_sdk/Linux_x86_64/23.1/cuda/12.0/include /opt/cray/pe/gcc/12.2.0/snos/lib/gcc/x86_64-suse-linux/12.2.0/include /usr/local/include /opt/cray/pe/gcc/12.2.0/snos/include /opt/cray/pe/gcc/12.2.0/snos/lib/gcc/x86_64-suse-linux/12.2.0/include-fixed /usr/include; 8 | set GPPDIR=/opt/nvidia/hpc_sdk/Linux_x86_64/23.1/math_libs/12.0/include /opt/nvidia/hpc_sdk/Linux_x86_64/23.1/cuda/12.0/include /opt/cray/pe/gcc/12.2.0/snos/include/g++ /opt/cray/pe/gcc/12.2.0/snos/include/g++/x86_64-suse-linux /opt/cray/pe/gcc/12.2.0/snos/include/g++/backward /opt/cray/pe/gcc/12.2.0/snos/lib/gcc/x86_64-suse-linux/12.2.0/include /usr/local/include /opt/cray/pe/gcc/12.2.0/snos/include /opt/cray/pe/gcc/12.2.0/snos/lib/gcc/x86_64-suse-linux/12.2.0/include-fixed /usr/include; 9 | set NUMALIBNAME=-lnuma; 10 | set LOCALRC=YES; 11 | set EXTENSION=__extension__=; 12 | set LC=-lgcc -lc $if(-Bstatic,-lgcc_eh, -lgcc_s); 13 | set DEFCUDAVERSION=12.0; 14 | set DEFSTDPARCOMPUTECAP=80; 15 | # GLIBC version 2.31 16 | # GCC version 12.2.0 17 | set GCCVERSION=120200; 18 | set LIBNCURSES=YES; 19 | export PGI=$COMPBASE; -------------------------------------------------------------------------------- /apps/1d-stencil/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | project(heat-equation LANGUAGES CXX) 2 | 3 | file(GLOB CPP_SOURCES "*.cpp") 4 | 5 | foreach(source_file ${CPP_SOURCES}) 6 | if(NOT STDPAR STREQUAL "gpu") 7 | if("${source_file}" MATCHES ".*stdpar.*gpu.*" 8 | OR "${source_file}" MATCHES ".*gpu.*stdpar.*" 9 | OR "${source_file}" MATCHES ".*cuda.*") 10 | message(STATUS "Skipping ${source_file} as stdpar=${STDPAR}") 11 | continue() 12 | endif() 13 | endif() 14 | 15 | # get the file name without an extension 16 | get_filename_component(exec_name ${source_file} NAME_WE) 17 | 18 | # add an executable with the same name as the source file 19 | add_executable(${exec_name} ${_EXCLUDE} ${source_file}) 20 | 21 | # add dependency on argparse 22 | add_dependencies(${exec_name} argparse) 23 | 24 | set_source_files_properties(${source_file} PROPERTIES LANGUAGE CXX 25 | LINKER_LANGUAGE CXX) 26 | target_include_directories( 27 | ${exec_name} 28 | PRIVATE ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/../../include 29 | ${ARGPARSE_INCLUDE_DIR} ${MDSPAN_INCLUDE_DIR}) 30 | 31 | target_link_libraries(${exec_name} PUBLIC ${MPI_LIBS} stdexec hpcpp-core) 32 | 33 | set_target_properties( 34 | ${exec_name} 35 | PROPERTIES CXX_STANDARD ${CXX_STANDARD} 36 | CXX_EXTENSIONS ${CMAKE_GNU_EXTENSIONS} 37 | INSTALL_RPATH_USE_LINK_PATH ON) 38 | 39 | # installation 40 | install(TARGETS ${exec_name} DESTINATION ${CMAKE_INSTALL_BINDIR}) 41 | endforeach() 42 | -------------------------------------------------------------------------------- /apps/heat-equation/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | project(heat-equation LANGUAGES CXX) 2 | 3 | file(GLOB CPP_SOURCES "*.cpp") 4 | 5 | foreach(source_file ${CPP_SOURCES}) 6 | if(NOT STDPAR STREQUAL "gpu") 7 | if("${source_file}" MATCHES ".*stdpar.*gpu.*" 8 | OR "${source_file}" MATCHES ".*gpu.*stdpar.*" 9 | OR "${source_file}" MATCHES ".*cuda.*") 10 | message(STATUS "Skipping ${source_file} as stdpar=${STDPAR}") 11 | continue() 12 | endif() 13 | endif() 14 | 15 | # get the file name without an extension 16 | get_filename_component(exec_name ${source_file} NAME_WE) 17 | 18 | # add an executable with the same name as the source file 19 | add_executable(${exec_name} ${_EXCLUDE} ${source_file}) 20 | 21 | # add dependency on argparse 22 | add_dependencies(${exec_name} argparse) 23 | 24 | set_source_files_properties(${source_file} PROPERTIES LANGUAGE CXX 25 | LINKER_LANGUAGE CXX) 26 | target_include_directories( 27 | ${exec_name} 28 | PRIVATE ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/../../include 29 | ${ARGPARSE_INCLUDE_DIR} ${MDSPAN_INCLUDE_DIR}) 30 | 31 | target_link_libraries(${exec_name} PUBLIC ${MPI_LIBS} stdexec hpcpp-core) 32 | 33 | set_target_properties( 34 | ${exec_name} 35 | PROPERTIES CXX_STANDARD ${CXX_STANDARD} 36 | CXX_EXTENSIONS ${CMAKE_GNU_EXTENSIONS} 37 | INSTALL_RPATH_USE_LINK_PATH ON) 38 | 39 | # installation 40 | install(TARGETS ${exec_name} DESTINATION ${CMAKE_INSTALL_BINDIR}) 41 | endforeach() 42 | -------------------------------------------------------------------------------- /scripts/heat-run.gcc.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -le 2 | 3 | # 4 | # Read and do the following two steps before running this script: 5 | # 6 | # 7 | # 1. In nvstdpar/CMakeLists.txt, replace the following line: 8 | # 9 | # set(CMAKE_CXX_FLAGS 10 | # "${CMAKE_CXX_FLAGS} -stdpar=${STDPAR} -mp=${OMP} --gcc-toolchain=/opt/cray/pe/gcc/12.2.0/bin/ -pthread" 11 | # 12 | # with 13 | # 14 | # set(CMAKE_CXX_FLAGS 15 | # "${CMAKE_CXX_FLAGS} -fopenmp -pthread" 16 | # 17 | # 18 | # 2. In nvstdpar/apps/heat-equation/CMakeLists.txt, replace the following line: 19 | # 20 | # target_link_libraries(${exec_name} PUBLIC ${MPI_LIBS} stdexec) 21 | # 22 | # with 23 | # 24 | # target_link_libraries(${exec_name} PUBLIC ${MPI_LIBS} stdexec tbb) 25 | # 26 | # 27 | 28 | set -x 29 | 30 | mkdir -p ${HOME}/repos/nvstdpar/build-gcc 31 | cd ${HOME}/repos/nvstdpar/build-gcc 32 | 33 | rm -rf ./* 34 | ml cmake/3.24 gcc/12.2 cudatoolkit/12.0 35 | ml unload cray-mpich 36 | 37 | cmake .. -DSTDPAR=multicore -DOMP=multicore -DCMAKE_CXX_COMPILER=$(which g++) -DCMAKE_CUDA_HOST_COMPILER=$(which g++) 38 | 39 | make -j heat-equation-omp heat-equation-mdspan heat-equation-stdpar 40 | 41 | cd ${HOME}/repos/nvstdpar/build-gcc/apps/heat-equation 42 | 43 | ./heat-equation-mdspan -s=50 -n=30000 --time 2>&1 |& tee gcc-md.txt 44 | 45 | # parallel runs 46 | T=(128 64 32 16 8 4 2 1) 47 | 48 | for i in "${T[@]}"; do 49 | ./heat-equation-omp -s=50 -n=30000 --time --nthreads=${i} 2>&1 |& tee gcc-omp-${i}.txt 50 | done 51 | 52 | # will use 128 threads anyway 53 | ./heat-equation-stdpar -s=50 -n=30000 --time 2>&1 |& tee gcc-stdpar-${i}.txt 54 | -------------------------------------------------------------------------------- /scripts/fft.nvhpc.grace.cpu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -le 2 | 3 | # 4 | # Reminder: Revert any changes to nvstdpar/CMakeLists.txt and 5 | # nvstdpar/apps/heat-equation/CMakeLists.txt that you did 6 | # for GCC compiler script before running this. 7 | # 8 | 9 | #SBATCH -N 1 10 | #SBATCH -p cg4-cpu4x120gb-gpu4x80gb 11 | #SBATCH --gres=gpu:4 12 | #SBATCH --exclusive 13 | #SBATCH -o fft-cpu.o%j 14 | #SBATCH -e fft-cpu.e%j 15 | #SBATCH -J FFT-CPU 16 | 17 | set +x 18 | 19 | mkdir -p ${HOME}/repos/nvstdpar/build-fft-cpu 20 | cd ${HOME}/repos/nvstdpar/build-fft-cpu 21 | rm -rf ./* 22 | 23 | module unload gcc; module load gcc/12.3.0; module load nvhpc/23.5; module load slurm 24 | export PATH=/home/wwei/install/cmake_3_27_3/bin/:$PATH 25 | 26 | cmake .. -DCMAKE_BUILD_TYPE=Release -DSTDPAR=multicore -DOMP=multicore -DCMAKE_CXX_COMPILER=$(which nvc++) 27 | 28 | make -j fft-serial fft-stdexec fft-stdpar 29 | 30 | cd ${HOME}/repos/nvstdpar/build-fft-cpu/apps/fft 31 | 32 | D=(536870912 1073741824) 33 | 34 | # parallel runs 35 | T=(256 128 64 32 16 8 4 2 1) 36 | 37 | for d in "${D[@]}"; do 38 | for i in "${T[@]}"; do 39 | echo "stdexec:cpu for ${d}, threads=${i}" 40 | srun -n 1 --cpu-bind=none ./fft-stdexec -N ${d} --time --sch=cpu --nthreads=${i} 41 | 42 | echo "stdpar:cpu for ${d}, threads=${i}" 43 | export OMP_NUM_THREADS=${i} 44 | srun -n 1 --cpu-bind=none ./fft-stdpar -N ${d} --time --nthreads=${i} 45 | done 46 | done 47 | 48 | unset OMP_NUM_THREADS 49 | 50 | for d in "${D[@]}"; do 51 | echo "serial for ${d}" 52 | srun -n 1 --cpu-bind=none ./fft-serial -N ${d} --time 53 | done 54 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # hpcpp 2 | 3 | Standard C++26 High-Performance Computing (HPC) applications that run on CPUs and GPUs. 4 | 5 | ## Build 6 | 7 | ```bash 8 | git clone https://github.com/NERSC/hpcpp.git 9 | cd hpcpp; mkdir build ; cd build 10 | ml nvhpc/23.7 cmake 3.24 11 | 12 | # enable GPU support by setting -DSTDPAR=gpu (default) 13 | cmake .. -DSTDPAR= -DOMP== 11.2.0` paths. 18 | 19 | ### NERSC Users 20 | 21 | You can also use the pre-configured `localrc` file included in this repo. To use it, run: 22 | 23 | ```bash 24 | export GCCLOCALRC=/path/to/hpcpp/scripts/pm-localrc/localrc 25 | ``` 26 | 27 | **Note**: Please uncomment the following line in `apps/fft/CMakeLists.txt` if using `nvc++` version < 23.7? 28 | 29 | ```bash 30 | # uncomment only if using nvc++ earlier than 23.7 to find libcublas 31 | # target_link_directories(${exec_name} PRIVATE /opt/nvidia/hpc_sdk/Linux_x86_64/23.1/math_libs/lib64) 32 | ``` 33 | 34 | ## Run Apps 35 | 36 | ```bash 37 | cd hpcpp/build 38 | srun -n 1 -N 1 -G <> -A -t 30 -C ./apps// [ARGS] 39 | ``` 40 | 41 | Use `--help` to see help with arguments. 42 | 43 | ## Contributors 44 | 45 | (in alphabetical order of last name) 46 | - [Muhammad Haseeb](https://nersc.gov/muhammad-haseeb) 47 | - [Weile Wei](https://nersc.gov/weile-wei) 48 | - [Chuanqiu He](https://github.com/hcq9102) 49 | 50 | ## License 51 | Copyright (C) The Regents of the University of California, 2023 (See [LICENSE](LICENSE) for details). 52 | -------------------------------------------------------------------------------- /scripts/fft.nvhpc.cpu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -le 2 | 3 | # 4 | # Reminder: Revert any changes to nvstdpar/CMakeLists.txt and 5 | # nvstdpar/apps/heat-equation/CMakeLists.txt that you did 6 | # for GCC compiler script before running this. 7 | # 8 | 9 | #SBATCH -A nstaff 10 | #SBATCH -C cpu 11 | #SBATCH --qos=regular 12 | #SBATCH --time=12:00:00 13 | #SBATCH --nodes=1 14 | #SBATCH --ntasks-per-node=1 15 | #SBATCH --cpus-per-task=128 16 | #SBATCH --exclusive 17 | #SBATCH -o fft-cpu.o%j 18 | #SBATCH -e fft-cpu.e%j 19 | #SBATCH -J FFT-CPU 20 | 21 | set +x 22 | 23 | BUILD_HOME=${HOME}/repos/nvstdpar/build-fft-cpu 24 | 25 | mkdir -p ${BUILD_HOME} 26 | cd ${BUILD_HOME} 27 | rm -rf ./* 28 | 29 | ml unload cudatoolkit 30 | ml use /global/cfs/cdirs/m1759/wwei/nvhpc_23_7/modulefiles 31 | ml nvhpc/23.7 32 | # need this for GLIBC 33 | ml gcc/12.2.0 34 | ml cmake/3.24 35 | 36 | cmake .. -DSTDPAR=multicore -DOMP=multicore -DCMAKE_CXX_COMPILER=$(which nvc++) 37 | 38 | make -j fft-serial fft-stdexec fft-stdpar 39 | 40 | cd ${BUILD_HOME}/apps/fft 41 | 42 | D=(536870912 1073741824) 43 | 44 | # parallel runs 45 | T=(256 128 64 32 16 8 4 2 1) 46 | 47 | for d in "${D[@]}"; do 48 | for i in "${T[@]}"; do 49 | echo "stdexec:cpu for ${d}, threads=${i}" 50 | srun -n 1 --cpu-bind=none ./fft-stdexec -N ${d} --time --sch=cpu --nthreads=${i} 51 | 52 | echo "stdpar:cpu for ${d}, threads=${i}" 53 | export OMP_NUM_THREADS=${i} 54 | srun -n 1 --cpu-bind=none ./fft-stdpar -N ${d} --time --nthreads=${i} 55 | done 56 | done 57 | 58 | unset OMP_NUM_THREADS 59 | 60 | for d in "${D[@]}"; do 61 | echo "serial for ${d}" 62 | srun -n 1 --cpu-bind=none ./fft-serial -N ${d} --time 63 | done 64 | -------------------------------------------------------------------------------- /scripts/heat-run.nvhpc.grace.cpu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -le 2 | 3 | # 4 | # Reminder: Revert any changes to nvstdpar/CMakeLists.txt and 5 | # nvstdpar/apps/heat-equation/CMakeLists.txt that you did 6 | # for GCC compiler script before running this. 7 | # 8 | #SBATCH -N 1 9 | #SBATCH -p cg4-cpu4x120gb-gpu4x80gb 10 | #SBATCH --gres=gpu:4 11 | #SBATCH --exclusive 12 | #SBATCH -o heat-cpu.o%j 13 | #SBATCH -e heat-cpu.e%j 14 | #SBATCH -J HEAT-CPU 15 | 16 | set +x 17 | 18 | BUILD_HOME=${HOME}/repos/nvstdpar/build-heat-cpu 19 | 20 | mkdir -p ${BUILD_HOME} 21 | cd ${BUILD_HOME} 22 | rm -rf ./* 23 | 24 | module unload gcc; module load gcc/12.3.0; module load nvhpc/23.5; module load slurm 25 | export PATH=/home/wwei/install/cmake_3_27_3/bin/:$PATH 26 | 27 | cmake .. -DCMAKE_BUILD_TYPE=Release -DSTDPAR=multicore -DOMP=multicore -DCMAKE_CXX_COMPILER=$(which nvc++) 28 | 29 | make -j heat-equation-omp heat-equation-serial heat-equation-stdexec heat-equation-stdpar 30 | 31 | cd ${BUILD_HOME}/apps/heat-equation 32 | 33 | # parallel runs 34 | T=(256 128 64 32 16 8 4 2 1) 35 | 36 | unset OMP_NUM_THREADS 37 | 38 | for i in "${T[@]}"; do 39 | echo "heat:omp, threads=${i}" 40 | srun -n 1 --cpu-bind=none ./heat-equation-omp -s=1000 -n=46000 --time --nthreads=${i} 41 | 42 | echo "heat:stdexec, threads=${i}" 43 | srun -n 1 --cpu-bind=none ./heat-equation-stdexec -s=1000 -n=46000 --time --nthreads=${i} 44 | done 45 | 46 | for i in "${T[@]}"; do 47 | echo "heat:stdpar, threads=${i}" 48 | export OMP_NUM_THREADS=${i} 49 | srun -n 1 --cpu-bind=none ./heat-equation-stdpar -s=1000 -n=46000 --time 50 | done 51 | 52 | unset OMP_NUM_THREADS 53 | 54 | echo "heat:serial" 55 | srun -n 1 --cpu-bind=none ./heat-equation-serial -s=1000 -n=46000 --time 56 | -------------------------------------------------------------------------------- /scripts/stencil.nvhpc.grace.cpu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -le 2 | 3 | #SBATCH -N 1 4 | #SBATCH -p cg4-cpu4x120gb-gpu4x80gb 5 | #SBATCH --gres=gpu:4 6 | #SBATCH --exclusive 7 | #SBATCH -o 1d-cpu.o%j 8 | #SBATCH -e 1d-cpu.e%j 9 | #SBATCH -J 1D-CPU 10 | 11 | set +x 12 | 13 | BUILD_HOME=${HOME}/repos/nvstdpar/build-1d-cpu 14 | 15 | mkdir -p ${BUILD_HOME} 16 | cd ${BUILD_HOME} 17 | rm -rf ./* 18 | 19 | module unload gcc; module load gcc/12.3.0; module load nvhpc/23.5; module load slurm 20 | export PATH=/home/wwei/install/cmake_3_27_3/bin/:$PATH 21 | 22 | # export OMP_PLACES="{0:16},{16:16},{32:16},{48:16},{64:16},{80:16},{96:16},{112:16}" 23 | # export OMP_PROC_BIND=close 24 | 25 | 26 | oneDimension_size=1000000000 27 | oneDimension_iterations=4000 28 | 29 | cmake .. -DCMAKE_BUILD_TYPE=Release -DSTDPAR=multicore -DOMP=multicore -DCMAKE_CXX_COMPILER=$(which nvc++) 30 | make -j 31 | 32 | cd ${BUILD_HOME}/apps/1d_stencil 33 | 34 | # parallel runs 35 | T=(256 128 64 32 16 8 4 2 1) 36 | 37 | unset OMP_NUM_THREADS 38 | 39 | for i in "${T[@]}"; do 40 | echo "1d:omp, threads=${i}" 41 | srun -n 1 --cpu-bind=none ./stencil_omp --size $oneDimension_size --nt $oneDimension_iterations --nthreads=$i 42 | 43 | 44 | echo "1d:stdexec, threads=${i}" 45 | srun -n 1 --cpu-bind=none ./stencil_stdexec --sch cpu --size $oneDimension_size --nt $oneDimension_iterations --nthreads=$i 46 | done 47 | 48 | for i in "${T[@]}"; do 49 | echo "1d:stdpar, threads=${i}" 50 | export OMP_NUM_THREADS=${i} 51 | srun -n 1 --cpu-bind=none ./stencil_stdpar --size $oneDimension_size --nt $oneDimension_iterations 52 | done 53 | 54 | unset OMP_NUM_THREADS 55 | 56 | echo "1d:serial" 57 | srun -n 1 --cpu-bind=none ./stencil_serial --size $oneDimension_size --nt $oneDimension_iterations 58 | 59 | -------------------------------------------------------------------------------- /scripts/heat-run.nvhpc.cpu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -le 2 | 3 | # 4 | # Reminder: Revert any changes to nvstdpar/CMakeLists.txt and 5 | # nvstdpar/apps/heat-equation/CMakeLists.txt that you did 6 | # for GCC compiler script before running this. 7 | # 8 | 9 | #SBATCH -A nstaff 10 | #SBATCH -C cpu 11 | #SBATCH --qos=regular 12 | #SBATCH --time=24:00:00 13 | #SBATCH --nodes=1 14 | #SBATCH --ntasks-per-node=1 15 | #SBATCH --cpus-per-task=128 16 | #SBATCH --exclusive 17 | #SBATCH -o heat-cpu.o%j 18 | #SBATCH -e heat-cpu.e%j 19 | #SBATCH -J HEAT-CPU 20 | 21 | set +x 22 | 23 | BUILD_HOME=${HOME}/repos/nvstdpar/build-heat-cpu 24 | 25 | mkdir -p ${BUILD_HOME} 26 | cd ${BUILD_HOME} 27 | rm -rf ./* 28 | 29 | ml unload cudatoolkit 30 | ml use /global/cfs/cdirs/m1759/wwei/nvhpc_23_7/modulefiles 31 | ml nvhpc/23.7 32 | # needed for GLIBC 33 | ml gcc/12.2.0 34 | ml cmake/3.24 35 | 36 | cmake .. -DSTDPAR=multicore -DOMP=multicore -DCMAKE_CXX_COMPILER=$(which nvc++) 37 | 38 | make -j heat-equation-omp heat-equation-serial heat-equation-stdexec heat-equation-stdpar 39 | 40 | cd ${BUILD_HOME}/apps/heat-equation 41 | 42 | # parallel runs 43 | T=(256 128 64 32 16 8 4 2 1) 44 | 45 | unset OMP_NUM_THREADS 46 | 47 | for i in "${T[@]}"; do 48 | echo "heat:omp, threads=${i}" 49 | srun -n 1 --cpu-bind=none ./heat-equation-omp -s=1000 -n=46000 --time --nthreads=${i} 50 | 51 | echo "heat:stdexec, threads=${i}" 52 | srun -n 1 --cpu-bind=none ./heat-equation-stdexec -s=1000 -n=46000 --time --nthreads=${i} 53 | done 54 | 55 | for i in "${T[@]}"; do 56 | echo "heat:stdpar, threads=${i}" 57 | export OMP_NUM_THREADS=${i} 58 | srun -n 1 --cpu-bind=none ./heat-equation-stdpar -s=1000 -n=46000 --time 59 | done 60 | 61 | unset OMP_NUM_THREADS 62 | 63 | echo "heat:serial" 64 | srun -n 1 --cpu-bind=none ./heat-equation-serial -s=1000 -n=46000 --time 65 | -------------------------------------------------------------------------------- /scripts/stencil.nvhpc.cpu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -le 2 | 3 | #SBATCH -A nstaff 4 | #SBATCH -C cpu 5 | #SBATCH --qos=regular 6 | #SBATCH --time=24:00:00 7 | #SBATCH --nodes=1 8 | #SBATCH --ntasks-per-node=1 9 | #SBATCH --cpus-per-task=128 10 | #SBATCH --exclusive 11 | #SBATCH -o 1d-cpu.o%j 12 | #SBATCH -e 1d-cpu.e%j 13 | #SBATCH -J 1D-CPU 14 | 15 | set +x 16 | 17 | BUILD_HOME=${HOME}/repos/nvstdpar/build-1d-cpu 18 | 19 | mkdir -p ${BUILD_HOME} 20 | cd ${BUILD_HOME} 21 | rm -rf ./* 22 | 23 | ml unload cudatoolkit 24 | ml use /global/cfs/cdirs/m1759/wwei/nvhpc_23_7/modulefiles 25 | ml nvhpc/23.7 26 | # needed for GLIBC 27 | ml gcc/12.2.0 28 | ml cmake/3.24 29 | 30 | 31 | # export OMP_PLACES="{0:16},{16:16},{32:16},{48:16},{64:16},{80:16},{96:16},{112:16}" 32 | # export OMP_PROC_BIND=close 33 | 34 | 35 | oneDimension_size=1000000000 36 | oneDimension_iterations=4000 37 | 38 | cmake .. -DSTDPAR=multicore -DOMP=multicore -DCMAKE_CXX_COMPILER=$(which nvc++) 39 | make -j 40 | 41 | cd ${BUILD_HOME}/apps/1d_stencil 42 | 43 | # parallel runs 44 | T=(256 128 64 32 16 8 4 2 1) 45 | 46 | unset OMP_NUM_THREADS 47 | 48 | for i in "${T[@]}"; do 49 | echo "1d:omp, threads=${i}" 50 | srun -n 1 --cpu-bind=none ./stencil_omp --size $oneDimension_size --nt $oneDimension_iterations --nthreads=$i 51 | 52 | 53 | echo "1d:stdexec, threads=${i}" 54 | srun -n 1 --cpu-bind=none ./stencil_stdexec --sch cpu --size $oneDimension_size --nt $oneDimension_iterations --nthreads=$i 55 | done 56 | 57 | for i in "${T[@]}"; do 58 | echo "1d:stdpar, threads=${i}" 59 | export OMP_NUM_THREADS=${i} 60 | srun -n 1 --cpu-bind=none ./stencil_stdpar --size $oneDimension_size --nt $oneDimension_iterations 61 | done 62 | 63 | unset OMP_NUM_THREADS 64 | 65 | echo "1d:serial" 66 | srun -n 1 --cpu-bind=none ./stencil_serial --size $oneDimension_size --nt $oneDimension_iterations 67 | 68 | -------------------------------------------------------------------------------- /apps/prefixSum/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | project(prefixSum LANGUAGES CXX) 2 | 3 | file(GLOB CPP_SOURCES "*.cpp") 4 | 5 | # add -cudalib=cublas if -stdpar=gpu 6 | if(STDPAR STREQUAL "gpu") 7 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") 8 | endif() 9 | 10 | foreach(source_file ${CPP_SOURCES}) 11 | if(NOT STDPAR STREQUAL "gpu") 12 | if("${source_file}" MATCHES ".*stdpar.*gpu.*" OR "${source_file}" MATCHES 13 | ".*gpu.*stdpar.*") 14 | message(STATUS "Skipping ${source_file} as stdpar=${STDPAR}") 15 | continue() 16 | endif() 17 | endif() 18 | 19 | # get the file name without an extension 20 | get_filename_component(exec_name ${source_file} NAME_WE) 21 | 22 | # add an executable with the same name as the source file 23 | add_executable(${exec_name} ${_EXCLUDE} ${source_file}) 24 | 25 | # add dependency on argparse 26 | add_dependencies(${exec_name} argparse) 27 | 28 | set_source_files_properties(${source_file} PROPERTIES LANGUAGE CXX 29 | LINKER_LANGUAGE CXX) 30 | target_include_directories( 31 | ${exec_name} 32 | PRIVATE ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/../../include 33 | ${ARGPARSE_INCLUDE_DIR} ${MDSPAN_INCLUDE_DIR}) 34 | 35 | # uncomment only if using nvc++/23.1 - no need if nvc++/23.7 36 | # target_link_directories(${exec_name} PRIVATE 37 | # /opt/nvidia/hpc_sdk/Linux_x86_64/23.1/math_libs/lib64) 38 | 39 | target_link_libraries(${exec_name} PUBLIC ${MPI_LIBS} stdexec hpcpp-core) 40 | 41 | set_target_properties( 42 | ${exec_name} 43 | PROPERTIES CXX_STANDARD ${CXX_STANDARD} 44 | CXX_EXTENSIONS ${CMAKE_GNU_EXTENSIONS} 45 | INSTALL_RPATH_USE_LINK_PATH ON) 46 | 47 | # installation 48 | install(TARGETS ${exec_name} DESTINATION ${CMAKE_INSTALL_BINDIR}) 49 | endforeach() 50 | -------------------------------------------------------------------------------- /apps/fft/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | project(fft LANGUAGES CXX) 2 | 3 | file(GLOB CPP_SOURCES "*.cpp") 4 | 5 | # add -cudalib=cublas if -stdpar=gpu 6 | if(STDPAR STREQUAL "gpu") 7 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -cudalib=cublas") 8 | endif() 9 | 10 | foreach(source_file ${CPP_SOURCES}) 11 | if(NOT STDPAR STREQUAL "gpu") 12 | if("${source_file}" MATCHES ".*stdpar.*gpu.*" OR "${source_file}" MATCHES 13 | ".*gpu.*stdpar.*") 14 | message(STATUS "Skipping ${source_file} as stdpar=${STDPAR}") 15 | continue() 16 | endif() 17 | endif() 18 | 19 | # get the file name without an extension 20 | get_filename_component(exec_name ${source_file} NAME_WE) 21 | 22 | # add an executable with the same name as the source file 23 | add_executable(${exec_name} ${_EXCLUDE} ${source_file}) 24 | 25 | # add dependency on argparse 26 | add_dependencies(${exec_name} argparse) 27 | 28 | set_source_files_properties(${source_file} PROPERTIES LANGUAGE CXX 29 | LINKER_LANGUAGE CXX) 30 | target_include_directories( 31 | ${exec_name} 32 | PRIVATE ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_LIST_DIR}/../../include 33 | ${ARGPARSE_INCLUDE_DIR} ${MDSPAN_INCLUDE_DIR}) 34 | 35 | # uncomment only if using nvc++/23.1 - no need if nvc++/23.7 36 | # target_link_directories(${exec_name} PRIVATE 37 | # /opt/nvidia/hpc_sdk/Linux_x86_64/23.1/math_libs/lib64) 38 | 39 | target_link_libraries(${exec_name} PUBLIC ${MPI_LIBS} stdexec blas hpcpp-core) 40 | 41 | set_target_properties( 42 | ${exec_name} 43 | PROPERTIES CXX_STANDARD ${CXX_STANDARD} 44 | CXX_EXTENSIONS ${CMAKE_GNU_EXTENSIONS} 45 | INSTALL_RPATH_USE_LINK_PATH ON) 46 | 47 | # installation 48 | install(TARGETS ${exec_name} DESTINATION ${CMAKE_INSTALL_BINDIR}) 49 | endforeach() 50 | -------------------------------------------------------------------------------- /apps/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------------------# 2 | # Add comm-study 3 | # ----------------------------------------------------------------------------------------# 4 | 5 | message(STATUS "Adding comm-study...") 6 | add_subdirectory(comm-study) 7 | 8 | # ----------------------------------------------------------------------------------------# 9 | # Add heat equation demo 10 | # ----------------------------------------------------------------------------------------# 11 | 12 | message(STATUS "Adding heat-equation...") 13 | add_subdirectory(heat-equation) 14 | 15 | # ----------------------------------------------------------------------------------------# 16 | # Add MDSPAN + hpcpp demo 17 | # ----------------------------------------------------------------------------------------# 18 | 19 | message(STATUS "Adding mdspan-stdpar...") 20 | add_subdirectory(mdspan-stdpar) 21 | 22 | message(STATUS "Adding 1d-stencil...") 23 | add_subdirectory(1d-stencil) 24 | 25 | # ----------------------------------------------------------------------------------------# 26 | # Add choleskey demo 27 | # ----------------------------------------------------------------------------------------# 28 | message(STATUS "Adding choleskey example...") 29 | add_subdirectory(choleskey) 30 | 31 | # ----------------------------------------------------------------------------------------# 32 | # Add fft demo 33 | # ----------------------------------------------------------------------------------------# 34 | message(STATUS "Adding fft...") 35 | add_subdirectory(fft) 36 | 37 | # ----------------------------------------------------------------------------------------# 38 | # Add block segmented prefixSum 39 | # ----------------------------------------------------------------------------------------# 40 | message(STATUS "Adding prefixSum...") 41 | add_subdirectory(prefixSum) 42 | -------------------------------------------------------------------------------- /scripts/benchmark.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -le 2 | 3 | #SBATCH -A nstaff 4 | 5 | #SBATCH -C gpu 6 | #SBATCH --qos=regular 7 | #SBATCH -G 4 8 | #SBATCH -t 6:00:00 9 | #SBATCH --exclusive 10 | #SBATCH -N 1 11 | #SBATCH --ntasks-per-node=1 12 | 13 | #SBATCH -o nvstdpar_stencil_final_benchmark.out 14 | #SBATCH -e nvstdpar_stencil_final_benchmark.err 15 | 16 | ml use /global/cfs/cdirs/m1759/wwei/nvhpc_23_7/modulefiles 17 | ml unload cudatoolkit 18 | ml gcc/12.2 cmake/3.24 nvhpc-hpcx/23.7 19 | 20 | oneDimention_size=1000000000 21 | oneDimention_iterations=4000 22 | twoDimention_size=46000 23 | twoDimention_iterations=1000 24 | 25 | cpu_build=build_multicore_benchmark 26 | 27 | # mkdir -p ${HOME}/src/nvstdpar/$cpu_build 28 | # cd ${HOME}/src/nvstdpar/$cpu_build 29 | # rm -rf ./* 30 | # cmake .. -DCMAKE_BUILD_TYPE=Release -DSTDPAR=multicore -DOMP=multicore 31 | # make -j 32 | 33 | gpu_build=build_gpu_benchmark 34 | 35 | # mkdir -p ${HOME}/src/nvstdpar/$gpu_build 36 | # cd ${HOME}/src/nvstdpar/$gpu_build 37 | # rm -rf ./* 38 | # cmake .. -DCMAKE_BUILD_TYPE=Release -DSTDPAR=gpu -DOMP=gpu 39 | # make -j 40 | 41 | cd ${HOME}/src/nvstdpar/$cpu_build/apps/1d_stencil 42 | echo "1D_serial" 43 | time ./stencil_serial --size $oneDimention_size --nt $oneDimention_iterations 44 | echo "1D_stdpar cpu" 45 | time ./stencil_stdpar --size $oneDimention_size --nt $oneDimention_iterations 46 | 47 | cd ${HOME}/src/nvstdpar/$gpu_build/apps/1d_stencil 48 | echo "1D_stdpar gpu" 49 | time ./stencil_stdpar --size $oneDimention_size --nt $oneDimention_iterations 50 | echo "1D_stdexec gpu" 51 | time ./stencil_stdexec --sch gpu --size $oneDimention_size --nt $oneDimention_iterations 52 | echo "1D_stdexec multigpu" 53 | time ./stencil_stdexec --sch multigpu --size $oneDimention_size --nt $oneDimention_iterations 54 | echo "1D_cuda" 55 | time ./stencil_cuda --size $oneDimention_size --nt $oneDimention_iterations 56 | 57 | cd ${HOME}/src/nvstdpar/$cpu_build/apps/heat-equation 58 | echo "2D_serial" 59 | time ./heat-equation-serial -n=$twoDimention_size -s=$twoDimention_iterations --time 60 | echo "2D_stdpar cpu" 61 | time ./heat-equation-stdpar -n=$twoDimention_size -s=$twoDimention_iterations --time 62 | 63 | cd ${HOME}/src/nvstdpar/$gpu_build/apps/heat-equation 64 | echo "2D_stdpar gpu" 65 | time ./heat-equation-stdpar -n=$twoDimention_size -s=$twoDimention_iterations --time 66 | echo "2D_stdexec gpu" 67 | time ./heat-equation-stdexec --sch gpu -n=$twoDimention_size -s=$twoDimention_iterations --time 68 | echo "2D_stdexec multigpu" 69 | time ./heat-equation-stdexec --sch multigpu -n=$twoDimention_size -s=$twoDimention_iterations --time 70 | echo "2D_cuda" 71 | time ./heat-equation-cuda -n=$twoDimention_size -s=$twoDimention_iterations --time 72 | -------------------------------------------------------------------------------- /.clang-format: -------------------------------------------------------------------------------- 1 | # Google C/C++ Code Style settings 2 | # https://clang.llvm.org/docs/ClangFormatStyleOptions.html 3 | # Author: Kehan Xue, kehan.xue (at) gmail.com 4 | 5 | Language: Cpp 6 | BasedOnStyle: Google 7 | AccessModifierOffset: -1 8 | AlignAfterOpenBracket: Align 9 | AlignConsecutiveAssignments: None 10 | AlignOperands: Align 11 | AllowAllArgumentsOnNextLine: true 12 | AllowAllConstructorInitializersOnNextLine: true 13 | AllowAllParametersOfDeclarationOnNextLine: false 14 | AllowShortBlocksOnASingleLine: Empty 15 | AllowShortCaseLabelsOnASingleLine: false 16 | AllowShortFunctionsOnASingleLine: Inline 17 | AllowShortIfStatementsOnASingleLine: Never # To avoid conflict, set this "Never" and each "if statement" should include brace when coding 18 | AllowShortLambdasOnASingleLine: Inline 19 | AllowShortLoopsOnASingleLine: false 20 | AlwaysBreakAfterReturnType: None 21 | AlwaysBreakTemplateDeclarations: Yes 22 | BinPackArguments: true 23 | BreakBeforeBraces: Custom 24 | BraceWrapping: 25 | AfterCaseLabel: false 26 | AfterClass: false 27 | AfterStruct: false 28 | AfterControlStatement: Never 29 | AfterEnum: false 30 | AfterFunction: false 31 | AfterNamespace: false 32 | AfterUnion: false 33 | AfterExternBlock: false 34 | BeforeCatch: false 35 | BeforeElse: false 36 | BeforeLambdaBody: false 37 | IndentBraces: false 38 | SplitEmptyFunction: false 39 | SplitEmptyRecord: false 40 | SplitEmptyNamespace: false 41 | BreakBeforeBinaryOperators: None 42 | BreakBeforeTernaryOperators: true 43 | BreakConstructorInitializers: BeforeColon 44 | BreakInheritanceList: BeforeColon 45 | ColumnLimit: 120 46 | CompactNamespaces: false 47 | ContinuationIndentWidth: 4 48 | Cpp11BracedListStyle: true 49 | DerivePointerAlignment: false # Make sure the * or & align on the left 50 | EmptyLineBeforeAccessModifier: LogicalBlock 51 | FixNamespaceComments: true 52 | IncludeBlocks: Preserve 53 | IndentCaseLabels: true 54 | IndentPPDirectives: None 55 | IndentWidth: 4 56 | KeepEmptyLinesAtTheStartOfBlocks: true 57 | MaxEmptyLinesToKeep: 1 58 | NamespaceIndentation: None 59 | ObjCSpaceAfterProperty: false 60 | ObjCSpaceBeforeProtocolList: true 61 | PointerAlignment: Left 62 | ReflowComments: false 63 | # SeparateDefinitionBlocks: Always # Only support since clang-format 14 64 | SpaceAfterCStyleCast: false 65 | SpaceAfterLogicalNot: false 66 | SpaceAfterTemplateKeyword: true 67 | SpaceBeforeAssignmentOperators: true 68 | SpaceBeforeCpp11BracedList: false 69 | SpaceBeforeCtorInitializerColon: true 70 | SpaceBeforeInheritanceColon: true 71 | SpaceBeforeParens: ControlStatements 72 | SpaceBeforeRangeBasedForLoopColon: true 73 | SpaceBeforeSquareBrackets: false 74 | SpaceInEmptyParentheses: false 75 | SpacesBeforeTrailingComments: 2 76 | SpacesInAngles: false 77 | SpacesInCStyleCastParentheses: false 78 | SpacesInContainerLiterals: false 79 | SpacesInParentheses: false 80 | SpacesInSquareBrackets: false 81 | Standard: c++17 82 | TabWidth: 4 83 | SeparateDefinitionBlocks: Always # Only support since clang-format 14 84 | UseTab: Never 85 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Prerequisites 2 | *.d 3 | 4 | # Compiled Object files 5 | *.slo 6 | *.sln 7 | *.lo 8 | *.o 9 | *.obj 10 | 11 | # Data, Results and param files 12 | *params* 13 | *.tsv 14 | 15 | # Docs 16 | *.jekyll-metadata 17 | 18 | #IDE files 19 | /.vscode 20 | /.vs 21 | /*.nja 22 | settings.json 23 | *.swp 24 | 25 | # Edit files 26 | *~ 27 | ._* 28 | 29 | # Python 30 | __pycache__ 31 | 32 | # misc 33 | *.autosave 34 | /coverage.info 35 | 36 | # macOS Finder files 37 | .DS_Store 38 | 39 | *.dbprep 40 | 41 | # images 42 | /*.png 43 | /*.tif 44 | /*.tiff 45 | /*.jpeg 46 | /*.jpg 47 | /*.gif 48 | /run*.sh 49 | 50 | # stashed source tree 51 | /.stash 52 | 53 | # Directories 54 | /parts* 55 | /output* 56 | /build* 57 | /install* 58 | 59 | # Precompiled Headers 60 | *.gch 61 | *.pch 62 | 63 | # Compiled Dynamic libraries 64 | *.so 65 | *.dylib 66 | *.dll 67 | 68 | # Fortran module files 69 | *.mod 70 | *.smod 71 | 72 | # Compiled Static libraries 73 | *.lai 74 | *.la 75 | *.a 76 | *.lib 77 | 78 | # Executables 79 | *.exe 80 | *.out 81 | *.bin 82 | *.app 83 | *.map 84 | *.pyc 85 | 86 | # slurm logs 87 | *.e[0-9]* 88 | *.o[0-9]* 89 | 90 | # nsys and ncu 91 | *.nsys-rep 92 | *.nsys.sqlite 93 | *.ncu-rep 94 | 95 | # preprocessed data files 96 | *.pbin 97 | 98 | # Remote Sync for Atom 99 | .remote-sync.json 100 | 101 | # Byte-compiled / optimized / DLL files 102 | __pycache__/ 103 | *.py[cod] 104 | *$py.class 105 | 106 | # C extensions 107 | *.so 108 | 109 | # tmp files 110 | ~* 111 | *.~* 112 | 113 | # Distribution / packaging 114 | .Python 115 | env/ 116 | build*/ 117 | develop-eggs/ 118 | dist/ 119 | downloads/ 120 | eggs/ 121 | .eggs/ 122 | lib/ 123 | lib64/ 124 | parts/ 125 | sdist/ 126 | var/ 127 | wheels/ 128 | *.egg-info/ 129 | .installed.cfg 130 | *.egg 131 | 132 | # PyInstaller 133 | # Usually these files are written by a python script from a template 134 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 135 | *.manifest 136 | *.spec 137 | 138 | # Installer logs 139 | pip-log.txt 140 | pip-delete-this-directory.txt 141 | 142 | # Unit test / coverage reports 143 | htmlcov/ 144 | .tox/ 145 | .coverage 146 | .coverage.* 147 | .cache 148 | nosetests.xml 149 | coverage.xml 150 | *,cover 151 | .hypothesis/ 152 | 153 | # Translations 154 | *.mo 155 | *.pot 156 | 157 | # Django stuff: 158 | *.log 159 | local_settings.py 160 | 161 | # Flask stuff: 162 | instance/ 163 | .webassets-cache 164 | 165 | # Scrapy stuff: 166 | .scrapy 167 | 168 | # Sphinx documentation 169 | docs/_build/ 170 | 171 | # PyBuilder 172 | target/ 173 | 174 | # Jupyter Notebook 175 | .ipynb_checkpoints 176 | 177 | # pyenv 178 | .python-version 179 | 180 | # celery beat schedule file 181 | celerybeat-schedule 182 | 183 | # dotenv 184 | .env 185 | 186 | # virtualenv 187 | .venv/ 188 | venv/ 189 | ENV/ 190 | 191 | # Spyder project settings 192 | .spyderproject 193 | 194 | # Rope project settings 195 | .ropeproject 196 | 197 | # vscode 198 | .vscode/ 199 | .vs/ 200 | 201 | # cmake 202 | CMakeLists.txt.user* 203 | CMakeCache.txt* 204 | CMakeFiles* 205 | CMakeScripts* 206 | Testing* 207 | Makefile* 208 | cmake_install.cmake* 209 | install_manifest.txt* 210 | compile_commands.json* 211 | CTestTestfile.cmake* 212 | _deps* -------------------------------------------------------------------------------- /.gitlab-ci.yml: -------------------------------------------------------------------------------- 1 | variables: 2 | BASE: ${CI_PROJECT_DIR} 3 | threads: 32 4 | 5 | default: 6 | tags: 7 | - muller-login01 8 | interruptible: true 9 | before_script: 10 | - ml use /global/cfs/cdirs/m1759/wwei/nvhpc_23_7/modulefiles 11 | - ml unload cudatoolkit 12 | - ml gcc/12.2 cmake/3.24 nvhpc/23.7 13 | 14 | workflow: 15 | rules: 16 | - if: $CI_PIPELINE_SOURCE == 'merge_request_event' 17 | variables: 18 | install_prefix: ${CI_PROJECT_DIR}/merge_request_install 19 | - if: $CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH 20 | variables: 21 | install_prefix: ${CI_PROJECT_DIR}/default_branch_install 22 | - when: always 23 | variables: 24 | install_prefix: ${CI_PROJECT_DIR}/any_branch_install 25 | 26 | .build_template: &build_template 27 | stage: build 28 | script: 29 | - cd ${BASE} 30 | - git clone --recursive https://github.com/NERSC/hpcpp.git hpcpp 31 | - cd hpcpp 32 | - mkdir -p build-${BUILD_TYPE}-${STDPAR_TYPE} && cd build-${BUILD_TYPE}-${STDPAR_TYPE} 33 | - cmake -DCMAKE_CXX_COMPILER=$(which nvc++) -DCMAKE_C_COMPILER=$(which nvc) -DCMAKE_BUILD_TYPE=${BUILD_TYPE} -DSTDPAR=${STDPAR_TYPE} -DOMP=${STDPAR_TYPE} .. 34 | - make -j${threads} 35 | artifacts: 36 | paths: 37 | - hpcpp/build-${BUILD_TYPE}-${STDPAR_TYPE}/ 38 | 39 | build-debug-gpu: 40 | <<: *build_template 41 | variables: 42 | BUILD_TYPE: Debug 43 | STDPAR_TYPE: gpu 44 | 45 | build-debug-multicore: 46 | <<: *build_template 47 | variables: 48 | BUILD_TYPE: Debug 49 | STDPAR_TYPE: multicore 50 | 51 | build-release-gpu: 52 | <<: *build_template 53 | variables: 54 | BUILD_TYPE: Release 55 | STDPAR_TYPE: gpu 56 | 57 | build-release-multicore: 58 | <<: *build_template 59 | variables: 60 | BUILD_TYPE: Release 61 | STDPAR_TYPE: multicore 62 | 63 | .test_template: &test_template 64 | stage: test 65 | script: 66 | - cd ${BASE}/hpcpp/build-${BUILD_TYPE}-${STDPAR_TYPE}/apps/1d-stencil 67 | - | 68 | if [ "${STDPAR_TYPE}" = "gpu" ]; then 69 | ./1d-stdexec --sch gpu --size 10 --nt 10 70 | ./1d-stdpar --size 10 --nt 10 71 | ./1d-stdexec --sch multigpu --size 10 --nt 10 72 | ./1d-cuda --size 10 --nt 10 73 | fi 74 | - | 75 | if [ "${STDPAR_TYPE}" = "multicore" ]; then 76 | ./1d-serial --size 10 --nt 10 77 | ./1d-omp --size 10 --nt 10 78 | ./1d-stdpar --size 10 --nt 10 79 | ./1d-stdexec --sch cpu --size 10 --nt 10 80 | fi 81 | 82 | test-debug-gpu: 83 | <<: *test_template 84 | variables: 85 | BUILD_TYPE: Debug 86 | STDPAR_TYPE: gpu 87 | dependencies: 88 | - build-debug-gpu 89 | 90 | test-debug-multicore: 91 | <<: *test_template 92 | variables: 93 | BUILD_TYPE: Debug 94 | STDPAR_TYPE: multicore 95 | dependencies: 96 | - build-debug-multicore 97 | 98 | test-release-gpu: 99 | <<: *test_template 100 | variables: 101 | BUILD_TYPE: Release 102 | STDPAR_TYPE: gpu 103 | dependencies: 104 | - build-release-gpu 105 | 106 | test-release-multicore: 107 | <<: *test_template 108 | variables: 109 | BUILD_TYPE: Release 110 | STDPAR_TYPE: multicore 111 | dependencies: 112 | - build-release-multicore 113 | -------------------------------------------------------------------------------- /scripts/fft.ncu.nsys.gpu.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -le 2 | 3 | # 4 | # Reminder: Revert any changes to nvstdpar/CMakeLists.txt and 5 | # nvstdpar/apps/heat-equation/CMakeLists.txt that you did 6 | # for GCC compiler script before running this. 7 | # 8 | 9 | #SBATCH -A nstaff_g 10 | #SBATCH -C gpu 11 | #SBATCH --qos=regular 12 | #SBATCH --time=3:00:00 13 | #SBATCH --nodes=1 14 | #SBATCH --gpus=4 15 | #SBATCH --ntasks-per-node=4 16 | #SBATCH --exclusive 17 | #SBATCH --gpu-bind=none 18 | #SBATCH -o ncu-nsys-fft-gpu.o%j 19 | #SBATCH -e ncu-nsys-fft-gpu.e%j 20 | #SBATCH -J FFT-GPU-PERF 21 | 22 | set +x 23 | 24 | # config setting 25 | BUILD_HOME=${HOME}/repos/nvstdpar/build-fft-gpu-nsight 26 | 27 | # build stuff 28 | mkdir -p ${BUILD_HOME} 29 | cd ${BUILD_HOME} 30 | rm -rf ./* 31 | 32 | ml unload cudatoolkit 33 | ml use /global/cfs/cdirs/m1759/wwei/nvhpc_23_7/modulefiles 34 | ml nvhpc/23.7 35 | 36 | cmake .. -DSTDPAR=gpu -DOMP=gpu -DCMAKE_CXX_COMPILER=$(which nvc++) 37 | make -j fft-stdexec fft-stdpar 38 | 39 | # always run NCU and Nsys from $SCRATCH to avoid errors on Perlmutter 40 | mkdir -p ${SCRATCH}/fft-gpu-nsight 41 | cd ${SCRATCH}/fft-gpu-nsight 42 | rm -rf ./* 43 | 44 | 45 | # pause dcgmi 46 | srun --ntasks-per-node 1 dcgmi profile --pause 47 | 48 | # Problem size (increasing this beyond 4024000 may take long time for multigpu runs) 49 | SIZE=4024000 50 | 51 | # Run Nsys 52 | 53 | # stdexec-single-gpu 54 | srun nsys profile --force-overwrite true -o fft-gpu-stdexec.nsys --stats=true ${BUILD_HOME}/apps/fft/fft-stdexec --sch=gpu -N ${SIZE} |& tee nsys-fft-stdexec-gpu.log 55 | 56 | # stdpar-gpu (not sure if more than one) 57 | srun nsys profile --force-overwrite true -o fft-gpu-stdpar.nsys --stats=true ${BUILD_HOME}/apps/fft/fft-stdpar -N ${SIZE} |& tee nsys-fft-stdpar-gpu.log 58 | 59 | # stdexec-multigpu 60 | srun nsys profile --force-overwrite true -o fft-multigpu-stdexec.nsys --stats=true ${BUILD_HOME}/apps/fft/fft-stdexec --sch=multigpu -N ${SIZE} |& tee nsys-fft-multigpu-stdexec.log 61 | 62 | 63 | # Run NCU (set full) 64 | 65 | # stdexec-single-gpu (full) 66 | srun ncu -f -o fft-gpu-stdexec.ncu --target-processes all --print-summary per-gpu --replay-mode application --set full ${BUILD_HOME}/apps/fft/fft-stdexec -N ${SIZE} --sch=gpu |& tee ncu-fft-stdexec-gpu.log 67 | 68 | # stdpar-gpu (full) 69 | srun ncu -f -o fft-gpu-stdpar.ncu --target-processes all --print-summary per-gpu --replay-mode application --set full ${BUILD_HOME}/apps/fft/fft-stdpar -N ${SIZE} |& tee ncu-fft-stdpar-gpu.log 70 | 71 | # stdexec-multigpu (full) 72 | srun ncu -f -o fft-multigpu-stdexec.log --target-processes all --print-summary per-gpu --replay-mode application --set full ${BUILD_HOME}/apps/fft/fft-stdexec -N ${SIZE} --sch=multigpu |& tee ncu-fft-multigpu-stdexec.log 73 | 74 | 75 | # Run NCU (set roofline only) 76 | 77 | # stdexec-single-gpu (roofline) 78 | ncu -f -o fft-gpu-stdexec-roofline.ncu --target-processes all --print-summary per-gpu --replay-mode application --set roofline ${BUILD_HOME}/apps/fft/fft-stdexec -N ${SIZE} --sch=gpu |& tee ncu-fft-stdexec-gpu-roofline.log 79 | 80 | # stdpar-gpu (roofline) 81 | srun ncu -f -o fft-gpu-stdpar-roofline.ncu --target-processes all --print-summary per-gpu --replay-mode application --set full ${BUILD_HOME}/apps/fft/fft-stdpar -N ${SIZE} |& tee ncu-fft-stdpar-gpu-roofline.log 82 | 83 | # stdexec-multigpu (roofline) 84 | srun ncu -f -o fft-multigpu-stdexec-roofline.log --target-processes all --print-summary per-gpu --replay-mode application --set roofline ${BUILD_HOME}/apps/fft/fft-stdexec -N ${SIZE} --sch=multigpu |& tee ncu-fft-multigpu-stdexec-roofline.log 85 | 86 | # resume dcgmi 87 | srun --ntasks-per-node 1 dcgmi profile --resume 88 | -------------------------------------------------------------------------------- /apps/mdspan-stdpar/mdspan-stdpar.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * MIT License 3 | * 4 | * Copyright (c) 2023 The Regents of the University of California, 5 | * through Lawrence Berkeley National Laboratory (subject to receipt of any 6 | * required approvals from the U.S. Dept. of Energy). All rights reserved. 7 | * 8 | * Permission is hereby granted, free of charge, to any person obtaining a copy 9 | * of this software and associated documentation files (the "Software"), to deal 10 | * in the Software without restriction, including without limitation the rights 11 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | * copies of the Software, and to permit persons to whom the Software is 13 | * furnished to do so, subject to the following conditions: 14 | * 15 | * The above copyright notice and this permission notice shall be included in 16 | * all copies or substantial portions of the Software. 17 | * 18 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 | * SOFTWARE. 25 | */ 26 | 27 | #include "commons.hpp" 28 | 29 | using data_type = int; 30 | // 2D view 31 | using extents_type = std::extents; 32 | // 3D view (fix the first dimension to 2) 33 | using extents_type2 = std::extents; 34 | 35 | int main() { 36 | constexpr int N = 1e9; 37 | std::vector v(N); 38 | 39 | // View data as contiguous memory representing 2 rows of 6 ints each 40 | auto ms2 = std::mdspan(v.data(), N / 2, 2); 41 | // View the same data as a 3D array 2 (fixed above) x 3 x 2 42 | auto ms3 = std::mdspan(v.data(), N / 4, 2); 43 | 44 | // auto dim2 = [=](int i){int i1 = i/ms2.extent(1); int i2 = i%ms2.extent(1); 45 | // return std::make_tuple(i1, i2);}; auto dim3 = [=](int i){int i1 = 46 | // i/(ms3.extent(1)*ms3.extent(2)); int i2 = (i/ms3.extent(2))%ms3.extent(1); 47 | // int i3 = i%ms3.extent(2); return std::make_tuple(i1, i2, i3);}; 48 | 49 | std::for_each(std::execution::par_unseq, ms2.data_handle(), ms2.data_handle() + ms2.size(), [=](int& i) { 50 | auto global_idx = std::distance(ms2.data_handle(), &i); 51 | dim2(global_idx, ms2); 52 | // auto [i1, i2] = dim2(global_idx); 53 | ms2(ii, ij) = global_idx; 54 | }); 55 | 56 | fmt::print("\n"); 57 | 58 | std::for_each(std::execution::par_unseq, ms2.data_handle(), ms2.data_handle() + ms2.size(), [=](int& i) { 59 | auto global_idx = std::distance(ms2.data_handle(), &i); 60 | dim3(global_idx, ms3); 61 | // auto [i1, i2, i3] = dim3(global_idx); 62 | ms3(ii, ij, ik) = 1000 + global_idx; 63 | }); 64 | 65 | // read subset of data using 3D view 66 | for (size_t i = 0; i < ms3.extent(0); i++) { 67 | for (size_t j = 0; j < 10; j++) { 68 | for (size_t k = 0; k < ms3.extent(2); k++) { 69 | assert(ms3(i, j, k) == 1000 + i * ms3.extent(1) * ms3.extent(2) + j * ms3.extent(2) + k); 70 | fmt::print("{} ", ms3(i, j, k)); 71 | } 72 | fmt::print("\n"); 73 | } 74 | fmt::print("\n"); 75 | } 76 | 77 | fmt::print("{}\n", ms3(0, 0, 1)); 78 | } -------------------------------------------------------------------------------- /apps/prefixSum/prefixSum.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * MIT License 3 | * 4 | * Copyright (c) 2023 The Regents of the University of California, 5 | * through Lawrence Berkeley National Laboratory (subject to receipt of any 6 | * required approvals from the U.S. Dept. of Energy).All rights reserved. 7 | * 8 | * Permission is hereby granted, free of charge, to any person obtaining a copy 9 | * of this software and associated documentation files (the "Software"), to deal 10 | * in the Software without restriction, including without limitation the rights 11 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | * copies of the Software, and to permit persons to whom the Software is 13 | * furnished to do so, subject to the following conditions: 14 | * 15 | * The above copyright notice and this permission notice shall be included in 16 | * all copies or substantial portions of the Software. 17 | * 18 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 | * SOFTWARE. 25 | */ 26 | 27 | /* 28 | * commons for the fft codes 29 | */ 30 | 31 | #pragma once 32 | 33 | #include 34 | #include 35 | 36 | #if defined(USE_GPU) 37 | #include 38 | #include 39 | using namespace nvexec; 40 | #endif //USE_GPU 41 | 42 | #include "argparse/argparse.hpp" 43 | 44 | #include "commons.hpp" 45 | 46 | using namespace std; 47 | using namespace stdexec; 48 | namespace ex = stdexec; 49 | 50 | // data type 51 | using data_t = unsigned long long; 52 | 53 | // input arguments 54 | struct prefixSum_params_t : public argparse::Args { 55 | int& N = kwarg("N", "array size").set_default(1e9); 56 | bool& print_arr = flag("p,print", "print array and prefixSum"); 57 | int& nthreads = kwarg("nthreads", "number of threads").set_default(std::thread::hardware_concurrency()); 58 | 59 | #if defined(PSUM_STDEXEC) 60 | std::string& sch = kwarg("sch", 61 | "stdexec scheduler: [options: cpu" 62 | #if defined(USE_GPU) 63 | ", gpu, multigpu" 64 | #endif //USE_GPU 65 | "]") 66 | .set_default("cpu"); 67 | #endif // PSUM_STDEXEC 68 | 69 | bool& validate = flag("validate", "validate the results"); 70 | bool& help = flag("h, help", "print help"); 71 | bool& print_time = flag("t,time", "print prefixSum time"); 72 | bool& debug = flag("d,debug", "print internal timers and configs (if any)"); 73 | }; 74 | 75 | namespace psum { 76 | template 77 | [[nodiscard]] bool validatePrefixSum(T* in, data_t* out, size_t N) { 78 | fmt::print("Validating: \n"); 79 | 80 | // compute inclusive_scan via parSTL 81 | std::vector test(N); 82 | std::inclusive_scan(std::execution::par, in, in + N, test.begin(), std::plus<>()); 83 | 84 | // check if equal 85 | return std::equal(std::execution::par, out, out + N, test.begin()); 86 | } 87 | 88 | template 89 | void genRandomVector(T* in, int N, T lower, T upper) { 90 | // random number generator 91 | std::random_device rd; 92 | std::mt19937 gen(rd()); 93 | std::uniform_int_distribution dist(lower, upper); 94 | 95 | // fill random between 1 to 10 96 | std::generate(std::execution::seq, in, in + N, [&gen, &dist]() { return dist(gen); }); 97 | } 98 | } // namespace psum 99 | -------------------------------------------------------------------------------- /apps/prefixSum/prefixSum-stdpar.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * MIT License 3 | * 4 | * Copyright (c) 2023 The Regents of the University of California, 5 | * through Lawrence Berkeley National Laboratory (subject to receipt of any 6 | * required approvals from the U.S. Dept. of Energy).All rights reserved. 7 | * 8 | * Permission is hereby granted, free of charge, to any person obtaining a copy 9 | * of this software and associated documentation files (the "Software"), to deal 10 | * in the Software without restriction, including without limitation the rights 11 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | * copies of the Software, and to permit persons to whom the Software is 13 | * furnished to do so, subject to the following conditions: 14 | * 15 | * The above copyright notice and this permission notice shall be included in 16 | * all copies or substantial portions of the Software. 17 | * 18 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 | * SOFTWARE. 25 | */ 26 | 27 | /* 28 | * commons for the prefixSum codes 29 | */ 30 | 31 | #include "prefixSum.hpp" 32 | 33 | // 34 | // serial prefixSum function 35 | // 36 | template 37 | [[nodiscard]] T* prefixSum_stdpar(const T* in, const int N) { 38 | T* y = new T[N]; 39 | std::inclusive_scan(std::execution::par, in, in + N, y, std::plus<>()); 40 | return y; 41 | } 42 | 43 | // 44 | // simulation 45 | // 46 | int main(int argc, char* argv[]) { 47 | // parse params 48 | const prefixSum_params_t args = argparse::parse(argc, argv); 49 | 50 | // see if help wanted 51 | if (args.help) { 52 | args.print(); // prints all variables 53 | return 0; 54 | } 55 | 56 | // simulation variables 57 | int N = args.N; 58 | bool print_arr = args.print_arr; 59 | bool print_time = args.print_time; 60 | bool validate = args.validate; 61 | 62 | if (!isPowOf2(N)) { 63 | N = ceilPowOf2(N); 64 | fmt::print("INFO: N != pow(2). Setting => N = {}\n", N); 65 | } 66 | 67 | // input data 68 | data_t* in = new data_t[N]; 69 | 70 | fmt::print("Progress:0%"); 71 | 72 | // random number generator 73 | psum::genRandomVector(in, N, (data_t)0, (data_t)10); 74 | 75 | fmt::print("..50%"); 76 | 77 | // output pointer 78 | data_t* out = nullptr; 79 | 80 | // start the timer 81 | Timer timer; 82 | 83 | // serial prefixSum 84 | out = prefixSum_stdpar(in, N); 85 | 86 | // stop timer 87 | auto elapsed = timer.stop(); 88 | 89 | fmt::print("..100%\n"); 90 | 91 | // print the input and its prefix sum (don't if N > 100) 92 | if (print_arr && N < 100) { 93 | fmt::print("int = {}\n", fmt::join(in, in + N, " ")); 94 | fmt::print("out = {}\n", fmt::join(out, out + N, " ")); 95 | } 96 | 97 | // print the elapsed time 98 | if (print_time) 99 | fmt::print("Elapsed Time: {:f} s\n", elapsed); 100 | 101 | // validate the prefixSum 102 | if (validate) { 103 | bool verify = psum::validatePrefixSum(in, out, N); 104 | 105 | if (verify) 106 | fmt::print("SUCCESS.."); 107 | else 108 | fmt::print("FAILED.."); 109 | 110 | fmt::print("\n"); 111 | } 112 | 113 | // delete in and out 114 | delete[] in; 115 | delete[] out; 116 | 117 | // return status 118 | return 0; 119 | } 120 | -------------------------------------------------------------------------------- /apps/prefixSum/prefixSum-serial.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * MIT License 3 | * 4 | * Copyright (c) 2023 The Regents of the University of California, 5 | * through Lawrence Berkeley National Laboratory (subject to receipt of any 6 | * required approvals from the U.S. Dept. of Energy).All rights reserved. 7 | * 8 | * Permission is hereby granted, free of charge, to any person obtaining a copy 9 | * of this software and associated documentation files (the "Software"), to deal 10 | * in the Software without restriction, including without limitation the rights 11 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | * copies of the Software, and to permit persons to whom the Software is 13 | * furnished to do so, subject to the following conditions: 14 | * 15 | * The above copyright notice and this permission notice shall be included in 16 | * all copies or substantial portions of the Software. 17 | * 18 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 | * SOFTWARE. 25 | */ 26 | 27 | /* 28 | * commons for the prefixSum codes 29 | */ 30 | 31 | #include "prefixSum.hpp" 32 | 33 | // 34 | // serial prefixSum function 35 | // 36 | template 37 | [[nodiscard]] T* prefixSum_serial(const T* in, const int N) { 38 | T* y = new T[N]; 39 | std::inclusive_scan(std::execution::seq, in, in + N, y, std::plus<>()); 40 | return y; 41 | } 42 | 43 | // 44 | // simulation 45 | // 46 | int main(int argc, char* argv[]) { 47 | // parse params 48 | const prefixSum_params_t args = argparse::parse(argc, argv); 49 | 50 | // see if help wanted 51 | if (args.help) { 52 | args.print(); // prints all variables 53 | return 0; 54 | } 55 | 56 | // simulation variables 57 | int N = args.N; 58 | bool print_arr = args.print_arr; 59 | bool print_time = args.print_time; 60 | bool validate = args.validate; 61 | 62 | if (!isPowOf2(N)) { 63 | N = ceilPowOf2(N); 64 | fmt::print("INFO: N != pow(2). Setting => N = {}\n", N); 65 | } 66 | 67 | // input data 68 | data_t* in = new data_t[N]; 69 | 70 | fmt::print("Progress:0%"); 71 | 72 | // random number generator 73 | psum::genRandomVector(in, N, (data_t)0, (data_t)10); 74 | 75 | fmt::print("..50%"); 76 | 77 | // output pointer 78 | data_t* out = nullptr; 79 | 80 | // start the timer 81 | Timer timer; 82 | 83 | // serial prefixSum 84 | out = prefixSum_serial(in, N); 85 | 86 | // stop timer 87 | auto elapsed = timer.stop(); 88 | 89 | fmt::print("..100%\n"); 90 | 91 | // print the input and its prefix sum (don't if N > 100) 92 | if (print_arr && N < 100) { 93 | fmt::print("int = {}\n", fmt::join(in, in + N, " ")); 94 | fmt::print("out = {}\n", fmt::join(out, out + N, " ")); 95 | } 96 | 97 | // print the elapsed time 98 | if (print_time) 99 | fmt::print("Elapsed Time: {:f} s\n", elapsed); 100 | 101 | // validate the prefixSum 102 | if (validate) { 103 | bool verify = psum::validatePrefixSum(in, out, N); 104 | 105 | if (verify) { 106 | fmt::print("SUCCESS.."); 107 | } else { 108 | fmt::print("FAILED.."); 109 | } 110 | 111 | fmt::print("\n"); 112 | } 113 | 114 | // delete in and out 115 | delete[] in; 116 | delete[] out; 117 | 118 | // return status 119 | return 0; 120 | } 121 | -------------------------------------------------------------------------------- /include/counting_iterator.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * MIT License 3 | * 4 | * Copyright (c) 2023 The Regents of the University of California, 5 | * through Lawrence Berkeley National Laboratory (subject to receipt of any 6 | * required approvals from the U.S. Dept. of Energy). All rights reserved. 7 | * 8 | * Permission is hereby granted, free of charge, to any person obtaining a copy 9 | * of this software and associated documentation files (the "Software"), to deal 10 | * in the Software without restriction, including without limitation the rights 11 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | * copies of the Software, and to permit persons to whom the Software is 13 | * furnished to do so, subject to the following conditions: 14 | * 15 | * The above copyright notice and this permission notice shall be included in 16 | * all copies or substantial portions of the Software. 17 | * 18 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 | * SOFTWARE. 25 | */ 26 | 27 | // 28 | // counting_iterator taken from 29 | // https://github.com/LLNL/LULESH/blob/2.0.2-dev/stdpar/src/lulesh.h#L687 30 | // 31 | 32 | #pragma once 33 | 34 | #include "commons.hpp" 35 | 36 | using Index_t = int32_t; 37 | 38 | struct counting_iterator { 39 | private: 40 | using self = counting_iterator; 41 | 42 | public: 43 | using value_type = Index_t; 44 | using difference_type = typename std::make_signed::type; 45 | using pointer = Index_t*; 46 | using reference = Index_t&; 47 | using iterator_category = std::random_access_iterator_tag; 48 | 49 | counting_iterator() : value(0) {} 50 | 51 | explicit counting_iterator(value_type v) : value(v) {} 52 | 53 | value_type operator*() const { return value; } 54 | 55 | value_type operator[](difference_type n) const { return value + n; } 56 | 57 | self& operator++() { 58 | ++value; 59 | return *this; 60 | } 61 | 62 | self operator++(int) { 63 | self result{value}; 64 | ++value; 65 | return result; 66 | } 67 | 68 | self& operator--() { 69 | --value; 70 | return *this; 71 | } 72 | 73 | self operator--(int) { 74 | self result{value}; 75 | --value; 76 | return result; 77 | } 78 | 79 | self& operator+=(difference_type n) { 80 | value += n; 81 | return *this; 82 | } 83 | 84 | self& operator-=(difference_type n) { 85 | value -= n; 86 | return *this; 87 | } 88 | 89 | friend self operator+(self const& i, difference_type n) { return self(i.value + n); } 90 | 91 | friend self operator+(difference_type n, self const& i) { return self(i.value + n); } 92 | 93 | friend difference_type operator-(self const& x, self const& y) { return x.value - y.value; } 94 | 95 | friend self operator-(self const& i, difference_type n) { return self(i.value - n); } 96 | 97 | friend bool operator==(self const& x, self const& y) { return x.value == y.value; } 98 | 99 | friend bool operator!=(self const& x, self const& y) { return x.value != y.value; } 100 | 101 | friend bool operator<(self const& x, self const& y) { return x.value < y.value; } 102 | 103 | friend bool operator<=(self const& x, self const& y) { return x.value <= y.value; } 104 | 105 | friend bool operator>(self const& x, self const& y) { return x.value > y.value; } 106 | 107 | friend bool operator>=(self const& x, self const& y) { return x.value >= y.value; } 108 | 109 | private: 110 | value_type value; 111 | }; -------------------------------------------------------------------------------- /apps/1d-stencil/1d-cuda.cpp: -------------------------------------------------------------------------------- 1 | #include "argparse/argparse.hpp" 2 | #include "commons.hpp" 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | // parameters 9 | struct args_params_t : public argparse::Args { 10 | bool& results = kwarg("results", "print generated results (default: false)").set_default(false); 11 | std::uint64_t& nt = kwarg("nt", "Number of time steps").set_default(45); 12 | std::uint64_t& size = kwarg("size", "Number of elements").set_default(10); 13 | bool& k = kwarg("k", "Heat transfer coefficient").set_default(0.5); 14 | double& dt = kwarg("dt", "Timestep unit (default: 1.0[s])").set_default(1.0); 15 | double& dx = kwarg("dx", "Local x dimension").set_default(1.0); 16 | bool& help = flag("h, help", "print help"); 17 | bool& time = kwarg("t, time", "print time").set_default(true); 18 | }; 19 | 20 | using Real_t = double; 21 | 22 | using view_1d = std::extents; 23 | typedef std::mdspan space; 24 | 25 | /////////////////////////////////////////////////////////////////////////////// 26 | // Command-line variables 27 | constexpr Real_t k = 0.5; // heat transfer coefficient 28 | constexpr Real_t dt = 1.; // time step 29 | constexpr Real_t dx = 1.; // grid spacing 30 | 31 | // Our operator 32 | __device__ Real_t heat(const Real_t left, const Real_t middle, const Real_t right, const Real_t k, const Real_t dt, 33 | const Real_t dx) { 34 | return middle + (k * dt / (dx * dx)) * (left - 2 * middle + right); 35 | } 36 | 37 | __global__ void heat_equation(Real_t* current, Real_t* next, std::size_t size, const Real_t k, const Real_t dt, 38 | const Real_t dx) { 39 | std::size_t i = blockIdx.x * blockDim.x + threadIdx.x; 40 | 41 | if (i < size) { 42 | std::size_t left = (i == 0) ? size - 1 : i - 1; 43 | std::size_t right = (i == size - 1) ? 0 : i + 1; 44 | next[i] = heat(current[left], current[i], current[right], k, dt, dx); 45 | } 46 | } 47 | 48 | int benchmark(args_params_t const& args) { 49 | std::uint64_t size = args.size; // Number of elements. 50 | std::uint64_t nt = args.nt; // Number of steps. 51 | 52 | Real_t* h_current = nullptr; 53 | Real_t* h_next = nullptr; 54 | 55 | // Measure execution time. 56 | Timer timer; 57 | 58 | // Memory allocation 59 | if (args.results) { 60 | h_current = new Real_t[size]; 61 | h_next = new Real_t[size]; 62 | } 63 | 64 | Real_t* d_current; 65 | Real_t* d_next; 66 | cudaMalloc(&d_current, size * sizeof(Real_t)); 67 | cudaMalloc(&d_next, size * sizeof(Real_t)); 68 | thrust::sequence(thrust::device, d_current, d_current + size, 0); 69 | thrust::sequence(thrust::device, d_next, d_next + size, 0); 70 | 71 | // CUDA kernel execution parameters 72 | const int threadsPerBlock = std::min(1024, (int)size); 73 | const int blocks = (size + threadsPerBlock - 1) / threadsPerBlock; 74 | 75 | // Actual time step loop 76 | for (std::size_t t = 0; t < nt; ++t) { 77 | heat_equation<<>>(d_current, d_next, size, k, dt, dx); 78 | std::swap(d_current, d_next); 79 | } 80 | cudaDeviceSynchronize(); 81 | auto time = timer.stop(); 82 | 83 | if (args.results) { 84 | // Copy result back to host 85 | cudaMemcpy(h_current, d_current, size * sizeof(Real_t), cudaMemcpyDeviceToHost); 86 | 87 | // Print results 88 | if (args.results) { 89 | auto h_current_mds = space(h_current, size); 90 | fmt::println("{::f}", h_current_mds); 91 | } 92 | // Cleanup 93 | delete[] h_current; 94 | delete[] h_next; 95 | } 96 | 97 | cudaFree(d_current); 98 | cudaFree(d_next); 99 | 100 | if (args.time) { 101 | fmt::print("Duration: {:f} ms\n", time); 102 | } 103 | 104 | return 0; 105 | } 106 | 107 | int main(int argc, char* argv[]) { 108 | // parse params 109 | args_params_t args = argparse::parse(argc, argv); 110 | // see if help wanted 111 | if (args.help) { 112 | args.print(); // prints all variables 113 | return 0; 114 | } 115 | 116 | benchmark(args); 117 | 118 | return 0; 119 | } 120 | -------------------------------------------------------------------------------- /apps/comm-study/comm-study-no-senders.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * MIT License 3 | * 4 | * Copyright (c) 2023 The Regents of the University of California, 5 | * through Lawrence Berkeley National Laboratory (subject to receipt of any 6 | * required approvals from the U.S. Dept. of Energy). All rights reserved. 7 | * 8 | * Permission is hereby granted, free of charge, to any person obtaining a copy 9 | * of this software and associated documentation files (the "Software"), to deal 10 | * in the Software without restriction, including without limitation the rights 11 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | * copies of the Software, and to permit persons to whom the Software is 13 | * furnished to do so, subject to the following conditions: 14 | * 15 | * The above copyright notice and this permission notice shall be included in 16 | * all copies or substantial portions of the Software. 17 | * 18 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 | * SOFTWARE. 25 | */ 26 | 27 | #include "commons.hpp" 28 | // 29 | // Build and run on Perlmutter using 30 | // ml nvhpc/23.1 ; nvc++ -stdpar=gpu -std=c++20 -o hpcpp.out 31 | // ./hpcpp_nosenders.cpp && nsys profile --stats=true ./hpcpp.out 32 | // 33 | 34 | using T = double; 35 | using time_point_t = std::chrono::system_clock::time_point; 36 | 37 | // must take in the pointers/vectors by reference 38 | template 39 | auto work(P& A, P& B, P& Y, int N) { 40 | // init A and B separately - will it cause an H2D copy? 41 | std::for_each(std::execution::par_unseq, &A[0], &A[N], [&](T& ai) { ai = cos(M_PI / 4); }); 42 | 43 | T sum = 0.0; 44 | 45 | for (int i = 0; i < N / 3; i++) { 46 | // read only or read-write operations 47 | sum += A[i] / N; 48 | 49 | // this line if commented should not result in an H2D after this but it 50 | // does. 51 | // A[i] = sin(M_PI/4); 52 | } 53 | 54 | fmt::print("\n"); 55 | 56 | // will it cause an H2D here? 57 | std::for_each(std::execution::par_unseq, &B[0], &B[N], [&](T& bi) { bi = sin(M_PI / 6); }); 58 | 59 | // compute Y = sqrt((A+B)^2 + B^2)/(A+B+B) 60 | 61 | std::transform(std::execution::par_unseq, &A[N / 2], &A[N], &B[0], &A[N / 2], 62 | [&](T& ai, T& bi) { return ai + bi; }); 63 | std::transform(std::execution::par_unseq, &A[N / 2], &A[N], &B[0], &Y[0], 64 | [&](T& ai, T& bi) { return sqrt(pow(ai, 2) + pow(bi, 2)) / (ai + bi); }); 65 | 66 | // should trigger a D2H copy of N/5 elements 67 | for (int i = 0; i < N / 3; i++) 68 | sum += Y[i] / N; 69 | 70 | fmt::print("\n"); 71 | 72 | // get sum(Y) - one last memcpy (not USM) D2H 73 | sum += std::reduce(std::execution::par_unseq, &Y[0], &Y[N], 0.0, std::plus()); 74 | 75 | return sum / N; 76 | } 77 | 78 | int main(int argc, char* argv[]) { 79 | constexpr int N = 1e9; 80 | time_point_t mark = std::chrono::system_clock::now(); 81 | auto es = std::chrono::duration(std::chrono::system_clock::now() - mark).count(); 82 | T sum = 0; 83 | 84 | #if 1 // 0 if only want to run with pointers 85 | std::vector A(N); 86 | std::vector B(N); 87 | std::vector Y(N); 88 | 89 | mark = std::chrono::system_clock::now(); 90 | sum = work(A, B, Y, N); 91 | es = std::chrono::duration(std::chrono::system_clock::now() - mark).count(); 92 | fmt::print("Vectors: Elapsed Time: {:f}s\n", es); 93 | 94 | #endif 95 | 96 | #if 1 // 0 if only want to run with vectors 97 | 98 | // allocate memory - where is this allocated? 99 | T* a = new T[N]; 100 | T* b = new T[N]; 101 | T* y = new T[N]; 102 | 103 | sum = 0; 104 | mark = std::chrono::system_clock::now(); 105 | sum = work(a, b, y, N); 106 | es = std::chrono::duration(std::chrono::system_clock::now() - mark).count(); 107 | fmt::print("Pointers: Elapsed Time: {:f}s\n\n", es); 108 | #endif 109 | 110 | // do not use scientific notation 111 | fmt::print("sum: {}\n", sum); 112 | 113 | return 0; 114 | } -------------------------------------------------------------------------------- /apps/heat-equation/heat-equation.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * MIT License 3 | * 4 | * Copyright (c) 2023 The Regents of the University of California, 5 | * through Lawrence Berkeley National Laboratory (subject to receipt of any 6 | * required approvals from the U.S. Dept. of Energy).All rights reserved. 7 | * 8 | * Permission is hereby granted, free of charge, to any person obtaining a copy 9 | * of this software and associated documentation files (the "Software"), to deal 10 | * in the Software without restriction, including without limitation the rights 11 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | * copies of the Software, and to permit persons to whom the Software is 13 | * furnished to do so, subject to the following conditions: 14 | * 15 | * The above copyright notice and this permission notice shall be included in 16 | * all copies or substantial portions of the Software. 17 | * 18 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 | * SOFTWARE. 25 | */ 26 | 27 | /* 28 | * commons for the heat equation examples 29 | */ 30 | 31 | #pragma once 32 | 33 | #include 34 | #include 35 | 36 | #if defined(USE_GPU) 37 | #include 38 | #include 39 | using namespace nvexec; 40 | #endif //USE_GPU 41 | 42 | #include "argparse/argparse.hpp" 43 | #include "commons.hpp" 44 | 45 | using namespace std; 46 | using namespace stdexec; 47 | using stdexec::sync_wait; 48 | 49 | namespace ex = stdexec; 50 | 51 | // data type 52 | using Real_t = double; 53 | 54 | // number of dimensions 55 | constexpr int dims = 2; 56 | 57 | // total number of ghost cells = ghosts x dims 58 | constexpr int ghost_cells = 1; 59 | constexpr int nghosts = ghost_cells * dims; 60 | 61 | // 2D view 62 | using view_2d = std::extents; 63 | 64 | // 3D view 65 | using view_3d = std::extents; 66 | 67 | // macros to get x and y positions from indices 68 | #define pos(i, ghosts, dx) -0.5 + dx*(i - ghosts) 69 | 70 | // parameters 71 | struct heat_params_t : public argparse::Args { 72 | int& ncells = kwarg("n,ncells", "number of cells on each side of the domain").set_default(32); 73 | int& nsteps = kwarg("s,nsteps", "total steps in simulation").set_default(100); 74 | 75 | #if defined(HEQ_OMP) || defined(HEQ_STDEXEC) 76 | int& nthreads = kwarg("nthreads", "number of threads").set_default(std::thread::hardware_concurrency()); 77 | #endif // HEQ_OMP || HEQ_STDEXEC 78 | 79 | #if defined(HEQ_STDEXEC) 80 | std::string& sch = kwarg("sch", 81 | "stdexec scheduler: [options: cpu" 82 | #if defined(USE_GPU) 83 | ", gpu, multigpu" 84 | #endif //USE_GPU 85 | "]") 86 | .set_default("cpu"); 87 | #endif // HEQ_STDEXEC 88 | 89 | Real_t& alpha = kwarg("a,alpha", "thermal diffusivity").set_default(0.5f); 90 | Real_t& dt = kwarg("t,dt", "time step").set_default(5.0e-5f); 91 | bool& help = flag("h,help", "print help"); 92 | bool& print_grid = flag("p,print", "print grids at step 0 and step n"); 93 | bool& print_time = flag("time", "print simulation time"); 94 | }; 95 | 96 | // template printGrid 97 | template 98 | void printGrid(T* grid, int len) { 99 | auto view = std::mdspan(grid, len, len); 100 | fmt::print("Grid: \n"); 101 | fmt::println("{::.2f}", view); 102 | } 103 | 104 | // fill boundary cells 105 | template 106 | void fill2Dboundaries(T* grid, int len, int ghost_cells = 1) { 107 | std::for_each_n(std::execution::par_unseq, counting_iterator(ghost_cells), len - nghosts, [=](auto i) { 108 | grid[i] = grid[i + (ghost_cells * len)]; 109 | grid[i + (len * (len - ghost_cells))] = grid[i + (len * (len - ghost_cells - 1))]; 110 | 111 | grid[i * len] = grid[(ghost_cells * len) + i]; 112 | grid[(len - ghost_cells) + (len * i)] = grid[(len - ghost_cells - 1) + (len * i)]; 113 | }); 114 | } 115 | -------------------------------------------------------------------------------- /apps/choleskey/choleskey_serial.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * MIT License 3 | * 4 | * Copyright (c) 2023 Chuanqiu He 5 | * Copyright (c) 2023 Weile Wei 6 | * Copyright (c) 2023 The Regents of the University of California, 7 | * through Lawrence Berkeley National Laboratory (subject to receipt of any 8 | * required approvals from the U.S. Dept. of Energy).All rights reserved. 9 | * 10 | * Permission is hereby granted, free of charge, to any person obtaining a copy 11 | * of this software and associated documentation files (the "Software"), to deal 12 | * in the Software without restriction, including without limitation the rights 13 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 14 | * copies of the Software, and to permit persons to whom the Software is 15 | * furnished to do so, subject to the following conditions: 16 | * 17 | * The above copyright notice and this permission notice shall be included in 18 | * all copies or substantial portions of the Software. 19 | * 20 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 22 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 23 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 24 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 25 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 26 | * SOFTWARE. 27 | */ 28 | // 29 | // This example provides a serial(mdspan) implementation for choleskey decomposition code. 30 | 31 | #include 32 | #include 33 | #include "argparse/argparse.hpp" 34 | #include "commons.hpp" 35 | #include "matrixutil.hpp" 36 | 37 | using namespace std; 38 | 39 | struct solver { 40 | 41 | using view_2d = std::extents; 42 | 43 | typedef std::mdspan matrix_ms_t; 44 | 45 | template 46 | matrix_ms_t Cholesky_Decomposition(std::vector& vec, int n) { 47 | std::vector lower(n * n, 0); 48 | 49 | auto matrix_ms = std::mdspan(vec.data(), n, n); 50 | auto lower_ms = std::mdspan(lower.data(), n, n); 51 | 52 | // Decomposing a matrix into Lower Triangular 53 | for (int i = 0; i < matrix_ms.extent(0); i++) { 54 | for (int j = 0; j <= i; j++) { 55 | T sum = 0; 56 | 57 | if (j == i) { 58 | // summation for diagonals 59 | for (int k = 0; k < j; k++) 60 | sum += pow(lower_ms(j, k), 2); 61 | lower_ms(j, j) = sqrt(matrix_ms(i, j) - sum); 62 | } else { 63 | // Evaluating L(i, j) using L(j, j) 64 | for (int k = 0; k < j; k++) 65 | sum += (lower_ms(i, k) * lower_ms(j, k)); 66 | lower_ms(i, j) = (matrix_ms(i, j) - sum) / lower_ms(j, j); 67 | } 68 | } 69 | } 70 | return lower_ms; 71 | } 72 | }; 73 | 74 | /////////////////////////////////////////////////////////////////////////////// 75 | int benchmark(args_params_t const& args) { 76 | 77 | std::uint64_t nd = args.nd; // Number of matrix dimension. 78 | 79 | std::vector inputMatrix = generate_pascal_matrix(nd); 80 | 81 | // Create the solverobject 82 | solver solve; 83 | // Measure execution time. 84 | Timer timer; 85 | // start decomposation 86 | auto res_matrix = solve.Cholesky_Decomposition(inputMatrix, nd); 87 | auto time = timer.stop(); 88 | 89 | // Print the final results 90 | if (args.results) { 91 | // Displaying Lower Triangular and its Transpose 92 | fmt::print("{:>6} {:>30}\n", "Lower Triangular", "Transpose"); 93 | for (int i = 0; i < nd; i++) { 94 | // Lower Triangular 95 | for (int j = 0; j < nd; j++) 96 | fmt::print("{:>6}\t", res_matrix(i, j)); 97 | fmt::print("\t"); 98 | 99 | // Transpose of Lower Triangular 100 | for (int j = 0; j < nd; j++) 101 | fmt::print("{:>6}\t", res_matrix(j, i)); 102 | fmt::print("\n"); 103 | } 104 | } 105 | 106 | if (args.time) { 107 | fmt::print("Duration: {:f} ms\n", time); 108 | } 109 | 110 | return 0; 111 | } 112 | 113 | // Driver Code for testing 114 | int main(int argc, char* argv[]) { 115 | 116 | // parse params 117 | args_params_t args = argparse::parse(argc, argv); 118 | // see if help wanted 119 | if (args.help) { 120 | args.print(); // prints all variables 121 | return 0; 122 | } 123 | 124 | benchmark(args); 125 | 126 | return 0; 127 | } 128 | -------------------------------------------------------------------------------- /apps/fft/fft-serial.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * MIT License 3 | * 4 | * Copyright (c) 2023 The Regents of the University of California, 5 | * through Lawrence Berkeley National Laboratory (subject to receipt of any 6 | * required approvals from the U.S. Dept. of Energy).All rights reserved. 7 | * 8 | * Permission is hereby granted, free of charge, to any person obtaining a copy 9 | * of this software and associated documentation files (the "Software"), to deal 10 | * in the Software without restriction, including without limitation the rights 11 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | * copies of the Software, and to permit persons to whom the Software is 13 | * furnished to do so, subject to the following conditions: 14 | * 15 | * The above copyright notice and this permission notice shall be included in 16 | * all copies or substantial portions of the Software. 17 | * 18 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 | * SOFTWARE. 25 | */ 26 | 27 | /* 28 | * commons for the fft codes 29 | */ 30 | 31 | #include "fft.hpp" 32 | 33 | // 34 | // serial fft function 35 | // 36 | [[nodiscard]] std::vector fft_serial(const data_t* x, const int N, bool debug = false) { 37 | std::vector x_r(N); 38 | 39 | // bit shift 40 | int shift = 32 - ilog2(N); 41 | 42 | // twiddle data in x[n] 43 | for (int k = 0; k < N; k++) { 44 | auto new_idx = reverse_bits32(k) >> shift; 45 | x_r[k] = x[new_idx]; 46 | } 47 | 48 | // niterations 49 | int niters = ilog2(N); 50 | // local merge partition size 51 | int lN = 2; 52 | 53 | fmt::print("FFT progress: "); 54 | 55 | for (int k = 0; k < niters; k++, lN *= 2) { 56 | fmt::print("{:f}%..", (100.0 * k) / niters); 57 | 58 | static Timer dtimer; 59 | 60 | // number of partitions 61 | int nparts = N / lN; 62 | int tpp = lN / 2; 63 | 64 | if (debug) 65 | dtimer.start(); 66 | 67 | // merge 68 | for (int k = 0; k < N / 2; k++) { 69 | // compute indices 70 | int e = (k / tpp) * lN + (k % tpp); 71 | auto o = e + tpp; 72 | auto i = (k % tpp); 73 | auto tmp = x_r[e] + x_r[o] * WNk(N, i * nparts); 74 | x_r[o] = x_r[e] - x_r[o] * WNk(N, i * nparts); 75 | x_r[e] = tmp; 76 | } 77 | 78 | if (debug) { 79 | fmt::print("This iter time: {:f} ms\n", dtimer.stop()); 80 | } 81 | } 82 | 83 | fmt::print("100%\n"); 84 | return x_r; 85 | } 86 | 87 | // 88 | // simulation 89 | // 90 | int main(int argc, char* argv[]) { 91 | // parse params 92 | const fft_params_t args = argparse::parse(argc, argv); 93 | 94 | // see if help wanted 95 | if (args.help) { 96 | args.print(); // prints all variables 97 | return 0; 98 | } 99 | 100 | // simulation variables 101 | int N = args.N; 102 | sig_type_t sig_type = getSignal(args.sig); 103 | //int freq = args.freq; 104 | bool print_sig = args.print_sig; 105 | bool print_time = args.print_time; 106 | bool validate = args.validate; 107 | 108 | // x[n] signal 109 | sig_t x_n(N, sig_type); 110 | 111 | if (!isPowOf2(N)) { 112 | N = ceilPowOf2(N); 113 | fmt::print("INFO: N is not a power of 2. Padding zeros => N = {}\n", N); 114 | 115 | x_n.resize(N); 116 | } 117 | 118 | if (print_sig) { 119 | fmt::print("x[n] = "); 120 | x_n.printSignal(); 121 | } 122 | 123 | // niterations 124 | int niters = ilog2(N); 125 | 126 | // start the timer 127 | Timer timer; 128 | 129 | // fft radix-2 algorithm 130 | // y[n] = fft(x[n]); 131 | sig_t y_n(std::move(fft_serial(x_n.data(), N, args.debug))); 132 | 133 | // stop timer 134 | auto elapsed = timer.stop(); 135 | 136 | // print the fft(x) 137 | if (print_sig) { 138 | fmt::print("X(k) = "); 139 | y_n.printSignal(); 140 | } 141 | 142 | // print the computation time 143 | if (print_time) { 144 | fmt::print("Duration: {:f} ms\n", elapsed); 145 | } 146 | 147 | // validate the recursively computed fft 148 | if (validate) { 149 | if (x_n.isFFT(y_n, exec::static_thread_pool(std::thread::hardware_concurrency()).get_scheduler())) { 150 | fmt::print("SUCCESS: y[n] == fft(x[n])\n"); 151 | } else { 152 | fmt::print("FAILED: y[n] != fft(x[n])\n"); 153 | } 154 | } 155 | 156 | return 0; 157 | } 158 | -------------------------------------------------------------------------------- /apps/choleskey/choleskey_stdpar.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * MIT License 3 | * 4 | * Copyright (c) 2023 Chuanqiu He 5 | * Copyright (c) 2023 Weile Wei 6 | * Copyright (c) 2023 The Regents of the University of California, 7 | * through Lawrence Berkeley National Laboratory (subject to receipt of any 8 | * required approvals from the U.S. Dept. of Energy).All rights reserved. 9 | * 10 | * Permission is hereby granted, free of charge, to any person obtaining a copy 11 | * of this software and associated documentation files (the "Software"), to deal 12 | * in the Software without restriction, including without limitation the rights 13 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 14 | * copies of the Software, and to permit persons to whom the Software is 15 | * furnished to do so, subject to the following conditions: 16 | * 17 | * The above copyright notice and this permission notice shall be included in 18 | * all copies or substantial portions of the Software. 19 | * 20 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 22 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 23 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 24 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 25 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 26 | * SOFTWARE. 27 | */ 28 | // 29 | // This example provides a stdpar implementation for choleskey decomposition code. 30 | 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include 36 | #include "argparse/argparse.hpp" 37 | #include "commons.hpp" 38 | #include "matrixutil.hpp" 39 | 40 | using namespace std; 41 | 42 | struct solver { 43 | 44 | using view_2d = std::extents; 45 | 46 | template 47 | std::vector> Cholesky_Decomposition(std::vector& vec, int n) { 48 | std::vector> lower(n, std::vector(n, 0)); 49 | 50 | auto matrix_ms = std::mdspan(vec.data(), n, n); 51 | 52 | auto multiplier_lambda = [=](auto a, auto b) { 53 | return a * b; 54 | }; 55 | 56 | // Decomposing a matrix into Lower Triangular 57 | for (int i = 0; i < matrix_ms.extent(0); i++) { 58 | for (int j = 0; j <= i; j++) { 59 | T sum = 0; 60 | 61 | if (j == i) // summation for diagonals 62 | { 63 | sum = std::transform_reduce(std::execution::par, lower[j].cbegin(), lower[j].cbegin() + j, 0, 64 | std::plus{}, [=](int val) { return val * val; }); 65 | 66 | lower[j][j] = std::sqrt(matrix_ms(i, j) - sum); 67 | 68 | } else { // Evaluating L(i, j) using L(j, j) 69 | 70 | sum = std::transform_reduce(std::execution::par, lower[j].cbegin(), lower[j].cbegin() + j, 71 | lower[i].cbegin(), 0, std::plus<>(), multiplier_lambda); 72 | 73 | lower[i][j] = (matrix_ms(i, j) - sum) / lower[j][j]; 74 | } 75 | } 76 | } 77 | return lower; 78 | } 79 | }; 80 | 81 | /////////////////////////////////////////////////////////////////////////////// 82 | int benchmark(args_params_t const& args) { 83 | 84 | std::uint64_t nd = args.nd; // Number of matrix dimension. 85 | 86 | std::vector inputMatrix = generate_pascal_matrix(nd); 87 | 88 | // Create the solver object 89 | solver solve; 90 | // Measure execution time. 91 | Timer timer; 92 | 93 | // start decomposation 94 | auto res_matrix = solve.Cholesky_Decomposition(inputMatrix, nd); 95 | auto time = timer.stop(); 96 | 97 | // Print the final results 98 | if (args.results) { 99 | // Displaying Lower Triangular and its Transpose 100 | fmt::print("{:>6} {:>30}\n", "Lower Triangular", "Transpose"); 101 | for (int i = 0; i < nd; i++) { 102 | // Lower Triangular 103 | for (int j = 0; j < nd; j++) 104 | fmt::print("{:>6}\t", res_matrix[i][j]); 105 | fmt::print("\t"); 106 | 107 | // Transpose of Lower Triangular 108 | for (int j = 0; j < nd; j++) 109 | fmt::print("{:>6}\t", res_matrix[j][i]); 110 | fmt::print("\n"); 111 | } 112 | } 113 | 114 | if (args.time) { 115 | fmt::print("Duration: {:f} ms\n", time); 116 | } 117 | 118 | return 0; 119 | } 120 | 121 | // Driver Code for testing 122 | int main(int argc, char* argv[]) { 123 | 124 | // parse params 125 | args_params_t args = argparse::parse(argc, argv); 126 | // see if help wanted 127 | if (args.help) { 128 | args.print(); // prints all variables 129 | return 0; 130 | } 131 | 132 | benchmark(args); 133 | 134 | return 0; 135 | } 136 | -------------------------------------------------------------------------------- /apps/1d-stencil/1d-omp.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * MIT License 3 | * 4 | * Copyright (c) 2023 Weile Wei 5 | * Copyright (c) 2023 The Regents of the University of California, 6 | * through Lawrence Berkeley National Laboratory (subject to receipt of any 7 | * required approvals from the U.S. Dept. of Energy).All rights reserved. 8 | * 9 | * Permission is hereby granted, free of charge, to any person obtaining a copy 10 | * of this software and associated documentation files (the "Software"), to deal 11 | * in the Software without restriction, including without limitation the rights 12 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 13 | * copies of the Software, and to permit persons to whom the Software is 14 | * furnished to do so, subject to the following conditions: 15 | * 16 | * The above copyright notice and this permission notice shall be included in 17 | * all copies or substantial portions of the Software. 18 | * 19 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 20 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 22 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 24 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 25 | * SOFTWARE. 26 | */ 27 | #include 28 | #include "argparse/argparse.hpp" 29 | #include "commons.hpp" 30 | 31 | // parameters 32 | struct args_params_t : public argparse::Args { 33 | bool& results = kwarg("results", "print generated results (default: false)").set_default(false); 34 | std::uint64_t& nt = kwarg("nt", "Number of time steps").set_default(45); 35 | std::uint64_t& size = kwarg("size", "Number of elements").set_default(10); 36 | int& nthreads = kwarg("nthreads", "Number of openmp threads").set_default(1); 37 | bool& k = kwarg("k", "Heat transfer coefficient").set_default(0.5); 38 | double& dt = kwarg("dt", "Timestep unit (default: 1.0[s])").set_default(1.0); 39 | double& dx = kwarg("dx", "Local x dimension").set_default(1.0); 40 | bool& help = flag("h, help", "print help"); 41 | bool& time = kwarg("t, time", "print time").set_default(true); 42 | }; 43 | 44 | using Real_t = double; 45 | /////////////////////////////////////////////////////////////////////////////// 46 | // Command-line variables 47 | constexpr Real_t k = 0.5; // heat transfer coefficient 48 | constexpr Real_t dt = 1.; // time step 49 | constexpr Real_t dx = 1.; // grid spacing 50 | 51 | /////////////////////////////////////////////////////////////////////////////// 52 | //[stepper_1 53 | struct stepper { 54 | // Our operator 55 | Real_t heat(const Real_t left, const Real_t middle, const Real_t right, const Real_t k = ::k, 56 | const Real_t dt = ::dt, const Real_t dx = ::dx) { 57 | return middle + (k * dt / (dx * dx)) * (left - 2 * middle + right); 58 | } 59 | 60 | // do all the work on 'size' data points for 'nt' time steps 61 | [[nodiscard]] std::vector do_work(const std::size_t size, const std::size_t nt, const int nthreads) { 62 | std::vector current(size); 63 | std::vector next(size); 64 | 65 | #pragma omp parallel for num_threads(nthreads) 66 | for (std::size_t i = 0; i < size; ++i) { 67 | current[i] = Real_t(i); 68 | } 69 | 70 | // Actual time step loop 71 | for (std::size_t t = 0; t != nt; ++t) { 72 | // OpenMP parallel for loop 73 | #pragma omp parallel for num_threads(nthreads) 74 | for (std::size_t i = 0; i < size; ++i) { 75 | std::size_t left = (i == 0) ? size - 1 : i - 1; 76 | std::size_t right = (i == size - 1) ? 0 : i + 1; 77 | next[i] = heat(current[left], current[i], current[right], k, dt, dx); 78 | } 79 | std::swap(current, next); 80 | } 81 | 82 | return current; 83 | } 84 | }; 85 | 86 | /////////////////////////////////////////////////////////////////////////////// 87 | int benchmark(args_params_t const& args) { 88 | std::uint64_t size = args.size; // Number of elements. 89 | std::uint64_t nt = args.nt; // Number of steps. 90 | int nthreads = args.nthreads; 91 | 92 | // Create the stepper object 93 | stepper step; 94 | 95 | // Measure execution time. 96 | Timer timer; 97 | 98 | auto solution = step.do_work(size, nt, nthreads); 99 | auto time = timer.stop(); 100 | 101 | // Print the final solution 102 | if (args.results) { 103 | fmt::println("{::f}", solution); 104 | } 105 | 106 | if (args.time) { 107 | fmt::print("Duration: {:f} ms\n", time); 108 | } 109 | 110 | return 0; 111 | } 112 | 113 | int main(int argc, char* argv[]) { 114 | // parse params 115 | args_params_t args = argparse::parse(argc, argv); 116 | // see if help wanted 117 | if (args.help) { 118 | args.print(); // prints all variables 119 | return 0; 120 | } 121 | 122 | benchmark(args); 123 | 124 | return 0; 125 | } 126 | -------------------------------------------------------------------------------- /apps/1d-stencil/1d-serial.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * MIT License 3 | * 4 | * Copyright (c) 2023 Weile Wei 5 | * Copyright (c) 2023 The Regents of the University of California, 6 | * through Lawrence Berkeley National Laboratory (subject to receipt of any 7 | * required approvals from the U.S. Dept. of Energy).All rights reserved. 8 | * 9 | * Permission is hereby granted, free of charge, to any person obtaining a copy 10 | * of this software and associated documentation files (the "Software"), to deal 11 | * in the Software without restriction, including without limitation the rights 12 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 13 | * copies of the Software, and to permit persons to whom the Software is 14 | * furnished to do so, subject to the following conditions: 15 | * 16 | * The above copyright notice and this permission notice shall be included in 17 | * all copies or substantial portions of the Software. 18 | * 19 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 20 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 22 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 24 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 25 | * SOFTWARE. 26 | */ 27 | #include "argparse/argparse.hpp" 28 | #include "commons.hpp" 29 | 30 | // parameters 31 | struct args_params_t : public argparse::Args { 32 | bool& results = kwarg("results", "print generated results (default: false)").set_default(false); 33 | std::uint64_t& nt = kwarg("nt", "Number of time steps").set_default(45); 34 | std::uint64_t& size = kwarg("size", "Number of elements").set_default(10); 35 | bool& k = kwarg("k", "Heat transfer coefficient").set_default(0.5); 36 | double& dt = kwarg("dt", "Timestep unit (default: 1.0[s])").set_default(1.0); 37 | double& dx = kwarg("dx", "Local x dimension").set_default(1.0); 38 | bool& help = flag("h, help", "print help"); 39 | bool& time = kwarg("t, time", "print time").set_default(true); 40 | }; 41 | 42 | using Real_t = double; 43 | /////////////////////////////////////////////////////////////////////////////// 44 | // Command-line variables 45 | constexpr Real_t k = 0.5; // heat transfer coefficient 46 | constexpr Real_t dt = 1.; // time step 47 | constexpr Real_t dx = 1.; // grid spacing 48 | 49 | /////////////////////////////////////////////////////////////////////////////// 50 | //[stepper_1 51 | struct stepper { 52 | using view_1d = std::extents; 53 | typedef std::mdspan space; 54 | 55 | void init_value(auto& data, const std::size_t size) { 56 | for (std::size_t i = 0; i != size; ++i) { 57 | data[i] = Real_t(i); 58 | } 59 | } 60 | 61 | // Our operator 62 | Real_t heat(const Real_t left, const Real_t middle, const Real_t right, const Real_t k = ::k, 63 | const Real_t dt = ::dt, const Real_t dx = ::dx) { 64 | return middle + (k * dt / (dx * dx)) * (left - 2 * middle + right); 65 | } 66 | 67 | // do all the work on 'size' data points for 'nt' time steps 68 | [[nodiscard]] space do_work(const std::size_t size, const std::size_t nt) { 69 | Real_t* current_ptr = new Real_t[size]; 70 | Real_t* next_ptr = new Real_t[size]; 71 | auto current = space(current_ptr, size); 72 | auto next = space(next_ptr, size); 73 | 74 | init_value(current, size); 75 | 76 | // Actual time step loop 77 | for (std::size_t t = 0; t != nt; ++t) { 78 | for (std::size_t i = 0; i < size; ++i) { 79 | std::size_t left = (i == 0) ? size - 1 : i - 1; 80 | std::size_t right = (i == size - 1) ? 0 : i + 1; 81 | next[i] = heat(current[left], current[i], current[right], k, dt, dx); 82 | } 83 | std::swap(current, next); 84 | } 85 | 86 | return current; 87 | } 88 | }; 89 | 90 | /////////////////////////////////////////////////////////////////////////////// 91 | int benchmark(args_params_t const& args) { 92 | std::uint64_t size = args.size; // Number of elements. 93 | std::uint64_t nt = args.nt; // Number of steps. 94 | 95 | // Create the stepper object 96 | stepper step; 97 | 98 | // Measure execution time. 99 | Timer timer; 100 | 101 | auto solution = step.do_work(size, nt); 102 | auto time = timer.stop(); 103 | 104 | // Print the final solution 105 | if (args.results) { 106 | fmt::println("{::f}", solution); 107 | } 108 | 109 | if (args.time) { 110 | fmt::print("Duration: {:f} ms\n", time); 111 | } 112 | 113 | return 0; 114 | } 115 | 116 | int main(int argc, char* argv[]) { 117 | // parse params 118 | args_params_t args = argparse::parse(argc, argv); 119 | // see if help wanted 120 | if (args.help) { 121 | args.print(); // prints all variables 122 | return 0; 123 | } 124 | 125 | benchmark(args); 126 | 127 | return 0; 128 | } 129 | -------------------------------------------------------------------------------- /apps/heat-equation/heat-equation-stdpar.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * MIT License 3 | * 4 | * Copyright (c) 2023 The Regents of the University of California, 5 | * through Lawrence Berkeley National Laboratory (subject to receipt of any 6 | * required approvals from the U.S. Dept. of Energy).All rights reserved. 7 | * 8 | * Permission is hereby granted, free of charge, to any person obtaining a copy 9 | * of this software and associated documentation files (the "Software"), to deal 10 | * in the Software without restriction, including without limitation the rights 11 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | * copies of the Software, and to permit persons to whom the Software is 13 | * furnished to do so, subject to the following conditions: 14 | * 15 | * The above copyright notice and this permission notice shall be included in 16 | * all copies or substantial portions of the Software. 17 | * 18 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 | * SOFTWARE. 25 | */ 26 | 27 | /* 28 | * Simplified 2d heat equation example derived from amrex 29 | */ 30 | 31 | #include "heat-equation.hpp" 32 | 33 | // 34 | // simulation 35 | // 36 | int main(int argc, char* argv[]) { 37 | // parse params 38 | const heat_params_t args = argparse::parse(argc, argv); 39 | 40 | // see if help wanted 41 | if (args.help) { 42 | args.print(); // prints all variables 43 | return 0; 44 | } 45 | 46 | // simulation variables 47 | int ncells = args.ncells; 48 | int nsteps = args.nsteps; 49 | Real_t dt = args.dt; 50 | Real_t alpha = args.alpha; 51 | // future if needed to split in multiple grids 52 | // int max_grid_size = args.max_grid_size; 53 | 54 | // initialize dx, dy, dz 55 | auto* dx = new Real_t[dims]; 56 | for (int i = 0; i < dims; ++i) 57 | dx[i] = 1.0 / (ncells - 1); 58 | 59 | // simulation setup (2D) 60 | Real_t* grid_old = new Real_t[(ncells + nghosts) * (ncells + nghosts)]; 61 | Real_t* grid_new = new Real_t[(ncells) * (ncells)]; 62 | 63 | auto phi_old = std::mdspan(grid_old, ncells + nghosts, ncells + nghosts); 64 | auto phi_new = std::mdspan(grid_new, ncells, ncells); 65 | 66 | // initialize phi_old domain: {[-0.5, -0.5], [0.5, 0.5]} -> origin at [0,0] 67 | 68 | Timer timer; 69 | 70 | std::for_each_n(std::execution::par_unseq, counting_iterator(0), ncells * ncells, [=](int ind) { 71 | int i = 1 + (ind / ncells); 72 | int j = 1 + (ind % ncells); 73 | 74 | Real_t x = pos(i, ghost_cells, dx[0]); 75 | Real_t y = pos(j, ghost_cells, dx[1]); 76 | 77 | // L2 distance (r2 from origin) 78 | Real_t r2 = (x * x + y * y) / (0.01); 79 | 80 | // phi(x,y) = 1 + exp(-r^2) 81 | phi_old(i, j) = 1 + exp(-r2); 82 | }); 83 | 84 | if (args.print_grid) 85 | // print the initial grid 86 | printGrid(grid_old, ncells + nghosts); 87 | 88 | // init simulation time 89 | Real_t time = 0.0; 90 | 91 | // evolve the system 92 | for (auto step = 0; step < nsteps; step++) { 93 | // fill boundary cells in old_phi 94 | fill2Dboundaries(grid_old, ncells + nghosts, ghost_cells); 95 | 96 | // update phi_new with stencil 97 | std::for_each_n(std::execution::par_unseq, counting_iterator(0), ncells * ncells, [=](int ind) { 98 | int i = 1 + (ind / ncells); 99 | int j = 1 + (ind % ncells); 100 | 101 | // Jacobi iteration 102 | phi_new(i - 1, j - 1) = 103 | phi_old(i, j) + alpha * dt * 104 | ((phi_old(i + 1, j) - 2.0 * phi_old(i, j) + phi_old(i - 1, j)) / (dx[0] * dx[0]) + 105 | (phi_old(i, j + 1) - 2.0 * phi_old(i, j) + phi_old(i, j - 1)) / (dx[1] * dx[1])); 106 | }); 107 | 108 | // update the simulation time 109 | time += dt; 110 | 111 | // parallel copy phi_new to phi_old 112 | std::for_each_n(std::execution::par_unseq, counting_iterator(0), ncells * ncells, [=](int ind) { 113 | int i = 1 + (ind / ncells); 114 | int j = 1 + (ind % ncells); 115 | 116 | // copy phi_new to phi_old 117 | phi_old(i, j) = phi_new(i - 1, j - 1); 118 | }); 119 | } 120 | 121 | auto elapsed = timer.stop(); 122 | 123 | // print timing 124 | if (args.print_time) { 125 | fmt::print("Duration: {:f} ms\n", elapsed); 126 | } 127 | 128 | if (args.print_grid) 129 | // print the final grid 130 | printGrid(grid_new, ncells); 131 | 132 | // delete all memory 133 | delete[] grid_old; 134 | delete[] grid_new; 135 | 136 | grid_old = nullptr; 137 | grid_new = nullptr; 138 | 139 | return 0; 140 | } 141 | -------------------------------------------------------------------------------- /apps/1d-stencil/1d-stdpar.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * MIT License 3 | * 4 | * Copyright (c) 2023 Weile Wei 5 | * Copyright (c) 2023 The Regents of the University of California, 6 | * through Lawrence Berkeley National Laboratory (subject to receipt of any 7 | * required approvals from the U.S. Dept. of Energy).All rights reserved. 8 | * 9 | * Permission is hereby granted, free of charge, to any person obtaining a copy 10 | * of this software and associated documentation files (the "Software"), to deal 11 | * in the Software without restriction, including without limitation the rights 12 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 13 | * copies of the Software, and to permit persons to whom the Software is 14 | * furnished to do so, subject to the following conditions: 15 | * 16 | * The above copyright notice and this permission notice shall be included in 17 | * all copies or substantial portions of the Software. 18 | * 19 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 20 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 22 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 24 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 25 | * SOFTWARE. 26 | */ 27 | // 28 | // This example provides a stdpar implementation for the 1D stencil code. 29 | #include "argparse/argparse.hpp" 30 | #include "commons.hpp" 31 | 32 | // parameters 33 | struct args_params_t : public argparse::Args { 34 | bool& results = kwarg("results", "print generated results (default: false)").set_default(false); 35 | std::uint64_t& nt = kwarg("nt", "Number of time steps").set_default(45); 36 | std::uint64_t& size = kwarg("size", "Number of elements").set_default(10); 37 | bool& k = kwarg("k", "Heat transfer coefficient").set_default(0.5); 38 | double& dt = kwarg("dt", "Timestep unit (default: 1.0[s])").set_default(1.0); 39 | double& dx = kwarg("dx", "Local x dimension").set_default(1.0); 40 | bool& help = flag("h, help", "print help"); 41 | bool& time = kwarg("t, time", "print time").set_default(true); 42 | }; 43 | 44 | using Real_t = double; 45 | /////////////////////////////////////////////////////////////////////////////// 46 | // Command-line variables 47 | constexpr Real_t k = 0.5; // heat transfer coefficient 48 | constexpr Real_t dt = 1.; // time step 49 | constexpr Real_t dx = 1.; // grid spacing 50 | 51 | /////////////////////////////////////////////////////////////////////////////// 52 | //[stepper_1 53 | struct stepper { 54 | using view_1d = std::extents; 55 | typedef std::mdspan space; 56 | 57 | // Our operator 58 | [[nodiscard]] Real_t heat(const Real_t left, const Real_t middle, const Real_t right, const Real_t k = ::k, 59 | const Real_t dt = ::dt, const Real_t dx = ::dx) { 60 | return middle + (k * dt / (dx * dx)) * (left - 2 * middle + right); 61 | } 62 | 63 | // do all the work on 'size' data points for 'nt' time steps 64 | [[nodiscard]] space do_work(const std::size_t size, const std::size_t nt) { 65 | Real_t* current_ptr = new Real_t[size]; 66 | Real_t* next_ptr = new Real_t[size]; 67 | 68 | auto current = space(current_ptr, size); 69 | auto next = space(next_ptr, size); 70 | 71 | // parallel init 72 | std::for_each_n(std::execution::par, counting_iterator(0), size, 73 | [=](std::size_t i) { current[i] = (Real_t)i; }); 74 | 75 | // Actual time step loop 76 | for (std::size_t t = 0; t != nt; ++t) { 77 | std::for_each_n(std::execution::par, counting_iterator(0), size, [=, k = k, dt = dt, dx = dx](int32_t i) { 78 | std::size_t left = (i == 0) ? size - 1 : i - 1; 79 | std::size_t right = (i == size - 1) ? 0 : i + 1; 80 | next[i] = heat(current[left], current[i], current[right], k, dt, dx); 81 | }); 82 | std::swap(current, next); 83 | } 84 | 85 | return current; 86 | } 87 | }; 88 | 89 | /////////////////////////////////////////////////////////////////////////////// 90 | int benchmark(args_params_t const& args) { 91 | std::uint64_t size = args.size; // Number of elements. 92 | std::uint64_t nt = args.nt; // Number of steps. 93 | 94 | // Create the stepper object 95 | stepper step; 96 | 97 | // Measure execution time. 98 | Timer timer; 99 | 100 | // Execute nt time steps on nx grid points. 101 | auto solution = step.do_work(size, nt); 102 | auto time = timer.stop(); 103 | 104 | // Print the final solution 105 | if (args.results) { 106 | fmt::println("{::f}", solution); 107 | } 108 | 109 | if (args.time) { 110 | fmt::print("Duration: {:f} ms\n", time); 111 | } 112 | 113 | return 0; 114 | } 115 | 116 | int main(int argc, char* argv[]) { 117 | // parse params 118 | args_params_t args = argparse::parse(argc, argv); 119 | // see if help wanted 120 | if (args.help) { 121 | args.print(); // prints all variables 122 | return 0; 123 | } 124 | 125 | benchmark(args); 126 | 127 | return 0; 128 | } 129 | -------------------------------------------------------------------------------- /apps/fft/fft-stdpar.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * MIT License 3 | * 4 | * Copyright (c) 2023 The Regents of the University of California, 5 | * through Lawrence Berkeley National Laboratory (subject to receipt of any 6 | * required approvals from the U.S. Dept. of Energy).All rights reserved. 7 | * 8 | * Permission is hereby granted, free of charge, to any person obtaining a copy 9 | * of this software and associated documentation files (the "Software"), to deal 10 | * in the Software without restriction, including without limitation the rights 11 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | * copies of the Software, and to permit persons to whom the Software is 13 | * furnished to do so, subject to the following conditions: 14 | * 15 | * The above copyright notice and this permission notice shall be included in 16 | * all copies or substantial portions of the Software. 17 | * 18 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 | * SOFTWARE. 25 | */ 26 | 27 | /* 28 | * commons for the fft codes 29 | */ 30 | 31 | #include "fft.hpp" 32 | 33 | // 34 | // fft algorithm 35 | // 36 | [[nodiscard]] std::vector fft(const data_t* x, const int N, bool debug = false) { 37 | std::vector x_rev(N); 38 | 39 | // create mdspans 40 | auto x_r = std::mdspan(x_rev.data(), N); 41 | 42 | // compute shift factor 43 | int shift = 32 - ilog2(N); 44 | 45 | // twiddle bits for fft 46 | std::for_each_n(std::execution::par_unseq, counting_iterator(0), N, [=](auto k) { 47 | auto new_idx = reverse_bits32(k) >> shift; 48 | x_r(k) = x[new_idx]; 49 | }); 50 | 51 | // niterations 52 | int niters = ilog2(N); 53 | 54 | // local merge partition size 55 | int lN = 2; 56 | 57 | // set cout precision 58 | fmt::print("FFT progress: "); 59 | 60 | // iterate until niters - lN*=2 after each iteration 61 | for (int it = 0; it < niters; it++, lN *= 2) { 62 | // print progress 63 | fmt::print("{:.1f}%..", (100.0 * it) / niters); 64 | 65 | // debugging timer 66 | static Timer dtimer; 67 | 68 | // number of partitions 69 | int nparts = N / lN; 70 | int tpp = lN / 2; 71 | 72 | // display info only if debugging 73 | if (debug) { 74 | dtimer.start(); 75 | fmt::print("lN = {}, npartitions = {}, partition size = {}\n", lN, nparts, tpp); 76 | } 77 | 78 | // parallel compute lN-pt FFT 79 | std::for_each_n(std::execution::par_unseq, counting_iterator(0), N / 2, [=](auto k) { 80 | // compute indices 81 | int e = (k / tpp) * lN + (k % tpp); 82 | auto o = e + tpp; 83 | auto i = (k % tpp); 84 | 85 | // compute 2-pt DFT 86 | auto tmp = x_r(e) + x_r(o) * WNk(N, i * nparts); 87 | x_r(o) = x_r(e) - x_r(o) * WNk(N, i * nparts); 88 | x_r(e) = tmp; 89 | }); 90 | 91 | // print only if debugging 92 | if (debug) 93 | fmt::print("This iter time: {} ms\n", dtimer.stop()); 94 | } 95 | 96 | // print final progress mark 97 | fmt::print("100%\n"); 98 | 99 | // return x_rev = fft(x_r) 100 | return x_rev; 101 | } 102 | 103 | // 104 | // simulation 105 | // 106 | int main(int argc, char* argv[]) { 107 | // parse params 108 | const fft_params_t args = argparse::parse(argc, argv); 109 | 110 | // see if help wanted 111 | if (args.help) { 112 | args.print(); // prints all variables 113 | return 0; 114 | } 115 | 116 | // simulation variables 117 | int N = args.N; 118 | sig_type_t sig_type = sig_type_t::box; 119 | //int freq = args.freq; 120 | bool print_sig = args.print_sig; 121 | bool print_time = args.print_time; 122 | bool validate = args.validate; 123 | 124 | // x[n] signal 125 | sig_t x_n(N, sig_type); 126 | 127 | if (!isPowOf2(N)) { 128 | N = ceilPowOf2(N); 129 | fmt::print("INFO: N is not a power of 2. Padding zeros => N = {}\n", N); 130 | 131 | x_n.resize(N); 132 | } 133 | 134 | if (print_sig) { 135 | fmt::print("\nx[n] = "); 136 | x_n.printSignal(); 137 | } 138 | 139 | // start the timer here 140 | Timer timer; 141 | 142 | // y[n] = fft(x[n]) 143 | sig_t y_n(std::move(fft(x_n.data(), N, args.debug))); 144 | 145 | // stop timer 146 | auto elapsed = timer.stop(); 147 | 148 | // print the fft(x) 149 | if (print_sig) { 150 | fmt::print("X(k) = "); 151 | y_n.printSignal(); 152 | } 153 | 154 | // print the computation time 155 | if (print_time) { 156 | fmt::print("Elapsed Time: {} ms\n", elapsed); 157 | } 158 | 159 | // validate the recursively computed fft 160 | if (validate) { 161 | if (x_n.isFFT(y_n, exec::static_thread_pool(std::thread::hardware_concurrency()).get_scheduler())) { 162 | fmt::print("SUCCESS: y[n] == fft(x[n])\n"); 163 | } else { 164 | fmt::print("FAILED: y[n] != fft(x[n])\n"); 165 | } 166 | } 167 | 168 | return 0; 169 | } 170 | -------------------------------------------------------------------------------- /include/commons.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * MIT License 3 | * 4 | * Copyright (c) 2023 The Regents of the University of California, 5 | * through Lawrence Berkeley National Laboratory (subject to receipt of any 6 | * required approvals from the U.S. Dept. of Energy). All rights reserved. 7 | * 8 | * Permission is hereby granted, free of charge, to any person obtaining a copy 9 | * of this software and associated documentation files (the "Software"), to deal 10 | * in the Software without restriction, including without limitation the rights 11 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | * copies of the Software, and to permit persons to whom the Software is 13 | * furnished to do so, subject to the following conditions: 14 | * 15 | * The above copyright notice and this permission notice shall be included in 16 | * all copies or substantial portions of the Software. 17 | * 18 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 | * SOFTWARE. 25 | */ 26 | 27 | #pragma once 28 | 29 | #include 30 | #include 31 | #include 32 | 33 | #include 34 | #include 35 | #include 36 | #include 37 | #include 38 | #include 39 | #include 40 | #include 41 | #include 42 | #include 43 | #include 44 | #include 45 | #include 46 | #include 47 | #include 48 | #include 49 | #include 50 | #include 51 | #include 52 | #include 53 | 54 | #include 55 | #include 56 | 57 | #include 58 | 59 | #include "counting_iterator.hpp" 60 | 61 | template 62 | requires std::floating_point struct fmt::formatter> { 63 | template 64 | constexpr auto parse(ParseContext& ctx) { 65 | return ctx.begin(); 66 | } 67 | 68 | template 69 | auto format(const std::complex& c, FormatContext& ctx) const { 70 | return format_to(ctx.out(), "({:.2f} + {:.2f}i)", c.real(), c.imag()); 71 | } 72 | }; 73 | 74 | // get mdpsan 2d indices from 1d index 75 | #define dim2(x, ms) \ 76 | int ii = x / ms.extent(1); \ 77 | int ij = x % ms.extent(1); 78 | // get mdspan 3d indices from 1d index 79 | #define dim3(x, ms) \ 80 | int ii = x / (ms3.extent(1) * ms.extent(2)); \ 81 | int ij = (x / ms.extent(2)) % ms.extent(1); \ 82 | int ik = x % ms.extent(2) 83 | 84 | class Timer { 85 | public: 86 | Timer() { start(); } 87 | 88 | ~Timer() { stop(); } 89 | 90 | void start() { start_time_point = std::chrono::high_resolution_clock::now(); } 91 | 92 | double stop() { 93 | end_time_point = std::chrono::high_resolution_clock::now(); 94 | return duration(); 95 | } 96 | 97 | double duration() { 98 | auto start = 99 | std::chrono::time_point_cast(start_time_point).time_since_epoch().count(); 100 | auto end = std::chrono::time_point_cast(end_time_point).time_since_epoch().count(); 101 | auto duration = end - start; 102 | double ms = duration * 1e-6; 103 | return ms; 104 | } 105 | 106 | private: 107 | std::chrono::time_point start_time_point; 108 | std::chrono::time_point end_time_point; 109 | }; 110 | 111 | enum class sch_t { CPU, GPU, MULTIGPU }; 112 | 113 | [[nodiscard]] sch_t get_sch_enum(std::string_view str) { 114 | static const std::map schmap = { 115 | {"cpu", sch_t::CPU}, 116 | #if defined(USE_GPU) 117 | {"gpu", sch_t::GPU}, 118 | {"multigpu", sch_t::MULTIGPU} 119 | #endif // USE_GPU 120 | }; 121 | 122 | if (schmap.contains(str)) { 123 | return schmap.at(str); 124 | } 125 | 126 | throw std::invalid_argument("FATAL: " + std::string(str) + 127 | " is not a stdexec scheduler.\n" 128 | "Available schedulers: cpu" 129 | #if defined(USE_GPU) 130 | ", gpu, multigpu" 131 | #endif 132 | "\n" 133 | "Exiting...\n"); 134 | } 135 | 136 | inline bool isPowOf2(long long int x) { 137 | return !(x == 0) && !(x & (x - 1)); 138 | } 139 | 140 | inline int ceilPowOf2(unsigned int v) { 141 | return static_cast(std::bit_ceil(v)); 142 | } 143 | 144 | inline int ilog2(uint32_t x) { 145 | return static_cast(log2(x)); 146 | } 147 | 148 | template 149 | bool complex_compare(T a, T b, double error = 0.0101) { 150 | auto r = (fabs(a.real() - b.real()) < error) ? true : false; 151 | return r && (fabs(a.imag() - b.imag()) < error) ? true : false; 152 | } 153 | 154 | uint32_t reverse_bits32(uint32_t x) { 155 | x = ((x & 0xaaaaaaaa) >> 1) | ((x & 0x55555555) << 1); 156 | x = ((x & 0xcccccccc) >> 2) | ((x & 0x33333333) << 2); 157 | x = ((x & 0xf0f0f0f0) >> 4) | ((x & 0x0f0f0f0f) << 4); 158 | x = ((x & 0xff00ff00) >> 8) | ((x & 0x00ff00ff) << 8); 159 | return (x >> 16) | (x << 16); 160 | } 161 | 162 | // alias for status variables 163 | using status_t = int; 164 | -------------------------------------------------------------------------------- /apps/heat-equation/heat-equation-serial.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * MIT License 3 | * 4 | * Copyright (c) 2023 The Regents of the University of California, 5 | * through Lawrence Berkeley National Laboratory (subject to receipt of any 6 | * required approvals from the U.S. Dept. of Energy).All rights reserved. 7 | * 8 | * Permission is hereby granted, free of charge, to any person obtaining a copy 9 | * of this software and associated documentation files (the "Software"), to deal 10 | * in the Software without restriction, including without limitation the rights 11 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | * copies of the Software, and to permit persons to whom the Software is 13 | * furnished to do so, subject to the following conditions: 14 | * 15 | * The above copyright notice and this permission notice shall be included in 16 | * all copies or substantial portions of the Software. 17 | * 18 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 | * SOFTWARE. 25 | */ 26 | 27 | /* 28 | * Simplified 2d heat equation example derived from amrex 29 | */ 30 | 31 | #include "heat-equation.hpp" 32 | 33 | // fill boundary cells 34 | template 35 | void fill2Dboundaries_mdspan(T* grid, int len, int ghost_cells = 1) { 36 | auto row_view = std::mdspan(grid, len, len); 37 | 38 | for (auto j = ghost_cells; j < row_view.extent(1) - ghost_cells; ++j) { 39 | row_view(0, j) = row_view(ghost_cells, j); 40 | row_view(row_view.extent(0) - ghost_cells, j) = row_view(row_view.extent(0) - ghost_cells - 1, j); 41 | } 42 | 43 | auto col_view = std::mdspan(grid, len, len); 44 | 45 | for (auto i = 1; i < col_view.extent(1) - 1; ++i) { 46 | col_view(0, i) = col_view(ghost_cells, i); 47 | col_view(col_view.extent(0) - 1, i) = col_view(col_view.extent(0) - ghost_cells - 1, i); 48 | } 49 | } 50 | 51 | // 52 | // simulation 53 | // 54 | int main(int argc, char* argv[]) { 55 | // parse params 56 | const heat_params_t args = argparse::parse(argc, argv); 57 | 58 | // see if help wanted 59 | if (args.help) { 60 | args.print(); // prints all variables 61 | return 0; 62 | } 63 | 64 | // simulation variables 65 | int ncells = args.ncells; 66 | int nsteps = args.nsteps; 67 | Real_t dt = args.dt; 68 | Real_t alpha = args.alpha; 69 | // future if needed to split in multiple grids 70 | // int max_grid_size = args.max_grid_size; 71 | 72 | // initialize dx, dy, dz 73 | auto* dx = new Real_t[dims]; 74 | for (int i = 0; i < dims; ++i) 75 | dx[i] = 1.0 / (ncells - 1); 76 | 77 | // simulation setup (2D) 78 | Real_t* grid_old = new Real_t[(ncells + nghosts) * (ncells + nghosts)]; 79 | Real_t* grid_new = new Real_t[(ncells) * (ncells)]; 80 | 81 | auto phi_old = std::mdspan(grid_old, ncells + nghosts, ncells + nghosts); 82 | auto phi_new = std::mdspan(grid_new, ncells, ncells); 83 | 84 | Timer timer; 85 | 86 | // initialize phi_old domain: {[-0.5, -0.5], [0.5, 0.5]} -> origin at [0,0] 87 | for (int i = 1; i < phi_old.extent(0) - 1; ++i) { 88 | for (int j = 1; j < phi_old.extent(1) - 1; ++j) { 89 | Real_t x = pos(i, ghost_cells, dx[0]); 90 | Real_t y = pos(j, ghost_cells, dx[1]); 91 | 92 | // L2 distance (r2 from origin) 93 | Real_t r2 = (x * x + y * y) / (0.01); 94 | 95 | // phi(x,y) = 1 + exp(-r^2) 96 | phi_old(i, j) = 1 + exp(-r2); 97 | } 98 | } 99 | 100 | if (args.print_grid) 101 | // print the initial grid 102 | printGrid(grid_old, ncells + nghosts); 103 | 104 | // init simulation time 105 | Real_t time = 0.0; 106 | 107 | // evolve the system 108 | for (auto step = 0; step < nsteps; step++) { 109 | // fill boundary cells in old_phi 110 | fill2Dboundaries_mdspan(grid_old, ncells + nghosts, ghost_cells); 111 | 112 | // update phi_new 113 | for (auto i = 1; i < phi_old.extent(0) - 1; i++) { 114 | for (auto j = 1; j < phi_old.extent(1) - 1; j++) { 115 | // Jacobi iteration 116 | phi_new(i - 1, j - 1) = 117 | phi_old(i, j) + 118 | alpha * dt * 119 | ((phi_old(i + 1, j) - 2.0 * phi_old(i, j) + phi_old(i - 1, j)) / (dx[0] * dx[0]) + 120 | (phi_old(i, j + 1) - 2.0 * phi_old(i, j) + phi_old(i, j - 1)) / (dx[1] * dx[1])); 121 | } 122 | } 123 | 124 | // update the simulation time 125 | time += dt; 126 | 127 | // parallel copy phi_new to phi_old 128 | for (auto i = 1; i < phi_old.extent(0) - 1; i++) 129 | for (auto j = 1; j < phi_old.extent(1) - 1; j++) 130 | // copy phi_new to phi_old 131 | phi_old(i, j) = phi_new(i - 1, j - 1); 132 | } 133 | 134 | auto elapsed = timer.stop(); 135 | 136 | // print timing 137 | if (args.print_time) { 138 | fmt::print("Duration: {:f} ms\n", elapsed); 139 | } 140 | 141 | if (args.print_grid) 142 | // print the final grid 143 | printGrid(grid_new, ncells); 144 | 145 | // delete all memory 146 | delete[] grid_old; 147 | delete[] grid_new; 148 | 149 | grid_old = nullptr; 150 | grid_new = nullptr; 151 | 152 | return 0; 153 | } 154 | -------------------------------------------------------------------------------- /apps/heat-equation/heat-equation-omp.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * MIT License 3 | * 4 | * Copyright (c) 2023 The Regents of the University of California, 5 | * through Lawrence Berkeley National Laboratory (subject to receipt of any 6 | * required approvals from the U.S. Dept. of Energy).All rights reserved. 7 | * 8 | * Permission is hereby granted, free of charge, to any person obtaining a copy 9 | * of this software and associated documentation files (the "Software"), to deal 10 | * in the Software without restriction, including without limitation the rights 11 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | * copies of the Software, and to permit persons to whom the Software is 13 | * furnished to do so, subject to the following conditions: 14 | * 15 | * The above copyright notice and this permission notice shall be included in 16 | * all copies or substantial portions of the Software. 17 | * 18 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 | * SOFTWARE. 25 | */ 26 | 27 | /* 28 | * Simplified 2d heat equation example derived from amrex 29 | */ 30 | 31 | #define HEQ_OMP 32 | #include "heat-equation.hpp" 33 | 34 | // fill boundary cells OpenMP 35 | template 36 | void fill2Dboundaries_omp(T* grid, int len, int nthreads = 1, int ghost_cells = 1) { 37 | #pragma omp parallel for num_threads(nthreads) 38 | for (int i = ghost_cells; i < len - ghost_cells; i++) { 39 | grid[i] = grid[i + (ghost_cells * len)]; 40 | grid[i + (len * (len - ghost_cells))] = grid[i + (len * (len - ghost_cells - 1))]; 41 | 42 | grid[i * len] = grid[(ghost_cells * len) + i]; 43 | grid[(len - ghost_cells) + (len * i)] = grid[(len - ghost_cells - 1) + (len * i)]; 44 | } 45 | } 46 | 47 | // 48 | // simulation 49 | // 50 | int main(int argc, char* argv[]) { 51 | // parse params 52 | const heat_params_t args = argparse::parse(argc, argv); 53 | 54 | // see if help wanted 55 | if (args.help) { 56 | args.print(); // prints all variables 57 | return 0; 58 | } 59 | 60 | // simulation variables 61 | int ncells = args.ncells; 62 | int nsteps = args.nsteps; 63 | int nthreads = args.nthreads; 64 | Real_t dt = args.dt; 65 | Real_t alpha = args.alpha; 66 | // future if needed to split in multiple grids 67 | // int max_grid_size = args.max_grid_size; 68 | 69 | // initialize dx, dy, dz 70 | auto* dx = new Real_t[dims]; 71 | for (int i = 0; i < dims; ++i) 72 | dx[i] = 1.0 / (ncells - 1); 73 | 74 | // simulation setup (2D) 75 | Real_t* grid_old = new Real_t[(ncells + nghosts) * (ncells + nghosts)]; 76 | Real_t* grid_new = new Real_t[(ncells) * (ncells)]; 77 | 78 | auto phi_old = std::mdspan(grid_old, ncells + nghosts, ncells + nghosts); 79 | auto phi_new = std::mdspan(grid_new, ncells, ncells); 80 | 81 | int gsize = ncells * ncells; 82 | 83 | Timer timer; 84 | 85 | // initialize phi_old domain: {[-0.5, -0.5], [0.5, 0.5]} -> origin at [0,0] 86 | #pragma omp parallel for num_threads(nthreads) 87 | for (int pos = 0; pos < gsize; pos++) { 88 | int i = 1 + (pos / ncells); 89 | int j = 1 + (pos % ncells); 90 | 91 | Real_t x = pos(i, ghost_cells, dx[0]); 92 | Real_t y = pos(j, ghost_cells, dx[1]); 93 | 94 | // L2 distance (r2 from origin) 95 | Real_t r2 = (x * x + y * y) / (0.01); 96 | 97 | // phi(x,y) = 1 + exp(-r^2) 98 | phi_old(i, j) = 1 + exp(-r2); 99 | } 100 | 101 | if (args.print_grid) 102 | // print the initial grid 103 | printGrid(grid_old, ncells + nghosts); 104 | 105 | // init simulation time 106 | Real_t time = 0.0; 107 | 108 | // evolve the system 109 | for (auto step = 0; step < nsteps; step++) { 110 | // fill boundary cells in old_phi 111 | fill2Dboundaries_omp(grid_old, ncells + nghosts, ghost_cells, nthreads); 112 | 113 | #pragma omp parallel for num_threads(nthreads) 114 | for (int pos = 0; pos < gsize; pos++) { 115 | int i = 1 + (pos / ncells); 116 | int j = 1 + (pos % ncells); 117 | 118 | // Jacobi iteration 119 | phi_new(i - 1, j - 1) = 120 | phi_old(i, j) + alpha * dt * 121 | ((phi_old(i + 1, j) - 2.0 * phi_old(i, j) + phi_old(i - 1, j)) / (dx[0] * dx[0]) + 122 | (phi_old(i, j + 1) - 2.0 * phi_old(i, j) + phi_old(i, j - 1)) / (dx[1] * dx[1])); 123 | } 124 | 125 | // update the simulation time 126 | time += dt; 127 | 128 | // parallel copy phi_new to phi_old 129 | #pragma omp parallel for num_threads(nthreads) 130 | for (int pos = 0; pos < gsize; pos++) { 131 | int i = 1 + (pos / ncells); 132 | int j = 1 + (pos % ncells); 133 | 134 | // copy phi_new to phi_old 135 | phi_old(i, j) = phi_new(i - 1, j - 1); 136 | } 137 | } 138 | 139 | auto elapsed = timer.stop(); 140 | 141 | // print timing 142 | if (args.print_time) { 143 | fmt::print("Duration: {:f} ms\n", elapsed); 144 | } 145 | 146 | if (args.print_grid) 147 | // print the final grid 148 | printGrid(grid_new, ncells); 149 | 150 | // delete all memory 151 | delete[] grid_old; 152 | delete[] grid_new; 153 | 154 | grid_old = nullptr; 155 | grid_new = nullptr; 156 | 157 | return 0; 158 | } 159 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # ############################################################################## 2 | # Add colors 3 | # ############################################################################## 4 | if(NOT WIN32) 5 | string(ASCII 27 Esc) 6 | set(ColourReset "${Esc}[m") 7 | set(ColourBold "${Esc}[1m") 8 | set(Red "${Esc}[31m") 9 | set(Green "${Esc}[32m") 10 | set(Yellow "${Esc}[33m") 11 | set(Blue "${Esc}[34m") 12 | set(Magenta "${Esc}[35m") 13 | set(Cyan "${Esc}[36m") 14 | set(White "${Esc}[37m") 15 | set(BoldRed "${Esc}[1;31m") 16 | set(BoldGreen "${Esc}[1;32m") 17 | set(BoldYellow "${Esc}[1;33m") 18 | set(BoldBlue "${Esc}[1;34m") 19 | set(BoldMagenta "${Esc}[1;35m") 20 | set(BoldCyan "${Esc}[1;36m") 21 | set(BoldWhite "${Esc}[1;37m") 22 | endif() 23 | 24 | # ############################################################################## 25 | # CMake settings 26 | # ############################################################################## 27 | 28 | # cmake min required 29 | cmake_minimum_required(VERSION 3.22.1 FATAL_ERROR) 30 | project(hpcpp LANGUAGES CXX CUDA) 31 | 32 | # in source build warning 33 | if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_BINARY_DIR}") 34 | set(MSG "") 35 | message( 36 | STATUS "Warning! Building from the source directory is not recommended") 37 | message( 38 | STATUS "If unintented, please remove 'CMakeCache.txt' and 'CMakeFiles'") 39 | message(STATUS "and build from a separate directory") 40 | message(WARNING "In-source build") 41 | endif() 42 | 43 | # set cmake module path 44 | set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/external/") 45 | 46 | # Set a default build type if none was specified 47 | set(HPCPP_BUILD_TYPE "RelWithDebInfo") 48 | 49 | # set the build type 50 | if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) 51 | message( 52 | STATUS 53 | "${BoldCyan}Setting build type to '${HPCPP_BUILD_TYPE}' as none was specified.${ColourReset}" 54 | ) 55 | set(CMAKE_BUILD_TYPE 56 | "${HPCPP_BUILD_TYPE}" 57 | CACHE STRING "Choose the type of build." FORCE) 58 | # Set the possible values of build type for cmake-gui 59 | set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" 60 | "MinSizeRel" "RelWithDebInfo") 61 | endif() 62 | 63 | # ############################################################################## 64 | # GCC version check 65 | # ############################################################################## 66 | set(GCC_EXPECTED_VERSION 11.2) 67 | 68 | if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS GCC_EXPECTED_VERSION) 69 | message( 70 | FATAL_ERROR 71 | "GCC: hpcpp requires GCC v${GCC_EXPECTED_VERSION} or higher to build but found v${CMAKE_CXX_COMPILER_VERSION}" 72 | ) 73 | endif() 74 | 75 | # ############################################################################## 76 | # CXX standard 77 | # ############################################################################## 78 | set(CXX_STANDARD_REQUIRED ON) 79 | 80 | # required minimum CXX standard 81 | set(CMAKE_CXX_STANDARD_REQUIRED 23) 82 | set(CMAKE_GNU_EXTENSIONS ON) 83 | 84 | if(NOT CXX_STANDARD OR (CXX_STANDARD LESS ${CMAKE_CXX_STANDARD_REQUIRED})) 85 | set(CXX_STANDARD ${CMAKE_CXX_STANDARD_REQUIRED}) 86 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++${CXX_STANDARD}") 87 | message(STATUS "Setting CXX_STANDARD to ${CMAKE_CXX_STANDARD_REQUIRED}") 88 | endif() 89 | 90 | # ############################################################################## 91 | # Setup STDEXEC 92 | # ############################################################################## 93 | 94 | # this is a hack should be automatically detected from the CMAKE_PREFIX_PATH 95 | # instead of manual 96 | set(CPM_DOWNLOAD_VERSION 0.35.6) 97 | 98 | if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.24.0") 99 | cmake_policy(SET CMP0135 NEW) 100 | endif() 101 | 102 | if(CPM_SOURCE_CACHE) 103 | set(CPM_DOWNLOAD_LOCATION 104 | "${CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake") 105 | elseif(DEFINED ENV{CPM_SOURCE_CACHE}) 106 | set(CPM_DOWNLOAD_LOCATION 107 | "$ENV{CPM_SOURCE_CACHE}/cpm/CPM_${CPM_DOWNLOAD_VERSION}.cmake") 108 | else() 109 | set(CPM_DOWNLOAD_LOCATION 110 | "${CMAKE_BINARY_DIR}/cmake/CPM_${CPM_DOWNLOAD_VERSION}.cmake") 111 | endif() 112 | 113 | if(NOT (EXISTS ${CPM_DOWNLOAD_LOCATION})) 114 | message(STATUS "Downloading CPM.cmake to ${CPM_DOWNLOAD_LOCATION}") 115 | file( 116 | DOWNLOAD 117 | https://github.com/TheLartians/CPM.cmake/releases/download/v${CPM_DOWNLOAD_VERSION}/CPM.cmake 118 | ${CPM_DOWNLOAD_LOCATION}) 119 | endif() 120 | 121 | include(${CPM_DOWNLOAD_LOCATION}) 122 | 123 | cpmaddpackage( 124 | NAME 125 | stdexec 126 | GITHUB_REPOSITORY 127 | NVIDIA/stdexec 128 | GIT_TAG 129 | main 130 | OPTIONS 131 | "STDEXEC_ENABLE_CUDA ON" 132 | "STDEXEC_BUILD_EXAMPLES OFF" 133 | "STDEXEC_BUILD_TESTS OFF" 134 | "STDEXEC_ENABLE_IO_URING_TESTS OFF" 135 | "BUILD_TESTING OFF") 136 | 137 | cpmaddpackage(NAME mdspan GITHUB_REPOSITORY kokkos/mdspan GIT_TAG stable) 138 | 139 | cpmaddpackage(NAME fmt GITHUB_REPOSITORY fmtlib/fmt GIT_TAG master) 140 | 141 | cpmaddpackage(NAME mdspan_formatter GITHUB_REPOSITORY weilewei/mdspan_formatter 142 | GIT_TAG main) 143 | 144 | cpmaddpackage(NAME argparse GITHUB_REPOSITORY mhaseeb123/argparse GIT_TAG 145 | master) 146 | 147 | add_library(hpcpp-core INTERFACE) 148 | 149 | # Link external libraries 150 | target_link_libraries(hpcpp-core INTERFACE mdspan fmt mdspan_formatter argparse) 151 | 152 | # stdpar and openmp 153 | set(STDPAR_TYPE "gpu") 154 | set(OMP_TYPE "multicore") 155 | 156 | # set the build type 157 | if(NOT STDPAR) 158 | message( 159 | STATUS 160 | "${BoldCyan}Setting -stdpar=${STDPAR_TYPE} as none was specified.${ColourReset}" 161 | ) 162 | set(STDPAR 163 | "${STDPAR_TYPE}" 164 | CACHE STRING "Choose the stdpar accelerator." FORCE) 165 | # Set the possible values of build type for cmake-gui 166 | set_property(CACHE STDPAR PROPERTY STRINGS "gpu" "multicore") 167 | endif() 168 | 169 | # set the omp offload type 170 | if(NOT OMP) 171 | message( 172 | STATUS 173 | "${BoldCyan}Setting -mp=${OMP_TYPE} as none was specified.${ColourReset}") 174 | set(OMP 175 | "${OMP_TYPE}" 176 | CACHE STRING "Choose the OpenMP accelerator." FORCE) 177 | # Set the possible values of build type for cmake-gui 178 | set_property(CACHE OMP PROPERTY STRINGS "multicore" "gpu") 179 | endif() 180 | 181 | # need to add appropriate flags for stdexec 182 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -stdpar=${STDPAR} -mp=${OMP}") 183 | 184 | # add -cudalib=cublas if -stdpar=gpu 185 | if(STDPAR STREQUAL "gpu") 186 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_GPU") 187 | else() 188 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -UUSE_GPU") 189 | endif() 190 | 191 | # ############################################################################## 192 | # Add sub-directories 193 | # ############################################################################## 194 | 195 | # ----------------------------------------------------------------------------------------# 196 | # apps 197 | # ----------------------------------------------------------------------------------------# 198 | 199 | message(STATUS "Adding hpcpp apps...") 200 | add_subdirectory(apps) 201 | -------------------------------------------------------------------------------- /apps/prefixSum/prefixSum-stdexec.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * MIT License 3 | * 4 | * Copyright (c) 2023 The Regents of the University of California, 5 | * through Lawrence Berkeley National Laboratory (subject to receipt of any 6 | * required approvals from the U.S. Dept. of Energy).All rights reserved. 7 | * 8 | * Permission is hereby granted, free of charge, to any person obtaining a copy 9 | * of this software and associated documentation files (the "Software"), to deal 10 | * in the Software without restriction, including without limitation the rights 11 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | * copies of the Software, and to permit persons to whom the Software is 13 | * furnished to do so, subject to the following conditions: 14 | * 15 | * The above copyright notice and this permission notice shall be included in 16 | * all copies or substantial portions of the Software. 17 | * 18 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 | * SOFTWARE. 25 | */ 26 | 27 | /* 28 | * commons for the prefixSum codes 29 | */ 30 | 31 | #define PSUM_STDEXEC 32 | #include "prefixSum.hpp" 33 | #include "repeat_n/repeat_n.cuh" 34 | 35 | // 36 | // stdexec prefixSum function 37 | // 38 | template 39 | [[nodiscard]] T* prefixSum(scheduler auto&& sch, const T* in, const int N) { 40 | // allocate a N+1 size array as there will be a trailing zero 41 | T* y = new T[N + 1]; 42 | 43 | // number of iterations 44 | int niters = ilog2(N); 45 | 46 | // need to be dynamic memory to be able to use it in gpu ctx. 47 | int* d_ptr = new int(0); 48 | 49 | // memcpy to output vector to start computation. 50 | ex::sync_wait(ex::schedule(sch) | ex::bulk(N, [=](int k) { y[k] = in[k]; })); 51 | 52 | // GE Blelloch (1990) algorithm from pseudocode at: 53 | // https://developer.nvidia.com/gpugems/gpugems3/part-vi-gpu-computing/chapter-39-parallel-prefix-sum-scan-cuda 54 | 55 | // upsweep 56 | for (int d = 0; d < niters; d++) { 57 | int bsize = N / (1 << d + 1); 58 | 59 | ex::sender auto uSweep = schedule(sch) | ex::bulk(bsize, [=](int k) { 60 | // stride1 = 2^(d+1) 61 | int st1 = 1 << d + 1; 62 | // stride2 = 2^d 63 | int st2 = 1 << d; 64 | // only the threads at indices (k+1) * 2^(d+1) -1 will compute 65 | int myIdx = (k + 1) * st1 - 1; 66 | 67 | // update y[myIdx] 68 | y[myIdx] += y[myIdx - st2]; 69 | }); 70 | // wait for upsweep 71 | ex::sync_wait(uSweep); 72 | } 73 | 74 | // write sum to y[N] and reset vars 75 | ex::sync_wait(schedule(sch) | ex::then([=]() { 76 | y[N] = y[N - 1]; 77 | y[N - 1] = 0; 78 | })); 79 | 80 | // downsweep 81 | for (int d = niters - 1; d >= 0; d--) { 82 | int bsize = N / (1 << d + 1); 83 | 84 | ex::sender auto dSweep = schedule(sch) | ex::bulk(bsize, [=](int k) { 85 | // stride1 = 2^(d+1) 86 | int st1 = 1 << d + 1; 87 | // stride2 = 2^d 88 | int st2 = 1 << d; 89 | // only the threads at indices (k+1) * 2^(d+1) -1 will compute 90 | int myIdx = (k + 1) * st1 - 1; 91 | 92 | // update y[myIdx] and y[myIdx-stride2] 93 | auto tmp = y[myIdx]; 94 | y[myIdx] += y[myIdx - st2]; 95 | y[myIdx - st2] = tmp; 96 | }); 97 | 98 | // wait for downsweep 99 | ex::sync_wait(dSweep); 100 | } 101 | 102 | // return the computed results. 103 | return y; 104 | } 105 | 106 | // 107 | // simulation 108 | // 109 | int main(int argc, char* argv[]) { 110 | // parse params 111 | const prefixSum_params_t args = argparse::parse(argc, argv); 112 | 113 | // see if help wanted 114 | if (args.help) { 115 | args.print(); // prints all variables 116 | return 0; 117 | } 118 | 119 | // simulation variables 120 | int N = args.N; 121 | bool print_arr = args.print_arr; 122 | bool print_time = args.print_time; 123 | bool validate = args.validate; 124 | std::string sched = args.sch; 125 | int nthreads = args.nthreads; 126 | 127 | if (!isPowOf2(N)) { 128 | N = ceilPowOf2(N); 129 | fmt::print("INFO: N != pow(2). Setting => N = {}\n", N); 130 | } 131 | 132 | // input data 133 | data_t* in = new data_t[N]; 134 | 135 | fmt::print("Progress:0%"); 136 | 137 | // random number generator 138 | psum::genRandomVector(in, N, (data_t)0, (data_t)10); 139 | 140 | fmt::print("..50%"); 141 | 142 | // output pointer 143 | data_t* out = nullptr; 144 | 145 | // start the timer 146 | Timer timer; 147 | 148 | // initialize stdexec scheduler 149 | sch_t scheduler = get_sch_enum(sched); 150 | 151 | // launch with appropriate stdexec scheduler 152 | switch (scheduler) { 153 | case sch_t::CPU: 154 | out = prefixSum(exec::static_thread_pool(nthreads).get_scheduler(), in, N); 155 | break; 156 | #if defined(USE_GPU) 157 | case sch_t::GPU: 158 | out = prefixSum(nvexec::stream_context().get_scheduler(), in, N); 159 | break; 160 | case sch_t::MULTIGPU: 161 | out = prefixSum(nvexec::multi_gpu_stream_context().get_scheduler(), in, N); 162 | break; 163 | #endif // USE_GPU 164 | default: 165 | throw std::runtime_error("Run: `prefixSum-stdexec --help` to see the list of available schedulers"); 166 | } 167 | 168 | // stop timer 169 | auto elapsed = timer.stop(); 170 | 171 | fmt::print("..100%\n"); 172 | 173 | // print the input and its prefix sum (don't if N > 100) 174 | if (print_arr && N < 100) { 175 | fmt::print("int = {}\n", fmt::join(in, in + N, " ")); 176 | fmt::print("out = {}\n", fmt::join(out + 1, out + 1 + N, " ")); 177 | } 178 | 179 | // print the elapsed time 180 | if (print_time) 181 | fmt::print("Elapsed Time: {:f} s\n", elapsed); 182 | 183 | // validate the prefixSum 184 | if (validate) { 185 | bool verify = psum::validatePrefixSum(in, out + 1, N); 186 | 187 | if (verify) 188 | fmt::print("SUCCESS.."); 189 | else 190 | fmt::print("FAILED.."); 191 | 192 | fmt::print("\n"); 193 | } 194 | 195 | // return status 196 | return 0; 197 | } 198 | -------------------------------------------------------------------------------- /apps/heat-equation/heat-equation-stdexec.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * MIT License 3 | * 4 | * Copyright (c) 2023 The Regents of the University of California, 5 | * through Lawrence Berkeley National Laboratory (subject to receipt of any 6 | * required approvals from the U.S. Dept. of Energy).All rights reserved. 7 | * 8 | * Permission is hereby granted, free of charge, to any person obtaining a copy 9 | * of this software and associated documentation files (the "Software"), to deal 10 | * in the Software without restriction, including without limitation the rights 11 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | * copies of the Software, and to permit persons to whom the Software is 13 | * furnished to do so, subject to the following conditions: 14 | * 15 | * The above copyright notice and this permission notice shall be included in 16 | * all copies or substantial portions of the Software. 17 | * 18 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 | * SOFTWARE. 25 | */ 26 | 27 | /* 28 | * Simplified 2d heat equation example derived from amrex 29 | */ 30 | 31 | #define HEQ_STDEXEC 32 | #include "heat-equation.hpp" 33 | #include "repeat_n/repeat_n.cuh" 34 | 35 | // 2D jacobi algorithm pipeline 36 | void heat_equation(scheduler auto sch, Real_t* phi_old, Real_t* phi_new, Real_t* dx, Real_t dt, Real_t alpha, 37 | int nsteps, int ncells, bool print = false) { 38 | // init simulation time 39 | Real_t time = 0.0; 40 | auto phi_old_extent = ncells + nghosts; 41 | int gsize = ncells * ncells; 42 | 43 | // initialize dx on CPU 44 | for (int i = 0; i < dims; ++i) 45 | dx[i] = 1.0 / (ncells - 1); 46 | 47 | // set cout precision 48 | fmt::print("HEQ progress: "); 49 | 50 | ex::sender auto begin = schedule(sch); 51 | 52 | auto heat_eq_init = ex::bulk(begin, gsize, [=](int pos) { 53 | int i = 1 + (pos / ncells); 54 | int j = 1 + (pos % ncells); 55 | 56 | Real_t x = pos(i, ghost_cells, dx[0]); 57 | Real_t y = pos(j, ghost_cells, dx[1]); 58 | 59 | // L2 distance (r2 from origin) 60 | Real_t r2 = (x * x + y * y) / (0.01); 61 | 62 | // phi(x,y) = 1 + exp(-r^2) 63 | phi_old[(i)*phi_old_extent + j] = 1 + exp(-r2); 64 | }); 65 | 66 | ex::sync_wait(std::move(heat_eq_init)); 67 | 68 | if (print) 69 | printGrid(phi_old, ncells + nghosts); 70 | 71 | auto fillBoundary = [=](int pos) { 72 | int i = pos + ghost_cells; 73 | int len = phi_old_extent; 74 | // fill boundary cells in old_phi 75 | phi_old[i] = phi_old[i + (ghost_cells * len)]; 76 | phi_old[i + (len * (len - ghost_cells))] = phi_old[i + (len * (len - ghost_cells - 1))]; 77 | phi_old[i * len] = phi_old[(ghost_cells * len) + i]; 78 | phi_old[(len - ghost_cells) + (len * i)] = phi_old[(len - ghost_cells - 1) + (len * i)]; 79 | }; 80 | 81 | auto jacobi = [=](int pos) { 82 | int i = 1 + (pos / ncells); 83 | int j = 1 + (pos % ncells); 84 | 85 | // Jacobi iteration 86 | phi_new[(i - 1) * ncells + j - 1] = 87 | phi_old[(i)*phi_old_extent + j] + 88 | alpha * dt * 89 | ((phi_old[(i + 1) * phi_old_extent + j] - 2.0 * phi_old[(i)*phi_old_extent + j] + 90 | phi_old[(i - 1) * phi_old_extent + j]) / 91 | (dx[0] * dx[0]) + 92 | (phi_old[(i)*phi_old_extent + j + 1] - 2.0 * phi_old[(i)*phi_old_extent + j] + 93 | phi_old[(i)*phi_old_extent + j - 1]) / 94 | (dx[1] * dx[1])); 95 | }; 96 | 97 | auto parallelCopy = [=](int pos) { 98 | int i = 1 + (pos / ncells); 99 | int j = 1 + (pos % ncells); 100 | phi_old[(i)*phi_old_extent + j] = phi_new[(i - 1) * ncells + (j - 1)]; 101 | }; 102 | 103 | // evolve the system 104 | #if !defined(USE_GPU) 105 | for (auto iter = 0; iter < nsteps; iter++) 106 | #endif 107 | stdexec::sync_wait( 108 | #if defined(USE_GPU) 109 | ex::just() | exec::on(sch, repeat_n(nsteps, 110 | #else 111 | stdexec::schedule(sch) | 112 | #endif // USE_GPU 113 | ex::bulk(phi_old_extent - nghosts, [=](int k) { fillBoundary(k); }) | 114 | ex::bulk(gsize, [=](int k) { jacobi(k); }) | 115 | ex::bulk(gsize, [=](int k) { parallelCopy(k); }) 116 | #if defined(USE_GPU) 117 | )) 118 | #endif // USE_GPU 119 | ); 120 | 121 | // update the simulation time 122 | time += nsteps * dt; 123 | 124 | // print final progress mark 125 | fmt::print("100% \n"); 126 | 127 | return; 128 | } 129 | 130 | // 131 | // simulation 132 | // 133 | int main(int argc, char* argv[]) { 134 | // parse params 135 | const heat_params_t args = argparse::parse(argc, argv); 136 | 137 | // see if help wanted 138 | if (args.help) { 139 | args.print(); // prints all variables 140 | return 0; 141 | } 142 | 143 | // simulation variables 144 | int ncells = args.ncells; 145 | int nsteps = args.nsteps; 146 | int nthreads = args.nthreads; 147 | Real_t dt = args.dt; 148 | Real_t alpha = args.alpha; 149 | std::string sched = args.sch; 150 | 151 | // initialize dx, dy, dz 152 | std::vector ds(dims); 153 | // simulation setup (2D) 154 | std::vector grid_old((ncells + nghosts) * (ncells + nghosts)); 155 | std::vector grid_new(ncells * ncells); 156 | 157 | // data pointers 158 | Real_t* dx = ds.data(); 159 | Real_t* phi_old = grid_old.data(); 160 | Real_t* phi_new = grid_new.data(); 161 | 162 | // initialize stdexec scheduler 163 | sch_t scheduler = get_sch_enum(sched); 164 | 165 | // init timer 166 | Timer timer; 167 | 168 | // launch with appropriate stdexec scheduler 169 | switch (scheduler) { 170 | case sch_t::CPU: 171 | heat_equation(exec::static_thread_pool{nthreads}.get_scheduler(), phi_old, phi_new, dx, dt, alpha, nsteps, 172 | ncells, args.print_grid); 173 | break; 174 | #if defined(USE_GPU) 175 | case sch_t::GPU: 176 | heat_equation(nvexec::stream_context().get_scheduler(), phi_old, phi_new, dx, dt, alpha, nsteps, ncells, 177 | args.print_grid); 178 | break; 179 | case sch_t::MULTIGPU: 180 | heat_equation(nvexec::multi_gpu_stream_context().get_scheduler(), phi_old, phi_new, dx, dt, alpha, nsteps, 181 | ncells, args.print_grid); 182 | break; 183 | #endif // USE_GPU 184 | default: 185 | throw std::runtime_error("Run: `heat-equation-stdexec --help` to see the list of available schedulers"); 186 | } 187 | 188 | auto elapsed = timer.stop(); 189 | 190 | // print timing 191 | if (args.print_time) { 192 | fmt::print("Duration: {:f} ms\n", elapsed); 193 | } 194 | 195 | if (args.print_grid) 196 | // print the final grid 197 | printGrid(phi_new, ncells); 198 | 199 | return 0; 200 | } -------------------------------------------------------------------------------- /apps/1d-stencil/1d-stdexec.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * MIT License 3 | * 4 | * Copyright (c) 2023 Weile Wei 5 | * Copyright (c) 2023 The Regents of the University of California, 6 | * through Lawrence Berkeley National Laboratory (subject to receipt of any 7 | * required approvals from the U.S. Dept. of Energy).All rights reserved. 8 | * 9 | * Permission is hereby granted, free of charge, to any person obtaining a copy 10 | * of this software and associated documentation files (the "Software"), to deal 11 | * in the Software without restriction, including without limitation the rights 12 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 13 | * copies of the Software, and to permit persons to whom the Software is 14 | * furnished to do so, subject to the following conditions: 15 | * 16 | * The above copyright notice and this permission notice shall be included in 17 | * all copies or substantial portions of the Software. 18 | * 19 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 20 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 22 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 24 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 25 | * SOFTWARE. 26 | */ 27 | // 28 | // This example provides a stdexec implementation for the 1D stencil code. 29 | #include 30 | #if defined(USE_GPU) 31 | #include 32 | #include 33 | #endif 34 | #include 35 | 36 | #include "argparse/argparse.hpp" 37 | #include "commons.hpp" 38 | #include "repeat_n/repeat_n.cuh" 39 | 40 | // parameters 41 | struct args_params_t : public argparse::Args { 42 | bool& results = kwarg("results", "print generated results (default: false)").set_default(false); 43 | std::uint64_t& nt = kwarg("nt", "Number of time steps").set_default(45); 44 | std::uint64_t& size = kwarg("size", "Number of elements").set_default(10); 45 | bool& k = kwarg("k", "Heat transfer coefficient").set_default(0.5); 46 | double& dt = kwarg("dt", "Timestep unit (default: 1.0[s])").set_default(1.0); 47 | double& dx = kwarg("dx", "Local x dimension").set_default(1.0); 48 | bool& help = flag("h, help", "print help"); 49 | bool& time = kwarg("t, time", "print time").set_default(true); 50 | std::string& sch = kwarg("sch", 51 | "stdexec scheduler: [options: cpu" 52 | #if defined(USE_GPU) 53 | ", gpu, multigpu" 54 | #endif //USE_GPU 55 | "]") 56 | .set_default("cpu"); 57 | 58 | int& nthreads = kwarg("nthreads", "number of threads").set_default(std::thread::hardware_concurrency()); 59 | }; 60 | 61 | using Real_t = double; 62 | /////////////////////////////////////////////////////////////////////////////// 63 | // Command-line variables 64 | constexpr Real_t k = 0.5; // heat transfer coefficient 65 | constexpr Real_t dt = 1.; // time step 66 | constexpr Real_t dx = 1.; // grid spacing 67 | 68 | /////////////////////////////////////////////////////////////////////////////// 69 | //[stepper_1 70 | struct stepper { 71 | 72 | // do all the work on 'size' data points for 'nt' time steps 73 | [[nodiscard]] std::vector do_work(const auto& sch, std::size_t size, std::size_t nt) { 74 | std::vector current(size); 75 | std::vector next(size); 76 | 77 | Real_t** next_ptr_ptr = new Real_t*(next.data()); 78 | Real_t** current_ptr_ptr = new Real_t*(current.data()); 79 | 80 | stdexec::sender auto init = stdexec::bulk(stdexec::schedule(sch), size, [=](int i) { 81 | auto current_ptr = *current_ptr_ptr; 82 | ; 83 | current_ptr[i] = (Real_t)i; 84 | }); 85 | stdexec::sync_wait(std::move(init)); 86 | 87 | #if !defined(USE_GPU) 88 | for (auto iter = 0; iter < nt; iter++) 89 | #endif 90 | // evolve the system 91 | stdexec::sync_wait( 92 | #if defined(USE_GPU) 93 | ex::just() | 94 | exec::on(sch, repeat_n(nt, 95 | #else 96 | stdexec::schedule(sch) | 97 | #endif 98 | stdexec::bulk(size, 99 | [=](int i) { 100 | auto current_ptr = *current_ptr_ptr; 101 | auto next_ptr = *next_ptr_ptr; 102 | 103 | std::size_t left = (i == 0) ? size - 1 : i - 1; 104 | std::size_t right = (i == size - 1) ? 0 : i + 1; 105 | next_ptr[i] = current_ptr[i] + 106 | (k * dt / (dx * dx)) * 107 | (current_ptr[left] - 2 * current_ptr[i] + 108 | current_ptr[right]); 109 | }) | 110 | stdexec::then([=]() { std::swap(*next_ptr_ptr, *current_ptr_ptr); }) 111 | #if defined(USE_GPU) 112 | )) 113 | #endif // USE_GPU 114 | ); 115 | 116 | if (nt % 2 == 0) { 117 | return current; 118 | } 119 | return next; 120 | } 121 | }; 122 | 123 | /////////////////////////////////////////////////////////////////////////////// 124 | int benchmark(args_params_t const& args) { 125 | std::uint64_t size = args.size; // Number of elements. 126 | std::uint64_t nt = args.nt; // Number of steps. 127 | std::string sch_str = args.sch; // scheduler type 128 | int nthreads = args.nthreads; // number of threads for cpu scheduler type 129 | 130 | // Create the stepper object 131 | stepper step; 132 | 133 | // Measure execution time. 134 | Timer timer; 135 | 136 | // Execute nt time steps on size of elements. 137 | // launch with appropriate stdexec scheduler 138 | std::vector solution; 139 | try { 140 | sch_t schedulerType = get_sch_enum(sch_str); 141 | 142 | switch (schedulerType) { 143 | case sch_t::CPU: 144 | solution = step.do_work(exec::static_thread_pool(nthreads).get_scheduler(), size, nt); 145 | break; 146 | #if defined(USE_GPU) 147 | case sch_t::GPU: 148 | solution = step.do_work(nvexec::stream_context().get_scheduler(), size, nt); 149 | break; 150 | case sch_t::MULTIGPU: 151 | solution = step.do_work(nvexec::multi_gpu_stream_context().get_scheduler(), size, nt); 152 | break; 153 | #endif // USE_GPU 154 | default: 155 | std::cerr << "Unknown scheduler type encountered." << std::endl; 156 | break; 157 | } 158 | } catch (const std::invalid_argument& e) { 159 | std::cerr << e.what() << std::endl; 160 | exit(1); 161 | } 162 | 163 | auto time = timer.stop(); 164 | 165 | // Print the final solution 166 | if (args.results) { 167 | fmt::println("{::f}", solution); 168 | } 169 | 170 | if (args.time) { 171 | fmt::print("Duration: {:f} ms\n", time); 172 | } 173 | 174 | return 0; 175 | } 176 | 177 | int main(int argc, char* argv[]) { 178 | // parse params 179 | args_params_t args = argparse::parse(argc, argv); 180 | // see if help wanted 181 | if (args.help) { 182 | args.print(); // prints all variables 183 | return 0; 184 | } 185 | 186 | benchmark(args); 187 | 188 | return 0; 189 | } 190 | -------------------------------------------------------------------------------- /apps/fft/fft-stdexec.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * MIT License 3 | * 4 | * Copyright (c) 2023 The Regents of the University of California, 5 | * through Lawrence Berkeley National Laboratory (subject to receipt of any 6 | * required approvals from the U.S. Dept. of Energy).All rights reserved. 7 | * 8 | * Permission is hereby granted, free of charge, to any person obtaining a copy 9 | * of this software and associated documentation files (the "Software"), to deal 10 | * in the Software without restriction, including without limitation the rights 11 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | * copies of the Software, and to permit persons to whom the Software is 13 | * furnished to do so, subject to the following conditions: 14 | * 15 | * The above copyright notice and this permission notice shall be included in 16 | * all copies or substantial portions of the Software. 17 | * 18 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 | * SOFTWARE. 25 | */ 26 | 27 | /* 28 | * commons for the fft codes 29 | */ 30 | 31 | #define FFT_STDEXEC 32 | #include "fft.hpp" 33 | #include "repeat_n/repeat_n.cuh" 34 | 35 | // 36 | // fft algorithm 37 | // 38 | [[nodiscard]] std::vector fft(const data_t* x, scheduler auto sch, const int N, const int max_threads, 39 | bool debug = false) { 40 | std::vector x_rev(N); 41 | 42 | data_t* x_r = x_rev.data(); 43 | 44 | // compute shift factor 45 | int shift = 32 - ilog2(N); 46 | 47 | // set cout precision 48 | fmt::print("FFT progress: "); 49 | 50 | // twiddle bits for fft 51 | ex::sender auto twiddle = ex::bulk(schedule(sch), N, [=](int k) { 52 | auto new_idx = reverse_bits32(k) >> shift; 53 | x_r[k] = x[new_idx]; 54 | }); 55 | ex::sync_wait(std::move(twiddle)); 56 | 57 | // mark progress of the twiddle stage 58 | fmt::print("50%.."); 59 | 60 | // niterations 61 | int niters = ilog2(N); 62 | 63 | // pointer to local partition size (must be dynamic mem to be copied to GPU) 64 | int* lN_ptr = new int(1); 65 | 66 | #if !defined(USE_GPU) 67 | for (auto iter = 0; iter < niters; iter++) 68 | #endif 69 | // evolve the system 70 | stdexec::sync_wait( 71 | #if defined(USE_GPU) 72 | // iterate until niters - lN*=2 after each iteration 73 | ex::just() | exec::on(sch, repeat_n(niters, 74 | #else 75 | stdexec::schedule(sch) | 76 | #endif // USE_GPU 77 | ex::then([=]() { *lN_ptr *= 2; }) | 78 | ex::bulk(N / 2, 79 | [=](int k) { 80 | // extract lN from pointer 81 | int lN = *lN_ptr; 82 | 83 | // number of partitions 84 | int nparts = N / lN; 85 | int tpp = lN / 2; 86 | 87 | // compute indices 88 | int e = (k / tpp) * lN + (k % tpp); 89 | auto o = e + tpp; 90 | auto i = (k % tpp); 91 | 92 | // compute 2-pt DFT 93 | auto tmp = x_r[e] + x_r[o] * WNk(N, i * nparts); 94 | x_r[o] = x_r[e] - x_r[o] * WNk(N, i * nparts); 95 | x_r[e] = tmp; 96 | }) 97 | #if defined(USE_GPU) 98 | )) 99 | #endif // USE_GPU 100 | ); 101 | 102 | // print final progress mark 103 | fmt::print("100%\n"); 104 | 105 | // return x_rev = fft(x_r) 106 | return x_rev; 107 | } 108 | 109 | // 110 | // simulation 111 | // 112 | int main(int argc, char* argv[]) { 113 | // parse params 114 | const fft_params_t args = argparse::parse(argc, argv); 115 | 116 | // see if help wanted 117 | if (args.help) { 118 | args.print(); // prints all variables 119 | return 0; 120 | } 121 | 122 | // simulation variables 123 | int N = args.N; 124 | sig_type_t sig_type = sig_type_t::box; 125 | int max_threads = args.max_threads; 126 | //int freq = args.freq; 127 | bool print_sig = args.print_sig; 128 | bool print_time = args.print_time; 129 | bool validate = args.validate; 130 | std::string sched = args.sch; 131 | 132 | // x[n] signal 133 | sig_t x_n(N, sig_type); 134 | 135 | if (!isPowOf2(N)) { 136 | N = ceilPowOf2(N); 137 | fmt::print("INFO: N is not a power of 2. Padding zeros => N = {}\n", N); 138 | 139 | x_n.resize(N); 140 | } 141 | 142 | if (print_sig) { 143 | fmt::print("\nx[n] = "); 144 | x_n.printSignal(); 145 | } 146 | 147 | // y[n] = fft(x[n]); 148 | std::vector y(N); 149 | 150 | // start the timer here 151 | Timer timer; 152 | 153 | // initialize stdexec scheduler 154 | sch_t scheduler = get_sch_enum(sched); 155 | 156 | // launch with appropriate stdexec scheduler 157 | switch (scheduler) { 158 | case sch_t::CPU: 159 | y = fft(x_n.data(), exec::static_thread_pool(max_threads).get_scheduler(), N, max_threads, args.debug); 160 | break; 161 | #if defined(USE_GPU) 162 | case sch_t::GPU: 163 | y = fft(x_n.data(), nvexec::stream_context().get_scheduler(), N, 1024 * 108, args.debug); 164 | break; 165 | case sch_t::MULTIGPU: 166 | y = fft(x_n.data(), nvexec::multi_gpu_stream_context().get_scheduler(), N, 4 * 1024 * 108, args.debug); 167 | break; 168 | #endif // USE_GPU 169 | default: 170 | throw std::runtime_error("Run: `fft-stdexec --help` to see the list of available schedulers"); 171 | } 172 | 173 | // y[n] = fft(x[n]) 174 | sig_t y_n(y); 175 | 176 | // stop timer 177 | auto elapsed = timer.stop(); 178 | 179 | // print the fft(x) 180 | if (print_sig) { 181 | fmt::print("X(k) = "); 182 | y_n.printSignal(); 183 | } 184 | 185 | // print the computation time 186 | if (print_time) 187 | fmt::print("Elapsed Time: {:f} ms\n", elapsed); 188 | 189 | // validate the recursively computed fft 190 | if (validate) { 191 | bool verify = true; 192 | // launch with appropriate stdexec scheduler 193 | switch (scheduler) { 194 | case sch_t::CPU: 195 | verify = x_n.isFFT(y_n, exec::static_thread_pool(max_threads).get_scheduler()); 196 | break; 197 | #if defined(USE_GPU) 198 | case sch_t::GPU: 199 | verify = x_n.isFFT(y_n, nvexec::stream_context().get_scheduler()); 200 | break; 201 | case sch_t::MULTIGPU: 202 | verify = x_n.isFFT(y_n, nvexec::stream_context().get_scheduler()); 203 | break; 204 | #endif // USE_GPU 205 | default: 206 | throw std::runtime_error("Run: `fft-stdexec --help` to see the list of available schedulers"); 207 | } 208 | 209 | if (verify) { 210 | fmt::print("SUCCESS: y[n] == fft(x[n])\n"); 211 | } else { 212 | fmt::print("FAILED: y[n] != fft(x[n])\n"); 213 | } 214 | } 215 | 216 | return 0; 217 | } 218 | -------------------------------------------------------------------------------- /apps/choleskey/choleskey_stdpar_snd.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * MIT License 3 | * 4 | * Copyright (c) 2023 Chuanqiu He 5 | * Copyright (c) 2023 Weile Wei 6 | * Copyright (c) 2023 The Regents of the University of California, 7 | * through Lawrence Berkeley National Laboratory (subject to receipt of any 8 | * required approvals from the U.S. Dept. of Energy).All rights reserved. 9 | * 10 | * Permission is hereby granted, free of charge, to any person obtaining a copy 11 | * of this software and associated documentation files (the "Software"), to deal 12 | * in the Software without restriction, including without limitation the rights 13 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 14 | * copies of the Software, and to permit persons to whom the Software is 15 | * furnished to do so, subject to the following conditions: 16 | * 17 | * The above copyright notice and this permission notice shall be included in 18 | * all copies or substantial portions of the Software. 19 | * 20 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 21 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 22 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 23 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 24 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 25 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 26 | * SOFTWARE. 27 | */ 28 | // 29 | // This example provides a stdexec(senders/receivers) implementation for choleskey decomposition code. 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include "argparse/argparse.hpp" 36 | #include "commons.hpp" 37 | #include "exec/static_thread_pool.hpp" 38 | 39 | #include "matrixutil.hpp" 40 | 41 | using namespace std; 42 | 43 | struct solver { 44 | 45 | using view_2d = std::extents; 46 | 47 | template 48 | std::vector> Cholesky_Decomposition(std::vector& vec, int n, int np) { 49 | 50 | // test here first, scheduler from a thread pool 51 | exec::static_thread_pool pool(np); 52 | stdexec::scheduler auto sch = pool.get_scheduler(); 53 | stdexec::sender auto begin = stdexec::schedule(sch); 54 | 55 | std::vector> lower(n, std::vector(n, 0)); 56 | 57 | auto matrix_ms = std::mdspan(vec.data(), n, n); 58 | 59 | auto multiplier_lambda = [=](auto a, auto b) { 60 | return a * b; 61 | }; 62 | 63 | for (int i = 0; i < matrix_ms.extent(0); i++) { 64 | for (int j = 0; j <= i; j++) { 65 | // avoid over parallelize 66 | if (j == 0) { 67 | np = 1; 68 | } else if (j > 0 && np > j) { 69 | np = j; 70 | } 71 | 72 | if (j == i) // summation for diagonals 73 | { 74 | 75 | if (i == 0 && j == 0) { 76 | lower[j][j] = std::sqrt(matrix_ms(i, j)); 77 | } else { 78 | 79 | std::vector sum_vec(np); // sub res for each piece 80 | int size = j; // there are j elements need to be calculated(power) 81 | 82 | stdexec::sender auto send1 = 83 | stdexec::bulk(begin, np, 84 | [&](int piece) { 85 | int start = piece * size / np; 86 | int chunk_size = size / np; 87 | int remaining = size % np; 88 | chunk_size += (piece == np - 1) ? remaining : 0; 89 | 90 | sum_vec[piece] = std::transform_reduce( 91 | std::execution::par, counting_iterator(start), 92 | counting_iterator(start + chunk_size), 0, std ::plus{}, 93 | [=](int val) { return lower[j][val] * lower[j][val]; }); 94 | }) | 95 | stdexec::then([&sum_vec]() { 96 | return std::reduce(std::execution::par, sum_vec.begin(), sum_vec.end()); 97 | }); 98 | 99 | auto [sum1] = stdexec::sync_wait(std::move(send1)).value(); 100 | 101 | lower[j][j] = std::sqrt(matrix_ms(i, j) - sum1); 102 | } 103 | 104 | } else { 105 | // Evaluating L(i, j) using L(j, j) 106 | 107 | if (j == 0) { 108 | lower[i][j] = (matrix_ms(i, j)) / lower[j][j]; 109 | } else { 110 | 111 | std::vector sum_vec(np); // sub_result for each par piece 112 | int size_nondiag = j; 113 | 114 | stdexec::sender auto send2 = 115 | stdexec::bulk(begin, np, 116 | [&](int piece) { 117 | int start = piece * size_nondiag / np; 118 | int chunk_size = size_nondiag / np; 119 | int remaining = size_nondiag % np; 120 | chunk_size += (piece == np - 1) ? remaining : 0; 121 | 122 | sum_vec[piece] = std::transform_reduce( 123 | std::execution::par, counting_iterator(start), 124 | counting_iterator(start + chunk_size), 0, std ::plus{}, 125 | [=](int k) { return lower[j][k] * lower[i][k]; }); 126 | }) | 127 | stdexec::then([&sum_vec]() { 128 | return std::reduce(std::execution::par, sum_vec.begin(), sum_vec.end()); 129 | }); 130 | 131 | auto [sum2] = stdexec::sync_wait(std::move(send2)).value(); 132 | 133 | lower[i][j] = (matrix_ms(i, j) - sum2) / lower[j][j]; 134 | } 135 | } 136 | } 137 | } 138 | return lower; 139 | } 140 | }; 141 | 142 | /////////////////////////////////////////////////////////////////////////////// 143 | int benchmark(args_params_t const& args) { 144 | 145 | std::uint64_t nd = args.nd; // Number of matrix dimension. 146 | std::uint64_t np = args.np; // Number of parallel partitions. 147 | 148 | std::vector inputMatrix = generate_pascal_matrix(nd); 149 | 150 | // Create the solver object 151 | solver solve; 152 | 153 | // Measure execution time. 154 | Timer timer; 155 | 156 | // start decomposation 157 | auto res_matrix = solve.Cholesky_Decomposition(inputMatrix, nd, np); 158 | auto time = timer.stop(); 159 | 160 | // Print the final results 161 | if (args.results) { 162 | // Displaying Lower Triangular and its Transpose 163 | fmt::print("{:>6} {:>30}\n", "Lower Triangular", "Transpose"); 164 | for (int i = 0; i < nd; i++) { 165 | // Lower Triangular 166 | for (int j = 0; j < nd; j++) 167 | fmt::print("{:>6}\t", res_matrix[i][j]); 168 | fmt::print("\t"); 169 | 170 | // Transpose of Lower Triangular 171 | for (int j = 0; j < nd; j++) 172 | fmt::print("{:>6}\t", res_matrix[j][i]); 173 | fmt::print("\n"); 174 | } 175 | } 176 | 177 | if (args.time) { 178 | fmt::print("Duration: {:f} ms\n", time); 179 | } 180 | 181 | return 0; 182 | } 183 | 184 | // Driver Code for testing 185 | int main(int argc, char* argv[]) { 186 | 187 | // parse params 188 | args_params_t args = argparse::parse(argc, argv); 189 | // see if help wanted 190 | if (args.help) { 191 | args.print(); // prints all variables 192 | return 0; 193 | } 194 | 195 | benchmark(args); 196 | 197 | return 0; 198 | } 199 | -------------------------------------------------------------------------------- /apps/heat-equation/heat-equation-cuda.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * MIT License 3 | * 4 | * Copyright (c) 2023 The Regents of the University of California, 5 | * through Lawrence Berkeley National Laboratory (subject to receipt of any 6 | * required approvals from the U.S. Dept. of Energy).All rights reserved. 7 | * 8 | * Permission is hereby granted, free of charge, to any person obtaining a copy 9 | * of this software and associated documentation files (the "Software"), to deal 10 | * in the Software without restriction, including without limitation the rights 11 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | * copies of the Software, and to permit persons to whom the Software is 13 | * furnished to do so, subject to the following conditions: 14 | * 15 | * The above copyright notice and this permission notice shall be included in 16 | * all copies or substantial portions of the Software. 17 | * 18 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 | * SOFTWARE. 25 | */ 26 | 27 | /* 28 | * Simplified 2d heat equation example derived from amrex 29 | */ 30 | 31 | #include 32 | 33 | #include "heat-equation.hpp" 34 | 35 | using namespace std; 36 | 37 | // array to store PTM masses 38 | __constant__ Real_t dx[2]; 39 | 40 | #define cudaErrorCheck(ans) check((ans), __FILE__, __LINE__) 41 | 42 | // error checking function 43 | template 44 | static inline void check(T result, const char* const file, const int line, bool is_fatal = true) { 45 | if (result != cudaSuccess) { 46 | std::cerr << "CUDA error at " << file << ":" << line << std::endl; 47 | std::cerr << cudaGetErrorString(result) << std::endl; 48 | 49 | if (is_fatal) 50 | exit(result); 51 | } 52 | } 53 | 54 | // 55 | // initialize grid kernel 56 | // 57 | template 58 | __global__ void initialize(T* phi, int ncells, int ghost_cells) { 59 | int ind = blockIdx.x * blockDim.x + threadIdx.x; 60 | int d_nghosts = nghosts; 61 | int phi_old_extent = ncells + d_nghosts; 62 | int gsize = ncells * ncells; 63 | 64 | for (; ind < gsize; ind += blockDim.x * gridDim.x) { 65 | int i = 1 + (ind / ncells); 66 | int j = 1 + (ind % ncells); 67 | 68 | Real_t x = pos(i, ghost_cells, dx[0]); 69 | Real_t y = pos(j, ghost_cells, dx[1]); 70 | 71 | // L2 distance (r2 from origin) 72 | Real_t r2 = (x * x + y * y) / (0.01); 73 | 74 | // phi(x,y) = 1 + exp(-r^2) 75 | phi[(i)*phi_old_extent + j] = 1 + exp(-r2); 76 | } 77 | } 78 | 79 | // 80 | // fill boundary kernel 81 | // 82 | template 83 | __global__ void fillBoundary(T* phi_old, int ncells, int ghost_cells) { 84 | int pos = blockIdx.x * blockDim.x + threadIdx.x; 85 | int d_nghosts = nghosts; 86 | int phi_old_extent = ncells + d_nghosts; 87 | int len = phi_old_extent; 88 | 89 | for (; pos < phi_old_extent - nghosts; pos += blockDim.x * gridDim.x) { 90 | int i = pos + ghost_cells; 91 | 92 | // fill boundary cells in phi_old 93 | phi_old[i] = phi_old[i + (ghost_cells * len)]; 94 | 95 | phi_old[i + (len * (len - ghost_cells))] = phi_old[i + (len * (len - ghost_cells - 1))]; 96 | 97 | phi_old[i * len] = phi_old[(ghost_cells * len) + i]; 98 | 99 | phi_old[(len - ghost_cells) + (len * i)] = phi_old[(len - ghost_cells - 1) + (len * i)]; 100 | } 101 | } 102 | 103 | // 104 | // jacobi 2d stencil kernel 105 | // 106 | template 107 | __global__ void jacobi(T* phi_old, T* phi_new, int ncells, Real_t alpha, Real_t dt) { 108 | int pos = blockIdx.x * blockDim.x + threadIdx.x; 109 | int d_nghosts = nghosts; 110 | int phi_old_extent = ncells + d_nghosts; 111 | int gsize = ncells * ncells; 112 | 113 | for (; pos < gsize; pos += blockDim.x * gridDim.x) { 114 | int i = 1 + (pos / ncells); 115 | int j = 1 + (pos % ncells); 116 | 117 | // Jacobi iteration 118 | phi_new[(i - 1) * ncells + j - 1] = 119 | phi_old[(i)*phi_old_extent + j] + 120 | alpha * dt * 121 | 122 | ((phi_old[(i + 1) * phi_old_extent + j] - 2.0 * phi_old[(i)*phi_old_extent + j] + 123 | phi_old[(i - 1) * phi_old_extent + j]) / 124 | (dx[0] * dx[0]) + 125 | 126 | (phi_old[(i)*phi_old_extent + j + 1] - 2.0 * phi_old[(i)*phi_old_extent + j] + 127 | phi_old[(i)*phi_old_extent + j - 1]) / 128 | (dx[1] * dx[1])); 129 | } 130 | } 131 | 132 | // 133 | // parallelCopy kernel 134 | // 135 | template 136 | __global__ void parallelCopy(T* phi_old, T* phi_new, int ncells) { 137 | int pos = blockIdx.x * blockDim.x + threadIdx.x; 138 | int d_nghosts = nghosts; 139 | int phi_old_extent = ncells + d_nghosts; 140 | int gsize = ncells * ncells; 141 | 142 | for (; pos < gsize; pos += blockDim.x * gridDim.x) { 143 | int i = 1 + (pos / ncells); 144 | int j = 1 + (pos % ncells); 145 | phi_old[(i)*phi_old_extent + j] = phi_new[(i - 1) * ncells + (j - 1)]; 146 | } 147 | } 148 | 149 | // 150 | // main simulation 151 | // 152 | int main(int argc, char* argv[]) { 153 | // parse params 154 | const heat_params_t args = argparse::parse(argc, argv); 155 | 156 | // see if help wanted 157 | if (args.help) { 158 | args.print(); // prints all variables 159 | return 0; 160 | } 161 | 162 | // simulation variables 163 | int ncells = args.ncells; 164 | int nsteps = args.nsteps; 165 | Real_t dt = args.dt; 166 | Real_t alpha = args.alpha; 167 | 168 | // init simulation time 169 | Real_t time = 0.0; 170 | 171 | // initialize dx, dy, dz 172 | Real_t h_dx[dims]; 173 | for (int i = 0; i < dims; ++i) 174 | h_dx[i] = 1.0 / (ncells - 1); 175 | 176 | cudaErrorCheck(cudaMemcpyToSymbol(dx, h_dx, sizeof(Real_t) * dims)); 177 | 178 | // grid size 179 | int gsize = ncells * ncells; 180 | 181 | // host memory for printing 182 | Real_t* h_phi = nullptr; 183 | 184 | // simulation setup (2D) 185 | Real_t* phi_old = nullptr; 186 | Real_t* phi_new = nullptr; 187 | 188 | cudaErrorCheck(cudaMalloc(&phi_old, sizeof(Real_t) * ((ncells + nghosts) * (ncells + nghosts)))); 189 | cudaErrorCheck(cudaMalloc(&phi_new, sizeof(Real_t) * ((ncells) * (ncells)))); 190 | 191 | // setup grid 192 | int blockSize = std::min(1024, gsize); // let's do at most 1024 threads. 193 | int nBlocks = (gsize + blockSize - 1) / blockSize; 194 | 195 | Timer timer; 196 | 197 | // initialize grid 198 | initialize<<>>(phi_old, ncells, ghost_cells); 199 | 200 | cudaErrorCheck(cudaDeviceSynchronize()); 201 | 202 | // print initial grid if needed 203 | if (args.print_grid) { 204 | // copy initial grid to host 205 | h_phi = new Real_t[(ncells + nghosts) * (ncells + nghosts)]; 206 | cudaErrorCheck(cudaMemcpy(h_phi, phi_old, sizeof(Real_t) * (ncells + nghosts) * (ncells + nghosts), 207 | cudaMemcpyDeviceToHost)); 208 | 209 | printGrid(h_phi, ncells + nghosts); 210 | } 211 | 212 | // evolve the system 213 | for (auto step = 0; step < nsteps; step++) { 214 | static int fBblock = std::min(1024, ncells); // let's do at most 1024 threads. 215 | static int fBnBlocks = (ncells + fBblock - 1) / fBblock; // fillBoundary blocks 216 | 217 | // fillboundary 218 | fillBoundary<<>>(phi_old, ncells, ghost_cells); 219 | 220 | // jacobi 221 | jacobi<<>>(phi_old, phi_new, ncells, alpha, dt); 222 | 223 | // parallelCopy 224 | parallelCopy<<>>(phi_old, phi_new, ncells); 225 | 226 | cudaErrorCheck(cudaDeviceSynchronize()); 227 | 228 | // update time 229 | time += dt; 230 | } 231 | 232 | auto elapsed = timer.stop(); 233 | 234 | // print timing 235 | if (args.print_time) { 236 | fmt::print("Duration: {:f} ms\n", elapsed); 237 | } 238 | 239 | // print final grid if needed 240 | if (args.print_grid) { 241 | cudaErrorCheck(cudaMemcpy(h_phi, phi_new, sizeof(Real_t) * gsize, cudaMemcpyDeviceToHost)); 242 | printGrid(h_phi, ncells); 243 | 244 | // free host memory 245 | delete[] h_phi; 246 | h_phi = nullptr; 247 | } 248 | 249 | // free device memory 250 | cudaErrorCheck(cudaFree(phi_old)); 251 | cudaErrorCheck(cudaFree(phi_new)); 252 | 253 | return 0; 254 | } 255 | -------------------------------------------------------------------------------- /.cmake-format.py: -------------------------------------------------------------------------------- 1 | # ---------------------------------- 2 | # Options affecting listfile parsing 3 | # ---------------------------------- 4 | with section("parse"): 5 | 6 | # Specify structure for custom cmake functions 7 | additional_commands = { 'foo': { 'flags': ['BAR', 'BAZ'], 8 | 'kwargs': {'DEPENDS': '*', 'HEADERS': '*', 'SOURCES': '*'}}} 9 | 10 | # Override configurations per-command where available 11 | override_spec = {} 12 | 13 | # Specify variable tags. 14 | vartags = [] 15 | 16 | # Specify property tags. 17 | proptags = [] 18 | 19 | # ----------------------------- 20 | # Options affecting formatting. 21 | # ----------------------------- 22 | with section("format"): 23 | 24 | # Disable formatting entirely, making cmake-format a no-op 25 | disable = False 26 | 27 | # How wide to allow formatted cmake files 28 | line_width = 80 29 | 30 | # How many spaces to tab for indent 31 | tab_size = 2 32 | 33 | # If true, lines are indented using tab characters (utf-8 0x09) instead of 34 | # space characters (utf-8 0x20). In cases where the layout would 35 | # require a fractional tab character, the behavior of the fractional 36 | # indentation is governed by 37 | use_tabchars = False 38 | 39 | # If is True, then the value of this variable indicates how 40 | # fractional indentions are handled during whitespace replacement. If set to 41 | # 'use-space', fractional indentation is left as spaces (utf-8 0x20). If set 42 | # to `round-up` fractional indentation is replaced with a single tab character 43 | # (utf-8 0x09) effectively shifting the column to the next tabstop 44 | fractional_tab_policy = 'use-space' 45 | 46 | # If an argument group contains more than this many sub-groups (parg or kwarg 47 | # groups) then force it to a vertical layout. 48 | max_subgroups_hwrap = 2 49 | 50 | # If a positional argument group contains more than this many arguments, then 51 | # force it to a vertical layout. 52 | max_pargs_hwrap = 6 53 | 54 | # If a cmdline positional group consumes more than this many lines without 55 | # nesting, then invalidate the layout (and nest) 56 | max_rows_cmdline = 2 57 | 58 | # If true, separate flow control names from their parentheses with a space 59 | separate_ctrl_name_with_space = False 60 | 61 | # If true, separate function names from parentheses with a space 62 | separate_fn_name_with_space = False 63 | 64 | # If a statement is wrapped to more than one line, than dangle the closing 65 | # parenthesis on its own line. 66 | dangle_parens = False 67 | 68 | # If the trailing parenthesis must be 'dangled' on its on line, then align it 69 | # to this reference: `prefix`: the start of the statement, `prefix-indent`: 70 | # the start of the statement, plus one indentation level, `child`: align to 71 | # the column of the arguments 72 | dangle_align = 'prefix' 73 | 74 | # If the statement spelling length (including space and parenthesis) is 75 | # smaller than this amount, then force reject nested layouts. 76 | min_prefix_chars = 4 77 | 78 | # If the statement spelling length (including space and parenthesis) is larger 79 | # than the tab width by more than this amount, then force reject un-nested 80 | # layouts. 81 | max_prefix_chars = 10 82 | 83 | # If a candidate layout is wrapped horizontally but it exceeds this many 84 | # lines, then reject the layout. 85 | max_lines_hwrap = 2 86 | 87 | # What style line endings to use in the output. 88 | line_ending = 'unix' 89 | 90 | # Format command names consistently as 'lower' or 'upper' case 91 | command_case = 'canonical' 92 | 93 | # Format keywords consistently as 'lower' or 'upper' case 94 | keyword_case = 'unchanged' 95 | 96 | # A list of command names which should always be wrapped 97 | always_wrap = [] 98 | 99 | # If true, the argument lists which are known to be sortable will be sorted 100 | # lexicographicall 101 | enable_sort = True 102 | 103 | # If true, the parsers may infer whether or not an argument list is sortable 104 | # (without annotation). 105 | autosort = False 106 | 107 | # By default, if cmake-format cannot successfully fit everything into the 108 | # desired linewidth it will apply the last, most agressive attempt that it 109 | # made. If this flag is True, however, cmake-format will print error, exit 110 | # with non-zero status code, and write-out nothing 111 | require_valid_layout = False 112 | 113 | # A dictionary mapping layout nodes to a list of wrap decisions. See the 114 | # documentation for more information. 115 | layout_passes = {} 116 | 117 | # ------------------------------------------------ 118 | # Options affecting comment reflow and formatting. 119 | # ------------------------------------------------ 120 | with section("markup"): 121 | 122 | # What character to use for bulleted lists 123 | bullet_char = '*' 124 | 125 | # What character to use as punctuation after numerals in an enumerated list 126 | enum_char = '.' 127 | 128 | # If comment markup is enabled, don't reflow the first comment block in each 129 | # listfile. Use this to preserve formatting of your copyright/license 130 | # statements. 131 | first_comment_is_literal = False 132 | 133 | # If comment markup is enabled, don't reflow any comment block which matches 134 | # this (regex) pattern. Default is `None` (disabled). 135 | literal_comment_pattern = None 136 | 137 | # Regular expression to match preformat fences in comments default= 138 | # ``r'^\s*([`~]{3}[`~]*)(.*)$'`` 139 | fence_pattern = '^\\s*([`~]{3}[`~]*)(.*)$' 140 | 141 | # Regular expression to match rulers in comments default= 142 | # ``r'^\s*[^\w\s]{3}.*[^\w\s]{3}$'`` 143 | ruler_pattern = '^\\s*[^\\w\\s]{3}.*[^\\w\\s]{3}$' 144 | 145 | # If a comment line matches starts with this pattern then it is explicitly a 146 | # trailing comment for the preceeding argument. Default is '#<' 147 | explicit_trailing_pattern = '#<' 148 | 149 | # If a comment line starts with at least this many consecutive hash 150 | # characters, then don't lstrip() them off. This allows for lazy hash rulers 151 | # where the first hash char is not separated by space 152 | hashruler_min_length = 10 153 | 154 | # If true, then insert a space between the first hash char and remaining hash 155 | # chars in a hash ruler, and normalize its length to fill the column 156 | canonicalize_hashrulers = True 157 | 158 | # enable comment markup parsing and reflow 159 | enable_markup = True 160 | 161 | # ---------------------------- 162 | # Options affecting the linter 163 | # ---------------------------- 164 | with section("lint"): 165 | 166 | # a list of lint codes to disable 167 | disabled_codes = [] 168 | 169 | # regular expression pattern describing valid function names 170 | function_pattern = '[0-9a-z_]+' 171 | 172 | # regular expression pattern describing valid macro names 173 | macro_pattern = '[0-9A-Z_]+' 174 | 175 | # regular expression pattern describing valid names for variables with global 176 | # (cache) scope 177 | global_var_pattern = '[A-Z][0-9A-Z_]+' 178 | 179 | # regular expression pattern describing valid names for variables with global 180 | # scope (but internal semantic) 181 | internal_var_pattern = '_[A-Z][0-9A-Z_]+' 182 | 183 | # regular expression pattern describing valid names for variables with local 184 | # scope 185 | local_var_pattern = '[a-z][a-z0-9_]+' 186 | 187 | # regular expression pattern describing valid names for privatedirectory 188 | # variables 189 | private_var_pattern = '_[0-9a-z_]+' 190 | 191 | # regular expression pattern describing valid names for public directory 192 | # variables 193 | public_var_pattern = '[A-Z][0-9A-Z_]+' 194 | 195 | # regular expression pattern describing valid names for function/macro 196 | # arguments and loop variables. 197 | argument_var_pattern = '[a-z][a-z0-9_]+' 198 | 199 | # regular expression pattern describing valid names for keywords used in 200 | # functions or macros 201 | keyword_pattern = '[A-Z][0-9A-Z_]+' 202 | 203 | # In the heuristic for C0201, how many conditionals to match within a loop in 204 | # before considering the loop a parser. 205 | max_conditionals_custom_parser = 2 206 | 207 | # Require at least this many newlines between statements 208 | min_statement_spacing = 1 209 | 210 | # Require no more than this many newlines between statements 211 | max_statement_spacing = 2 212 | max_returns = 6 213 | max_branches = 12 214 | max_arguments = 5 215 | max_localvars = 15 216 | max_statements = 50 217 | 218 | # ------------------------------- 219 | # Options affecting file encoding 220 | # ------------------------------- 221 | with section("encode"): 222 | 223 | # If true, emit the unicode byte-order mark (BOM) at the start of the file 224 | emit_byteorder_mark = False 225 | 226 | # Specify the encoding of the input file. Defaults to utf-8 227 | input_encoding = 'utf-8' 228 | 229 | # Specify the encoding of the output file. Defaults to utf-8. Note that cmake 230 | # only claims to support utf-8 so be careful when using anything else 231 | output_encoding = 'utf-8' 232 | 233 | # ------------------------------------- 234 | # Miscellaneous configurations options. 235 | # ------------------------------------- 236 | with section("misc"): 237 | 238 | # A dictionary containing any per-command configuration overrides. Currently 239 | # only `command_case` is supported. 240 | per_command = {} 241 | 242 | -------------------------------------------------------------------------------- /apps/fft/fft.hpp: -------------------------------------------------------------------------------- 1 | /* 2 | * MIT License 3 | * 4 | * Copyright (c) 2023 The Regents of the University of California, 5 | * through Lawrence Berkeley National Laboratory (subject to receipt of any 6 | * required approvals from the U.S. Dept. of Energy).All rights reserved. 7 | * 8 | * Permission is hereby granted, free of charge, to any person obtaining a copy 9 | * of this software and associated documentation files (the "Software"), to deal 10 | * in the Software without restriction, including without limitation the rights 11 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | * copies of the Software, and to permit persons to whom the Software is 13 | * furnished to do so, subject to the following conditions: 14 | * 15 | * The above copyright notice and this permission notice shall be included in 16 | * all copies or substantial portions of the Software. 17 | * 18 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 | * SOFTWARE. 25 | */ 26 | 27 | /* 28 | * commons for the fft codes 29 | */ 30 | 31 | #pragma once 32 | 33 | #include 34 | #include 35 | 36 | #if defined(USE_GPU) 37 | #include 38 | #include 39 | using namespace nvexec; 40 | #endif //USE_GPU 41 | 42 | #include 43 | #include "argparse/argparse.hpp" 44 | 45 | #include "commons.hpp" 46 | 47 | using namespace std; 48 | using namespace stdexec; 49 | using namespace std::complex_literals; 50 | using stdexec::sync_wait; 51 | 52 | namespace ex = stdexec; 53 | 54 | // mdspan views 55 | using view_2d = std::extents; 56 | using view_1d = std::extents; 57 | 58 | // data type 59 | using Real_t = double; 60 | using data_t = std::complex; 61 | 62 | // enum for signal types 63 | enum sig_type { square, sinusoid, sawtooth, triangle, sinc, box }; 64 | 65 | using sig_type_t = sig_type; 66 | 67 | // map for signals 68 | std::map sigmap{{"square", sig_type_t::square}, {"sinusoid", sig_type_t::sinusoid}, 69 | {"triangle", sig_type_t::sawtooth}, {"triangle", sig_type_t::triangle}, 70 | {"sinc", sig_type_t::sinc}, {"box", sig_type_t::box}}; 71 | 72 | // custom get sig_type_t from string 73 | sig_type_t getSignal(std::string& sig) { 74 | if (sigmap.contains(sig)) { 75 | return sigmap[sig]; 76 | } else { 77 | return (sig_type_t)(-1); 78 | } 79 | } 80 | 81 | // input arguments 82 | struct fft_params_t : public argparse::Args { 83 | // NVC++ is not supported by magic_enum so using strings 84 | std::string& sig = kwarg("sig", "input signal type: square, sinusoid, sawtooth, triangle, box").set_default("box"); 85 | 86 | int& freq = kwarg("f,freq", "Signal frequency").set_default(1024); 87 | int& N = kwarg("N", "N-point FFT").set_default(1024); 88 | bool& print_sig = flag("p,print", "print x[n] and X(k)"); 89 | int& max_threads = kwarg("nthreads", "number of threads").set_default(std::thread::hardware_concurrency()); 90 | 91 | #if defined(FFT_STDEXEC) 92 | std::string& sch = kwarg("sch", 93 | "stdexec scheduler: [options: cpu" 94 | #if defined(USE_GPU) 95 | ", gpu, multigpu" 96 | #endif //USE_GPU 97 | "]") 98 | .set_default("cpu"); 99 | #endif // FFT_STDEXEC 100 | 101 | bool& validate = flag("validate", "validate the results via y[k] = WNk * x[n]"); 102 | bool& help = flag("h, help", "print help"); 103 | bool& print_time = flag("t,time", "print fft time"); 104 | bool& debug = flag("d,debug", "print internal timers and launch configs"); 105 | }; 106 | 107 | inline std::complex WNk(int N, int k) { 108 | return std::complex(exp(-2 * M_PI * 1 / N * k * 1i)); 109 | } 110 | 111 | class signal { 112 | public: 113 | signal() = default; 114 | 115 | signal(int N) { 116 | if (N <= 0) { 117 | std::cerr << "ERROR: N must be > 0. exiting.." << std::endl; 118 | exit(1); 119 | } 120 | y.reserve(ceilPowOf2(N)); 121 | y.resize(N); 122 | } 123 | 124 | signal(signal& rhs) { y = rhs.y; } 125 | 126 | signal(std::vector&& in) { y = std::move(in); } 127 | 128 | signal(std::vector& in) { y = std::move(in); } 129 | 130 | signal(int N, sig_type type, int threads = std::thread::hardware_concurrency()) { 131 | if (N <= 0) { 132 | std::cerr << "ERROR: N must be > 0. exiting.." << std::endl; 133 | exit(1); 134 | } 135 | y.reserve(ceilPowOf2(N)); 136 | y.resize(N); 137 | signalGenerator(type, threads); 138 | } 139 | 140 | void signalGenerator(sig_type type = sig_type::box, int threads = std::thread::hardware_concurrency()) { 141 | int N = y.size(); 142 | 143 | // scheduler from a thread pool 144 | exec::static_thread_pool ctx{threads}; 145 | scheduler auto sch = ctx.get_scheduler(); 146 | 147 | // start scheduling 148 | sender auto start = schedule(sch); 149 | 150 | // generate input signal 151 | switch (type) { 152 | case sig_type::square: 153 | sync_wait(bulk(start, N, [&](int n) { y[n] = (n < N / 4 || n >= 3 * N / 4) ? 1.0 : -1.0; })); 154 | break; 155 | case sig_type::sinusoid: 156 | sync_wait(bulk(start, N, [&](int n) { y[n] = std::sin(2.0 * M_PI * n / N); })); 157 | break; 158 | case sig_type::sawtooth: 159 | sync_wait(bulk(start, N, [&](int n) { y[n] = 2.0 * (n / N) - 1.0; })); 160 | break; 161 | case sig_type::triangle: 162 | sync_wait(bulk(start, N, [&](int n) { y[n] = 2.0 * std::abs(2.0 * (n / N) - 1.0) - 1.0; })); 163 | break; 164 | case sig_type::sinc: 165 | y[0] = 1.0; 166 | sync_wait(bulk(start, N - 1, [&](int n) { 167 | y[n + 1] = std::sin(2.0 * M_PI * (n + 1) / N) / (2.0 * M_PI * (n + 1) / N); 168 | })); 169 | break; 170 | case sig_type::box: 171 | sync_wait(bulk(start, N, [&](int n) { y[n] = (n < N / 4 || n >= 3 * N / 4) ? 1.0 : 0.0; })); 172 | break; 173 | default: 174 | std::cerr << "ERROR: Unknown input signal type. exiting.." << std::endl; 175 | std::cerr << "Run: --help to see the list of available signals" << std::endl; 176 | exit(1); 177 | } 178 | } 179 | 180 | ~signal() { y.clear(); } 181 | 182 | data_t* data() { return y.data(); } 183 | 184 | int len() { return y.size(); } 185 | 186 | void resize(int N) { 187 | if (N != y.size()) 188 | y.resize(N, 0); 189 | } 190 | 191 | data_t& operator[](int n) { return y[n]; } 192 | 193 | data_t& operator()(int n) { return y[n]; } 194 | 195 | void printSignal() { fmt::print("{} \n", y); } 196 | 197 | [[nodiscard]] bool isFFT(signal& X, scheduler auto sch, int maxN = 20000) { 198 | int N = y.size(); 199 | bool ret = true; 200 | 201 | if (X.len() > maxN) { 202 | fmt::print("Input signal may be too large to compute DFT via y[n] = WNk * x[n]. Segfaults expected..\n"); 203 | } 204 | 205 | std::vector Y(N); 206 | std::vector M(N * N); 207 | 208 | auto A = std::mdspan(M.data(), N, N); 209 | auto mdy = std::mdspan(y.data(), N, 1); 210 | auto mdY = std::mdspan(Y.data(), N, 1); 211 | 212 | data_t* F = M.data(); 213 | data_t* X_ptr = X.data(); 214 | data_t* Y_ptr = Y.data(); 215 | 216 | ex::sender auto init = ex::transfer_just(sch, F) | ex::bulk(N * N, [=](int k, auto F) { 217 | int i = k / N; 218 | int j = k % N; 219 | F[k] = WNk(N, i * j); 220 | }); 221 | 222 | // initialize 223 | ex::sync_wait(init); 224 | 225 | // compute Y[n] = dft(x[n]) = WNk * x[n] 226 | stdex::linalg::matrix_product(std::execution::par, A, mdy, mdY); 227 | 228 | // compare the computed Y[n] (dft) with X[n](fft) 229 | ex::sender auto verify = ex::transfer_just(sch, ret, X_ptr, Y_ptr) | 230 | ex::bulk(N, 231 | [](int k, auto& ret, auto X_ptr, auto Y_ptr) { 232 | if (!complex_compare(X_ptr[k], Y_ptr[k])) { 233 | //std::cout << "y[" << i << "] = " << X[i] << " != x[" << i << "] = " << Y[i] << std::endl; 234 | ret = false; 235 | } 236 | }) | 237 | then([](auto ret, auto&&...) { return ret; }); 238 | 239 | // let the pipeline run 240 | auto [re] = ex::sync_wait(verify).value(); 241 | 242 | return re; 243 | } 244 | 245 | private: 246 | // y[n] 247 | std::vector y; 248 | }; 249 | 250 | using sig_t = signal; 251 | --------------------------------------------------------------------------------