├── python ├── hamr.py ├── hamr_py.i └── CMakeLists.txt ├── doc └── rtd │ ├── requirements.txt │ ├── source │ ├── hello_cuda │ │ ├── Makefile │ │ ├── add.cuh │ │ ├── write.h │ │ ├── hello_cuda.cu │ │ └── add.h │ ├── hello_hip │ │ ├── Makefile │ │ ├── add_kernel.h │ │ ├── write.h │ │ ├── hello_hip.cpp │ │ └── add.h │ ├── hello_openmp │ │ ├── write.h │ │ ├── Makefile │ │ ├── hello_openmp.cpp │ │ └── add.h │ ├── zero_copy_cupy │ │ ├── python_to_cpp.py │ │ └── cpp_to_python.py │ └── hello_cupy │ │ └── hello_cupy.py │ ├── _static │ └── theme_overrides.css │ ├── Makefile │ ├── make.bat │ └── conf.py ├── .gitignore ├── hamr_buffer_allocator.i ├── hamr_buffer_transfer.i ├── CITATION.cff ├── .readthedocs.yaml ├── test ├── test_hamr_openmp_allocator.cpp ├── test_hamr_buffer_cupy_host.py ├── test_hamr_buffer_numpy_cuda.py ├── test_hamr_multi_gpu_cuda.cpp ├── test_hamr_multi_gpu_hip.cpp ├── test_hamr_buffer_cupy_cuda.py ├── test_hamr_buffer_numpy_host.py ├── test_hamr_pipeline_cuda_openmp.cpp ├── test_hamr_pipeline_cuda_openmp_mp.cpp ├── test_hamr_pipeline_openmp.cpp └── test_hamr_pipeline_host.cpp ├── hamr_env.h ├── hamr_buffer_transfer.h ├── hamr_hip_print.h ├── hamr_stream.i ├── hamr_cuda_print.h ├── hamr_env.cxx ├── hamr_gil_state.h ├── hamr_openmp_print.h ├── hamr_config.cmake.in ├── hamr_config.h.in ├── hamr_hip_print.cxx ├── hamr_openmp_print.cxx ├── hamr_buffer_pointer.h ├── hamr_host_copy.h ├── hamr_python_deleter_impl.h ├── hamr_buffer_allocator.cxx ├── hamr_openmp_device.h ├── hamr_cuda_print.cxx ├── hamr_hip_device.h ├── hamr_openmp_print_impl.h ├── hamr_host_copy.cxx ├── hamr_python_deleter.h ├── .github └── workflows │ ├── build_and_test_cuda.yml │ ├── build_and_test_hip.yml │ ├── build_and_test_amd_openmp.yml │ └── build_and_test_host.yml ├── hamr_openmp_device.cxx ├── hamr_host_copy_impl.h ├── hamr_python_deleter.cxx ├── hamr_buffer_handle.i ├── hamr_hip_print_impl.h ├── hamr_copier_traits.h ├── hamr_cuda_print_impl.h ├── hamr_stream.cxx ├── hamr_cuda_device.h ├── hamr_new_allocator.h ├── hamr_hip_device.cxx ├── hamr_new_allocator.cxx ├── README.md ├── hamr_openmp_allocator.cxx ├── hamr_malloc_allocator.cxx ├── hamr_hip_malloc_allocator.cxx ├── hamr_new_allocator_impl.h ├── hamr_cuda_malloc_host_allocator.cxx ├── hamr_device.h ├── cmake └── hamr_omp_offload.cmake ├── hamr_stream.h ├── hamr_stream_impl.h ├── hamr_cuda_malloc_uva_allocator.cxx ├── hamr_cuda_malloc_async_allocator.cxx ├── LICENSE ├── hamr_hip_launch.h ├── hamr_cuda_launch.h ├── hamr_buffer_allocator.h ├── hamr_malloc_allocator.h ├── hamr_openmp_allocator.h ├── hamr_hip_malloc_allocator.h ├── hamr_cuda_malloc_allocator.cxx ├── hamr_cuda_malloc_host_allocator.h ├── hamr_hip_kernels.h ├── hamr_cuda_kernels.h ├── hamr_cuda_malloc_uva_allocator.h └── hamr_hip_launch.cxx /python/hamr.py: -------------------------------------------------------------------------------- 1 | from hamr.hamr_py import * 2 | -------------------------------------------------------------------------------- /doc/rtd/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx<7 2 | sphinxcontrib-bibtex 3 | breathe 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.sw[a-z] 2 | *.patch 3 | _build 4 | *.pt 5 | *.vscode* 6 | .DS_Store 7 | generated_rtd* 8 | -------------------------------------------------------------------------------- /doc/rtd/source/hello_cuda/Makefile: -------------------------------------------------------------------------------- 1 | 2 | HAMR_SOURCE=../../../../ 3 | HAMR_BUILD=../../../../build_cuda 4 | 5 | all: 6 | nvcc hello_cuda.cu -I${HAMR_SOURCE} -I${HAMR_BUILD} -std=c++17 -L${HAMR_BUILD}/lib/ -lhamr 7 | -------------------------------------------------------------------------------- /doc/rtd/source/hello_hip/Makefile: -------------------------------------------------------------------------------- 1 | 2 | HAMR_SOURCE=../../../../ 3 | HAMR_BUILD=../../../../build_hip 4 | 5 | all: 6 | hipcc hello_hip.cpp -I${HAMR_SOURCE} -I${HAMR_BUILD} -std=c++17 -L${HAMR_BUILD}/lib/ -lhamr 7 | -------------------------------------------------------------------------------- /doc/rtd/_static/theme_overrides.css: -------------------------------------------------------------------------------- 1 | /* override table width restrictions */ 2 | .wy-table-responsive table td, .wy-table-responsive table th { 3 | white-space: normal; 4 | } 5 | 6 | .wy-table-responsive { 7 | margin-bottom: 24px; 8 | max-width: 100%; 9 | overflow: visible; 10 | } 11 | -------------------------------------------------------------------------------- /hamr_buffer_allocator.i: -------------------------------------------------------------------------------- 1 | %{ 2 | #include "hamr_config.h" 3 | #include "hamr_buffer_allocator.h" 4 | %} 5 | /*************************************************************************** 6 | * buffer allocator 7 | **************************************************************************/ 8 | %include "hamr_buffer_allocator.h" 9 | -------------------------------------------------------------------------------- /doc/rtd/source/hello_cuda/add.cuh: -------------------------------------------------------------------------------- 1 | template 2 | __global__ 3 | void add(T *result, const T *array_1, const U *array_2, size_t n_vals) 4 | { 5 | unsigned long i = blockIdx.x*blockDim.x + threadIdx.x; 6 | 7 | if (i >= n_vals) 8 | return; 9 | 10 | result[i] = array_1[i] + array_2[i]; 11 | } 12 | -------------------------------------------------------------------------------- /doc/rtd/source/hello_hip/add_kernel.h: -------------------------------------------------------------------------------- 1 | template 2 | __global__ 3 | void add(T *result, const T *array_1, const U *array_2, size_t n_vals) 4 | { 5 | unsigned long i = blockIdx.x*blockDim.x + threadIdx.x; 6 | 7 | if (i >= n_vals) 8 | return; 9 | 10 | result[i] = array_1[i] + array_2[i]; 11 | } 12 | -------------------------------------------------------------------------------- /hamr_buffer_transfer.i: -------------------------------------------------------------------------------- 1 | %{ 2 | #include "hamr_config.h" 3 | #include "hamr_buffer_transfer.h" 4 | %} 5 | /*************************************************************************** 6 | * buffer transfer 7 | **************************************************************************/ 8 | %namewarn("") "async"; 9 | %include "hamr_buffer_transfer.h" 10 | -------------------------------------------------------------------------------- /CITATION.cff: -------------------------------------------------------------------------------- 1 | cff-version: 1.2.0 2 | message: "If you use this software, please cite it as below." 3 | authors: 4 | - family-names: Loring 5 | given-names: Burlen 6 | orcid: https://orcid.org/0000-0002-4678-8142 7 | title: "HAMR the Heterogeneous Accelerator Memory Resource" 8 | version: 1.0.0 9 | doi: https://zenodo.org/record/6471012 10 | date-released: 2022-04-19 11 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | version: 2 6 | 7 | build: 8 | os: "ubuntu-20.04" 9 | tools: 10 | python: "3" 11 | 12 | sphinx: 13 | configuration: doc/rtd/conf.py 14 | 15 | python: 16 | install: 17 | - requirements: doc/rtd/requirements.txt 18 | -------------------------------------------------------------------------------- /test/test_hamr_openmp_allocator.cpp: -------------------------------------------------------------------------------- 1 | #include "hamr_config.h" 2 | #include "hamr_openmp_allocator.h" 3 | #include "hamr_openmp_print.h" 4 | 5 | int main(int argc, char **argv) 6 | { 7 | (void) argc; 8 | (void) argv; 9 | 10 | { 11 | auto data = hamr::openmp_allocator::allocate(400, 3.1415); 12 | 13 | hamr::openmp_print(data.get(), 400); 14 | } 15 | 16 | return 0; 17 | } 18 | -------------------------------------------------------------------------------- /hamr_env.h: -------------------------------------------------------------------------------- 1 | #ifndef hamr_env_h 2 | #define hamr_env_h 3 | 4 | #include "hamr_config.h" 5 | 6 | /// heterogeneous accelerator memory resource 7 | namespace hamr 8 | { 9 | 10 | /// returns the value of the HAMR_VERBOSE environment variable 11 | #if defined(HAMR_VERBOSE) 12 | HAMR_EXPORT int get_verbose(); 13 | #else 14 | constexpr HAMR_EXPORT int get_verbose() { return 0; } 15 | #endif 16 | 17 | } 18 | 19 | #endif 20 | -------------------------------------------------------------------------------- /doc/rtd/source/hello_hip/write.h: -------------------------------------------------------------------------------- 1 | template 2 | void write(std::ostream &os, const hamr::buffer &ai) 3 | { 4 | // get pointer to the input array that is safe to use on the host 5 | auto [spai, pai] = hamr::get_host_accessible(ai); 6 | 7 | // write the elements of the array to the stream 8 | for (int i = 0; i < ai.size(); ++i) 9 | { 10 | os << pai[i] << " "; 11 | } 12 | 13 | os << std::endl; 14 | } 15 | -------------------------------------------------------------------------------- /doc/rtd/source/hello_openmp/write.h: -------------------------------------------------------------------------------- 1 | template 2 | void write(std::ostream &os, const hamr::buffer &ai) 3 | { 4 | // get pointer to the input array that is safe to use on the host 5 | auto [spai, pai] = hamr::get_host_accessible(ai); 6 | 7 | // write the elements of the array to the stream 8 | for (size_t i = 0; i < ai.size(); ++i) 9 | { 10 | os << pai[i] << " "; 11 | } 12 | 13 | os << std::endl; 14 | } 15 | -------------------------------------------------------------------------------- /doc/rtd/source/zero_copy_cupy/python_to_cpp.py: -------------------------------------------------------------------------------- 1 | from hamr import * 2 | import cupy as cp 3 | 4 | # create a cupy array on the GPU 5 | n_elem = 16 6 | arr = cp.full((n_elem), 3.1415, dtype='float32') 7 | 8 | # zero-copy share the data with C++ 9 | buf = buffer(arr) 10 | 11 | # modify the cupy array 12 | arr *= 10000 13 | 14 | # print the buffer, which should reflect the modification because of the 15 | # zero-copy data sharing 16 | print('buf = %s\n'%(str(buf))) 17 | -------------------------------------------------------------------------------- /doc/rtd/source/hello_cuda/write.h: -------------------------------------------------------------------------------- 1 | template 2 | void write(std::ostream &os, const hamr::buffer &ai) 3 | { 4 | // get pointer to the input array that is safe to use on the host 5 | auto spai = ai.get_host_accessible(); 6 | const T *pai = spai.get(); 7 | 8 | // write the elements of the array to the stream 9 | for (int i = 0; i < ai.size(); ++i) 10 | { 11 | os << pai[i] << " "; 12 | } 13 | 14 | os << std::endl; 15 | } 16 | -------------------------------------------------------------------------------- /doc/rtd/source/zero_copy_cupy/cpp_to_python.py: -------------------------------------------------------------------------------- 1 | from hamr import * 2 | import cupy as cp 3 | 4 | # allocate some memory on the GPU 5 | n_elem = 16 6 | buf = buffer_float(buffer_allocator_cuda, n_elem, 3.1415) 7 | 8 | # convert to a cupy array 9 | arr = cp.array(buf.get_cuda_accessible(), copy=False) 10 | 11 | # modify the cupy array 12 | arr *= 10000 13 | 14 | # print the buffer, which should reflect the modification because of the 15 | # zero-copy data sharing 16 | print('buf = %s\n'%(str(buf))) 17 | -------------------------------------------------------------------------------- /doc/rtd/source/hello_openmp/Makefile: -------------------------------------------------------------------------------- 1 | 2 | HAMR_SOURCE=../../../../ 3 | HAMR_BUILD=../../../../build_omp 4 | 5 | # NVIDIA HPC Compiler 6 | #CXX=`which nvc++` 7 | #CXX_FLAGS=-mp=gpu -Minfo 8 | 9 | # AMD ROCm compiler 10 | CXX=/opt/rocm/llvm/bin/amdclang++ 11 | CXX_FLAGS=-target x86_64-pc-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx1030 12 | 13 | all: 14 | ${CXX} ${CXX_FLAGS} hello_openmp.cpp -I${HAMR_SOURCE} -I${HAMR_BUILD} -std=c++17 -L${HAMR_BUILD}/lib/ -lhamr 15 | -------------------------------------------------------------------------------- /hamr_buffer_transfer.h: -------------------------------------------------------------------------------- 1 | #ifndef buffer_transfer_h 2 | #define buffer_transfer_h 3 | 4 | ///@file 5 | 6 | /// heterogeneous accelerator memory resource 7 | namespace hamr 8 | { 9 | 10 | /** flag used to indicate whether or not a transfer operation should be 11 | * synchronous or not. 12 | */ 13 | enum class buffer_transfer 14 | { 15 | async = 0, ///< all operations are asynchronous 16 | sync_host = 1,///< operations moving data from GPU to host memory are synchronous 17 | sync = 2 ///< all operations are synchronous 18 | }; 19 | 20 | } 21 | 22 | #endif 23 | -------------------------------------------------------------------------------- /hamr_hip_print.h: -------------------------------------------------------------------------------- 1 | #ifndef hamr_hip_print_h 2 | #define hamr_hip_print_h 3 | 4 | #include "hamr_config.h" 5 | 6 | /// heterogeneous accelerator memory resource 7 | namespace hamr 8 | { 9 | 10 | /** prints an array on the GPU 11 | * @param[in] vals an array of n elements accessible in HIP 12 | * @param[in] n_elem the length of the array 13 | * @returns 0 if there were no errors 14 | */ 15 | template 16 | int hip_print(T *vals, size_t n_elem); 17 | 18 | } 19 | 20 | #if !defined(HAMR_SEPARATE_IMPL) 21 | #include "hamr_hip_print_impl.h" 22 | #endif 23 | 24 | #endif 25 | -------------------------------------------------------------------------------- /hamr_stream.i: -------------------------------------------------------------------------------- 1 | %{ 2 | #include "hamr_config.h" 3 | #include "hamr_stream.h" 4 | %} 5 | 6 | /*************************************************************************** 7 | * stream 8 | **************************************************************************/ 9 | %namewarn("") "print"; 10 | %ignore hamr::stream::operator=; 11 | #if defined(HAMR_ENABLE_CUDA) 12 | %ignore hamr::stream::operator cudaStream_t; 13 | #endif 14 | #if defined(HAMR_ENABLE_HIP) 15 | %ignore hamr::stream::operator hipStream_t; 16 | #endif 17 | %ignore hamr::stream::stream(stream &&); 18 | %include "hamr_stream.h" 19 | -------------------------------------------------------------------------------- /hamr_cuda_print.h: -------------------------------------------------------------------------------- 1 | #ifndef hamr_cuda_print_h 2 | #define hamr_cuda_print_h 3 | 4 | #include "hamr_config.h" 5 | 6 | /// heterogeneous accelerator memory resource 7 | namespace hamr 8 | { 9 | class stream; 10 | 11 | /** prints an array on the GPU 12 | * @param[in] vals an array of n elements accessible in CUDA 13 | * @param[in] n_elem the length of the array 14 | * @returns 0 if there were no errors 15 | */ 16 | template 17 | int cuda_print(const hamr::stream &strm, T *vals, size_t n_elem); 18 | 19 | } 20 | 21 | #if !defined(HAMR_SEPARATE_IMPL) 22 | #include "hamr_cuda_print_impl.h" 23 | #endif 24 | 25 | #endif 26 | -------------------------------------------------------------------------------- /doc/rtd/source/hello_openmp/hello_openmp.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | #include 5 | 6 | #include "add.h" 7 | #include "write.h" 8 | 9 | int main(int, char **) 10 | { 11 | size_t n_vals = 400; 12 | 13 | // allocate and initialize to 1 on the GPU 14 | hamr::buffer a0(hamr::buffer_allocator::openmp, n_vals, 1.0f); 15 | 16 | // allocate and initialize to 1 on the host 17 | hamr::buffer a1(hamr::buffer_allocator::malloc, n_vals, 1.0f); 18 | 19 | // add the two arrays 20 | hamr::buffer a2 = add(a0, a1); 21 | 22 | // write the result 23 | write(std::cerr, a2); 24 | 25 | return 0; 26 | } 27 | -------------------------------------------------------------------------------- /hamr_env.cxx: -------------------------------------------------------------------------------- 1 | #include "hamr_env.h" 2 | 3 | #if defined(HAMR_VERBOSE) 4 | 5 | #include 6 | #include 7 | 8 | namespace hamr 9 | { 10 | 11 | // ************************************************************************** 12 | int get_verbose() 13 | { 14 | static int ival = -1; 15 | 16 | if (ival < 0) 17 | { 18 | char *cval = getenv("HAMR_VERBOSE"); 19 | if (cval) 20 | { 21 | ival = atoi(cval); 22 | std::cerr << "HAMR_VERBOSE=" << ival << std::endl; 23 | } 24 | else 25 | { 26 | ival = 0; 27 | } 28 | } 29 | 30 | return ival; 31 | } 32 | 33 | } 34 | 35 | #endif 36 | -------------------------------------------------------------------------------- /hamr_gil_state.h: -------------------------------------------------------------------------------- 1 | #ifndef hamr_gil_state_h 2 | #define hamr_gil_state_h 3 | 4 | #include 5 | 6 | namespace hamr 7 | { 8 | 9 | /// A RAII helper for managing the Python GIL. 10 | /** The GIL is aquired and held while the object exists. The GIL must be held 11 | * by C++ code invoking any Python C-API calls. 12 | */ 13 | class HAMR_EXPORT gil_state 14 | { 15 | public: 16 | gil_state() 17 | { m_state = PyGILState_Ensure(); } 18 | 19 | ~gil_state() 20 | { PyGILState_Release(m_state); } 21 | 22 | gil_state(const gil_state&) = delete; 23 | void operator=(const gil_state&) = delete; 24 | 25 | private: 26 | PyGILState_STATE m_state; 27 | }; 28 | 29 | } 30 | 31 | #endif 32 | -------------------------------------------------------------------------------- /doc/rtd/source/hello_hip/hello_hip.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include 5 | #include 6 | 7 | #include "add.h" 8 | #include "write.h" 9 | 10 | int main(int, char **) 11 | { 12 | size_t n_vals = 400; 13 | 14 | // allocate and initialize to 1 on the GPU 15 | hamr::buffer a0(hamr::buffer_allocator::hip, n_vals, 1.0f); 16 | 17 | // allocate and initialize to 1 on the host 18 | hamr::buffer a1(hamr::buffer_allocator::malloc, n_vals, 1.0f); 19 | 20 | // add the two arrays 21 | hamr::buffer a2 = add(a0, a1); 22 | 23 | // write the result 24 | write(std::cerr, a2); 25 | 26 | return 0; 27 | } 28 | -------------------------------------------------------------------------------- /hamr_openmp_print.h: -------------------------------------------------------------------------------- 1 | #ifndef hamr_openmp_print_impl_h 2 | #define hamr_openmp_print_impl_h 3 | 4 | #include "hamr_config.h" 5 | #include 6 | 7 | /// heterogeneous accelerator memory resource 8 | namespace hamr 9 | { 10 | 11 | /** prints an array on the host (note: OpenMP provides no way to print directly 12 | * from the device) 13 | * @param[in] vals an array of n elements accessible in OpenMP 14 | * @param[in] n_elem the length of the array 15 | * @returns 0 if there were no errors 16 | */ 17 | template 18 | HAMR_EXPORT 19 | int openmp_print(T *vals, size_t n_elem); 20 | 21 | } 22 | 23 | #if !defined(HAMR_SEPARATE_IMPL) 24 | #include "hamr_openmp_print_impl.h" 25 | #endif 26 | #endif 27 | -------------------------------------------------------------------------------- /doc/rtd/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /doc/rtd/source/hello_cuda/hello_cuda.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include "add.h" 9 | #include "write.h" 10 | 11 | int main(int, char **) 12 | { 13 | size_t n_vals = 400; 14 | 15 | // allocate and initialize to 1 on the GPU 16 | hamr::buffer a0(hamr::buffer_allocator::cuda, n_vals, 1.0f); 17 | 18 | // allocate and initialize to 1 on the host 19 | hamr::buffer a1(hamr::buffer_allocator::malloc, n_vals, 1.0f); 20 | 21 | // add the two arrays 22 | hamr::buffer a2 = add(a0, a1); 23 | 24 | // write the result 25 | write(std::cerr, a2); 26 | 27 | return 0; 28 | } 29 | -------------------------------------------------------------------------------- /hamr_config.cmake.in: -------------------------------------------------------------------------------- 1 | include(CMakeFindDependencyMacro) 2 | 3 | list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}") 4 | 5 | set(HAMR_LIB_TYPE STATIC) 6 | if (@BUILD_SHARED_LIBS@) 7 | set(HAMR_LIB_TYPE SHARED) 8 | endif() 9 | 10 | set(HAMR_SEPARATE_IMPL @HAMR_SEPARATE_IMPL@) 11 | set(HAMR_ENABLE_CUDA @HAMR_ENABLE_CUDA@) 12 | set(HAMR_NVHPC_CUDA @HAMR_NVHPC_CUDA@) 13 | set(HAMR_NVCC_CUDA @HAMR_NVCC_CUDA@) 14 | set(HAMR_CLANG_CUDA @HAMR_CLANG_CUDA@) 15 | set(HAMR_NVHPC_CUDA @HAMR_NVHPC_CUDA@) 16 | set(HAMR_ENABLE_HIP @HAMR_ENABLE_HIP@) 17 | set(HAMR_ENABLE_OPENMP @HAMR_ENABLE_HIP@) 18 | set(HAMR_ENABLE_OBJECTS @HAMR_ENABLE_OBJECTS@) 19 | set(HAMR_ENABLE_PAGE_LOCKED_MEMORY @HAMR_ENABLE_PAGE_LOCKED_MEMORY@) 20 | set(HAMR_ENABLE_PYTHON @HAMR_ENABLE_PYTHON@) 21 | set(HAMR_VERBOSE @HAMR_VERBOSE@) 22 | 23 | include(hamr) 24 | -------------------------------------------------------------------------------- /doc/rtd/source/hello_cuda/add.h: -------------------------------------------------------------------------------- 1 | #include "add.cuh" 2 | 3 | template 4 | hamr::buffer add(const hamr::buffer &a1, const hamr::buffer &a2) 5 | { 6 | size_t n_vals = a1.size(); 7 | 8 | // get pointers to the input arrays that are safe to use on the GPU 9 | auto [spa1, pa1] = hamr::get_cuda_accessible(a1); 10 | auto [spa2, pa2] = hamr::get_cuda_accessible(a2); 11 | 12 | // allocate the memory for the result on the GPU, and get a pointer to it 13 | hamr::buffer ao(hamr::buffer_allocator::cuda, n_vals, T(0)); 14 | T *pao = ao.data(); 15 | 16 | // launch the kernel to add the arrays 17 | dim3 thread_grid(128); 18 | dim3 block_grid(n_vals/128 + (n_vals % 128 ? 1 : 0)); 19 | add<<>>(pao, pa1, pa2, n_vals); 20 | 21 | return ao; 22 | } 23 | -------------------------------------------------------------------------------- /doc/rtd/source/hello_hip/add.h: -------------------------------------------------------------------------------- 1 | #include "add_kernel.h" 2 | 3 | template 4 | hamr::buffer add(const hamr::buffer &a1, const hamr::buffer &a2) 5 | { 6 | size_t n_vals = a1.size(); 7 | 8 | // get pointers to the input arrays that are safe to use on the GPU 9 | auto [spa1, pa1] = hamr::get_hip_accessible(a1); 10 | auto [spa2, pa2] = hamr::get_hip_accessible(a2); 11 | 12 | // allocate the memory for the result on the GPU, and get a pointer to it 13 | hamr::buffer ao(hamr::buffer_allocator::hip, n_vals, T(0)); 14 | T *pao = ao.data(); 15 | 16 | // launch the kernel to add the arrays 17 | dim3 thread_grid(128); 18 | dim3 block_grid(n_vals/128 + (n_vals % 128 ? 1 : 0)); 19 | add<<>>(pao, pa1, pa2, n_vals); 20 | 21 | return ao; 22 | } 23 | -------------------------------------------------------------------------------- /doc/rtd/source/hello_openmp/add.h: -------------------------------------------------------------------------------- 1 | template 2 | hamr::buffer add(const hamr::buffer &a1, const hamr::buffer &a2) 3 | { 4 | size_t n_vals = a1.size(); 5 | 6 | // get pointers to the input arrays that are safe to use on the GPU 7 | auto [spa1, pa1] = hamr::get_openmp_accessible(a1); 8 | auto [spa2, pa2] = hamr::get_openmp_accessible(a2); 9 | 10 | // allocate the memory for the result on the GPU, and get a pointer to it 11 | hamr::buffer ao(hamr::buffer_allocator::openmp, n_vals, T(0)); 12 | T *pao = ao.data(); 13 | 14 | // launch the kernel to add the arrays 15 | #pragma omp target teams distribute parallel for is_device_ptr(pao, pa1, pa2) 16 | for (size_t i = 0; i < n_vals; ++i) 17 | { 18 | pao[i] = pa1[i] + pa2[i]; 19 | } 20 | 21 | return ao; 22 | } 23 | -------------------------------------------------------------------------------- /doc/rtd/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /hamr_config.h.in: -------------------------------------------------------------------------------- 1 | #ifndef hamr_config_h 2 | #define hamr_config_h 3 | 4 | #define HAMR_EXPORT __attribute__ ((visibility ("default"))) 5 | #define HAMR_PRIVATE __attribute__ ((visibility ("hidden"))) 6 | 7 | #cmakedefine HAMR_SEPARATE_IMPL 8 | #cmakedefine HAMR_ENABLE_CUDA 9 | #cmakedefine HAMR_NVHPC_CUDA 10 | #cmakedefine HAMR_NVCC_CUDA 11 | #cmakedefine HAMR_CLANG_CUDA 12 | #cmakedefine HAMR_ENABLE_HIP 13 | #cmakedefine HAMR_ENABLE_OPENMP 14 | #define HAMR_OPENMP_LOOP @HAMR_OPENMP_LOOP@ 15 | #cmakedefine HAMR_ENABLE_OBJECTS 16 | #cmakedefine HAMR_ENABLE_PAGE_LOCKED_MEMORY 17 | #cmakedefine HAMR_ENABLE_PYTHON 18 | #cmakedefine HAMR_VERBOSE 19 | 20 | // work around an issue with clang compiling CUDA (clang 17/CUDA 12 May 2023). 21 | // problematic includes can go here, leaving the individual source files 22 | // unmodified. 23 | #if defined(HAMR_CLANG_CUDA) 24 | #undef __noinline__ 25 | #include 26 | #include 27 | #define __noinline__ __attribute__((noinline)) 28 | #endif 29 | 30 | #endif 31 | -------------------------------------------------------------------------------- /hamr_hip_print.cxx: -------------------------------------------------------------------------------- 1 | #include "hamr_config.h" 2 | 3 | #include "hamr_hip_print.h" 4 | #include "hamr_hip_print_impl.h" 5 | 6 | template int hamr::hip_print(float *vals, size_t n_elem); 7 | template int hamr::hip_print(double *vals, size_t n_elem); 8 | template int hamr::hip_print(char *vals, size_t n_elem); 9 | template int hamr::hip_print(signed char *vals, size_t n_elem); 10 | template int hamr::hip_print(short *vals, size_t n_elem); 11 | template int hamr::hip_print(int *vals, size_t n_elem); 12 | template int hamr::hip_print(long *vals, size_t n_elem); 13 | template int hamr::hip_print(long long *vals, size_t n_elem); 14 | template int hamr::hip_print(unsigned char *vals, size_t n_elem); 15 | template int hamr::hip_print(unsigned short *vals, size_t n_elem); 16 | template int hamr::hip_print(unsigned int *vals, size_t n_elem); 17 | template int hamr::hip_print(unsigned long *vals, size_t n_elem); 18 | template int hamr::hip_print(unsigned long long *vals, size_t n_elem); 19 | -------------------------------------------------------------------------------- /doc/rtd/source/hello_cupy/hello_cupy.py: -------------------------------------------------------------------------------- 1 | from hamr import * 2 | import cupy as cp 3 | import numpy as np 4 | import sys 5 | 6 | 7 | def add(buf_0, buf_1): 8 | """ add 2 arrays on the GPU """ 9 | arr_0 = cp.array(buf_0.get_cuda_accessible()) # share data w/ cupy on GPU 10 | arr_1 = cp.array(buf_1.get_cuda_accessible()) # share data w/ cupy on GPU 11 | arr_2 = arr_0 + arr_1 # add on the GPU 12 | buf_2 = buffer(arr_2) # zero-copy from cupy on GPU 13 | return buf_2 14 | 15 | def write(fh, buf): 16 | """ print the array on the host """ 17 | arr = np.array(buf.get_host_accessible()) # share data w/ numpy on host 18 | fh.write('%s\n'%(str(arr))) # write to the file on host 19 | 20 | 21 | n_vals = 400 22 | buf_0 = buffer_float(buffer_allocator_cuda, n_vals, 1.0) # allocate on the host 23 | buf_1 = buffer_float(buffer_allocator_malloc, n_vals, 1.0) # allocate on the GPU 24 | 25 | buf_2 = add(buf_0, buf_1) # add the arrays 26 | 27 | write(sys.stdout, buf_2) # write the arrays 28 | -------------------------------------------------------------------------------- /hamr_openmp_print.cxx: -------------------------------------------------------------------------------- 1 | #include "hamr_config.h" 2 | 3 | #include "hamr_openmp_print.h" 4 | #include "hamr_openmp_print_impl.h" 5 | 6 | template int hamr::openmp_print(float *vals, size_t n_elem); 7 | template int hamr::openmp_print(double *vals, size_t n_elem); 8 | template int hamr::openmp_print(char *vals, size_t n_elem); 9 | template int hamr::openmp_print(signed char *vals, size_t n_elem); 10 | template int hamr::openmp_print(short *vals, size_t n_elem); 11 | template int hamr::openmp_print(int *vals, size_t n_elem); 12 | template int hamr::openmp_print(long *vals, size_t n_elem); 13 | template int hamr::openmp_print(long long *vals, size_t n_elem); 14 | template int hamr::openmp_print(unsigned char *vals, size_t n_elem); 15 | template int hamr::openmp_print(unsigned short *vals, size_t n_elem); 16 | template int hamr::openmp_print(unsigned int *vals, size_t n_elem); 17 | template int hamr::openmp_print(unsigned long *vals, size_t n_elem); 18 | template int hamr::openmp_print(unsigned long long *vals, size_t n_elem); 19 | -------------------------------------------------------------------------------- /hamr_buffer_pointer.h: -------------------------------------------------------------------------------- 1 | #ifndef buffer_pointer_h 2 | #define buffer_pointer_h 3 | 4 | #include 5 | 6 | /// heterogeneous accelerator memory resource 7 | namespace hamr 8 | { 9 | 10 | template class buffer; 11 | 12 | /// a shared pointer to an instance of a buffer 13 | template 14 | using p_buffer = std::shared_ptr>; 15 | 16 | /// a shared pointer to an instance of a const buffer 17 | template 18 | using const_p_buffer = std::shared_ptr>; 19 | 20 | /// a helper for explicitly casting to a const buffer pointer. 21 | template 22 | hamr::const_p_buffer const_ptr(const hamr::p_buffer &v) 23 | { 24 | return hamr::const_p_buffer(v); 25 | } 26 | 27 | /// a helper for getting a reference to pointed to hamr::buffer 28 | template 29 | const hamr::buffer &ref_to(const hamr::const_p_buffer &ptr) 30 | { 31 | return *(ptr.get()); 32 | } 33 | 34 | /// a helper for getting a reference to pointed to hamr::buffer 35 | template 36 | hamr::buffer &ref_to(const hamr::p_buffer &ptr) 37 | { 38 | return *(ptr.get()); 39 | } 40 | 41 | } 42 | 43 | #endif 44 | -------------------------------------------------------------------------------- /hamr_host_copy.h: -------------------------------------------------------------------------------- 1 | #ifndef hamr_host_copy_h 2 | #define hamr_host_copy_h 3 | 4 | #include "hamr_config.h" 5 | #include 6 | #include 7 | 8 | /// heterogeneous accelerator memory resource 9 | namespace hamr 10 | { 11 | 12 | /** Copies an array on the host. 13 | * 14 | * @param[in] dest an array of n elements accessible on the host 15 | * @param[in] src an array of n elements accessible on the host 16 | * @param[in] n_elem the number of elements in the array 17 | * @returns 0 if there were no errors 18 | */ 19 | template 20 | int copy_to_host_from_host(T *dest, const U *src, size_t n_elem); 21 | 22 | /** Copies an array on the host (fast path for arrays of arithmetic types of the 23 | * same type). 24 | * 25 | * @param[in] dest an array of n elements accessible in CUDA 26 | * @param[in] src an array of n elements accessible on the host 27 | * @param[in] n_elem the number of elements in the array 28 | * @returns 0 if there were no errors 29 | */ 30 | template 31 | int copy_to_host_from_host(T *dest, const T *src, size_t n_elem, 32 | typename std::enable_if::value>::type * = nullptr); 33 | 34 | } 35 | 36 | #if !defined(HAMR_SEPARATE_IMPL) 37 | #include "hamr_host_copy_impl.h" 38 | #endif 39 | 40 | #endif 41 | -------------------------------------------------------------------------------- /hamr_python_deleter_impl.h: -------------------------------------------------------------------------------- 1 | #include "hamr_gil_state.h" 2 | #include 3 | #include 4 | 5 | namespace hamr 6 | { 7 | 8 | // -------------------------------------------------------------------------- 9 | template 10 | python_deleter::python_deleter(T *ptr, size_t n, PyObject *obj) 11 | : m_ptr(ptr), m_elem(n), m_object(obj) 12 | { 13 | #if defined(HAMR_VERBOSE) 14 | if (hamr::get_verbose()) 15 | { 16 | std::cerr << "created python_deleter for array of " << n 17 | << " objects of type " << typeid(T).name() << sizeof(T) 18 | << " holding a reference to " << m_object << std::endl; 19 | } 20 | #endif 21 | hamr::gil_state gil; 22 | Py_INCREF(obj); 23 | } 24 | 25 | // -------------------------------------------------------------------------- 26 | template 27 | void python_deleter::operator()(T *ptr) 28 | { 29 | (void)ptr; 30 | assert(ptr == m_ptr); 31 | #if defined(HAMR_VERBOSE) 32 | if (hamr::get_verbose()) 33 | { 34 | std::cerr << "python_deleter deleting array of " << m_elem 35 | << " objects of type " << typeid(T).name() << sizeof(T) 36 | << " release reference to " << m_object << std::endl; 37 | } 38 | #endif 39 | hamr::gil_state gil; 40 | Py_DECREF(m_object); 41 | } 42 | 43 | } 44 | -------------------------------------------------------------------------------- /hamr_buffer_allocator.cxx: -------------------------------------------------------------------------------- 1 | #include "hamr_buffer_allocator.h" 2 | 3 | namespace hamr 4 | { 5 | 6 | // ************************************************************************** 7 | const char *get_allocator_name(buffer_allocator alloc) 8 | { 9 | if (alloc == buffer_allocator::cpp) 10 | { 11 | return "cpp"; 12 | } 13 | else if (alloc == buffer_allocator::malloc) 14 | { 15 | return "malloc"; 16 | } 17 | else if (alloc == buffer_allocator::cuda) 18 | { 19 | return "cuda_malloc_allocator"; 20 | } 21 | else if (alloc == buffer_allocator::cuda_host) 22 | { 23 | return "cuda_malloc_host_allocator"; 24 | } 25 | else if (alloc == buffer_allocator::cuda_async) 26 | { 27 | return "cuda_malloc_async_allocator"; 28 | } 29 | else if (alloc == buffer_allocator::cuda_uva) 30 | { 31 | return "cuda_malloc_uva_allocator"; 32 | } 33 | else if (alloc == buffer_allocator::hip) 34 | { 35 | return "hip_malloc_allocator"; 36 | } 37 | else if (alloc == buffer_allocator::hip_uva) 38 | { 39 | return "hip_malloc_uva_allocator"; 40 | } 41 | else if (alloc == buffer_allocator::openmp) 42 | { 43 | return "openmp_allocator"; 44 | } 45 | 46 | return "the allocator name is not known"; 47 | } 48 | 49 | } 50 | -------------------------------------------------------------------------------- /test/test_hamr_buffer_cupy_host.py: -------------------------------------------------------------------------------- 1 | from hamr import * 2 | import cupy as cp 3 | import sys 4 | 5 | stderr = sys.__stderr__ 6 | 7 | stderr.write('TEST: creating a hamr::buffer host ... \n') 8 | buf = buffer_float(buffer_allocator_malloc, 16, 3.1415) 9 | stderr.write('buf = %s\n'%(str(buf))) 10 | stderr.write('TEST: creating a hamr::buffer host ... OK!\n\n') 11 | 12 | stderr.write('TEST: get a handle to the data ... \n') 13 | h = buf.get_cuda_accessible() 14 | stderr.write('TEST: get a handle to the data ... OK!\n\n') 15 | 16 | stderr.write('TEST: share the data with Cupy ... \n') 17 | arr = cp.array(h, copy=False) 18 | stderr.write('arr.__cuda_array_interface__ = %s\n'%(arr.__cuda_array_interface__)) 19 | stderr.write('TEST: share the data with Cupy ... OK!\n\n') 20 | 21 | stderr.write('TEST: deleting the hamr::buffer ... \n') 22 | buf = None 23 | stderr.write('TEST: deleting the hamr::buffer ... OK!\n\n') 24 | 25 | stderr.write('TEST: Cupy reads the data ... \n') 26 | stderr.write('arr = %s\n'%(str(arr))) 27 | stderr.write('TEST: Cupy reads the data ... OK!\n\n') 28 | 29 | stderr.write('TEST: Cupy modifies the data ... \n') 30 | arr *= 10000 31 | stderr.write('arr = %s\n'%(str(arr))) 32 | stderr.write('TEST: Cupy modifies the data ... OK!\n\n') 33 | 34 | stderr.write('TEST: deleting the Cupy array ... \n') 35 | arr = None 36 | stderr.write('TEST: deleting the Cupy array ... OK!\n\n') 37 | 38 | sys.exit(0) 39 | -------------------------------------------------------------------------------- /hamr_openmp_device.h: -------------------------------------------------------------------------------- 1 | #ifndef hamr_openmp_device_h 2 | #define hamr_openmp_device_h 3 | 4 | #include "hamr_config.h" 5 | 6 | ///@file 7 | 8 | namespace hamr 9 | { 10 | /// gets the device identifier for the first GPU. @returns zero if successful. 11 | inline int HAMR_EXPORT get_openmp_device_identifier(int &dev_id) { dev_id = 0; return 0; } 12 | 13 | /// gets the device identifier for the host. @returns zero if successful. 14 | int HAMR_EXPORT get_openmp_host_identifier(int &dev_id); 15 | 16 | /// gets the currently atcive HIP device. returns zero if successful. 17 | int HAMR_EXPORT get_active_openmp_device(int &dev_id); 18 | 19 | /// sets the active HIP device. returns zero if successful. 20 | int HAMR_EXPORT set_active_openmp_device(int dev_id); 21 | 22 | /// gets the device that owns the given pointer. @returns zero if successful. 23 | int HAMR_EXPORT get_openmp_device(const void *ptr, int &device_id); 24 | 25 | /** Activate the specified HIP device, and restore the previously active 26 | * device when the object is destroyed. 27 | */ 28 | class HAMR_EXPORT activate_openmp_device 29 | { 30 | public: 31 | activate_openmp_device() = delete; 32 | activate_openmp_device(const activate_openmp_device &) = delete; 33 | void operator=(const activate_openmp_device &) = delete; 34 | 35 | activate_openmp_device(int id); 36 | ~activate_openmp_device(); 37 | 38 | private: 39 | int m_device; 40 | }; 41 | 42 | } 43 | #endif 44 | -------------------------------------------------------------------------------- /hamr_cuda_print.cxx: -------------------------------------------------------------------------------- 1 | #include "hamr_config.h" 2 | 3 | #include "hamr_cuda_print.h" 4 | #include "hamr_cuda_print_impl.h" 5 | 6 | template int hamr::cuda_print(const hamr::stream &strm, float *vals, size_t n_elem); 7 | template int hamr::cuda_print(const hamr::stream &strm, double *vals, size_t n_elem); 8 | template int hamr::cuda_print(const hamr::stream &strm, char *vals, size_t n_elem); 9 | template int hamr::cuda_print(const hamr::stream &strm, signed char *vals, size_t n_elem); 10 | template int hamr::cuda_print(const hamr::stream &strm, short *vals, size_t n_elem); 11 | template int hamr::cuda_print(const hamr::stream &strm, int *vals, size_t n_elem); 12 | template int hamr::cuda_print(const hamr::stream &strm, long *vals, size_t n_elem); 13 | template int hamr::cuda_print(const hamr::stream &strm, long long *vals, size_t n_elem); 14 | template int hamr::cuda_print(const hamr::stream &strm, unsigned char *vals, size_t n_elem); 15 | template int hamr::cuda_print(const hamr::stream &strm, unsigned short *vals, size_t n_elem); 16 | template int hamr::cuda_print(const hamr::stream &strm, unsigned int *vals, size_t n_elem); 17 | template int hamr::cuda_print(const hamr::stream &strm, unsigned long *vals, size_t n_elem); 18 | template int hamr::cuda_print(const hamr::stream &strm, unsigned long long *vals, size_t n_elem); 19 | -------------------------------------------------------------------------------- /hamr_hip_device.h: -------------------------------------------------------------------------------- 1 | #ifndef hamr_hip_device_h 2 | #define hamr_hip_device_h 3 | 4 | #include "hamr_config.h" 5 | 6 | ///@file 7 | 8 | namespace hamr 9 | { 10 | /// gets the device identifier for the first GPU. @returns zero if successful. 11 | inline int HAMR_EXPORT get_hip_device_identifier(int &dev_id) { dev_id = 0; return 0; } 12 | 13 | /// gets the device identifier for the host. @returns zero if successful. 14 | inline int HAMR_EXPORT get_hip_host_identifier(int &dev_id) { dev_id = -1; return 0; } 15 | 16 | /// gets the currently atcive HIP device. returns zero if successful. 17 | int HAMR_EXPORT get_active_hip_device(int &dev_id); 18 | 19 | /// sets the active HIP device. returns zero if successful. 20 | int HAMR_EXPORT set_active_hip_device(int dev_id); 21 | 22 | /// gets the device that owns the given pointer. @returns zero if successful. 23 | int HAMR_EXPORT get_hip_device(const void *ptr, int &device_id); 24 | 25 | 26 | /** Activate the specified HIP device, and restore the previously active 27 | * device when the object is destroyed. 28 | */ 29 | class HAMR_EXPORT activate_hip_device 30 | { 31 | public: 32 | activate_hip_device() = delete; 33 | activate_hip_device(const activate_hip_device &) = delete; 34 | void operator=(const activate_hip_device &) = delete; 35 | 36 | activate_hip_device(int id); 37 | ~activate_hip_device(); 38 | 39 | private: 40 | int m_device; 41 | }; 42 | 43 | } 44 | 45 | 46 | 47 | 48 | #endif 49 | -------------------------------------------------------------------------------- /hamr_openmp_print_impl.h: -------------------------------------------------------------------------------- 1 | #ifndef hamr_openmp_print_h 2 | #define hamr_openmp_print_h 3 | 4 | #include "hamr_config.h" 5 | #include "hamr_env.h" 6 | #if defined(HAMR_ENABLE_OPENMP) 7 | #include "hamr_openmp_copy.h" 8 | #include "hamr_malloc_allocator.h" 9 | #endif 10 | 11 | #include 12 | 13 | /// heterogeneous accelerator memory resource 14 | namespace hamr 15 | { 16 | // --------------------------------------------------------------------------- 17 | template 18 | int openmp_print(T *vals, size_t n_elem) 19 | { 20 | #if !defined(HAMR_ENABLE_OPENMP) 21 | (void) vals; 22 | (void) n_elem; 23 | std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:" 24 | " print_openmp failed because OpenMP is not enabled." << std::endl; 25 | return -1; 26 | #else 27 | 28 | // allocate a temporary on the host 29 | auto sptmp = hamr::malloc_allocator::allocate(n_elem); 30 | T *ptmp = sptmp.get(); 31 | 32 | // move to the host 33 | if (hamr::copy_to_host_from_openmp(ptmp, vals, n_elem)) 34 | { 35 | std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:" 36 | " failed to move data to the host" << std::endl; 37 | return -1; 38 | } 39 | 40 | // print 41 | if (n_elem) 42 | { 43 | std::cerr << ptmp[0]; 44 | for (size_t i = 1; i < n_elem; ++i) 45 | { 46 | std::cerr << ", " << ptmp[i]; 47 | } 48 | } 49 | std::cerr << std::endl; 50 | 51 | return 0; 52 | #endif 53 | } 54 | 55 | } 56 | #endif 57 | -------------------------------------------------------------------------------- /hamr_host_copy.cxx: -------------------------------------------------------------------------------- 1 | #include "hamr_config.h" 2 | 3 | #include "hamr_host_copy.h" 4 | #include "hamr_host_copy_impl.h" 5 | 6 | #define hamr_host_copy_instantiate_(T, U) \ 7 | template int hamr::copy_to_host_from_host(T *dest, const U *src, size_t n_elem); 8 | 9 | #define hamr_host_copy_instantiate(T) \ 10 | template int hamr::copy_to_host_from_host(T *dest, const T *src, size_t n_elem, void *); \ 11 | hamr_host_copy_instantiate_(T, float) \ 12 | hamr_host_copy_instantiate_(T, double) \ 13 | hamr_host_copy_instantiate_(T, char) \ 14 | hamr_host_copy_instantiate_(T, signed char) \ 15 | hamr_host_copy_instantiate_(T, short) \ 16 | hamr_host_copy_instantiate_(T, int) \ 17 | hamr_host_copy_instantiate_(T, long) \ 18 | hamr_host_copy_instantiate_(T, long long) \ 19 | hamr_host_copy_instantiate_(T, unsigned char) \ 20 | hamr_host_copy_instantiate_(T, unsigned short) \ 21 | hamr_host_copy_instantiate_(T, unsigned int) \ 22 | hamr_host_copy_instantiate_(T, unsigned long) \ 23 | hamr_host_copy_instantiate_(T, unsigned long long) 24 | 25 | hamr_host_copy_instantiate(float) 26 | hamr_host_copy_instantiate(double) 27 | hamr_host_copy_instantiate(char) 28 | hamr_host_copy_instantiate(signed char) 29 | hamr_host_copy_instantiate(short) 30 | hamr_host_copy_instantiate(int) 31 | hamr_host_copy_instantiate(long) 32 | hamr_host_copy_instantiate(long long) 33 | hamr_host_copy_instantiate(unsigned char) 34 | hamr_host_copy_instantiate(unsigned short) 35 | hamr_host_copy_instantiate(unsigned int) 36 | hamr_host_copy_instantiate(unsigned long) 37 | hamr_host_copy_instantiate(unsigned long long) 38 | -------------------------------------------------------------------------------- /hamr_python_deleter.h: -------------------------------------------------------------------------------- 1 | #ifndef hamr_python_deleter_h 2 | #define hamr_python_deleter_h 3 | 4 | #include "hamr_config.h" 5 | #include 6 | 7 | namespace hamr 8 | { 9 | 10 | /// a deleter for memory managed from within Python 11 | /** This class manages an array allocated by a Python code. In the functor's 12 | * constructor a refrence to a user provdied Python object is stolen. When the 13 | * functor is invoked, a reference to this Python object is released. It is up 14 | * to the Python object to free the memory. One may use a PyCapsule to 15 | * implement custom delete methods if they are needed. 16 | */ 17 | template 18 | class HAMR_EXPORT python_deleter 19 | { 20 | public: 21 | /** constructs the deleter. A reference to obj is stolen by this constructor. 22 | * @param[in] ptr a pointer to shared data 23 | * @param[in] n_elem the number of elements of type T shared 24 | * @param[in] obj a PyObject who's reference count will be decremented when 25 | * the data shared from Python is no longer needed. 26 | */ 27 | python_deleter(T *ptr, size_t n_elem, PyObject *obj); 28 | 29 | /** deletes the array 30 | * @param[in] ptr the pointer to the array to delete. must be the same as 31 | * that passed during construction. 32 | */ 33 | void operator()(T *ptr); 34 | 35 | private: 36 | T *m_ptr; 37 | size_t m_elem; 38 | PyObject *m_object; 39 | }; 40 | 41 | } 42 | 43 | #if !defined(HAMR_SEPARATE_IMPL) 44 | #include "hamr_python_deleter_impl.h" 45 | #endif 46 | 47 | #endif 48 | -------------------------------------------------------------------------------- /.github/workflows/build_and_test_cuda.yml: -------------------------------------------------------------------------------- 1 | name: CUDA-HAMR 2 | 3 | on: 4 | push: 5 | branches: [ master ] 6 | pull_request: 7 | branches: [ master ] 8 | 9 | workflow_dispatch: 10 | 11 | jobs: 12 | build_and_test: 13 | runs-on: ubuntu-22.04 14 | 15 | steps: 16 | - uses: actions/checkout@v2 17 | 18 | - name: install_deps 19 | run: | 20 | export DEBIAN_FRONTEND="noninteractive" 21 | export TZ="America/Los_Angeles" 22 | sudo apt-get update -qq 23 | sudo apt-get install -qq -y git-core gcc g++ cmake automake m4 wget swig python3 nvidia-cuda-toolkit 24 | python3 -mvenv py3k_testing 25 | source py3k_testing/bin/activate 26 | python3 -mpip install numpy 27 | 28 | # build for CUDA 29 | - name: build_cuda 30 | run: | 31 | source py3k_testing/bin/activate 32 | mkdir build_cuda 33 | cd build_cuda 34 | cmake -DCMAKE_BUILD_TYPE=Debug -DBUILD_TESTING=ON -DHAMR_ENABLE_PYTHON=ON -DHAMR_ENABLE_CUDA=ON -DCMAKE_INSTALL_PREFIX=`pwd`/../../hamr-install-cuda .. 35 | make -j2 install 36 | cd .. 37 | 38 | # build for CUDA 39 | - name: build_cuda_separate_impl 40 | run: | 41 | source py3k_testing/bin/activate 42 | mkdir build_cuda_sep 43 | cd build_cuda_sep 44 | cmake -DCMAKE_BUILD_TYPE=Debug -DBUILD_TESTING=ON -DHAMR_ENABLE_PYTHON=ON -DHAMR_ENABLE_CUDA=ON -DHAMR_SEPARATE_IMPL=ON -DCMAKE_INSTALL_PREFIX=`pwd`/../../hamr-install-cuda-sep .. 45 | make -j2 install 46 | cd .. 47 | -------------------------------------------------------------------------------- /hamr_openmp_device.cxx: -------------------------------------------------------------------------------- 1 | #include "hamr_openmp_device.h" 2 | 3 | #include 4 | #include 5 | 6 | namespace hamr 7 | { 8 | // ************************************************************************** 9 | int get_openmp_host_identifier(int &dev_id) 10 | { 11 | dev_id = omp_get_initial_device(); 12 | return 0; 13 | } 14 | 15 | // ************************************************************************** 16 | int get_active_openmp_device(int &dev_id) 17 | { 18 | dev_id = omp_get_default_device(); 19 | return 0; 20 | } 21 | 22 | // ************************************************************************** 23 | int set_active_openmp_device(int dev_id) 24 | { 25 | omp_set_default_device(dev_id); 26 | return 0; 27 | } 28 | 29 | // ************************************************************************** 30 | int HAMR_EXPORT get_openmp_device(const void *ptr, int &device_id) 31 | { 32 | (void)ptr; 33 | device_id = 0; 34 | return -1; 35 | } 36 | 37 | // -------------------------------------------------------------------------- 38 | activate_openmp_device::activate_openmp_device(int new_dev) : m_device(-1) 39 | { 40 | int cur_dev = -1; 41 | if (!get_active_openmp_device(cur_dev) && (cur_dev != new_dev) && 42 | !set_active_openmp_device(new_dev)) 43 | { 44 | m_device = cur_dev; 45 | } 46 | } 47 | 48 | // -------------------------------------------------------------------------- 49 | activate_openmp_device::~activate_openmp_device() 50 | { 51 | if (m_device >= 0) 52 | { 53 | set_active_openmp_device(m_device); 54 | } 55 | } 56 | 57 | } 58 | -------------------------------------------------------------------------------- /hamr_host_copy_impl.h: -------------------------------------------------------------------------------- 1 | #ifndef hamr_host_copy_impl_h 2 | #define hamr_host_copy_impl_h 3 | 4 | #include "hamr_config.h" 5 | #include "hamr_env.h" 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | /// heterogeneous accelerator memory resource 13 | namespace hamr 14 | { 15 | 16 | // -------------------------------------------------------------------------- 17 | template 18 | int copy_to_host_from_host(T *dest, const U *src, size_t n_elem) 19 | { 20 | for (size_t i = 0; i < n_elem; ++i) 21 | { 22 | dest[i] = static_cast(src[i]); 23 | } 24 | 25 | #if defined(HAMR_VERBOSE) 26 | if (hamr::get_verbose()) 27 | { 28 | std::cerr << "hamr::copy_to_host_from_host " << n_elem 29 | << " from " << typeid(U).name() << sizeof(U) << " to " 30 | << typeid(T).name() << sizeof(T) << std::endl; 31 | } 32 | #endif 33 | 34 | return 0; 35 | } 36 | 37 | // -------------------------------------------------------------------------- 38 | template 39 | int copy_to_host_from_host(T *dest, const T *src, size_t n_elem, 40 | typename std::enable_if::value>::type *) 41 | { 42 | // copy src to gpu 43 | size_t n_bytes = n_elem*sizeof(T); 44 | memcpy(dest, src, n_bytes); 45 | 46 | #if defined(HAMR_VERBOSE) 47 | if (hamr::get_verbose()) 48 | { 49 | std::cerr << "hamr::copy_to_host_from_host same " << n_elem 50 | << " " << typeid(T).name() << sizeof(T) << std::endl; 51 | } 52 | #endif 53 | 54 | return 0; 55 | } 56 | 57 | } 58 | 59 | #endif 60 | -------------------------------------------------------------------------------- /hamr_python_deleter.cxx: -------------------------------------------------------------------------------- 1 | #include "hamr_config.h" 2 | 3 | #include "hamr_python_deleter.h" 4 | #include "hamr_python_deleter_impl.h" 5 | 6 | template class hamr::python_deleter; 7 | template class hamr::python_deleter; 8 | template class hamr::python_deleter; 9 | template class hamr::python_deleter; 10 | template class hamr::python_deleter; 11 | template class hamr::python_deleter; 12 | template class hamr::python_deleter; 13 | template class hamr::python_deleter; 14 | template class hamr::python_deleter; 15 | template class hamr::python_deleter; 16 | template class hamr::python_deleter; 17 | template class hamr::python_deleter; 18 | template class hamr::python_deleter; 19 | 20 | #include "hamr_buffer.h" 21 | #include "hamr_buffer_impl.h" 22 | 23 | #define hamr_buffer_instantiate_python(T) \ 24 | template hamr::buffer::buffer(allocator alloc, const hamr::stream &strm, transfer sync, size_t size, int owner, T *ptr, hamr::python_deleter df); 25 | 26 | hamr_buffer_instantiate_python(float) 27 | hamr_buffer_instantiate_python(double) 28 | hamr_buffer_instantiate_python(char) 29 | hamr_buffer_instantiate_python(signed char) 30 | hamr_buffer_instantiate_python(short) 31 | hamr_buffer_instantiate_python(int) 32 | hamr_buffer_instantiate_python(long) 33 | hamr_buffer_instantiate_python(long long) 34 | hamr_buffer_instantiate_python(unsigned char) 35 | hamr_buffer_instantiate_python(unsigned short) 36 | hamr_buffer_instantiate_python(unsigned int) 37 | hamr_buffer_instantiate_python(unsigned long) 38 | hamr_buffer_instantiate_python(unsigned long long) 39 | -------------------------------------------------------------------------------- /test/test_hamr_buffer_numpy_cuda.py: -------------------------------------------------------------------------------- 1 | from hamr import * 2 | import numpy as np 3 | import sys 4 | 5 | stderr = sys.__stderr__ 6 | 7 | n_elem = 256 8 | init_val = 3.1415 9 | mod_val = 10000 10 | res_val = init_val*mod_val 11 | 12 | # send data from C++ to Python 13 | stderr.write('TEST: C++ --> Python\n' \ 14 | '=======================\n') 15 | 16 | stderr.write('TEST: creating a hamr::buffer w. CUDA ... \n') 17 | buf = buffer_float(buffer_allocator_cuda, 16, 3.1415) 18 | stderr.write('buf = %s\n'%(str(buf))) 19 | stderr.write('TEST: creating a hamr::buffer w. CUDA ... OK!\n\n') 20 | 21 | stderr.write('TEST: get a handle to the data ... \n') 22 | h = buf.get_host_accessible() 23 | stderr.write('TEST: get a handle to the data ... OK!\n\n') 24 | 25 | stderr.write('TEST: share the data with Numpy ... \n') 26 | arr = np.array(h, copy=False) 27 | stderr.write('arr.__array_interface__ = %s\n'%(arr.__array_interface__)) 28 | stderr.write('TEST: share the data with Numpy ... OK!\n\n') 29 | 30 | stderr.write('TEST: deleting the hamr::buffer ... \n') 31 | buf = None 32 | stderr.write('TEST: deleting the hamr::buffer ... OK!\n\n') 33 | 34 | stderr.write('TEST: Numpy reads the data ... \n') 35 | stderr.write('arr = %s\n'%(str(arr))) 36 | stderr.write('TEST: Numpy reads the data ... OK!\n\n') 37 | 38 | stderr.write('TEST: Numpy modifies the data ... \n') 39 | arr *= 10000 40 | stderr.write('arr = %s\n'%(str(arr))) 41 | stderr.write('TEST: Numpy modifies the data ... OK!\n\n') 42 | 43 | stderr.write('TEST: Verify the result ... \n') 44 | if not np.allclose(arr, res_val): 45 | stderr.write('ERROR: TEST failed!\n') 46 | sys.exit(-1) 47 | stderr.write('TEST: Verify the result ... OK\n\n') 48 | 49 | stderr.write('TEST: deleting the Numpy array ... \n') 50 | arr = None 51 | stderr.write('TEST: deleting the Numpy array ... OK!\n\n') 52 | 53 | sys.exit(0) 54 | -------------------------------------------------------------------------------- /hamr_buffer_handle.i: -------------------------------------------------------------------------------- 1 | %{ 2 | #include "hamr_config.h" 3 | #include "hamr_buffer_handle.h" 4 | #include "hamr_gil_state.h" 5 | %} 6 | 7 | /*************************************************************************** 8 | * buffer handle 9 | **************************************************************************/ 10 | %ignore hamr::buffer_handle::buffer_handle(buffer_handle &&); 11 | %ignore hamr::buffer_handle::operator=; 12 | 13 | %include "hamr_buffer_handle.h" 14 | 15 | %extend hamr::buffer_handle 16 | { 17 | PyObject *__str__() 18 | { 19 | hamr::gil_state gil; 20 | std::ostringstream oss; 21 | self->to_stream(oss); 22 | return PyUnicode_FromString(oss.str().c_str()); 23 | } 24 | 25 | %pythoncode 26 | { 27 | @property 28 | def __array_interface__(self): 29 | return self.get_numpy_array_interface() 30 | 31 | @property 32 | def __cuda_array_interface__(self): 33 | return self.get_cuda_array_interface() 34 | } 35 | } 36 | 37 | /* named buffer_handles */ 38 | %template(buffer_handle_float) hamr::buffer_handle; 39 | %template(buffer_handle_double) hamr::buffer_handle; 40 | %template(buffer_handle_char) hamr::buffer_handle; 41 | %template(buffer_handle_short) hamr::buffer_handle; 42 | %template(buffer_handle_int) hamr::buffer_handle; 43 | %template(buffer_handle_long) hamr::buffer_handle; 44 | %template(buffer_handle_long_long) hamr::buffer_handle; 45 | %template(buffer_handle_unsigned_char) hamr::buffer_handle; 46 | %template(buffer_handle_unsigned_short) hamr::buffer_handle; 47 | %template(buffer_handle_unsigned_int) hamr::buffer_handle; 48 | %template(buffer_handle_unsigned_long) hamr::buffer_handle; 49 | %template(buffer_handle_unsigned_long_long) hamr::buffer_handle; 50 | -------------------------------------------------------------------------------- /hamr_hip_print_impl.h: -------------------------------------------------------------------------------- 1 | #ifndef hamr_hip_print_impl_h 2 | #define hamr_hip_print_impl_h 3 | 4 | #include "hamr_config.h" 5 | #include "hamr_env.h" 6 | 7 | #if defined(HAMR_ENABLE_HIP) 8 | #include "hamr_hip_kernels.h" 9 | #include "hamr_hip_launch.h" 10 | #include 11 | #endif 12 | 13 | #include 14 | 15 | /// heterogeneous accelerator memory resource 16 | namespace hamr 17 | { 18 | 19 | /** prints an array on the GPU 20 | * @param[in] vals an array of n elements accessible in HIP 21 | * @param[in] n_elem the length of the array 22 | * @returns 0 if there were no errors 23 | */ 24 | template 25 | int hip_print(T *vals, size_t n_elem) 26 | { 27 | #if !defined(HAMR_ENABLE_HIP) 28 | (void) vals; 29 | (void) n_elem; 30 | std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:" 31 | " print_hip failed because HIP is not enabled." << std::endl; 32 | return -1; 33 | #else 34 | 35 | // get launch parameters 36 | int device_id = -1; 37 | dim3 block_grid; 38 | int n_blocks = 0; 39 | dim3 thread_grid = 0; 40 | if (hamr::partition_thread_blocks(device_id, n_elem, 8, block_grid, 41 | n_blocks, thread_grid)) 42 | { 43 | std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:" 44 | " Failed to determine launch properties." << std::endl; 45 | return -1; 46 | } 47 | 48 | // invoke the print kernel 49 | hipError_t ierr = hipSuccess; 50 | hamr::hip_kernels::print<<>>(vals, n_elem); 51 | if ((ierr = hipGetLastError()) != hipSuccess) 52 | { 53 | std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:" 54 | " Failed to launch the print kernel. " 55 | << hipGetErrorString(ierr) << std::endl; 56 | return -1; 57 | } 58 | 59 | return 0; 60 | #endif 61 | } 62 | 63 | } 64 | #endif 65 | -------------------------------------------------------------------------------- /hamr_copier_traits.h: -------------------------------------------------------------------------------- 1 | #ifndef hamr_copier_traits_h 2 | #define hamr_copier_traits_h 3 | 4 | #include "hamr_config.h" 5 | #include 6 | 7 | namespace hamr 8 | { 9 | /// @name type trait that enables object copy 10 | ///@{ 11 | template ::value || !std::is_arithmetic::value)> struct use_object_copier : std::false_type {}; 12 | template struct use_object_copier : std::true_type {}; 13 | template using use_object_copier_t = typename std::enable_if::value>::type; 14 | ///@} 15 | 16 | 17 | /// @name type trait that enables POD copy from different types 18 | ///@{ 19 | #if defined(HAMR_ENABLE_OBJECTS) 20 | template ::value)> struct use_cons_copier : std::false_type {}; 21 | template struct use_cons_copier : std::true_type {}; 22 | template using use_cons_copier_t = typename std::enable_if::value>::type; 23 | #else 24 | template ::value && std::is_arithmetic::value)> struct use_cons_copier : std::false_type {}; 25 | template struct use_cons_copier : std::true_type {}; 26 | template using use_cons_copier_t = typename std::enable_if::value>::type; 27 | #endif 28 | ///@} 29 | 30 | /// @name type trait that enables POD copy from the same types 31 | ///@{ 32 | template ::value && std::is_arithmetic::value)> struct use_bytes_copier : std::false_type {}; 33 | template struct use_bytes_copier : std::true_type {}; 34 | template using use_bytes_copier_t = typename std::enable_if::value>::type; 35 | ///@} 36 | 37 | } 38 | 39 | #endif 40 | -------------------------------------------------------------------------------- /hamr_cuda_print_impl.h: -------------------------------------------------------------------------------- 1 | #ifndef hamr_cuda_print_impl_h 2 | #define hamr_cuda_print_impl_h 3 | 4 | #include "hamr_config.h" 5 | #include "hamr_env.h" 6 | #include "hamr_stream.h" 7 | #if defined(HAMR_ENABLE_CUDA) 8 | #include "hamr_cuda_kernels.h" 9 | #include "hamr_cuda_launch.h" 10 | #include 11 | #include 12 | #endif 13 | 14 | #include 15 | 16 | /// heterogeneous accelerator memory resource 17 | namespace hamr 18 | { 19 | 20 | /** prints an array on the GPU 21 | * @param[in] vals an array of n elements accessible in CUDA 22 | * @param[in] n_elem the length of the array 23 | * @returns 0 if there were no errors 24 | */ 25 | template 26 | int cuda_print(const hamr::stream &strm, T *vals, size_t n_elem) 27 | { 28 | #if !defined(HAMR_ENABLE_CUDA) 29 | (void) vals; 30 | (void) n_elem; 31 | std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:" 32 | " print_cuda failed because CUDA is not enabled." << std::endl; 33 | return -1; 34 | #else 35 | 36 | // get launch parameters 37 | int device_id = -1; 38 | dim3 block_grid; 39 | int n_blocks = 0; 40 | dim3 thread_grid = 0; 41 | if (hamr::partition_thread_blocks(device_id, n_elem, 8, block_grid, 42 | n_blocks, thread_grid)) 43 | { 44 | std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:" 45 | " Failed to determine launch properties." << std::endl; 46 | return -1; 47 | } 48 | 49 | // invoke the print kernel 50 | cudaError_t ierr = cudaSuccess; 51 | hamr::cuda_kernels::print<<>>(vals, n_elem); 52 | if ((ierr = cudaGetLastError()) != cudaSuccess) 53 | { 54 | std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:" 55 | " Failed to launch the print kernel. " 56 | << cudaGetErrorString(ierr) << std::endl; 57 | return -1; 58 | } 59 | 60 | return 0; 61 | #endif 62 | } 63 | 64 | } 65 | #endif 66 | -------------------------------------------------------------------------------- /.github/workflows/build_and_test_hip.yml: -------------------------------------------------------------------------------- 1 | name: HIP-HAMR 2 | 3 | on: 4 | push: 5 | branches: [ master ] 6 | pull_request: 7 | branches: [ master ] 8 | 9 | workflow_dispatch: 10 | 11 | jobs: 12 | build_and_test: 13 | runs-on: ubuntu-20.04 14 | 15 | steps: 16 | - uses: actions/checkout@v2 17 | 18 | - name: install_deps 19 | run: | 20 | export DEBIAN_FRONTEND="noninteractive" 21 | export TZ="America/Los_Angeles" 22 | sudo apt-get update -qq 23 | sudo apt-get install -qq -y git-core gcc g++ cmake automake m4 wget swig python-dev 24 | wget https://repo.radeon.com/amdgpu-install/21.50.2/ubuntu/focal/amdgpu-install_21.50.2.50002-1_all.deb 25 | sudo apt-get install -qq -y ./amdgpu-install_21.50.2.50002-1_all.deb 26 | sudo amdgpu-install --usecase=rocm,openclsdk,hiplibsdk --no-dkms 27 | python3 -mvenv py3k_testing 28 | source py3k_testing/bin/activate 29 | python3 -mpip install numpy 30 | 31 | # build for HIP 32 | - name: build_hip 33 | run: | 34 | source py3k_testing/bin/activate 35 | mkdir build_hip 36 | cd build_hip 37 | cmake -DCMAKE_C_COMPILER=/opt/rocm/llvm/bin/clang -DCMAKE_CXX_COMPILER=/opt/rocm/llvm/bin/clang++ -DCMAKE_BUILD_TYPE=Debug -DBUILD_TESTING=ON -DHAMR_ENABLE_HIP=ON -DCMAKE_INSTALL_PREFIX=`pwd`/../../hamr-install-hip .. 38 | make -j2 install 39 | cd .. 40 | 41 | # build for HIP 42 | - name: build_hip_separate_impl 43 | run: | 44 | source py3k_testing/bin/activate 45 | mkdir build_hip_sep 46 | cd build_hip_sep 47 | cmake -DCMAKE_C_COMPILER=/opt/rocm/llvm/bin/clang -DCMAKE_CXX_COMPILER=/opt/rocm/llvm/bin/clang++ -DCMAKE_BUILD_TYPE=Debug -DBUILD_TESTING=ON -DHAMR_ENABLE_HIP=ON -DHAMR_SEPARATE_IMPL=ON -DCMAKE_INSTALL_PREFIX=`pwd`/../../hamr-install-hip-sep .. 48 | make -j2 install 49 | cd .. 50 | 51 | -------------------------------------------------------------------------------- /.github/workflows/build_and_test_amd_openmp.yml: -------------------------------------------------------------------------------- 1 | name: AMD-OpenMP 2 | 3 | on: 4 | push: 5 | branches: [ master ] 6 | pull_request: 7 | branches: [ master ] 8 | 9 | workflow_dispatch: 10 | 11 | jobs: 12 | build_and_test: 13 | runs-on: ubuntu-22.04 14 | 15 | steps: 16 | - uses: actions/checkout@v2 17 | 18 | - name: install_deps 19 | run: | 20 | export DEBIAN_FRONTEND="noninteractive" 21 | export TZ="America/Los_Angeles" 22 | sudo apt-get update -qq 23 | sudo apt-get install -qq -y git-core gcc g++ cmake automake m4 wget swig python3 24 | wget https://repo.radeon.com/amdgpu-install/5.4.3/ubuntu/jammy/amdgpu-install_5.4.50403-1_all.deb 25 | sudo apt-get install ./amdgpu-install_5.4.50403-1_all.deb 26 | sudo amdgpu-install --usecase=rocm,openclsdk,hiplibsdk --no-dkms 27 | python3 -mvenv py3k_testing 28 | source py3k_testing/bin/activate 29 | python3 -mpip install numpy 30 | 31 | # build for OpenMP 32 | - name: build_openmp 33 | run: | 34 | source py3k_testing/bin/activate 35 | mkdir build_amd_openmp 36 | cd build_amd_openmp 37 | cmake -DCMAKE_CXX_COMPILER=/opt/rocm/llvm/bin/clang++ -DHAMR_OPENMP_ARCH=gfx1030 -DCMAKE_BUILD_TYPE=Debug -DBUILD_TESTING=ON -DHAMR_ENABLE_OPENMP=ON -DCMAKE_INSTALL_PREFIX=`pwd`/../../hamr-install-amd-omp .. 38 | make -j2 install 39 | cd .. 40 | 41 | # build for OpenMP 42 | - name: build_openmp_separate_impl 43 | run: | 44 | source py3k_testing/bin/activate 45 | mkdir build_amd_openmp_sep 46 | cd build_amd_openmp_sep 47 | cmake -DCMAKE_CXX_COMPILER=/opt/rocm/llvm/bin/clang++ -DHAMR_OPENMP_ARCH=gfx1030 -DCMAKE_BUILD_TYPE=Debug -DBUILD_TESTING=ON -DHAMR_ENABLE_OPENMP=ON -DHAMR_EMABLE_SEPARATE_IMPL=ON -DCMAKE_INSTALL_PREFIX=`pwd`/../../hamr-install-amd-omp-sep .. 48 | make -j2 install 49 | cd .. 50 | 51 | -------------------------------------------------------------------------------- /hamr_stream.cxx: -------------------------------------------------------------------------------- 1 | #include "hamr_stream.h" 2 | 3 | #include 4 | 5 | namespace hamr 6 | { 7 | 8 | // -------------------------------------------------------------------------- 9 | int stream::synchronize() const 10 | { 11 | #if defined(HAMR_ENABLE_CUDA) 12 | if (const cudaStream_t *cs = std::get_if<1>(&m_stream)) 13 | { 14 | cudaStreamSynchronize(*cs); 15 | } 16 | #endif 17 | #if defined(HAMR_ENABLE_HIP) 18 | if (const hipStream_t *hs = std::get_if<2>(&m_stream)) 19 | { 20 | hipStreamSynchronize(*hs); 21 | } 22 | #endif 23 | return 0; 24 | } 25 | 26 | // -------------------------------------------------------------------------- 27 | stream::operator bool() const 28 | { 29 | if (std::get_if<1>(&m_stream)) 30 | { 31 | return true; 32 | } 33 | else if (std::get_if<2>(&m_stream)) 34 | { 35 | return true; 36 | } 37 | return false; 38 | } 39 | 40 | // -------------------------------------------------------------------------- 41 | size_t stream::get_stream() 42 | { 43 | #if defined(HAMR_ENABLE_CUDA) 44 | if (const cudaStream_t *cs = std::get_if<1>(&m_stream)) 45 | { 46 | return (size_t)*cs; 47 | } 48 | #endif 49 | #if defined(HAMR_ENABLE_HIP) 50 | if (const hipStream_t *hs = std::get_if<2>(&m_stream)) 51 | { 52 | return (size_t)*hs; 53 | } 54 | #endif 55 | return 2; 56 | } 57 | 58 | // -------------------------------------------------------------------------- 59 | void stream::print() const 60 | { 61 | #if defined(HAMR_ENABLE_CUDA) 62 | if (const cudaStream_t *cs = std::get_if<1>(&m_stream)) 63 | { 64 | std::cerr << "cudaStream_t m_stream = " << *cs << std::endl; 65 | return; 66 | } 67 | #endif 68 | #if defined(HAMR_ENABLE_HIP) 69 | if (const hipStream_t *hs = std::get_if<2>(&m_stream)) 70 | { 71 | std::cerr << "hipStream_t m_stream = " << *hs << std::endl; 72 | return; 73 | } 74 | #endif 75 | std::cerr << "empty" << std::endl; 76 | } 77 | 78 | } 79 | -------------------------------------------------------------------------------- /hamr_cuda_device.h: -------------------------------------------------------------------------------- 1 | #ifndef hamr_cuda_device_h 2 | #define hamr_cuda_device_h 3 | 4 | #include "hamr_config.h" 5 | 6 | ///@file 7 | 8 | namespace hamr 9 | { 10 | /// gets the device identifier for the first GPU. @returns zero if successful. 11 | inline int HAMR_EXPORT get_cuda_device_identifier(int &dev_id) { dev_id = 0; return 0; } 12 | 13 | /// gets the device identifier for the host. @returns zero if successful. 14 | inline int HAMR_EXPORT get_cuda_host_identifier(int &dev_id) { dev_id = -1; return 0; } 15 | 16 | /// gets the currently atcive CUDA device. @returns zero if successful. 17 | int HAMR_EXPORT get_active_cuda_device(int &dev_id); 18 | 19 | /// sets the active CUDA device. returns zero if successful. 20 | int HAMR_EXPORT set_active_cuda_device(int dev_id); 21 | 22 | /// gets the device that owns the given pointer. @returns zero if successful. 23 | int HAMR_EXPORT get_cuda_device(const void *ptr, int &device_id); 24 | 25 | /** Activate the specified CUDA device, and restore the previously active 26 | * device when the object is destroyed. 27 | */ 28 | class HAMR_EXPORT activate_cuda_device 29 | { 30 | public: 31 | activate_cuda_device() = delete; 32 | activate_cuda_device(const activate_cuda_device &) = delete; 33 | void operator=(const activate_cuda_device &) = delete; 34 | 35 | activate_cuda_device(int id); 36 | ~activate_cuda_device(); 37 | 38 | private: 39 | int m_device; 40 | }; 41 | 42 | 43 | /** Activate peer to peer memory access between two devices, and deactivate when 44 | * the object goes out of scope. 45 | */ 46 | class access_cuda_peer 47 | { 48 | public: 49 | access_cuda_peer() : m_dest_device(-1), m_src_device(-1), m_symetric(false) {} 50 | ~access_cuda_peer() { disable(); } 51 | 52 | /// enable peer to peer access. the dest device must active. 53 | int enable(int dest_device, int src_device, bool symetric); 54 | 55 | /// disable peer to peer access. 56 | int disable(); 57 | 58 | private: 59 | int m_dest_device; 60 | int m_src_device; 61 | int m_symetric; 62 | }; 63 | 64 | 65 | } 66 | #endif 67 | -------------------------------------------------------------------------------- /hamr_new_allocator.h: -------------------------------------------------------------------------------- 1 | #ifndef hamr_new_allocator_h 2 | #define hamr_new_allocator_h 3 | 4 | #include "hamr_config.h" 5 | #include 6 | #include 7 | 8 | namespace hamr 9 | { 10 | 11 | /// a deleter for arrays allocated with new 12 | template 13 | class HAMR_EXPORT new_deleter 14 | { 15 | public: 16 | /** constructs the deleter 17 | * @param[in] ptr the pointer to the array to delete 18 | * @param[in] n the number of elements in the array 19 | */ 20 | new_deleter(T *ptr, size_t n); 21 | 22 | /** deletes the array 23 | * @param[in] ptr the pointer to the array to delete. must be the same as 24 | * that passed during construction. 25 | */ 26 | void operator()(T *ptr); 27 | 28 | private: 29 | T *m_ptr; 30 | size_t m_elem; 31 | }; 32 | 33 | 34 | 35 | 36 | 37 | 38 | /// a class for allocating arrays with new 39 | template 40 | struct HAMR_EXPORT new_allocator 41 | { 42 | /** allocate an array of n elements. 43 | * @param[in] n the number of elements to allocate 44 | * @returns a shared pointer to the array that holds a deleter for the memory 45 | */ 46 | static std::shared_ptr allocate(size_t n) HAMR_EXPORT; 47 | 48 | /** allocate an array of n elements. 49 | * @param[in] n the number of elements to allocate 50 | * @param[in] val a value to initialize the elements to 51 | * @returns a shared pointer to the array that holds a deleter for the memory 52 | */ 53 | static std::shared_ptr allocate(size_t n, const T &val) HAMR_EXPORT; 54 | 55 | /** allocate an array of n elements. 56 | * @param[in] n the number of elements to allocate 57 | * @param[in] vals an array of n values to initialize the elements with 58 | * @returns a shared pointer to the array that holds a deleter for the memory 59 | */ 60 | template 61 | static std::shared_ptr allocate(size_t n, const U *vals) HAMR_EXPORT; 62 | }; 63 | 64 | } 65 | 66 | #if !defined(HAMR_SEPARATE_IMPL) 67 | #include "hamr_new_allocator_impl.h" 68 | #endif 69 | 70 | #endif 71 | -------------------------------------------------------------------------------- /.github/workflows/build_and_test_host.yml: -------------------------------------------------------------------------------- 1 | name: host-HAMR 2 | 3 | on: 4 | push: 5 | branches: [ master ] 6 | pull_request: 7 | branches: [ master ] 8 | 9 | workflow_dispatch: 10 | 11 | jobs: 12 | build_and_test: 13 | runs-on: ubuntu-20.04 14 | 15 | steps: 16 | - uses: actions/checkout@v2 17 | 18 | - name: install_deps 19 | run: | 20 | export DEBIAN_FRONTEND="noninteractive" 21 | export TZ="America/Los_Angeles" 22 | sudo apt-get update -qq 23 | sudo apt-get install -qq -y git-core gcc g++ cmake automake m4 wget swig python-dev 24 | python3 -mvenv py3k_testing 25 | source py3k_testing/bin/activate 26 | python3 -mpip install numpy 27 | 28 | # build for host only 29 | - name: build_host 30 | run: | 31 | source py3k_testing/bin/activate 32 | mkdir build_host 33 | cd build_host 34 | cmake -DCMAKE_BUILD_TYPE=Debug -DBUILD_TESTING=ON -DHAMR_ENABLE_PYTHON=ON -DCMAKE_INSTALL_PREFIX=`pwd`/../../hamr-install-host .. 35 | make -j2 install 36 | cd .. 37 | 38 | # test the host build 39 | - name: test_host 40 | run: | 41 | source py3k_testing/bin/activate 42 | cd build_host 43 | tmp_llp=$LD_LIBRARY_PATH 44 | tmp_pp=$PYTHONPATH 45 | source ./bin/hamr_python_env.sh 46 | ctest --output-on-failure 47 | export LD_LIBRARY_PATH=$tmp_llp PYTHONPATH=$tmp_pp 48 | cd .. 49 | 50 | # build for host only 51 | - name: build_host_separate_impl 52 | run: | 53 | source py3k_testing/bin/activate 54 | mkdir build_host_sep 55 | cd build_host_sep 56 | cmake -DCMAKE_BUILD_TYPE=Debug -DBUILD_TESTING=ON -DHAMR_ENABLE_PYTHON=ON -DHAMR_SEPARATE_IMPL=ON -DCMAKE_INSTALL_PREFIX=`pwd`/../../hamr-install-host-sep .. 57 | make -j2 install 58 | cd .. 59 | 60 | # test the host build 61 | - name: test_host_separate_impl 62 | run: | 63 | source py3k_testing/bin/activate 64 | cd build_host_sep 65 | tmp_llp=$LD_LIBRARY_PATH 66 | tmp_pp=$PYTHONPATH 67 | source ./bin/hamr_python_env.sh 68 | ctest --output-on-failure 69 | export LD_LIBRARY_PATH=$tmp_llp PYTHONPATH=$tmp_pp 70 | cd .. 71 | -------------------------------------------------------------------------------- /python/hamr_py.i: -------------------------------------------------------------------------------- 1 | %define DOCSTR 2 | "HAMR - Heterogeneous Accelerator Memory Resource. A library for autmated 3 | memory management on systems with heterogeneous accellerators." 4 | %enddef 5 | %module(docstring=DOCSTR) hamr 6 | %feature("autodoc", "3"); 7 | %{ 8 | #define SWIG_FILE_WITH_INIT 9 | 10 | #include 11 | 12 | #include "hamr_config.h" 13 | #include "hamr_buffer.h" 14 | #include "hamr_buffer_allocator.h" 15 | #include "hamr_buffer_handle.h" 16 | #include "hamr_python_deleter.h" 17 | #include "hamr_stream.h" 18 | 19 | #include 20 | #include 21 | 22 | /* disable some warnings that are present in SWIG generated code. */ 23 | #if __GNUC__ > 8 24 | #pragma GCC diagnostic ignored "-Wcast-function-type" 25 | #endif 26 | #pragma GCC diagnostic ignored "-Wunused-parameter" 27 | #pragma GCC diagnostic ignored "-Wmissing-field-initializers" 28 | #pragma GCC diagnostic ignored "-Wdeprecated-declarations" 29 | #if defined(__CUDACC__) 30 | #pragma nv_diag_suppress = set_but_not_used 31 | #endif 32 | %} 33 | 34 | /* SWIG doens't understand compiler attriibbutes */ 35 | #define __attribute__(x) 36 | 37 | /* enable STL classes */ 38 | %include "shared_ptr.i" 39 | 40 | /*************************************************************************** 41 | * expose the build configuration 42 | **************************************************************************/ 43 | %include "hamr_config.h" 44 | 45 | /*************************************************************************** 46 | * buffer allocator enumerations 47 | **************************************************************************/ 48 | %include "hamr_buffer_allocator.i" 49 | 50 | /*************************************************************************** 51 | * buffer transfer mode enumerations 52 | **************************************************************************/ 53 | %include "hamr_buffer_transfer.i" 54 | 55 | /*************************************************************************** 56 | * stream 57 | **************************************************************************/ 58 | %include "hamr_stream.i" 59 | 60 | /*************************************************************************** 61 | * buffer_handle 62 | **************************************************************************/ 63 | %include "hamr_buffer_handle.i" 64 | 65 | /*************************************************************************** 66 | * buffer 67 | **************************************************************************/ 68 | %include "hamr_buffer.i" 69 | -------------------------------------------------------------------------------- /test/test_hamr_multi_gpu_cuda.cpp: -------------------------------------------------------------------------------- 1 | #include "hamr_buffer.h" 2 | #include "hamr_buffer_util.h" 3 | 4 | #include 5 | #include 6 | 7 | #include 8 | 9 | template 10 | void print(const hamr::buffer &buf) 11 | { 12 | auto [spbuf, pbuf] = hamr::get_host_accessible(buf); 13 | std::cerr << pbuf[0]; 14 | for (size_t i = 1; i < buf.size(); ++i) 15 | std::cerr << ", "<< pbuf[i]; 16 | std::cerr << std::endl; 17 | } 18 | 19 | 20 | int main(int argc, char **argv) 21 | { 22 | (void) argc; 23 | (void) argv; 24 | 25 | // get the number of GPUs 26 | int n_dev = 0; 27 | if (cudaGetDeviceCount(&n_dev) != cudaSuccess) 28 | { 29 | std::cerr << "ERROR: failed to get number of devices" << std::endl; 30 | return -1; 31 | } 32 | 33 | if (n_dev < 2) 34 | { 35 | std::cerr << "Can't run the test with " << n_dev << " CUDA devices" << std::endl; 36 | return 0; 37 | } 38 | 39 | // allocate some data on the host 40 | size_t n_elem = 1000; 41 | 42 | using T = int; 43 | T val = 31415; 44 | 45 | hamr::buffer *src = new hamr::buffer(hamr::buffer_allocator::malloc, n_elem, val); 46 | 47 | if (n_elem < 33) 48 | print(*src); 49 | 50 | // move to each GPU 51 | for (int i = 0; i < n_dev; ++i) 52 | { 53 | std::cerr << " ==== move to device " << i << " ==== " << std::endl; 54 | 55 | // move to GPU i 56 | if (cudaSetDevice(i) != cudaSuccess) 57 | { 58 | std::cerr << "ERROR: failed to set the active device to " << i << std::endl; 59 | return -1; 60 | } 61 | 62 | hamr::buffer *dest = new hamr::buffer(hamr::buffer_allocator::cuda, *src); 63 | 64 | if (n_elem < 33) 65 | print(*dest); 66 | 67 | // update the source 68 | delete src; 69 | src = dest; 70 | } 71 | 72 | // move back to the host 73 | std::cerr << " ==== move to host ==== " << std::endl; 74 | 75 | hamr::buffer end(hamr::buffer_allocator::malloc, *src); 76 | 77 | if (n_elem < 33) 78 | print(end); 79 | 80 | // check for 31415 81 | std::cerr << " ==== validate ==== " << std::endl; 82 | 83 | auto [spsrc, psrc] = hamr::get_host_accessible(*src); 84 | 85 | for (size_t i = 0; i < n_elem; ++i) 86 | { 87 | if (psrc[i] != val) 88 | { 89 | std::cerr << "ERROR: psrc[ " << i << "] == " << psrc[i] 90 | << " != " << val << std::endl; 91 | return -1; 92 | } 93 | } 94 | 95 | std::cerr << "All values verified to be " << val << std::endl; 96 | 97 | return 0; 98 | } 99 | -------------------------------------------------------------------------------- /test/test_hamr_multi_gpu_hip.cpp: -------------------------------------------------------------------------------- 1 | #include "hamr_buffer.h" 2 | #include "hamr_buffer_util.h" 3 | 4 | #include 5 | 6 | 7 | #include 8 | 9 | template 10 | void print(const hamr::buffer &buf) 11 | { 12 | auto [spbuf, pbuf] = hamr::get_host_accessible(buf); 13 | 14 | std::cerr << pbuf[0]; 15 | for (size_t i = 1; i < buf.size(); ++i) 16 | std::cerr << ", "<< pbuf[i]; 17 | std::cerr << std::endl; 18 | } 19 | 20 | 21 | int main(int argc, char **argv) 22 | { 23 | (void) argc; 24 | (void) argv; 25 | 26 | // get the number of GPUs 27 | int n_dev = 0; 28 | if (hipGetDeviceCount(&n_dev) != hipSuccess) 29 | { 30 | std::cerr << "ERROR: failed to get number of devices" << std::endl; 31 | return -1; 32 | } 33 | 34 | if (n_dev < 2) 35 | { 36 | std::cerr << "Can't run the test with " << n_dev << " HIP devices" << std::endl; 37 | return 0; 38 | } 39 | 40 | // allocate some data on the host 41 | size_t n_elem = 1000; 42 | 43 | using T = int; 44 | T val = 31415; 45 | 46 | hamr::buffer *src = new hamr::buffer(hamr::buffer_allocator::malloc, n_elem, val); 47 | 48 | if (n_elem < 33) 49 | print(*src); 50 | 51 | // move to each GPU 52 | for (int i = 0; i < n_dev; ++i) 53 | { 54 | std::cerr << " ==== move to device " << i << " ==== " << std::endl; 55 | 56 | // move to GPU i 57 | if (hipSetDevice(i) != hipSuccess) 58 | { 59 | std::cerr << "ERROR: failed to set the active device to " << i << std::endl; 60 | return -1; 61 | } 62 | 63 | hamr::buffer *dest = new hamr::buffer(hamr::buffer_allocator::hip, *src); 64 | 65 | if (n_elem < 33) 66 | print(*dest); 67 | 68 | // update the source 69 | delete src; 70 | src = dest; 71 | } 72 | 73 | // move back to the host 74 | std::cerr << " ==== move to host ==== " << std::endl; 75 | 76 | hamr::buffer end(hamr::buffer_allocator::malloc, *src); 77 | 78 | if (n_elem < 33) 79 | print(end); 80 | 81 | // check for 31415 82 | std::cerr << " ==== validate ==== " << std::endl; 83 | 84 | auto [spsrc, psrc] = hamr::get_host_accessible(*src); 85 | 86 | for (size_t i = 0; i < n_elem; ++i) 87 | { 88 | if (psrc[i] != val) 89 | { 90 | std::cerr << "ERROR: psrc[ " << i << "] == " << psrc[i] 91 | << " != " << val << std::endl; 92 | return -1; 93 | } 94 | } 95 | 96 | std::cerr << "All values verified to be " << val << std::endl; 97 | 98 | return 0; 99 | } 100 | -------------------------------------------------------------------------------- /hamr_hip_device.cxx: -------------------------------------------------------------------------------- 1 | #include "hamr_hip_device.h" 2 | 3 | #include 4 | 5 | #include 6 | 7 | 8 | namespace hamr 9 | { 10 | 11 | // ************************************************************************** 12 | int get_hip_device(const void *ptr, int &device_id) 13 | { 14 | hipError_t ierr = hipSuccess; 15 | hipPointerAttribute_t ptrAtts; 16 | ierr = hipPointerGetAttributes(&ptrAtts, ptr); 17 | 18 | // TODO -- HIP doesn;t yet have this feature of CUDA 19 | // these types of pointers are NOT accessible on the GPU 20 | // hipErrorInValue occurs when the pointer is unknown to HIP, as is 21 | // the case with pointers allocated by malloc or new. 22 | /*if ((ierr == hipErrorInvalidValue) || 23 | ((ierr == hipSuccess) && ((ptrAtts.type == hipMemoryTypeHost) || 24 | (ptrAtts.type == hipMemoryTypeUnregistered)))) 25 | { 26 | // this is host backed memory not associate with a GPU 27 | device_id = -1; 28 | } 29 | else*/ if (ierr != hipSuccess) 30 | { 31 | std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:" 32 | " Failed to get pointer attributes for " << ptr << std::endl; 33 | return -1; 34 | } 35 | else 36 | { 37 | device_id = ptrAtts.device; 38 | } 39 | 40 | return 0; 41 | } 42 | 43 | // ************************************************************************** 44 | int get_active_hip_device(int &dev_id) 45 | { 46 | hipError_t ierr = hipSuccess; 47 | if ((ierr = hipGetDevice(&dev_id)) != hipSuccess) 48 | { 49 | std::cerr << "Failed to get the active HIP device. " 50 | << hipGetErrorString(ierr) << std::endl; 51 | return -1; 52 | } 53 | 54 | return 0; 55 | } 56 | 57 | // ************************************************************************** 58 | int set_active_hip_device(int dev_id) 59 | { 60 | hipError_t ierr = hipSuccess; 61 | if ((ierr = hipSetDevice(dev_id)) != hipSuccess) 62 | { 63 | std::cerr << "Failed to set the active HIP device. " 64 | << hipGetErrorString(ierr) << std::endl; 65 | return -1; 66 | } 67 | 68 | return 0; 69 | } 70 | 71 | 72 | // -------------------------------------------------------------------------- 73 | activate_hip_device::activate_hip_device(int new_dev) : m_device(-1) 74 | { 75 | int cur_dev = -1; 76 | if (!get_active_hip_device(cur_dev) && (cur_dev != new_dev) && 77 | !set_active_hip_device(new_dev)) 78 | { 79 | m_device = cur_dev; 80 | } 81 | } 82 | 83 | // -------------------------------------------------------------------------- 84 | activate_hip_device::~activate_hip_device() 85 | { 86 | if (m_device >= 0) 87 | { 88 | set_active_hip_device(m_device); 89 | } 90 | } 91 | 92 | } 93 | -------------------------------------------------------------------------------- /hamr_new_allocator.cxx: -------------------------------------------------------------------------------- 1 | #include "hamr_config.h" 2 | 3 | #include "hamr_new_allocator.h" 4 | #include "hamr_new_allocator_impl.h" 5 | 6 | template class hamr::new_deleter; 7 | template class hamr::new_deleter; 8 | template class hamr::new_deleter; 9 | template class hamr::new_deleter; 10 | template class hamr::new_deleter; 11 | template class hamr::new_deleter; 12 | template class hamr::new_deleter; 13 | template class hamr::new_deleter; 14 | template class hamr::new_deleter; 15 | template class hamr::new_deleter; 16 | template class hamr::new_deleter; 17 | template class hamr::new_deleter; 18 | template class hamr::new_deleter; 19 | 20 | #define hamr_new_allocator_instantiate_members(_T) \ 21 | template std::shared_ptr<_T> hamr::new_allocator<_T>::allocate(size_t n, const float *vals); \ 22 | template std::shared_ptr<_T> hamr::new_allocator<_T>::allocate(size_t n, const double *vals); \ 23 | template std::shared_ptr<_T> hamr::new_allocator<_T>::allocate(size_t n, const char *vals); \ 24 | template std::shared_ptr<_T> hamr::new_allocator<_T>::allocate(size_t n, const signed char *vals); \ 25 | template std::shared_ptr<_T> hamr::new_allocator<_T>::allocate(size_t n, const short *vals); \ 26 | template std::shared_ptr<_T> hamr::new_allocator<_T>::allocate(size_t n, const int *vals); \ 27 | template std::shared_ptr<_T> hamr::new_allocator<_T>::allocate(size_t n, const long *vals); \ 28 | template std::shared_ptr<_T> hamr::new_allocator<_T>::allocate(size_t n, const long long *vals); \ 29 | template std::shared_ptr<_T> hamr::new_allocator<_T>::allocate(size_t n, const unsigned char *vals); \ 30 | template std::shared_ptr<_T> hamr::new_allocator<_T>::allocate(size_t n, const unsigned short *vals); \ 31 | template std::shared_ptr<_T> hamr::new_allocator<_T>::allocate(size_t n, const unsigned int *vals); \ 32 | template std::shared_ptr<_T> hamr::new_allocator<_T>::allocate(size_t n, const unsigned long *vals); \ 33 | template std::shared_ptr<_T> hamr::new_allocator<_T>::allocate(size_t n, const unsigned long long *vals); 34 | 35 | 36 | #define hamr_new_allocator_instantiate(_T) \ 37 | template struct hamr::new_allocator<_T>; \ 38 | hamr_new_allocator_instantiate_members(_T) 39 | 40 | hamr_new_allocator_instantiate(float) 41 | hamr_new_allocator_instantiate(double) 42 | hamr_new_allocator_instantiate(char) 43 | hamr_new_allocator_instantiate(signed char) 44 | hamr_new_allocator_instantiate(short) 45 | hamr_new_allocator_instantiate(int) 46 | hamr_new_allocator_instantiate(long) 47 | hamr_new_allocator_instantiate(long long) 48 | hamr_new_allocator_instantiate(unsigned char) 49 | hamr_new_allocator_instantiate(unsigned short) 50 | hamr_new_allocator_instantiate(unsigned int) 51 | hamr_new_allocator_instantiate(unsigned long) 52 | hamr_new_allocator_instantiate(unsigned long long) 53 | -------------------------------------------------------------------------------- /doc/rtd/conf.py: -------------------------------------------------------------------------------- 1 | import subprocess, os 2 | 3 | # Configuration file for the Sphinx documentation builder. 4 | # 5 | # This file only contains a selection of the most common options. For a full 6 | # list see the documentation: 7 | # http://www.sphinx-doc.org/en/master/config 8 | 9 | # -- Path setup -------------------------------------------------------------- 10 | 11 | # If extensions (or modules to document with autodoc) are in another directory, 12 | # add these directories to sys.path here. If the directory is relative to the 13 | # documentation root, use os.path.abspath to make it absolute, like shown here. 14 | # 15 | # import os 16 | # import sys 17 | # sys.path.insert(0, os.path.abspath('.')) 18 | 19 | 20 | # -- Project information ----------------------------------------------------- 21 | 22 | project = 'HAMR' 23 | copyright = "2022, Burlen Loring" 24 | author = "Burlen Loring" 25 | 26 | # -- General configuration --------------------------------------------------- 27 | 28 | try: 29 | odir = os.environ['READTHEDOCS_OUTPUT'] 30 | except: 31 | os.environ['READTHEDOCS_OUTPUT'] = '_build' 32 | odir = os.environ['READTHEDOCS_OUTPUT'] 33 | 34 | if not os.path.exists(odir + '/html'): 35 | os.makedirs(odir + '/html') 36 | 37 | subprocess.call('doxygen --version', shell=True) 38 | subprocess.call('doxygen', shell=True) 39 | 40 | # Add any Sphinx extension module names here, as strings. They can be 41 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 42 | # ones. 43 | 44 | # pip install sphinxcontrib-bibtex breathe 45 | extensions = ['sphinxcontrib.bibtex', 'breathe'] 46 | 47 | bibtex_bibfiles = ['bibliography.bib'] 48 | 49 | # Configuring Breathe 50 | breathe_projects = { 51 | "HAMR": "_build/xml" 52 | } 53 | breathe_default_project = "HAMR" 54 | 55 | # Add any paths that contain templates here, relative to this directory. 56 | templates_path = ['_templates'] 57 | 58 | # The master toctree document. 59 | master_doc = 'index' 60 | 61 | # List of patterns, relative to source directory, that match files and 62 | # directories to ignore when looking for source files. 63 | # This pattern also affects html_static_path and html_extra_path. 64 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 65 | 66 | 67 | # -- Options for HTML output ------------------------------------------------- 68 | 69 | # The theme to use for HTML and HTML Help pages. See the documentation for 70 | # a list of builtin themes. 71 | # 72 | html_theme = 'sphinx_rtd_theme' 73 | 74 | # Add any paths that contain custom static files (such as style sheets) here, 75 | # relative to this directory. They are copied after the builtin static files, 76 | # so a file named "default.css" will overwrite the builtin "default.css". 77 | html_static_path = ['_static'] 78 | 79 | html_css_files = [ 80 | 'theme_overrides.css' # overrides for wide tables in RTD theme 81 | ] 82 | 83 | numfig = True 84 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ### HAMR 2 | HAMR is a library defining an accelerator technology agnostic memory model that 3 | bridges between accelerator technologies (CUDA, HIP, ROCm, OpenMP, Kokos, etc) 4 | and traditional CPUs in heterogeneous computing environments. HAMR is light 5 | weight and implemented in modern C++. HAMR includes Python integration that 6 | enables zero-copy data transfer between C++ and Python technogies such as Numba 7 | and Cupy. 8 | 9 | ### Citing 10 | If you've used HAMR in your application please cite us. 11 | 12 | [![DOI](https://zenodo.org/badge/429528113.svg)](https://zenodo.org/badge/latestdoi/429528113) 13 | 14 | ### Source Code 15 | The source code can be obtained at the [HAMR github repository](https://github.com/LBL-EESA/HAMR). 16 | 17 | ### Documentation 18 | The [HAMR User's Guide](https://hamr.readthedocs.io/en/latest/) documents 19 | compiling and use of HAMR and contains simple examples. 20 | 21 | The [HAMR Doxygen site](https://hamr.readthedocs.io/en/latest/doxygen/index.html) documents the APIs. Most users will 22 | want to start with the [hamr::buffer](https://hamr.readthedocs.io/en/latest/doxygen/classhamr_1_1buffer.html), a 23 | container that has capabilities similar to std::vector and can provide access 24 | to data in different accelerator execution environments. 25 | 26 | ### Regression Testing and CI 27 | ![CPU-HAMR build and test](https://github.com/LBL-EESA/hamr/actions/workflows/build_and_test_host.yml/badge.svg) 28 | ![CUDA-HAMR build and test](https://github.com/LBL-EESA/hamr/actions/workflows/build_and_test_cuda.yml/badge.svg) 29 | ![HIP-HAMR build and test](https://github.com/LBL-EESA/hamr/actions/workflows/build_and_test_hip.yml/badge.svg) 30 | ![AMD-OpenMP-HAMR build and test](https://github.com/LBL-EESA/hamr/actions/workflows/build_and_test_amd_openmp.yml/badge.svg) 31 | 32 | ### License 33 | HAMR's [license](LICENSE) is a BSD license with an ADDED paragraph at the end that makes it easy for us to 34 | accept improvements. See [license](LICENSE) for more information. 35 | 36 | ## Copyright Notice 37 | HAMR - Heterogeneous Accelerator Memory Resource (HAMR) 38 | Copyright (c) 2022, The Regents of the University of California, through 39 | Lawrence Berkeley National Laboratory (subject to receipt of any 40 | required approvals from the U.S. Dept. of Energy). All rights reserved. 41 | 42 | If you have questions about your rights to use or distribute this software, 43 | please contact Berkeley Lab's Intellectual Property Office at 44 | IPO@lbl.gov. 45 | 46 | NOTICE. This Software was developed under funding from the U.S. Department 47 | of Energy and the U.S. Government consequently retains certain rights. As 48 | such, the U.S. Government has been granted for itself and others acting on 49 | its behalf a paid-up, nonexclusive, irrevocable, worldwide license in the 50 | Software to reproduce, distribute copies to the public, prepare derivative 51 | works, and perform publicly and display publicly, and to permit others to do so. 52 | -------------------------------------------------------------------------------- /hamr_openmp_allocator.cxx: -------------------------------------------------------------------------------- 1 | #include "hamr_config.h" 2 | 3 | #include "hamr_openmp_allocator.h" 4 | #include "hamr_openmp_allocator_impl.h" 5 | 6 | template class hamr::openmp_deleter; 7 | template class hamr::openmp_deleter; 8 | template class hamr::openmp_deleter; 9 | template class hamr::openmp_deleter; 10 | template class hamr::openmp_deleter; 11 | template class hamr::openmp_deleter; 12 | template class hamr::openmp_deleter; 13 | template class hamr::openmp_deleter; 14 | template class hamr::openmp_deleter; 15 | template class hamr::openmp_deleter; 16 | template class hamr::openmp_deleter; 17 | template class hamr::openmp_deleter; 18 | template class hamr::openmp_deleter; 19 | 20 | #define hamr_openmp_allocator_instantiate_members(_T) \ 21 | template std::shared_ptr<_T> hamr::openmp_allocator<_T>::allocate(size_t n, const float *vals); \ 22 | template std::shared_ptr<_T> hamr::openmp_allocator<_T>::allocate(size_t n, const double *vals); \ 23 | template std::shared_ptr<_T> hamr::openmp_allocator<_T>::allocate(size_t n, const char *vals); \ 24 | template std::shared_ptr<_T> hamr::openmp_allocator<_T>::allocate(size_t n, const signed char *vals); \ 25 | template std::shared_ptr<_T> hamr::openmp_allocator<_T>::allocate(size_t n, const short *vals); \ 26 | template std::shared_ptr<_T> hamr::openmp_allocator<_T>::allocate(size_t n, const int *vals); \ 27 | template std::shared_ptr<_T> hamr::openmp_allocator<_T>::allocate(size_t n, const long *vals); \ 28 | template std::shared_ptr<_T> hamr::openmp_allocator<_T>::allocate(size_t n, const long long *vals); \ 29 | template std::shared_ptr<_T> hamr::openmp_allocator<_T>::allocate(size_t n, const unsigned char *vals); \ 30 | template std::shared_ptr<_T> hamr::openmp_allocator<_T>::allocate(size_t n, const unsigned short *vals); \ 31 | template std::shared_ptr<_T> hamr::openmp_allocator<_T>::allocate(size_t n, const unsigned int *vals); \ 32 | template std::shared_ptr<_T> hamr::openmp_allocator<_T>::allocate(size_t n, const unsigned long *vals); \ 33 | template std::shared_ptr<_T> hamr::openmp_allocator<_T>::allocate(size_t n, const unsigned long long *vals); 34 | 35 | #define hamr_openmp_allocator_instantiate(_T) \ 36 | template struct hamr::openmp_allocator<_T>; \ 37 | hamr_openmp_allocator_instantiate_members(_T) 38 | 39 | hamr_openmp_allocator_instantiate(float) 40 | hamr_openmp_allocator_instantiate(double) 41 | hamr_openmp_allocator_instantiate(char) 42 | hamr_openmp_allocator_instantiate(signed char) 43 | hamr_openmp_allocator_instantiate(short) 44 | hamr_openmp_allocator_instantiate(int) 45 | hamr_openmp_allocator_instantiate(long) 46 | hamr_openmp_allocator_instantiate(long long) 47 | hamr_openmp_allocator_instantiate(unsigned char) 48 | hamr_openmp_allocator_instantiate(unsigned short) 49 | hamr_openmp_allocator_instantiate(unsigned int) 50 | hamr_openmp_allocator_instantiate(unsigned long) 51 | hamr_openmp_allocator_instantiate(unsigned long long) 52 | -------------------------------------------------------------------------------- /hamr_malloc_allocator.cxx: -------------------------------------------------------------------------------- 1 | #include "hamr_config.h" 2 | 3 | #include "hamr_malloc_allocator.h" 4 | #include "hamr_malloc_allocator_impl.h" 5 | 6 | template class hamr::malloc_deleter; 7 | template class hamr::malloc_deleter; 8 | template class hamr::malloc_deleter; 9 | template class hamr::malloc_deleter; 10 | template class hamr::malloc_deleter; 11 | template class hamr::malloc_deleter; 12 | template class hamr::malloc_deleter; 13 | template class hamr::malloc_deleter; 14 | template class hamr::malloc_deleter; 15 | template class hamr::malloc_deleter; 16 | template class hamr::malloc_deleter; 17 | template class hamr::malloc_deleter; 18 | template class hamr::malloc_deleter; 19 | 20 | #define hamr_malloc_allocator_instantiate_members(_T) \ 21 | template std::shared_ptr<_T> hamr::malloc_allocator<_T>::allocate(size_t n, const float *vals); \ 22 | template std::shared_ptr<_T> hamr::malloc_allocator<_T>::allocate(size_t n, const double *vals); \ 23 | template std::shared_ptr<_T> hamr::malloc_allocator<_T>::allocate(size_t n, const char *vals); \ 24 | template std::shared_ptr<_T> hamr::malloc_allocator<_T>::allocate(size_t n, const signed char *vals); \ 25 | template std::shared_ptr<_T> hamr::malloc_allocator<_T>::allocate(size_t n, const short *vals); \ 26 | template std::shared_ptr<_T> hamr::malloc_allocator<_T>::allocate(size_t n, const int *vals); \ 27 | template std::shared_ptr<_T> hamr::malloc_allocator<_T>::allocate(size_t n, const long *vals); \ 28 | template std::shared_ptr<_T> hamr::malloc_allocator<_T>::allocate(size_t n, const long long *vals); \ 29 | template std::shared_ptr<_T> hamr::malloc_allocator<_T>::allocate(size_t n, const unsigned char *vals); \ 30 | template std::shared_ptr<_T> hamr::malloc_allocator<_T>::allocate(size_t n, const unsigned short *vals); \ 31 | template std::shared_ptr<_T> hamr::malloc_allocator<_T>::allocate(size_t n, const unsigned int *vals); \ 32 | template std::shared_ptr<_T> hamr::malloc_allocator<_T>::allocate(size_t n, const unsigned long *vals); \ 33 | template std::shared_ptr<_T> hamr::malloc_allocator<_T>::allocate(size_t n, const unsigned long long *vals); 34 | 35 | 36 | #define hamr_malloc_allocator_instantiate(_T) \ 37 | template struct hamr::malloc_allocator<_T>; \ 38 | hamr_malloc_allocator_instantiate_members(_T) 39 | 40 | hamr_malloc_allocator_instantiate(float) 41 | hamr_malloc_allocator_instantiate(double) 42 | hamr_malloc_allocator_instantiate(char) 43 | hamr_malloc_allocator_instantiate(signed char) 44 | hamr_malloc_allocator_instantiate(short) 45 | hamr_malloc_allocator_instantiate(int) 46 | hamr_malloc_allocator_instantiate(long) 47 | hamr_malloc_allocator_instantiate(long long) 48 | hamr_malloc_allocator_instantiate(unsigned char) 49 | hamr_malloc_allocator_instantiate(unsigned short) 50 | hamr_malloc_allocator_instantiate(unsigned int) 51 | hamr_malloc_allocator_instantiate(unsigned long) 52 | hamr_malloc_allocator_instantiate(unsigned long long) 53 | -------------------------------------------------------------------------------- /test/test_hamr_buffer_cupy_cuda.py: -------------------------------------------------------------------------------- 1 | from hamr import * 2 | import cupy as cp 3 | import sys 4 | 5 | stderr = sys.__stderr__ 6 | 7 | n_elem = 256 8 | init_val = 3.1415 9 | mod_val = 10000 10 | res_val = init_val*mod_val 11 | 12 | # send data from C++ to Python 13 | stderr.write('TEST 1 : C++ --> Python\n' \ 14 | '=======================\n') 15 | 16 | stderr.write('TEST 1: creating a hamr::buffer w. CUDA ... \n') 17 | buf = buffer_float(buffer_allocator_cuda, n_elem, init_val) 18 | stderr.write('buf = %s\n'%(str(buf))) 19 | stderr.write('TEST 1: creating a hamr::buffer w. CUDA ... OK!\n\n') 20 | 21 | stderr.write('TEST 1: get a handle to the data ... \n') 22 | h = buf.get_cuda_accessible() 23 | stderr.write('TEST 1: get a handle to the data ... OK!\n\n') 24 | 25 | stderr.write('TEST 1: share the data with Cupy ... \n') 26 | arr = cp.array(h, copy=False) 27 | stderr.write('arr.__cuda_array_interface__ = %s\n'%(arr.__cuda_array_interface__)) 28 | stderr.write('TEST 1: share the data with Cupy ... OK!\n\n') 29 | 30 | stderr.write('TEST 1: deleting the hamr::buffer ... \n') 31 | buf = None 32 | h = None 33 | stderr.write('TEST 1: deleting the hamr::buffer ... OK!\n\n') 34 | 35 | stderr.write('TEST 1: Cupy modifies the data ... \n') 36 | arr *= mod_val 37 | stderr.write('arr = %s\n'%(str(arr))) 38 | stderr.write('TEST 1: Cupy modifies the data ... OK!\n\n') 39 | 40 | stderr.write('TEST 1: Verify the result ... \n') 41 | if not cp.allclose(arr, res_val): 42 | stderr.write('ERROR: TEST 1 failed!\n') 43 | sys.exit(-1) 44 | stderr.write('TEST 1: Verify the result ... OK\n\n') 45 | 46 | stderr.write('TEST 1: deleting the Cupy array ... \n') 47 | arr = None 48 | stderr.write('TEST 1: deleting the Cupy array ... OK!\n\n') 49 | 50 | 51 | 52 | # send data from Python to C++ 53 | stderr.write('TEST 2 : Python --> C++\n' \ 54 | '=======================\n') 55 | 56 | stderr.write('TEST 2: creating a Cupy array ... \n') 57 | arr = cp.full((n_elem), init_val, dtype='float32') 58 | stderr.write('arr.__cuda_array_interface__ = %s\n'%(arr.__cuda_array_interface__)) 59 | #stderr.write('arr = %s\n'%(str(arr))) 60 | stderr.write('TEST 2: creating a Cupy array ... OK\n\n') 61 | 62 | stderr.write('TEST 2: share the data with hamr::buffer ... \n') 63 | buf = buffer(arr) 64 | stderr.write('buf = %s\n'%(str(buf))) 65 | stderr.write('TEST 2: share the data with hamr::buffer ... OK\n\n') 66 | 67 | stderr.write('TEST 2: Cupy modifies the data ... \n') 68 | arr *= mod_val 69 | #stderr.write('arr = %s\n'%(str(arr))) 70 | stderr.write('TEST 2: Cupy modifies the data ... OK!\n\n') 71 | 72 | stderr.write('TEST 2: deleting the Cupy array ... \n') 73 | arr = None 74 | stderr.write('TEST 2: deleting the Cupy array ... OK!\n\n') 75 | 76 | stderr.write('TEST 2: display the modified hamr::buffer ... \n') 77 | stderr.write('buf = %s\n'%(str(buf))) 78 | stderr.write('TEST 2: display the modified hamr::buffer ... OK\n\n') 79 | 80 | stderr.write('TEST 2: deleting the hamr::buffer ... \n') 81 | buf = None 82 | stderr.write('TEST 2: deleting the hamr::buffer ... OK!\n\n') 83 | 84 | sys.exit(0) 85 | -------------------------------------------------------------------------------- /python/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_policy(SET CMP0078 NEW) 2 | cmake_policy(SET CMP0086 NEW) 3 | 4 | find_package(Python3 COMPONENTS Interpreter Development REQUIRED) 5 | message("++ Executable: ${Python3_EXECUTABLE}") 6 | 7 | set(HAMR_PYTHON_SITE 8 | "${CMAKE_INSTALL_LIBDIR}/python${Python3_VERSION_MAJOR}.${Python3_VERSION_MINOR}/site-packages/" 9 | CACHE STRING "Where Python modules are compiled and installed.") 10 | 11 | set(HAMR_PYTHON_DIR "${HAMR_PYTHON_SITE}/hamr/" 12 | CACHE STRING "Where HAMR Python bindings are compiled and installed") 13 | 14 | message(STATUS "HAMR: Python modules will be installed at \"${HAMR_PYTHON_DIR}\"") 15 | 16 | find_package(SWIG COMPONENTS python) 17 | include(UseSWIG) 18 | 19 | set(swig_deps 20 | ../hamr_buffer_allocator.i 21 | ../hamr_buffer_handle.i 22 | ../hamr_buffer.i) 23 | 24 | set_property(SOURCE hamr_py.i PROPERTY CPLUSPLUS ON) 25 | set_property(SOURCE hamr_py.i PROPERTY DEPENDS ${swig_deps}) 26 | set_property(SOURCE hamr_py.i PROPERTY SWIG_MODULE_NAME hamr_py) 27 | 28 | list(APPEND hamr_py_sources hamr_py.i) 29 | if (HAMR_SEPARATE_IMPL) 30 | list(APPEND hamr_py_sources 31 | ../hamr_python_deleter.cxx 32 | ) 33 | endif() 34 | 35 | swig_add_library(hamr_py 36 | TYPE MODULE LANGUAGE python 37 | SOURCES ${hamr_py_sources} 38 | OUTPUT_DIR "${CMAKE_BINARY_DIR}/${HAMR_PYTHON_DIR}" 39 | OUTFILE_DIR "${CMAKE_CURRENT_BINARY_DIR}") 40 | 41 | target_link_libraries(hamr_py ${Python3_LIBRARIES} hamr) 42 | 43 | target_include_directories(hamr_py 44 | PRIVATE "${Python3_INCLUDE_DIRS}" 45 | "${CMAKE_CURRENT_SOURCE_DIR}" 46 | "${CMAKE_CURRENT_SOURCE_DIR}/.." 47 | "${CMAKE_CURRENBT_BINARY_DIR}" 48 | "${CMAKE_CURRENBT_BINARY_DIR}/.." 49 | ) 50 | 51 | set_property(TARGET hamr_py 52 | PROPERTY SWIG_USE_TARGET_INCLUDE_DIRECTORIES ON) 53 | 54 | set_target_properties(hamr_py PROPERTIES 55 | LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/${HAMR_PYTHON_DIR}") 56 | 57 | if (HAMR_ENABLE_CUDA AND NOT HAMR_NVHPC_CUDA) 58 | set_source_files_properties( 59 | "${CMAKE_CURRENT_BINARY_DIR}/hamr_pyPYTHON_wrap.cxx" 60 | PROPERTIES LANGUAGE CUDA) 61 | 62 | set_target_properties(hamr_py PROPERTIES 63 | CUDA_ARCHITECTURES "${HAMR_CUDA_ARCHITECTURES}") 64 | endif() 65 | 66 | install(TARGETS hamr_py DESTINATION ${HAMR_PYTHON_DIR}) 67 | install(FILES ${CMAKE_BINARY_DIR}/${HAMR_PYTHON_DIR}/hamr_py.py 68 | DESTINATION ${HAMR_PYTHON_DIR}) 69 | 70 | if (APPLE) 71 | set_target_properties(hamr_py PROPERTIES INSTALL_RPATH "@loader_path/./") 72 | elseif(UNIX) 73 | set_target_properties(hamr_py PROPERTIES INSTALL_RPATH "\$ORIGIN/") 74 | endif() 75 | 76 | configure_file(hamr.py "${CMAKE_BINARY_DIR}/${HAMR_PYTHON_DIR}/__init__.py" 77 | COPYONLY) 78 | 79 | install(FILES "${CMAKE_BINARY_DIR}/${HAMR_PYTHON_DIR}/__init__.py" 80 | DESTINATION "${HAMR_PYTHON_DIR}") 81 | 82 | # capture python path for use in automated CI 83 | file(CONFIGURE OUTPUT "${CMAKE_BINARY_DIR}/${CMAKE_INSTALL_BINDIR}/hamr_python_env.sh" 84 | CONTENT 85 | [=[ 86 | #!/bin/bash 87 | export PYTHONPATH=@CMAKE_BINARY_DIR@/@HAMR_PYTHON_SITE@:$PYTHONPATH 88 | ]=] 89 | @ONLY) 90 | -------------------------------------------------------------------------------- /test/test_hamr_buffer_numpy_host.py: -------------------------------------------------------------------------------- 1 | from hamr import * 2 | import numpy as np 3 | import sys 4 | 5 | stderr = sys.__stderr__ 6 | 7 | n_elem = 256 8 | init_val = 3.1415 9 | mod_val = 10000 10 | res_val = init_val*mod_val 11 | 12 | # send data from C++ to Python 13 | stderr.write('TEST 1 : C++ --> Python\n' \ 14 | '=======================\n') 15 | 16 | stderr.write('TEST 1: creating a hamr::buffer host ... \n') 17 | buf = buffer_float(buffer_allocator_malloc, n_elem, init_val) 18 | stderr.write('buf = %s\n'%(str(buf))) 19 | stderr.write('TEST 1: creating a hamr::buffer host ... OK!\n\n') 20 | 21 | stderr.write('TEST 1: get a handle to the data ... \n') 22 | h = buf.get_host_accessible() 23 | stderr.write('TEST 1: get a handle to the data ... OK!\n\n') 24 | 25 | stderr.write('TEST 1: share the data with Numpy ... \n') 26 | arr = np.array(h, copy=False) 27 | stderr.write('arr.__array_interface__ = %s\n'%(arr.__array_interface__)) 28 | stderr.write('TEST 1: share the data with Numpy ... OK!\n\n') 29 | 30 | stderr.write('TEST 1: deleting the hamr::buffer ... \n') 31 | buf = None 32 | h = None 33 | stderr.write('TEST 1: deleting the hamr::buffer ... OK!\n\n') 34 | 35 | stderr.write('TEST 1: Numpy reads the data ... \n') 36 | stderr.write('arr = %s\n'%(str(arr))) 37 | stderr.write('TEST 1: Numpy reads the data ... OK!\n\n') 38 | 39 | stderr.write('TEST 1: Numpy modifies the data ... \n') 40 | arr *= mod_val 41 | stderr.write('arr = %s\n'%(str(arr))) 42 | stderr.write('TEST 1: Numpy modifies the data ... OK!\n\n') 43 | 44 | stderr.write('TEST 1: Verify the result ... \n') 45 | if not np.allclose(arr, res_val): 46 | stderr.write('ERROR: TEST 1 failed!\n') 47 | sys.exit(-1) 48 | stderr.write('TEST 1: Verify the result ... OK\n\n') 49 | 50 | stderr.write('TEST 1: deleting the Numpy array ... \n') 51 | arr = None 52 | stderr.write('TEST 1: deleting the Numpy array ... OK!\n\n') 53 | 54 | 55 | 56 | # send data from Python to C++ 57 | stderr.write('TEST 2 : Python --> C++\n' \ 58 | '=======================\n') 59 | 60 | stderr.write('TEST 2: creating a Numpy array ... \n') 61 | arr = np.full((n_elem), init_val, dtype='float32') 62 | stderr.write('arr.__array_interface__ = %s\n'%(arr.__array_interface__)) 63 | #stderr.write('arr = %s\n'%(str(arr))) 64 | stderr.write('TEST 2: creating a Numpy array ... OK\n\n') 65 | 66 | stderr.write('TEST 2: share the data with hamr::buffer ... \n') 67 | buf = buffer(arr) 68 | stderr.write('buf = %s\n'%(str(buf))) 69 | stderr.write('TEST 2: share the data with hamr::buffer ... OK\n\n') 70 | 71 | stderr.write('TEST 2: Numpy modifies the data ... \n') 72 | arr *= mod_val 73 | #stderr.write('arr = %s\n'%(str(arr))) 74 | stderr.write('TEST 2: Numpy modifies the data ... OK!\n\n') 75 | 76 | stderr.write('TEST 2: deleting the Numpy array ... \n') 77 | arr = None 78 | stderr.write('TEST 2: deleting the Numpy array ... OK!\n\n') 79 | 80 | stderr.write('TEST 2: display the modified hamr::buffer ... \n') 81 | stderr.write('buf = %s\n'%(str(buf))) 82 | stderr.write('TEST 2: display the modified hamr::buffer ... OK\n\n') 83 | 84 | stderr.write('TEST 2: deleting the hamr::buffer ... \n') 85 | buf = None 86 | stderr.write('TEST 2: deleting the hamr::buffer ... OK!\n\n') 87 | 88 | sys.exit(0) 89 | -------------------------------------------------------------------------------- /hamr_hip_malloc_allocator.cxx: -------------------------------------------------------------------------------- 1 | #include "hamr_config.h" 2 | 3 | #include "hamr_hip_malloc_allocator.h" 4 | #include "hamr_hip_malloc_allocator_impl.h" 5 | 6 | template class hamr::hip_malloc_deleter; 7 | template class hamr::hip_malloc_deleter; 8 | template class hamr::hip_malloc_deleter; 9 | template class hamr::hip_malloc_deleter; 10 | template class hamr::hip_malloc_deleter; 11 | template class hamr::hip_malloc_deleter; 12 | template class hamr::hip_malloc_deleter; 13 | template class hamr::hip_malloc_deleter; 14 | template class hamr::hip_malloc_deleter; 15 | template class hamr::hip_malloc_deleter; 16 | template class hamr::hip_malloc_deleter; 17 | template class hamr::hip_malloc_deleter; 18 | template class hamr::hip_malloc_deleter; 19 | 20 | #define hamr_hip_malloc_allocator_instantiate_members(_T) \ 21 | template std::shared_ptr<_T> hamr::hip_malloc_allocator<_T>::allocate(size_t n, const float *vals, bool hipVals); \ 22 | template std::shared_ptr<_T> hamr::hip_malloc_allocator<_T>::allocate(size_t n, const double *vals, bool hipVals); \ 23 | template std::shared_ptr<_T> hamr::hip_malloc_allocator<_T>::allocate(size_t n, const char *vals, bool hipVals); \ 24 | template std::shared_ptr<_T> hamr::hip_malloc_allocator<_T>::allocate(size_t n, const signed char *vals, bool hipVals); \ 25 | template std::shared_ptr<_T> hamr::hip_malloc_allocator<_T>::allocate(size_t n, const short *vals, bool hipVals); \ 26 | template std::shared_ptr<_T> hamr::hip_malloc_allocator<_T>::allocate(size_t n, const int *vals, bool hipVals); \ 27 | template std::shared_ptr<_T> hamr::hip_malloc_allocator<_T>::allocate(size_t n, const long *vals, bool hipVals); \ 28 | template std::shared_ptr<_T> hamr::hip_malloc_allocator<_T>::allocate(size_t n, const long long *vals, bool hipVals); \ 29 | template std::shared_ptr<_T> hamr::hip_malloc_allocator<_T>::allocate(size_t n, const unsigned char *vals, bool hipVals); \ 30 | template std::shared_ptr<_T> hamr::hip_malloc_allocator<_T>::allocate(size_t n, const unsigned short *vals, bool hipVals); \ 31 | template std::shared_ptr<_T> hamr::hip_malloc_allocator<_T>::allocate(size_t n, const unsigned int *vals, bool hipVals); \ 32 | template std::shared_ptr<_T> hamr::hip_malloc_allocator<_T>::allocate(size_t n, const unsigned long *vals, bool hipVals); \ 33 | template std::shared_ptr<_T> hamr::hip_malloc_allocator<_T>::allocate(size_t n, const unsigned long long *vals, bool hipVals); \ 34 | 35 | #define hamr_hip_malloc_allocator_instantiate(_T) \ 36 | template struct hamr::hip_malloc_allocator<_T>; \ 37 | hamr_hip_malloc_allocator_instantiate_members(_T) 38 | 39 | hamr_hip_malloc_allocator_instantiate(float) 40 | hamr_hip_malloc_allocator_instantiate(double) 41 | hamr_hip_malloc_allocator_instantiate(char) 42 | hamr_hip_malloc_allocator_instantiate(signed char) 43 | hamr_hip_malloc_allocator_instantiate(short) 44 | hamr_hip_malloc_allocator_instantiate(int) 45 | hamr_hip_malloc_allocator_instantiate(long) 46 | hamr_hip_malloc_allocator_instantiate(long long) 47 | hamr_hip_malloc_allocator_instantiate(unsigned char) 48 | hamr_hip_malloc_allocator_instantiate(unsigned short) 49 | hamr_hip_malloc_allocator_instantiate(unsigned int) 50 | hamr_hip_malloc_allocator_instantiate(unsigned long) 51 | hamr_hip_malloc_allocator_instantiate(unsigned long long) 52 | -------------------------------------------------------------------------------- /hamr_new_allocator_impl.h: -------------------------------------------------------------------------------- 1 | #ifndef hamr_new_allocator_impl_h 2 | #define hamr_new_allocator_impl_h 3 | 4 | //#include "hamr_new_allocator.h" 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | namespace hamr 14 | { 15 | 16 | // -------------------------------------------------------------------------- 17 | template 18 | new_deleter::new_deleter(T *ptr, size_t n) : m_ptr(ptr), m_elem(n) 19 | { 20 | #if defined(HAMR_VERBOSE) 21 | if (hamr::get_verbose()) 22 | { 23 | std::cerr << "created new_deleter for array of " << n 24 | << " objects of type " << typeid(T).name() << std::endl; 25 | } 26 | #endif 27 | } 28 | 29 | // -------------------------------------------------------------------------- 30 | template 31 | void new_deleter::operator()(T *ptr) 32 | { 33 | assert(ptr == m_ptr); 34 | 35 | #if defined(HAMR_VERBOSE) 36 | if (hamr::get_verbose()) 37 | { 38 | std::cerr << "new_deleter deleting array of " << m_elem 39 | << " objects of type " << typeid(T).name() << std::endl; 40 | } 41 | #endif 42 | 43 | delete [] ptr; 44 | } 45 | 46 | 47 | 48 | 49 | 50 | 51 | // -------------------------------------------------------------------------- 52 | template 53 | std::shared_ptr new_allocator::allocate(size_t n) 54 | { 55 | #if defined(HAMR_VERBOSE) 56 | if (hamr::get_verbose()) 57 | { 58 | std::cerr << "new_allocator allocating array of " << n 59 | << " objects of type " << typeid(T).name() << std::endl; 60 | } 61 | #endif 62 | 63 | // allocate 64 | T *ptr = new T[n]; 65 | 66 | // package 67 | return std::shared_ptr(ptr, new_deleter(ptr, n)); 68 | } 69 | 70 | // -------------------------------------------------------------------------- 71 | template 72 | std::shared_ptr new_allocator::allocate(size_t n, const T &val) 73 | { 74 | #if defined(HAMR_VERBOSE) 75 | if (hamr::get_verbose()) 76 | { 77 | std::cerr << "new_allocator allocating array of " << n 78 | << " objects of type " << typeid(T).name() << " initialized" 79 | << std::endl; 80 | } 81 | #endif 82 | 83 | // allocate 84 | T *ptr = (T*)new unsigned char[n*sizeof(T)]; 85 | 86 | // construct 87 | for (size_t i = 0; i < n; ++i) 88 | new (&ptr[i]) T(val); 89 | 90 | // package 91 | return std::shared_ptr(ptr, new_deleter(ptr, n)); 92 | } 93 | 94 | // -------------------------------------------------------------------------- 95 | template 96 | template 97 | std::shared_ptr new_allocator::allocate(size_t n, const U *vals) 98 | { 99 | #if defined(HAMR_VERBOSE) 100 | if (hamr::get_verbose()) 101 | { 102 | std::cerr << "new_allocator allocating array of " << n 103 | << " objects of type " << typeid(T).name() << " initialized" 104 | << std::endl; 105 | } 106 | #endif 107 | 108 | // allocate 109 | T *ptr = (T*)new unsigned char[n*sizeof(T)]; 110 | 111 | // construct 112 | for (size_t i = 0; i < n; ++i) 113 | new (&ptr[i]) T(vals[i]); 114 | 115 | // package 116 | return std::shared_ptr(ptr, new_deleter(ptr, n)); 117 | } 118 | 119 | }; 120 | 121 | #endif 122 | -------------------------------------------------------------------------------- /hamr_cuda_malloc_host_allocator.cxx: -------------------------------------------------------------------------------- 1 | #include "hamr_config.h" 2 | 3 | #include "hamr_cuda_malloc_host_allocator.h" 4 | #include "hamr_cuda_malloc_host_allocator_impl.h" 5 | 6 | template class hamr::cuda_malloc_host_deleter; 7 | template class hamr::cuda_malloc_host_deleter; 8 | template class hamr::cuda_malloc_host_deleter; 9 | template class hamr::cuda_malloc_host_deleter; 10 | template class hamr::cuda_malloc_host_deleter; 11 | template class hamr::cuda_malloc_host_deleter; 12 | template class hamr::cuda_malloc_host_deleter; 13 | template class hamr::cuda_malloc_host_deleter; 14 | template class hamr::cuda_malloc_host_deleter; 15 | template class hamr::cuda_malloc_host_deleter; 16 | template class hamr::cuda_malloc_host_deleter; 17 | template class hamr::cuda_malloc_host_deleter; 18 | template class hamr::cuda_malloc_host_deleter; 19 | 20 | #define hamr_cuda_malloc_host_allocator_instantiate_members(_T) \ 21 | template std::shared_ptr<_T> hamr::cuda_malloc_host_allocator<_T>::allocate(size_t n, const float *vals); \ 22 | template std::shared_ptr<_T> hamr::cuda_malloc_host_allocator<_T>::allocate(size_t n, const double *vals); \ 23 | template std::shared_ptr<_T> hamr::cuda_malloc_host_allocator<_T>::allocate(size_t n, const char *vals); \ 24 | template std::shared_ptr<_T> hamr::cuda_malloc_host_allocator<_T>::allocate(size_t n, const signed char *vals); \ 25 | template std::shared_ptr<_T> hamr::cuda_malloc_host_allocator<_T>::allocate(size_t n, const short *vals); \ 26 | template std::shared_ptr<_T> hamr::cuda_malloc_host_allocator<_T>::allocate(size_t n, const int *vals); \ 27 | template std::shared_ptr<_T> hamr::cuda_malloc_host_allocator<_T>::allocate(size_t n, const long *vals); \ 28 | template std::shared_ptr<_T> hamr::cuda_malloc_host_allocator<_T>::allocate(size_t n, const long long *vals); \ 29 | template std::shared_ptr<_T> hamr::cuda_malloc_host_allocator<_T>::allocate(size_t n, const unsigned char *vals); \ 30 | template std::shared_ptr<_T> hamr::cuda_malloc_host_allocator<_T>::allocate(size_t n, const unsigned short *vals); \ 31 | template std::shared_ptr<_T> hamr::cuda_malloc_host_allocator<_T>::allocate(size_t n, const unsigned int *vals); \ 32 | template std::shared_ptr<_T> hamr::cuda_malloc_host_allocator<_T>::allocate(size_t n, const unsigned long *vals); \ 33 | template std::shared_ptr<_T> hamr::cuda_malloc_host_allocator<_T>::allocate(size_t n, const unsigned long long *vals); 34 | 35 | 36 | #define hamr_cuda_malloc_host_allocator_instantiate(_T) \ 37 | template struct hamr::cuda_malloc_host_allocator<_T>; \ 38 | hamr_cuda_malloc_host_allocator_instantiate_members(_T) 39 | 40 | hamr_cuda_malloc_host_allocator_instantiate(float) 41 | hamr_cuda_malloc_host_allocator_instantiate(double) 42 | hamr_cuda_malloc_host_allocator_instantiate(char) 43 | hamr_cuda_malloc_host_allocator_instantiate(signed char) 44 | hamr_cuda_malloc_host_allocator_instantiate(short) 45 | hamr_cuda_malloc_host_allocator_instantiate(int) 46 | hamr_cuda_malloc_host_allocator_instantiate(long) 47 | hamr_cuda_malloc_host_allocator_instantiate(long long) 48 | hamr_cuda_malloc_host_allocator_instantiate(unsigned char) 49 | hamr_cuda_malloc_host_allocator_instantiate(unsigned short) 50 | hamr_cuda_malloc_host_allocator_instantiate(unsigned int) 51 | hamr_cuda_malloc_host_allocator_instantiate(unsigned long) 52 | hamr_cuda_malloc_host_allocator_instantiate(unsigned long long) 53 | -------------------------------------------------------------------------------- /test/test_hamr_pipeline_cuda_openmp.cpp: -------------------------------------------------------------------------------- 1 | #include "hamr_buffer.h" 2 | #include "hamr_buffer_util.h" 3 | 4 | #include 5 | 6 | using hamr::buffer; 7 | using allocator = hamr::buffer_allocator; 8 | 9 | // with LLVM Clang CUDA and OpenMP need to be compiled in separate 10 | // translation units. 11 | 12 | // 13 | // CUDA kernels 14 | // 15 | template 16 | buffer initialize_cuda(size_t n_vals, const T &val) HAMR_EXPORT; 17 | 18 | template 19 | buffer add_cuda(const buffer &a1, const buffer &a2) HAMR_EXPORT; 20 | 21 | template 22 | buffer multiply_scalar_cuda(const buffer &ai, const U &val) HAMR_EXPORT; 23 | 24 | // 25 | // OpenMP kernels 26 | // 27 | template 28 | buffer initialize_openmp(size_t n_vals, const T &val) HAMR_EXPORT; 29 | 30 | template 31 | buffer add_openmp(const buffer &a1, const buffer &a2) HAMR_EXPORT; 32 | 33 | template 34 | buffer multiply_scalar_openmp(const buffer &ai, const U &val) HAMR_EXPORT; 35 | 36 | 37 | 38 | // ************************************************************************** 39 | template 40 | int compare_int(const buffer &ain, int val) 41 | { 42 | size_t n_vals = ain.size(); 43 | std::cerr << "comparing array with " << n_vals << " elements to " << val << std::endl; 44 | 45 | buffer ai(ain.get_allocator(), n_vals); 46 | ain.get(ai); 47 | 48 | auto [spai, pai] = hamr::get_host_accessible(ai); 49 | 50 | if (n_vals < 33) 51 | { 52 | ai.print(); 53 | } 54 | 55 | for (size_t i = 0; i < n_vals; ++i) 56 | { 57 | if (pai[i] != val) 58 | { 59 | std::cerr << "ERROR: pai[" << i << "] = " 60 | << pai[i] << " != " << val << std::endl; 61 | return -1; 62 | } 63 | } 64 | 65 | std::cerr << "all elements are equal to " << val << std::endl; 66 | 67 | return 0; 68 | } 69 | 70 | 71 | 72 | int main(int, char **) 73 | { 74 | size_t n_vals = 100000; 75 | 76 | buffer ao0(allocator::cuda, n_vals, 1.0f); // = 1 (CUDA) 77 | buffer ao1 = multiply_scalar_cuda(ao0, 2.0f); // = 2 (CUDA) 78 | ao0.free(); 79 | 80 | buffer ao2 = initialize_openmp(n_vals, 2.0); // = 2 (OpenMP) 81 | buffer ao3 = add_openmp(ao2, ao1); // = 4 (OpenMP w/ CUDA data) 82 | ao1.free(); 83 | ao2.free(); 84 | 85 | buffer ao4 = multiply_scalar_cuda(ao3, 1000.0); // = 4000 (CUDA w/ OpenMP data) 86 | ao3.free(); 87 | 88 | buffer ao5(allocator::malloc, n_vals, 3.0f); // = 3 (host) 89 | buffer ao6 = multiply_scalar_cuda(ao5, 100.0f); // = 300 (CUDA) 90 | ao5.free(); 91 | 92 | buffer ao7(allocator::openmp, n_vals); // = uninit (OpenMP) 93 | ao7.set(ao6); // = 300 (CUDA to OpenMP) 94 | ao6.free(); 95 | 96 | buffer ao8(allocator::cuda, n_vals); // = uninit (CUDA) 97 | ao8.set(ao7); // = 300 (OpenMP to CUDA) 98 | ao7.free(); 99 | 100 | buffer ao9 = add_cuda(ao4, ao8); // = 4300 (CUDA) 101 | ao4.free(); 102 | ao8.free(); 103 | 104 | return compare_int(ao8, 4300); 105 | } 106 | -------------------------------------------------------------------------------- /hamr_device.h: -------------------------------------------------------------------------------- 1 | #ifndef hamr_device_h 2 | #define hamr_device_h 3 | 4 | #include "hamr_config.h" 5 | #if defined(HAMR_ENABLE_CUDA) 6 | #include "hamr_cuda_device.h" 7 | #elif defined(HAMR_ENABLE_HIP) 8 | #include "hamr_hip_device.h" 9 | #elif defined(HAMR_ENABLE_OPENMP) 10 | #include "hamr_openmp_device.h" 11 | #endif 12 | 13 | ///@file 14 | 15 | namespace hamr 16 | { 17 | /// gets the device identifier for the first GPU. @returns zero if successful. 18 | inline int HAMR_EXPORT get_device_identifier(int &dev_id) 19 | { 20 | #if defined(HAMR_ENABLE_CUDA) 21 | return get_cuda_device_identifier(dev_id); 22 | #elif defined(HAMR_ENABLE_HIP) 23 | return get_hip_device_identifier(dev_id); 24 | #elif defined(HAMR_ENABLE_OPENMP) 25 | return get_openmp_device_identifier(dev_id); 26 | #else 27 | dev_id = -1; 28 | return 0; 29 | #endif 30 | } 31 | 32 | /// gets the device identifier for the host. @returns zero if successful. 33 | inline int HAMR_EXPORT get_host_identifier(int &dev_id) 34 | { 35 | #if defined(HAMR_ENABLE_CUDA) 36 | return get_cuda_host_identifier(dev_id); 37 | #elif defined(HAMR_ENABLE_HIP) 38 | return get_hip_host_identifier(dev_id); 39 | #elif defined(HAMR_ENABLE_OPENMP) 40 | return get_openmp_host_identifier(dev_id); 41 | #else 42 | dev_id = -1; 43 | return 0; 44 | #endif 45 | } 46 | 47 | /// gets the currently atcive device. @returns zero if successful. 48 | inline int HAMR_EXPORT get_active_device(int &dev_id) 49 | { 50 | #if defined(HAMR_ENABLE_CUDA) 51 | return get_active_cuda_device(dev_id); 52 | #elif defined(HAMR_ENABLE_HIP) 53 | return get_active_hip_device(dev_id); 54 | #elif defined(HAMR_ENABLE_OPENMP) 55 | return get_active_openmp_device(dev_id); 56 | #else 57 | dev_id = -1; 58 | return 0; 59 | #endif 60 | } 61 | 62 | /// sets the active device. returns zero if successful. 63 | inline int HAMR_EXPORT set_active_device(int dev_id) 64 | { 65 | #if defined(HAMR_ENABLE_CUDA) 66 | return set_active_cuda_device(dev_id); 67 | #elif defined(HAMR_ENABLE_HIP) 68 | return set_active_hip_device(dev_id); 69 | #elif defined(HAMR_ENABLE_OPENMP) 70 | return set_active_openmp_device(dev_id); 71 | #else 72 | return 0; 73 | #endif 74 | } 75 | 76 | /// gets the device that owns the given pointer. @returns zero if successful. 77 | inline int HAMR_EXPORT get_device(const void *ptr, int &device_id) 78 | { 79 | #if defined(HAMR_ENABLE_CUDA) 80 | return get_cuda_device(ptr, device_id); 81 | #elif defined(HAMR_ENABLE_HIP) 82 | return get_hip_device(ptr, device_id); 83 | #elif defined(HAMR_ENABLE_OPENMP) 84 | return get_openmp_device(ptr, device_id); 85 | #else 86 | device_id = -1; 87 | return 0; 88 | #endif 89 | } 90 | 91 | #if defined(HAMR_ENABLE_CUDA) 92 | using activate_device = activate_cuda_device; 93 | #elif defined(HAMR_ENABLE_HIP) 94 | using activate_device = activate_hip_device; 95 | #elif defined(HAMR_ENABLE_OPENMP) 96 | using activate_device = activate_openmp_device; 97 | #else 98 | /** Activate the specified device, and restore the previously active 99 | * device when the object is destroyed. 100 | */ 101 | class HAMR_EXPORT activate_device 102 | { 103 | public: 104 | activate_device() = delete; 105 | activate_device(const activate_device &) = delete; 106 | void operator=(const activate_device &) = delete; 107 | activate_device(int) {} 108 | ~activate_device() {} 109 | }; 110 | #endif 111 | 112 | } 113 | #endif 114 | -------------------------------------------------------------------------------- /cmake/hamr_omp_offload.cmake: -------------------------------------------------------------------------------- 1 | # Get the OpenMP device offload flags for the current C++ compiler. 2 | # 3 | # TARGET 4 | # names the target for offloading (optional). 5 | # 6 | # ARCH 7 | # names the architcure to compiler for (optional). 8 | # 9 | # ADD_FLAGS 10 | # additional flags that may be needed (optional). 11 | # 12 | # RESULT 13 | # the flags are stored in this variable. 14 | # 15 | function(get_offload_compile_flags) 16 | set(opts "") 17 | set(nvpo ARCH TARGET ADD_FLAGS RESULT) 18 | set(mvo) 19 | cmake_parse_arguments(PARSE_ARGV 0 OMP_DO "${opts}" "${nvpo}" "${mvo}") 20 | set(tmp) 21 | if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang") 22 | set(tmp "-fopenmp --offload-new-driver") 23 | if (OMP_DO_TARGET) 24 | set(tmp "${tmp} -fopenmp-targets=${OMP_DO_TARGET}") 25 | endif() 26 | if (OMP_DO_ARCH) 27 | set(tmp "${tmp} --offload-arch=${OMP_DO_ARCH}") 28 | endif() 29 | elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "IntelLLVM") 30 | set(tmp "-qopenmp") 31 | if (OMP_DO_TARGET) 32 | set(tmp "${tmp} -fopenmp-targets=${OMP_DO_TARGET}") 33 | endif() 34 | elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "GNU") 35 | set(tmp "-fopenmp") 36 | if (OMP_DO_TARGET) 37 | set(tmp "${tmp} -foffload=${OMP_DO_TARGET}") 38 | endif() 39 | if (OMP_DO_ARCH) 40 | set(tmp "${tmp} --offload-options=${OMP_DO_TARGET}=-march=${OMP_DO_ARCH}") 41 | endif() 42 | elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "NVHPC") 43 | set(tmp "-mp=gpu") 44 | if (OMP_DO_ARCH) 45 | set(tmp "${tmp} -gpu=${OMP_DO_ARCH}") 46 | endif() 47 | endif() 48 | if (OMP_DO_ADD_FLAGS) 49 | set(tmp "${tmp} ${OMP_DO_ADD_FLAGS}") 50 | endif() 51 | if ("${tmp}" STREQUAL "") 52 | message(WARNING "OpenMP offload compiler flags not known for ${CMAKE_CXX_COMPILER_ID}") 53 | else() 54 | message(STATUS "OpenMP offload compiler flags for ${CMAKE_CXX_COMPILER_ID} are ${tmp}") 55 | endif() 56 | set(${OMP_DO_RESULT} ${tmp} PARENT_SCOPE) 57 | endfunction() 58 | 59 | # Get the OpenMP device offload flags for the current C++ compiler. 60 | # 61 | # TARGET 62 | # names the target for offloading (optional). 63 | # 64 | # ARCH 65 | # names the architcure to compiler for (optional). 66 | # 67 | # ADD_FLAGS 68 | # additional flags that may be needed (optional). 69 | # 70 | # RESULT 71 | # the flags are stored in this variable. 72 | # 73 | function(get_offload_link_flags) 74 | set(opts "") 75 | set(nvpo ARCH TARGET ADD_FLAGS RESULT) 76 | set(mvo) 77 | cmake_parse_arguments(PARSE_ARGV 0 OMP_DO "${opts}" "${nvpo}" "${mvo}") 78 | set(tmp) 79 | if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang") 80 | list(APPEND tmp -fopenmp --offload-new-driver) 81 | if (OMP_DO_TARGET) 82 | list(APPEND tmp -fopenmp-targets=${OMP_DO_TARGET}) 83 | endif() 84 | if (OMP_DO_ARCH) 85 | list(APPEND tmp --offload-arch=${OMP_DO_ARCH}) 86 | endif() 87 | elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "IntelLLVM") 88 | elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "GNU") 89 | elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "NVHPC") 90 | endif() 91 | if (OMP_DO_ADD_FLAGS) 92 | set(tmp "${tmp} ${OMP_DO_ADD_FLAGS}") 93 | endif() 94 | message(STATUS "OpenMP offload linker flags for ${CMAKE_CXX_COMPILER_ID} are ${tmp}") 95 | set(${OMP_DO_RESULT} ${tmp} PARENT_SCOPE) 96 | endfunction() 97 | -------------------------------------------------------------------------------- /hamr_stream.h: -------------------------------------------------------------------------------- 1 | #ifndef hamr_stream_h 2 | #define hamr_stream_h 3 | 4 | ///@file 5 | 6 | #include "hamr_config.h" 7 | 8 | #include 9 | #include 10 | 11 | #if defined(HAMR_ENABLE_CUDA) 12 | #include 13 | #else 14 | using cudaStream_t = void*; 15 | #endif 16 | #if defined(HAMR_ENABLE_HIP) 17 | #include 18 | #else 19 | using hipStream_t = void*; 20 | #endif 21 | 22 | namespace hamr 23 | { 24 | 25 | /// A wrapper around technology specific streams. 26 | /** Streams are used to enable and order concurrent operations on accelerator 27 | * devices. The default stream used in hamr is a stream-per-thread where 28 | * available. However, note that libraries built seperately will likely use 29 | * the default blocking stream and if so explicit specification of the stream 30 | * when calling into those libraries is necessary. Note that hamr passes stream 31 | * correctly when interfacing with Python. In most cases the hamr API's 32 | * requiring a ::stream can be passed the technology specific stream due to 33 | * implicit conversion operators implemented here. 34 | */ 35 | class HAMR_EXPORT stream 36 | { 37 | public: 38 | /// constructs a default stream 39 | stream() : 40 | #if defined(HAMR_ENABLE_CUDA) 41 | m_stream(std::in_place_index<1>, cudaStreamPerThread) 42 | #elif defined(HAMR_ENABLE_HIP) 43 | m_stream(std::in_place_index<2>, hipStreamPerThread) 44 | #else 45 | m_stream(std::in_place_index<0>, '\0') 46 | #endif 47 | {} 48 | 49 | stream(const stream &) = default; 50 | stream(stream &&) = default; 51 | 52 | stream &operator=(const stream &) = default; 53 | stream &operator=(stream &&) = default; 54 | 55 | #if defined(HAMR_ENABLE_CUDA) 56 | /// convert to a CUDA stream 57 | operator cudaStream_t () const { return this->get_cuda_stream(); } 58 | 59 | /// assign a CUDA stream 60 | stream &operator=(cudaStream_t strm) 61 | { 62 | m_stream = strm; 63 | return *this; 64 | } 65 | 66 | /// Constructs or converts from a CUDA stream 67 | stream(const cudaStream_t &strm) : m_stream(std::in_place_index<1>, strm) {} 68 | 69 | /// Accesses the CUDA stream. 70 | cudaStream_t get_cuda_stream() const 71 | { 72 | const cudaStream_t *cs; 73 | if ((cs = std::get_if<1>(&m_stream))) 74 | return *cs; 75 | return 0; // default stream 76 | } 77 | #endif 78 | #if defined(HAMR_ENABLE_HIP) 79 | /// convert to a HIP stream 80 | operator hipStream_t () const { return this->get_hip_stream(); } 81 | 82 | /// assign a HIP stream 83 | stream &operator=(hipStream_t strm) 84 | { 85 | m_stream = strm; 86 | return *this; 87 | } 88 | 89 | /// Constructs or converts from a HIP stream 90 | stream(hipStream_t &strm) : m_stream(std::in_place_index<2>, strm) {} 91 | 92 | /// Accesses the HIP stream. 93 | hipStream_t get_hip_stream() const 94 | { 95 | const hipStream_t *hs; 96 | if ((hs = std::get_if<2>(&m_stream))) 97 | return *hs; 98 | return 0; // default stream 99 | } 100 | #endif 101 | 102 | /// synchronize the stream 103 | int synchronize() const; 104 | 105 | /// evaluates true if a stream has been set 106 | operator bool() const; 107 | 108 | /// sends the value of the stream to std::cerr 109 | void print() const; 110 | 111 | /// convert the technology specific stream to an integer 112 | size_t get_stream(); 113 | 114 | private: 115 | std::variant m_stream; 116 | }; 117 | } 118 | #endif 119 | -------------------------------------------------------------------------------- /hamr_stream_impl.h: -------------------------------------------------------------------------------- 1 | #ifndef hamr_stream_h 2 | #define hamr_stream_h 3 | 4 | ///@file 5 | 6 | #include "hamr_config.h" 7 | 8 | #include 9 | #include 10 | 11 | #if defined(HAMR_ENABLE_CUDA) 12 | #include 13 | #else 14 | using cudaStream_t = void*; 15 | #endif 16 | #if defined(HAMR_ENABLE_HIP) 17 | #include 18 | #else 19 | using hipStream_t = void*; 20 | #endif 21 | 22 | namespace hamr 23 | { 24 | 25 | /// A wrapper around technology specific streams. 26 | /** Streams are used to enable and order concurrent operations on accelerator 27 | * devices. The default stream used in hamr is a stream-per-thread where 28 | * available. However, note that libraries built seperately will likely use 29 | * the default blocking stream and if so explicit specification of the stream 30 | * when calling into those libraries is necessary. Note that hamr passes stream 31 | * correctly when interfacing with Python. In most cases the hamr API's 32 | * requiring a ::stream can be passed the technology specific stream due to 33 | * implicit conversion operators implemented here. 34 | */ 35 | class HAMR_EXPORT stream 36 | { 37 | public: 38 | /// constructs a default stream 39 | stream() : 40 | #if defined(HAMR_ENABLE_CUDA) 41 | m_stream(std::in_place_index<1>, cudaStreamPerThread) 42 | #elif defined(HAMR_ENABLE_HIP) 43 | m_stream(std::in_place_index<2>, hipStreamPerThread) 44 | #else 45 | m_stream(std::in_place_index<0>, '\0') 46 | #endif 47 | {} 48 | 49 | stream(const stream &) = default; 50 | stream(stream &&) = default; 51 | 52 | stream &operator=(const stream &) = default; 53 | stream &operator=(stream &&) = default; 54 | 55 | #if defined(HAMR_ENABLE_CUDA) 56 | /// convert to a CUDA stream 57 | operator cudaStream_t () const { return this->get_cuda_stream(); } 58 | 59 | /// assign a CUDA stream 60 | stream &operator=(cudaStream_t strm) 61 | { 62 | m_stream = strm; 63 | return *this; 64 | } 65 | 66 | /// Constructs or converts from a CUDA stream 67 | stream(const cudaStream_t &strm) : m_stream(std::in_place_index<1>, strm) {} 68 | 69 | /// Accesses the CUDA stream. 70 | cudaStream_t get_cuda_stream() const 71 | { 72 | const cudaStream_t *cs; 73 | if ((cs = std::get_if<1>(&m_stream))) 74 | return *cs; 75 | return 0; // default stream 76 | } 77 | #endif 78 | #if defined(HAMR_ENABLE_HIP) 79 | /// convert to a HIP stream 80 | operator hipStream_t () const { return this->get_hip_stream(); } 81 | 82 | /// assign a HIP stream 83 | stream &operator=(hipStream_t strm) 84 | { 85 | m_stream = strm; 86 | return *this; 87 | } 88 | 89 | /// Constructs or converts from a HIP stream 90 | stream(hipStream_t &strm) : m_stream(std::in_place_index<2>, strm) {} 91 | 92 | /// Accesses the HIP stream. 93 | hipStream_t get_hip_stream() const 94 | { 95 | const hipStream_t *hs; 96 | if ((hs = std::get_if<2>(&m_stream))) 97 | return *hs; 98 | return 0; // default stream 99 | } 100 | #endif 101 | 102 | /// synchronize the stream 103 | int synchronize() const; 104 | 105 | /// evaluates true if a stream has been set 106 | operator bool() const; 107 | 108 | /// sends the value of the stream to std::cerr 109 | void print() const; 110 | 111 | /// convert the technology specific stream to an integer 112 | size_t get_stream(); 113 | 114 | private: 115 | std::variant m_stream; 116 | }; 117 | } 118 | #endif 119 | -------------------------------------------------------------------------------- /hamr_cuda_malloc_uva_allocator.cxx: -------------------------------------------------------------------------------- 1 | #include "hamr_config.h" 2 | 3 | #include "hamr_cuda_malloc_uva_allocator.h" 4 | #include "hamr_cuda_malloc_uva_allocator_impl.h" 5 | 6 | template class hamr::cuda_malloc_uva_deleter; 7 | template class hamr::cuda_malloc_uva_deleter; 8 | template class hamr::cuda_malloc_uva_deleter; 9 | template class hamr::cuda_malloc_uva_deleter; 10 | template class hamr::cuda_malloc_uva_deleter; 11 | template class hamr::cuda_malloc_uva_deleter; 12 | template class hamr::cuda_malloc_uva_deleter; 13 | template class hamr::cuda_malloc_uva_deleter; 14 | template class hamr::cuda_malloc_uva_deleter; 15 | template class hamr::cuda_malloc_uva_deleter; 16 | template class hamr::cuda_malloc_uva_deleter; 17 | template class hamr::cuda_malloc_uva_deleter; 18 | template class hamr::cuda_malloc_uva_deleter; 19 | 20 | #define hamr_cuda_malloc_uva_allocator_instantiate_members(_T) \ 21 | template std::shared_ptr<_T> hamr::cuda_malloc_uva_allocator<_T>::allocate(cudaStream_t strm, size_t n, const float *vals, bool cudaVals); \ 22 | template std::shared_ptr<_T> hamr::cuda_malloc_uva_allocator<_T>::allocate(cudaStream_t strm, size_t n, const double *vals, bool cudaVals); \ 23 | template std::shared_ptr<_T> hamr::cuda_malloc_uva_allocator<_T>::allocate(cudaStream_t strm, size_t n, const char *vals, bool cudaVals); \ 24 | template std::shared_ptr<_T> hamr::cuda_malloc_uva_allocator<_T>::allocate(cudaStream_t strm, size_t n, const signed char *vals, bool cudaVals); \ 25 | template std::shared_ptr<_T> hamr::cuda_malloc_uva_allocator<_T>::allocate(cudaStream_t strm, size_t n, const short *vals, bool cudaVals); \ 26 | template std::shared_ptr<_T> hamr::cuda_malloc_uva_allocator<_T>::allocate(cudaStream_t strm, size_t n, const int *vals, bool cudaVals); \ 27 | template std::shared_ptr<_T> hamr::cuda_malloc_uva_allocator<_T>::allocate(cudaStream_t strm, size_t n, const long *vals, bool cudaVals); \ 28 | template std::shared_ptr<_T> hamr::cuda_malloc_uva_allocator<_T>::allocate(cudaStream_t strm, size_t n, const long long *vals, bool cudaVals); \ 29 | template std::shared_ptr<_T> hamr::cuda_malloc_uva_allocator<_T>::allocate(cudaStream_t strm, size_t n, const unsigned char *vals, bool cudaVals); \ 30 | template std::shared_ptr<_T> hamr::cuda_malloc_uva_allocator<_T>::allocate(cudaStream_t strm, size_t n, const unsigned short *vals, bool cudaVals); \ 31 | template std::shared_ptr<_T> hamr::cuda_malloc_uva_allocator<_T>::allocate(cudaStream_t strm, size_t n, const unsigned int *vals, bool cudaVals); \ 32 | template std::shared_ptr<_T> hamr::cuda_malloc_uva_allocator<_T>::allocate(cudaStream_t strm, size_t n, const unsigned long *vals, bool cudaVals); \ 33 | template std::shared_ptr<_T> hamr::cuda_malloc_uva_allocator<_T>::allocate(cudaStream_t strm, size_t n, const unsigned long long *vals, bool cudaVals); 34 | 35 | #define hamr_cuda_malloc_uva_allocator_instantiate(_T) \ 36 | template struct hamr::cuda_malloc_uva_allocator<_T>; \ 37 | hamr_cuda_malloc_uva_allocator_instantiate_members(_T) 38 | 39 | hamr_cuda_malloc_uva_allocator_instantiate(float) 40 | hamr_cuda_malloc_uva_allocator_instantiate(double) 41 | hamr_cuda_malloc_uva_allocator_instantiate(char) 42 | hamr_cuda_malloc_uva_allocator_instantiate(signed char) 43 | hamr_cuda_malloc_uva_allocator_instantiate(short) 44 | hamr_cuda_malloc_uva_allocator_instantiate(int) 45 | hamr_cuda_malloc_uva_allocator_instantiate(long) 46 | hamr_cuda_malloc_uva_allocator_instantiate(long long) 47 | hamr_cuda_malloc_uva_allocator_instantiate(unsigned char) 48 | hamr_cuda_malloc_uva_allocator_instantiate(unsigned short) 49 | hamr_cuda_malloc_uva_allocator_instantiate(unsigned int) 50 | hamr_cuda_malloc_uva_allocator_instantiate(unsigned long) 51 | hamr_cuda_malloc_uva_allocator_instantiate(unsigned long long) 52 | -------------------------------------------------------------------------------- /hamr_cuda_malloc_async_allocator.cxx: -------------------------------------------------------------------------------- 1 | #include "hamr_config.h" 2 | 3 | #include "hamr_cuda_malloc_async_allocator.h" 4 | #include "hamr_cuda_malloc_async_allocator_impl.h" 5 | 6 | template class hamr::cuda_malloc_async_deleter; 7 | template class hamr::cuda_malloc_async_deleter; 8 | template class hamr::cuda_malloc_async_deleter; 9 | template class hamr::cuda_malloc_async_deleter; 10 | template class hamr::cuda_malloc_async_deleter; 11 | template class hamr::cuda_malloc_async_deleter; 12 | template class hamr::cuda_malloc_async_deleter; 13 | template class hamr::cuda_malloc_async_deleter; 14 | template class hamr::cuda_malloc_async_deleter; 15 | template class hamr::cuda_malloc_async_deleter; 16 | template class hamr::cuda_malloc_async_deleter; 17 | template class hamr::cuda_malloc_async_deleter; 18 | template class hamr::cuda_malloc_async_deleter; 19 | 20 | #define hamr_cuda_malloc_async_allocator_instantiate_members(_T) \ 21 | template std::shared_ptr<_T> hamr::cuda_malloc_async_allocator<_T>::allocate(cudaStream_t strm, size_t n, const float *vals, bool cudaVals); \ 22 | template std::shared_ptr<_T> hamr::cuda_malloc_async_allocator<_T>::allocate(cudaStream_t strm, size_t n, const double *vals, bool cudaVals); \ 23 | template std::shared_ptr<_T> hamr::cuda_malloc_async_allocator<_T>::allocate(cudaStream_t strm, size_t n, const char *vals, bool cudaVals); \ 24 | template std::shared_ptr<_T> hamr::cuda_malloc_async_allocator<_T>::allocate(cudaStream_t strm, size_t n, const signed char *vals, bool cudaVals); \ 25 | template std::shared_ptr<_T> hamr::cuda_malloc_async_allocator<_T>::allocate(cudaStream_t strm, size_t n, const short *vals, bool cudaVals); \ 26 | template std::shared_ptr<_T> hamr::cuda_malloc_async_allocator<_T>::allocate(cudaStream_t strm, size_t n, const int *vals, bool cudaVals); \ 27 | template std::shared_ptr<_T> hamr::cuda_malloc_async_allocator<_T>::allocate(cudaStream_t strm, size_t n, const long *vals, bool cudaVals); \ 28 | template std::shared_ptr<_T> hamr::cuda_malloc_async_allocator<_T>::allocate(cudaStream_t strm, size_t n, const long long *vals, bool cudaVals); \ 29 | template std::shared_ptr<_T> hamr::cuda_malloc_async_allocator<_T>::allocate(cudaStream_t strm, size_t n, const unsigned char *vals, bool cudaVals); \ 30 | template std::shared_ptr<_T> hamr::cuda_malloc_async_allocator<_T>::allocate(cudaStream_t strm, size_t n, const unsigned short *vals, bool cudaVals); \ 31 | template std::shared_ptr<_T> hamr::cuda_malloc_async_allocator<_T>::allocate(cudaStream_t strm, size_t n, const unsigned int *vals, bool cudaVals); \ 32 | template std::shared_ptr<_T> hamr::cuda_malloc_async_allocator<_T>::allocate(cudaStream_t strm, size_t n, const unsigned long *vals, bool cudaVals); \ 33 | template std::shared_ptr<_T> hamr::cuda_malloc_async_allocator<_T>::allocate(cudaStream_t strm, size_t n, const unsigned long long *vals, bool cudaVals); 34 | 35 | #define hamr_cuda_malloc_async_allocator_instantiate(_T) \ 36 | template struct hamr::cuda_malloc_async_allocator<_T>; \ 37 | hamr_cuda_malloc_async_allocator_instantiate_members(_T) 38 | 39 | hamr_cuda_malloc_async_allocator_instantiate(float) 40 | hamr_cuda_malloc_async_allocator_instantiate(double) 41 | hamr_cuda_malloc_async_allocator_instantiate(char) 42 | hamr_cuda_malloc_async_allocator_instantiate(signed char) 43 | hamr_cuda_malloc_async_allocator_instantiate(short) 44 | hamr_cuda_malloc_async_allocator_instantiate(int) 45 | hamr_cuda_malloc_async_allocator_instantiate(long) 46 | hamr_cuda_malloc_async_allocator_instantiate(long long) 47 | hamr_cuda_malloc_async_allocator_instantiate(unsigned char) 48 | hamr_cuda_malloc_async_allocator_instantiate(unsigned short) 49 | hamr_cuda_malloc_async_allocator_instantiate(unsigned int) 50 | hamr_cuda_malloc_async_allocator_instantiate(unsigned long) 51 | hamr_cuda_malloc_async_allocator_instantiate(unsigned long long) 52 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | ****************************************************************************** 2 | *** Copyright Notice *** 3 | ****************************************************************************** 4 | HAMR - Heterogeneous Accelerator Memory Resource (HAMR) 5 | Copyright (c) 2022, The Regents of the University of California, through 6 | Lawrence Berkeley National Laboratory (subject to receipt of any 7 | required approvals from the U.S. Dept. of Energy). All rights reserved. 8 | 9 | If you have questions about your rights to use or distribute this software, 10 | please contact Berkeley Lab's Intellectual Property Office at 11 | IPO@lbl.gov. 12 | 13 | NOTICE. This Software was developed under funding from the U.S. Department 14 | of Energy and the U.S. Government consequently retains certain rights. As 15 | such, the U.S. Government has been granted for itself and others acting on 16 | its behalf a paid-up, nonexclusive, irrevocable, worldwide license in the 17 | Software to reproduce, distribute copies to the public, prepare derivative 18 | works, and perform publicly and display publicly, and to permit others to do so. 19 | 20 | 21 | ****************************************************************************** 22 | *** License Agreement *** 23 | ****************************************************************************** 24 | HAMR - Heterogeneous Accelerator Memory Resource (HAMR) 25 | Copyright (c) 2022, The Regents of the University of California, through 26 | Lawrence Berkeley National Laboratory (subject to receipt of any 27 | required approvals from the U.S. Dept. of Energy). All rights reserved. 28 | 29 | Redistribution and use in source and binary forms, with or without 30 | modification, are permitted provided that the following conditions are met: 31 | 32 | (1) Redistributions of source code must retain the above copyright notice, 33 | this list of conditions and the following disclaimer. 34 | 35 | (2) Redistributions in binary form must reproduce the above copyright 36 | notice, this list of conditions and the following disclaimer in the 37 | documentation and/or other materials provided with the distribution. 38 | 39 | (3) Neither the name of the University of California, Lawrence Berkeley 40 | National Laboratory, U.S. Dept. of Energy nor the names of its contributors 41 | may be used to endorse or promote products derived from this software 42 | without specific prior written permission. 43 | 44 | 45 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 46 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 47 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 48 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 49 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 50 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 51 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 52 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 53 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 54 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 55 | POSSIBILITY OF SUCH DAMAGE. 56 | 57 | You are under no obligation whatsoever to provide any bug fixes, patches, 58 | or upgrades to the features, functionality or performance of the source 59 | code ("Enhancements") to anyone; however, if you choose to make your 60 | Enhancements available either publicly, or directly to Lawrence Berkeley 61 | National Laboratory, without imposing a separate written license agreement 62 | for such Enhancements, then you hereby grant the following license: a 63 | non-exclusive, royalty-free perpetual license to install, use, modify, 64 | prepare derivative works, incorporate into other computer software, 65 | distribute, and sublicense such enhancements or derivative works thereof, 66 | in binary and source code form. 67 | -------------------------------------------------------------------------------- /test/test_hamr_pipeline_cuda_openmp_mp.cpp: -------------------------------------------------------------------------------- 1 | #include "hamr_config.h" 2 | #include "hamr_buffer.h" 3 | #include "hamr_buffer_util.h" 4 | 5 | #include 6 | 7 | using hamr::buffer; 8 | using allocator = hamr::buffer_allocator; 9 | 10 | // ************************************************************************** 11 | template 12 | hamr::buffer initialize_openmp(size_t n_vals, const T &val) 13 | { 14 | // allocate the memory 15 | hamr::buffer ao(allocator::openmp, n_vals); 16 | T *pao = ao.data(); 17 | 18 | // initialize using openmp 19 | 20 | #pragma omp target teams HAMR_OPENMP_LOOP is_device_ptr(pao) map(to: val) 21 | for (size_t i = 0; i < n_vals; ++i) 22 | { 23 | pao[i] = val; 24 | } 25 | 26 | // print the results 27 | std::cerr << "initialized to an array of " << n_vals << " to " << val << std::endl; 28 | 29 | if (n_vals < 33) 30 | { 31 | std::cerr << "ao = "; ao.print(); std::cerr << std::endl; 32 | ao.print(); 33 | std::cerr << std::endl; 34 | } 35 | 36 | return ao; 37 | } 38 | 39 | 40 | 41 | 42 | // ************************************************************************** 43 | template 44 | hamr::buffer add_openmp(const hamr::buffer &a1, const hamr::buffer &a2) 45 | { 46 | size_t n_vals = a1.size(); 47 | 48 | // get the inputs 49 | auto spa1 = a1.get_openmp_accessible(); 50 | auto pa1 = spa1.get(); 51 | 52 | auto spa2 = a2.get_openmp_accessible(); 53 | auto pa2 = spa2.get(); 54 | 55 | // allocate the memory 56 | hamr::buffer ao(allocator::openmp, n_vals, T(0)); 57 | T *pao = ao.data(); 58 | 59 | // do the calculation 60 | #pragma omp target teams HAMR_OPENMP_LOOP is_device_ptr(pao, pa1, pa2) 61 | for (size_t i = 0; i < n_vals; ++i) 62 | { 63 | pao[i] = pa1[i] + pa2[i]; 64 | } 65 | 66 | // print the results 67 | std::cerr << "added " << n_vals << " array " << typeid(T).name() << sizeof(T) 68 | << " to array " << typeid(U).name() << sizeof(U) << std::endl; 69 | 70 | if (n_vals < 33) 71 | { 72 | std::cerr << "a1 = "; a1.print(); std::cerr << std::endl; 73 | std::cerr << "a2 = "; a2.print(); std::cerr << std::endl; 74 | std::cerr << "ao = "; ao.print(); std::cerr << std::endl; 75 | } 76 | 77 | return ao; 78 | } 79 | 80 | 81 | // ************************************************************************** 82 | template 83 | hamr::buffer multiply_scalar_openmp(const hamr::buffer &ai, const U &val) 84 | { 85 | size_t n_vals = ai.size(); 86 | 87 | // get the inputs 88 | auto spai = ai.get_openmp_accessible(); 89 | auto pai = spai.get(); 90 | 91 | // allocate the memory 92 | hamr::buffer ao(allocator::openmp, n_vals, T(0)); 93 | T *pao = ao.data(); 94 | 95 | // do the calculation 96 | #pragma omp target teams HAMR_OPENMP_LOOP is_device_ptr(pao, pai) map(to: val) 97 | for (size_t i = 0; i < n_vals; ++i) 98 | { 99 | pao[i] = val * pai[i]; 100 | } 101 | 102 | // print the results 103 | std::cerr << "multiply_scalar " << val << " " << typeid(U).name() << sizeof(U) 104 | << " by " << n_vals << " array " << typeid(T).name() << sizeof(T) << std::endl; 105 | 106 | if (n_vals < 33) 107 | { 108 | std::cerr << "ai = "; ai.print(); std::cerr << std::endl; 109 | std::cerr << "ao = "; ao.print(); std::cerr << std::endl; 110 | } 111 | 112 | return ao; 113 | } 114 | 115 | #define instantiate_openmp_kernels_(T,U) \ 116 | template buffer add_openmp(const buffer &a1, const buffer &a2); \ 117 | template buffer multiply_scalar_openmp(const buffer &ai, const U &val); 118 | 119 | #define instantiate_openmp_kernels(T) \ 120 | template buffer initialize_openmp(size_t n_vals, const T &val); \ 121 | instantiate_openmp_kernels_(T, float) \ 122 | instantiate_openmp_kernels_(T, double) 123 | 124 | instantiate_openmp_kernels(double) 125 | instantiate_openmp_kernels(float) 126 | 127 | -------------------------------------------------------------------------------- /hamr_hip_launch.h: -------------------------------------------------------------------------------- 1 | #ifndef hamr_hip_launch_h 2 | #define hamr_hip_launch_h 3 | 4 | /// @file 5 | 6 | #include "hamr_config.h" 7 | 8 | #include 9 | 10 | #include 11 | 12 | 13 | /// heterogeneous accelerator memory resource 14 | namespace hamr 15 | { 16 | 17 | /** A flat array is broken into blocks of number of threads where each adjacent 18 | * thread accesses adjacent memory locations. To accomplish this we might need 19 | * a large number of blocks. If the number of blocks exceeds the max block 20 | * dimension in the first and or second block grid dimension then we need to 21 | * use a 2d or 3d block grid. 22 | * 23 | * ::partition_thread_blocks - decides on a partitioning of the data based on 24 | * warps_per_block parameter. The resulting decomposition will be either 1,2, 25 | * or 3D as needed to accommodate the number of fixed sized blocks. It can 26 | * happen that max grid dimensions are hit, in which case you'll need to 27 | * increase the number of warps per block. 28 | * 29 | * ::thread_id_to_array_index - given a thread and block id gets the 30 | * array index to update. _this may be out of bounds so be sure 31 | * to validate before using it. 32 | * 33 | * ::index_is_valid - test an index for validity. 34 | */ 35 | /// @name CUDA indexing scheme 36 | ///@{ 37 | 38 | /// query properties for the named CUDA device. retruns non-zero on error 39 | HAMR_EXPORT 40 | int get_launch_props(int device_id, 41 | int *block_grid_max, int &warp_size, 42 | int &max_warps_per_block); 43 | 44 | 45 | /** convert a CUDA index into a flat array index using the partitioning scheme 46 | * defined in partition_thread_blocks 47 | */ 48 | inline 49 | __device__ 50 | unsigned long thread_id_to_array_index() 51 | { 52 | return threadIdx.x + blockDim.x*(blockIdx.x + blockIdx.y * gridDim.x 53 | + blockIdx.z * gridDim.x * gridDim.y); 54 | } 55 | 56 | /// bounds check the flat index 57 | inline 58 | __device__ 59 | int index_is_valid(unsigned long index, unsigned long max_index) 60 | { 61 | return index < max_index; 62 | } 63 | 64 | /** Calculate CUDA launch parameters for an arbitrarily large flat array. 65 | * 66 | * @param[in] device_id the CUDA device to use. Default values for 67 | * warps_per_block and block_grid_max are determined by 68 | * querying the capabilities of the device. If -1 is 69 | * passed then the currently active device is used. 70 | * @param[in] array_size the length of the array being processed 71 | * @param[in] warps_per_block number of warps to use per block (your choice). 72 | * Using a larger number here will result in fewer 73 | * blocks being processed concurrently. 74 | * 75 | * @param[out] block_grid block dimension kernel launch control 76 | * @param[out] n_blocks number of blocks 77 | * @param[out] thread_grid thread dimension kernel launch control 78 | * 79 | * @returns zero if successful and non-zero if an error occurred 80 | */ 81 | HAMR_EXPORT 82 | int partition_thread_blocks(int device_id, size_t array_size, 83 | int warps_per_block, dim3 &block_grid, int &n_blocks, 84 | dim3 &thread_grid); 85 | 86 | /** Calculate CUDA launch parameters for an arbitrarily large flat array. See 87 | * ::get_launch_props for determining the correct values for warp_size and 88 | * block_grid_max. 89 | * 90 | * @param[in] array_size The length of the array being processed 91 | * @param[in] warp_size The number of threads per warp supported on the device 92 | * @param[in] warps_per_block The number of warps to use per block (your choice) 93 | * @param[in] block_grid_max The maximum number of blocks, in 3-dimensions, 94 | * supported by the device 95 | * @param[out] block_grid The block grid dimension kernel launch control parameter 96 | * @param[out] n_blocks The total number of blocks that will be launched 97 | * @param[out] thread_grid The thread grid dimension kernel launch control parameter 98 | * 99 | * @returns zero if successful and non-zero if an error occurred 100 | */ 101 | HAMR_EXPORT 102 | int partition_thread_blocks(size_t array_size, 103 | int warps_per_block, int warp_size, int *block_grid_max, 104 | dim3 &block_grid, int &n_blocks, dim3 &thread_grid); 105 | } 106 | 107 | ///@} 108 | #endif 109 | -------------------------------------------------------------------------------- /hamr_cuda_launch.h: -------------------------------------------------------------------------------- 1 | #ifndef hamr_cuda_launch_h 2 | #define hamr_cuda_launch_h 3 | 4 | /// @file 5 | 6 | #include "hamr_config.h" 7 | 8 | #include 9 | 10 | #include 11 | #include 12 | 13 | /// heterogeneous accelerator memory resource 14 | namespace hamr 15 | { 16 | 17 | /** A flat array is broken into blocks of number of threads where each adjacent 18 | * thread accesses adjacent memory locations. To accomplish this we might need 19 | * a large number of blocks. If the number of blocks exceeds the max block 20 | * dimension in the first and or second block grid dimension then we need to 21 | * use a 2d or 3d block grid. 22 | * 23 | * ::partition_thread_blocks - decides on a partitioning of the data based on 24 | * warps_per_block parameter. The resulting decomposition will be either 1,2, 25 | * or 3D as needed to accommodate the number of fixed sized blocks. It can 26 | * happen that max grid dimensions are hit, in which case you'll need to 27 | * increase the number of warps per block. 28 | * 29 | * ::thread_id_to_array_index - given a thread and block id gets the 30 | * array index to update. _this may be out of bounds so be sure 31 | * to validate before using it. 32 | * 33 | * ::index_is_valid - test an index for validity. 34 | */ 35 | /// @name CUDA indexing scheme 36 | ///@{ 37 | 38 | /// query properties for the named CUDA device. retruns non-zero on error 39 | HAMR_EXPORT 40 | int get_launch_props(int device_id, 41 | int *block_grid_max, int &warp_size, 42 | int &max_warps_per_block); 43 | 44 | 45 | /** convert a CUDA index into a flat array index using the partitioning scheme 46 | * defined in partition_thread_blocks 47 | */ 48 | inline 49 | __device__ 50 | unsigned long thread_id_to_array_index() 51 | { 52 | return threadIdx.x + blockDim.x*(blockIdx.x + blockIdx.y * gridDim.x 53 | + blockIdx.z * gridDim.x * gridDim.y); 54 | } 55 | 56 | /// bounds check the flat index 57 | inline 58 | __device__ 59 | int index_is_valid(unsigned long index, unsigned long max_index) 60 | { 61 | return index < max_index; 62 | } 63 | 64 | /** Calculate CUDA launch parameters for an arbitrarily large flat array. 65 | * 66 | * @param[in] device_id the CUDA device to use. Default values for 67 | * warps_per_block and block_grid_max are determined by 68 | * querying the capabilities of the device. If -1 is 69 | * passed then the currently active device is used. 70 | * @param[in] array_size the length of the array being processed 71 | * @param[in] warps_per_block number of warps to use per block (your choice). 72 | * Using a larger number here will result in fewer 73 | * blocks being processed concurrently. 74 | * 75 | * @param[out] block_grid block dimension kernel launch control 76 | * @param[out] n_blocks number of blocks 77 | * @param[out] thread_grid thread dimension kernel launch control 78 | * 79 | * @returns zero if successful and non-zero if an error occurred 80 | */ 81 | HAMR_EXPORT 82 | int partition_thread_blocks(int device_id, size_t array_size, 83 | int warps_per_block, dim3 &block_grid, int &n_blocks, 84 | dim3 &thread_grid); 85 | 86 | /** Calculate CUDA launch parameters for an arbitrarily large flat array. See 87 | * ::get_launch_props for determining the correct values for warp_size and 88 | * block_grid_max. 89 | * 90 | * @param[in] array_size The length of the array being processed 91 | * @param[in] warp_size The number of threads per warp supported on the device 92 | * @param[in] warps_per_block The number of warps to use per block (your choice) 93 | * @param[in] block_grid_max The maximum number of blocks, in 3-dimensions, 94 | * supported by the device 95 | * @param[out] block_grid The block grid dimension kernel launch control parameter 96 | * @param[out] n_blocks The total number of blocks that will be launched 97 | * @param[out] thread_grid The thread grid dimension kernel launch control parameter 98 | * 99 | * @returns zero if successful and non-zero if an error occurred 100 | */ 101 | HAMR_EXPORT 102 | int partition_thread_blocks(size_t array_size, 103 | int warps_per_block, int warp_size, int *block_grid_max, 104 | dim3 &block_grid, int &n_blocks, dim3 &thread_grid); 105 | } 106 | 107 | ///@} 108 | #endif 109 | -------------------------------------------------------------------------------- /hamr_buffer_allocator.h: -------------------------------------------------------------------------------- 1 | #ifndef hamr_buffer_allocator_h 2 | #define hamr_buffer_allocator_h 3 | 4 | ///@file 5 | 6 | #include "hamr_config.h" 7 | #include 8 | 9 | namespace hamr 10 | { 11 | 12 | /// allocator types that may be used with hamr::buffer 13 | enum class buffer_allocator 14 | { 15 | same = -2, ///< propagate the current allocator 16 | none = -1, ///< no allocator specified 17 | cpp = 0, ///< allocates memory with new 18 | malloc = 1, ///< allocates memory with malloc 19 | cuda = 2, ///< allocates memory with cudaMalloc 20 | cuda_async = 3,///< allocates memory with cudaMallocAsync 21 | cuda_uva = 4, ///< allocates memory with cudaMallocManaged 22 | cuda_host = 5, ///< allocates memory with cudaMallocHost 23 | hip = 6, ///< allocates memory with hipMalloc 24 | hip_uva = 7, ///< allocates memory with hipMallocManaged 25 | openmp = 8 ///< allocates memory with OpenMP device offload API 26 | }; 27 | 28 | /// return the human readable name of the allocator 29 | HAMR_EXPORT 30 | const char *get_allocator_name(buffer_allocator alloc); 31 | 32 | /// @returns true if the allocator creates host accessible memory 33 | inline 34 | HAMR_EXPORT 35 | int host_accessible(buffer_allocator alloc) 36 | { 37 | return (alloc == buffer_allocator::cpp) || 38 | (alloc == buffer_allocator::malloc) || 39 | (alloc == buffer_allocator::cuda_uva) || 40 | (alloc == buffer_allocator::cuda_host) || 41 | (alloc == buffer_allocator::hip_uva); 42 | } 43 | 44 | /// @returns true if the allocator creates CUDA accessible memory 45 | inline 46 | HAMR_EXPORT 47 | int cuda_accessible(buffer_allocator alloc) 48 | { 49 | return (alloc == buffer_allocator::cuda) || 50 | (alloc == buffer_allocator::cuda_async) || 51 | (alloc == buffer_allocator::cuda_uva) || 52 | (alloc == buffer_allocator::hip) || 53 | (alloc == buffer_allocator::hip_uva) || 54 | (alloc == buffer_allocator::openmp); 55 | } 56 | 57 | /// @returns true if the allocator creates HIP accessible memory 58 | inline 59 | HAMR_EXPORT 60 | int hip_accessible(buffer_allocator alloc) 61 | { 62 | return (alloc == buffer_allocator::cuda) || 63 | (alloc == buffer_allocator::cuda_async) || 64 | (alloc == buffer_allocator::cuda_uva) || 65 | (alloc == buffer_allocator::hip) || 66 | (alloc == buffer_allocator::hip_uva); 67 | } 68 | 69 | /// @returns true if the allocator creates OPENMP accessible memory 70 | inline 71 | HAMR_EXPORT 72 | int openmp_accessible(buffer_allocator alloc) 73 | { 74 | return (alloc == buffer_allocator::cuda) || 75 | (alloc == buffer_allocator::cuda_async) || 76 | (alloc == buffer_allocator::cuda_uva) || 77 | (alloc == buffer_allocator::openmp); 78 | } 79 | 80 | /// asserts that the passed value is one of the known allocators 81 | inline 82 | HAMR_EXPORT 83 | void assert_valid_allocator(buffer_allocator alloc) 84 | { 85 | (void) alloc; 86 | assert((alloc == buffer_allocator::cpp) 87 | || (alloc == buffer_allocator::malloc) 88 | #if defined(HAMR_ENABLE_CUDA) 89 | || (alloc == buffer_allocator::cuda) 90 | || (alloc == buffer_allocator::cuda_async) 91 | || (alloc == buffer_allocator::cuda_uva) 92 | || (alloc == buffer_allocator::cuda_host) 93 | #endif 94 | #if defined(HAMR_ENABLE_HIP) 95 | || (alloc == buffer_allocator::hip) 96 | || (alloc == buffer_allocator::hip_uva) 97 | #endif 98 | #if defined(HAMR_ENABLE_OPENMP) 99 | || (alloc == buffer_allocator::openmp) 100 | #endif 101 | ); 102 | } 103 | 104 | /// get the allocator type most suitable for the current build configuration. 105 | inline HAMR_EXPORT buffer_allocator get_device_allocator() 106 | { 107 | #if defined(HAMR_ENABLE_CUDA) 108 | return buffer_allocator::cuda_async; 109 | #elif defined(HAMR_ENABLE_HIP) 110 | return buffer_allocator::hip; 111 | #elif defined(HAMR_ENABLE_OPENMP) 112 | return buffer_allocator::openmp; 113 | #else 114 | return buffer_allocator::malloc; 115 | #endif 116 | } 117 | 118 | /// get the allocator type most suitable for the current build configuration. 119 | inline HAMR_EXPORT buffer_allocator get_host_allocator() 120 | { 121 | #if defined(HAMR_ENABLE_CUDA) 122 | return buffer_allocator::cuda_host; 123 | #elif defined(HAMR_ENABLE_HIP) 124 | return buffer_allocator::malloc; 125 | #elif defined(HAMR_ENABLE_OPENMP) 126 | return buffer_allocator::malloc; 127 | #else 128 | return buffer_allocator::malloc; 129 | #endif 130 | } 131 | 132 | } 133 | 134 | #endif 135 | -------------------------------------------------------------------------------- /hamr_malloc_allocator.h: -------------------------------------------------------------------------------- 1 | #ifndef hamr_malloc_allocator_h 2 | #define hamr_malloc_allocator_h 3 | 4 | #include "hamr_config.h" 5 | 6 | #include 7 | #include 8 | 9 | namespace hamr 10 | { 11 | /// a deleter for arrays allocated with malloc 12 | template 13 | class malloc_deleter {}; 14 | 15 | /// a deleter for arrays allocated with malloc, specialized for objects 16 | template 17 | class HAMR_EXPORT malloc_deleter::value>::type> 18 | { 19 | public: 20 | /** constructs the deleter 21 | * @param[in] ptr the pointer to the array to delete 22 | * @param[in] n the number of elements in the array 23 | */ 24 | malloc_deleter(T *ptr, size_t n); 25 | 26 | /** deletes the array 27 | * @param[in] ptr the pointer to the array to delete. must be the same as 28 | * that passed during construction. 29 | */ 30 | void operator()(T *ptr); 31 | 32 | private: 33 | T *m_ptr; 34 | size_t m_elem; 35 | }; 36 | 37 | 38 | 39 | 40 | 41 | 42 | /// a deleter for arrays allocated with malloc, specialized for numbers 43 | template 44 | class HAMR_EXPORT malloc_deleter::value>::type> 45 | { 46 | public: 47 | /** constructs the deleter 48 | * @param[in] ptr the pointer to the array to delete 49 | * @param[in] n the number of elements in the array 50 | */ 51 | malloc_deleter(T *ptr, size_t n); 52 | 53 | /** deletes the array 54 | * @param[in] ptr the pointer to the array to delete. must be the same as 55 | * that passed during construction. 56 | */ 57 | void operator()(T *ptr); 58 | 59 | private: 60 | T *m_ptr; 61 | size_t m_elem; 62 | }; 63 | 64 | 65 | 66 | 67 | 68 | 69 | /// a class for allocating arrays with malloc 70 | template 71 | struct malloc_allocator {}; 72 | 73 | /// a class for allocating arrays with malloc, specialized for objects 74 | template 75 | struct HAMR_EXPORT malloc_allocator::value>::type> 76 | { 77 | /** allocate an array of n elements. 78 | * @param[in] n the number of elements to allocate 79 | * @returns a shared pointer to the array that holds a deleter for the memory 80 | */ 81 | static std::shared_ptr allocate(size_t n) HAMR_EXPORT; 82 | 83 | /** allocate an array of n elements. 84 | * @param[in] n the number of elements to allocate 85 | * @param[in] val a value to initialize the elements to 86 | * @returns a shared pointer to the array that holds a deleter for the memory 87 | */ 88 | 89 | static std::shared_ptr allocate(size_t n, const T &val) HAMR_EXPORT; 90 | 91 | /** allocate an array of n elements. 92 | * @param[in] n the number of elements to allocate 93 | * @param[in] vals an array of n elements to initialize the elements with 94 | * @returns a shared pointer to the array that holds a deleter for the memory 95 | */ 96 | template 97 | static std::shared_ptr allocate(size_t n, const U *vals) HAMR_EXPORT; 98 | }; 99 | 100 | 101 | 102 | 103 | 104 | /// a class for allocating arrays with malloc, specialized for numbers 105 | template 106 | struct HAMR_EXPORT malloc_allocator::value>::type> 107 | { 108 | /** allocate an array of n elements. 109 | * @param[in] n the number of elements to allocate 110 | * @returns a shared pointer to the array that holds a deleter for the memory 111 | */ 112 | static std::shared_ptr allocate(size_t n) HAMR_EXPORT; 113 | 114 | /** allocate an array of n elements. 115 | * @param[in] n the number of elements to allocate 116 | * @param[in] val a value to initialize the elements to 117 | * @returns a shared pointer to the array that holds a deleter for the memory 118 | */ 119 | static std::shared_ptr allocate(size_t n, const T &val) HAMR_EXPORT; 120 | 121 | /** allocate an array of n elements. 122 | * @param[in] n the number of elements to allocate 123 | * @param[in] vals an array of n elements to initialize the elements with 124 | * @returns a shared pointer to the array that holds a deleter for the memory 125 | */ 126 | template 127 | static std::shared_ptr allocate(size_t n, const U *vals) HAMR_EXPORT; 128 | }; 129 | 130 | } 131 | 132 | #if !defined(HAMR_SEPARATE_IMPL) 133 | #include "hamr_malloc_allocator_impl.h" 134 | #endif 135 | 136 | #endif 137 | -------------------------------------------------------------------------------- /hamr_openmp_allocator.h: -------------------------------------------------------------------------------- 1 | #ifndef hamr_openmp_allocator_h 2 | #define hamr_openmp_allocator_h 3 | 4 | #include "hamr_config.h" 5 | #include 6 | #include 7 | 8 | namespace hamr 9 | { 10 | /// a deleter for arrays allocated with OpenMP 11 | template 12 | class openmp_deleter {}; 13 | 14 | /// a deleter for arrays allocated with OpenMP, specialized for objects 15 | template 16 | class HAMR_EXPORT openmp_deleter::value>::type> 17 | { 18 | public: 19 | /** constructs the deleter 20 | * @param[in] ptr the pointer to the array to delete 21 | * @param[in] n the number of elements in the array 22 | */ 23 | openmp_deleter(T *ptr, size_t n, int dev); 24 | 25 | /** deletes the array 26 | * @param[in] ptr the pointer to the array to delete. must be the same as 27 | * that passed during construction. 28 | */ 29 | void operator()(T *ptr); 30 | 31 | private: 32 | T *m_ptr; 33 | size_t m_elem; 34 | int m_dev; 35 | }; 36 | 37 | 38 | 39 | 40 | 41 | 42 | /// a deleter for arrays allocated with OpenMP, specialized for numbers 43 | template 44 | class HAMR_EXPORT openmp_deleter::value>::type> 45 | { 46 | public: 47 | /** constructs the deleter 48 | * @param[in] ptr the pointer to the array to delete 49 | * @param[in] n the number of elements in the array 50 | */ 51 | openmp_deleter(T *ptr, size_t n, int dev); 52 | 53 | /** deletes the array 54 | * @param[in] ptr the pointer to the array to delete. must be the same as 55 | * that passed during construction. 56 | */ 57 | void operator()(T *ptr); 58 | 59 | private: 60 | T *m_ptr; 61 | size_t m_elem; 62 | int m_dev; 63 | }; 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | /// a class for allocating arrays with OpenMP 72 | template 73 | struct openmp_allocator {}; 74 | 75 | /// a class for allocating arrays with OpenMP, specialized for objects 76 | template 77 | struct HAMR_EXPORT openmp_allocator::value>::type> 78 | { 79 | /** allocate an array of n elements. 80 | * @param[in] n the number of elements to allocate 81 | * @returns a shared pointer to the array that holds a deleter for the memory 82 | */ 83 | static std::shared_ptr allocate(size_t n) HAMR_EXPORT; 84 | 85 | /** allocate an array of n elements. 86 | * @param[in] n the number of elements to allocate 87 | * @param[in] val a value to initialize the elements to 88 | * @returns a shared pointer to the array that holds a deleter for the memory 89 | */ 90 | 91 | static std::shared_ptr allocate(size_t n, const T &val) HAMR_EXPORT; 92 | 93 | /** allocate an array of n elements. 94 | * @param[in] n the number of elements to allocate 95 | * @param[in] vals an array of n elements to initialize the elements with 96 | * @returns a shared pointer to the array that holds a deleter for the memory 97 | */ 98 | template 99 | static std::shared_ptr allocate(size_t n, const U *vals) HAMR_EXPORT; 100 | }; 101 | 102 | 103 | 104 | 105 | 106 | 107 | /// a class for allocating arrays with OpenMP, specialized for numbers 108 | template 109 | struct HAMR_EXPORT openmp_allocator::value>::type> 110 | { 111 | /** allocate an array of n elements. 112 | * @param[in] n the number of elements to allocate 113 | * @returns a shared pointer to the array that holds a deleter for the memory 114 | */ 115 | static std::shared_ptr allocate(size_t n) HAMR_EXPORT; 116 | 117 | /** allocate an array of n elements. 118 | * @param[in] n the number of elements to allocate 119 | * @param[in] val a value to initialize the elements to 120 | * @returns a shared pointer to the array that holds a deleter for the memory 121 | */ 122 | static std::shared_ptr allocate(size_t n, const T &val) HAMR_EXPORT; 123 | 124 | /** allocate an array of n elements. 125 | * @param[in] n the number of elements to allocate 126 | * @param[in] vals an array of n elements to initialize the elements with 127 | * @returns a shared pointer to the array that holds a deleter for the memory 128 | */ 129 | template 130 | static std::shared_ptr allocate(size_t n, const U *vals) HAMR_EXPORT; 131 | }; 132 | 133 | } 134 | 135 | #if !defined(HAMR_SEPARATE_IMPL) 136 | #include "hamr_openmp_allocator_impl.h" 137 | #endif 138 | 139 | #endif 140 | -------------------------------------------------------------------------------- /hamr_hip_malloc_allocator.h: -------------------------------------------------------------------------------- 1 | #ifndef hamr_hip_malloc_allocator_h 2 | #define hamr_hip_malloc_allocator_h 3 | 4 | #include "hamr_config.h" 5 | #include 6 | #include 7 | 8 | namespace hamr 9 | { 10 | 11 | /// a deleter for arrays allocated with hip_malloc 12 | template 13 | class hip_malloc_deleter {}; 14 | 15 | /// a deleter for arrays allocated with hip_malloc, specialized for objects 16 | template 17 | class HAMR_EXPORT hip_malloc_deleter::value>::type> 18 | { 19 | public: 20 | /** constructs the deleter 21 | * @param[in] ptr the pointer to the array to delete 22 | * @param[in] n the number of elements in the array 23 | */ 24 | hip_malloc_deleter(T *ptr, size_t n); 25 | 26 | /** deletes the array 27 | * @param[in] ptr the pointer to the array to delete. must be the same as 28 | * that passed during construction. 29 | */ 30 | void operator()(T *ptr); 31 | 32 | private: 33 | T *m_ptr; 34 | size_t m_elem; 35 | }; 36 | 37 | 38 | 39 | 40 | 41 | 42 | /// a deleter for arrays allocated with hip_malloc, specialized for numbers 43 | template 44 | class HAMR_EXPORT hip_malloc_deleter::value>::type> 45 | { 46 | public: 47 | /** constructs the deleter 48 | * @param[in] ptr the pointer to the array to delete 49 | * @param[in] n the number of elements in the array 50 | */ 51 | hip_malloc_deleter(T *ptr, size_t n); 52 | 53 | /** deletes the array 54 | * @param[in] ptr the pointer to the array to delete. must be the same as 55 | * that passed during construction. 56 | */ 57 | void operator()(T *ptr); 58 | 59 | private: 60 | T *m_ptr; 61 | size_t m_elem; 62 | }; 63 | 64 | 65 | 66 | 67 | 68 | 69 | /// a class for allocating arrays with hip_malloc 70 | template 71 | struct hip_malloc_allocator {}; 72 | 73 | /// a class for allocating arrays with hip_malloc, specialized for objects 74 | template 75 | struct HAMR_EXPORT hip_malloc_allocator::value>::type> 76 | { 77 | /** allocate an array of n elements. 78 | * @param[in] n the number of elements to allocate 79 | * @returns a shared pointer to the array that holds a deleter for the 80 | * memory 81 | */ 82 | static std::shared_ptr allocate(size_t n); 83 | 84 | /** allocate an array of n elements. 85 | * @param[in] n the number of elements to allocate 86 | * @param[in] val a value to initialize the elements to 87 | * @returns a shared pointer to the array that holds a deleter for the 88 | * memory 89 | */ 90 | static std::shared_ptr allocate(size_t n, const T &val); 91 | 92 | /** allocate an array of n elements. 93 | * @param[in] n the number of elements to allocate 94 | * @param[in] vals an array of values to initialize the elements with 95 | * @param[in] hipVals a flag set to true if vals are accessible by codes 96 | * running in HIP 97 | * @returns a shared pointer to the array that holds a deleter for the 98 | * memory 99 | */ 100 | template 101 | static std::shared_ptr allocate(size_t n, const U *vals, bool hipVals = false); 102 | }; 103 | 104 | 105 | 106 | 107 | 108 | /// a class for allocating arrays with hip_malloc, specialized for numbers 109 | template 110 | struct HAMR_EXPORT hip_malloc_allocator::value>::type> 111 | { 112 | /** allocate an array of n elements. 113 | * @param[in] n the number of elements to allocate 114 | * @returns a shared pointer to the array that holds a deleter for the 115 | * memory 116 | */ 117 | static std::shared_ptr allocate(size_t n); 118 | 119 | /** allocate an array of n elements. 120 | * @param[in] n the number of elements to allocate 121 | * @param[in] val a value to initialize the elements to 122 | * @returns a shared pointer to the array that holds a deleter for the 123 | * memory 124 | */ 125 | static std::shared_ptr allocate(size_t n, const T &val); 126 | 127 | /** allocate an array of n elements. 128 | * @param[in] n the number of elements to allocate 129 | * @param[in] vals an array of values to initialize the elements with 130 | * @param[in] hipVals a flag set to true if vals are accessible by codes 131 | * running in HIP 132 | * @returns a shared pointer to the array that holds a 133 | * deleter for the memory 134 | */ 135 | template 136 | static std::shared_ptr allocate(size_t n, const U *vals, bool hipVals = false); 137 | }; 138 | 139 | } 140 | 141 | #if !defined(HAMR_SEPARATE_IMPL) 142 | #include "hamr_hip_malloc_allocator_impl.h" 143 | #endif 144 | 145 | #endif 146 | -------------------------------------------------------------------------------- /hamr_cuda_malloc_allocator.cxx: -------------------------------------------------------------------------------- 1 | #include "hamr_config.h" 2 | 3 | #include "hamr_cuda_malloc_allocator.h" 4 | #include "hamr_cuda_malloc_allocator_impl.h" 5 | 6 | template class hamr::cuda_malloc_deleter; 7 | template class hamr::cuda_malloc_deleter; 8 | template class hamr::cuda_malloc_deleter; 9 | template class hamr::cuda_malloc_deleter; 10 | template class hamr::cuda_malloc_deleter; 11 | template class hamr::cuda_malloc_deleter; 12 | template class hamr::cuda_malloc_deleter; 13 | template class hamr::cuda_malloc_deleter; 14 | template class hamr::cuda_malloc_deleter; 15 | template class hamr::cuda_malloc_deleter; 16 | template class hamr::cuda_malloc_deleter; 17 | template class hamr::cuda_malloc_deleter; 18 | template class hamr::cuda_malloc_deleter; 19 | 20 | #define hamr_cuda_malloc_allocator_instantiate_members(_T) \ 21 | template std::shared_ptr<_T> hamr::cuda_malloc_allocator<_T>::allocate(size_t n, const float *vals, bool cudaVals); \ 22 | template std::shared_ptr<_T> hamr::cuda_malloc_allocator<_T>::allocate(size_t n, const double *vals, bool cudaVals); \ 23 | template std::shared_ptr<_T> hamr::cuda_malloc_allocator<_T>::allocate(size_t n, const char *vals, bool cudaVals); \ 24 | template std::shared_ptr<_T> hamr::cuda_malloc_allocator<_T>::allocate(size_t n, const signed char *vals, bool cudaVals); \ 25 | template std::shared_ptr<_T> hamr::cuda_malloc_allocator<_T>::allocate(size_t n, const short *vals, bool cudaVals); \ 26 | template std::shared_ptr<_T> hamr::cuda_malloc_allocator<_T>::allocate(size_t n, const int *vals, bool cudaVals); \ 27 | template std::shared_ptr<_T> hamr::cuda_malloc_allocator<_T>::allocate(size_t n, const long *vals, bool cudaVals); \ 28 | template std::shared_ptr<_T> hamr::cuda_malloc_allocator<_T>::allocate(size_t n, const long long *vals, bool cudaVals); \ 29 | template std::shared_ptr<_T> hamr::cuda_malloc_allocator<_T>::allocate(size_t n, const unsigned char *vals, bool cudaVals); \ 30 | template std::shared_ptr<_T> hamr::cuda_malloc_allocator<_T>::allocate(size_t n, const unsigned short *vals, bool cudaVals); \ 31 | template std::shared_ptr<_T> hamr::cuda_malloc_allocator<_T>::allocate(size_t n, const unsigned int *vals, bool cudaVals); \ 32 | template std::shared_ptr<_T> hamr::cuda_malloc_allocator<_T>::allocate(size_t n, const unsigned long *vals, bool cudaVals); \ 33 | template std::shared_ptr<_T> hamr::cuda_malloc_allocator<_T>::allocate(size_t n, const unsigned long long *vals, bool cudaVals); \ 34 | \ 35 | template std::shared_ptr<_T> hamr::cuda_malloc_allocator<_T>::allocate(cudaStream_t strm, size_t n, const float *vals, bool cudaVals); \ 36 | template std::shared_ptr<_T> hamr::cuda_malloc_allocator<_T>::allocate(cudaStream_t strm, size_t n, const double *vals, bool cudaVals); \ 37 | template std::shared_ptr<_T> hamr::cuda_malloc_allocator<_T>::allocate(cudaStream_t strm, size_t n, const char *vals, bool cudaVals); \ 38 | template std::shared_ptr<_T> hamr::cuda_malloc_allocator<_T>::allocate(cudaStream_t strm, size_t n, const signed char *vals, bool cudaVals); \ 39 | template std::shared_ptr<_T> hamr::cuda_malloc_allocator<_T>::allocate(cudaStream_t strm, size_t n, const short *vals, bool cudaVals); \ 40 | template std::shared_ptr<_T> hamr::cuda_malloc_allocator<_T>::allocate(cudaStream_t strm, size_t n, const int *vals, bool cudaVals); \ 41 | template std::shared_ptr<_T> hamr::cuda_malloc_allocator<_T>::allocate(cudaStream_t strm, size_t n, const long *vals, bool cudaVals); \ 42 | template std::shared_ptr<_T> hamr::cuda_malloc_allocator<_T>::allocate(cudaStream_t strm, size_t n, const long long *vals, bool cudaVals); \ 43 | template std::shared_ptr<_T> hamr::cuda_malloc_allocator<_T>::allocate(cudaStream_t strm, size_t n, const unsigned char *vals, bool cudaVals); \ 44 | template std::shared_ptr<_T> hamr::cuda_malloc_allocator<_T>::allocate(cudaStream_t strm, size_t n, const unsigned short *vals, bool cudaVals); \ 45 | template std::shared_ptr<_T> hamr::cuda_malloc_allocator<_T>::allocate(cudaStream_t strm, size_t n, const unsigned int *vals, bool cudaVals); \ 46 | template std::shared_ptr<_T> hamr::cuda_malloc_allocator<_T>::allocate(cudaStream_t strm, size_t n, const unsigned long *vals, bool cudaVals); \ 47 | template std::shared_ptr<_T> hamr::cuda_malloc_allocator<_T>::allocate(cudaStream_t strm, size_t n, const unsigned long long *vals, bool cudaVals); 48 | 49 | #define hamr_cuda_malloc_allocator_instantiate(_T) \ 50 | template struct hamr::cuda_malloc_allocator<_T>; \ 51 | hamr_cuda_malloc_allocator_instantiate_members(_T) 52 | 53 | hamr_cuda_malloc_allocator_instantiate(float) 54 | hamr_cuda_malloc_allocator_instantiate(double) 55 | hamr_cuda_malloc_allocator_instantiate(char) 56 | hamr_cuda_malloc_allocator_instantiate(signed char) 57 | hamr_cuda_malloc_allocator_instantiate(short) 58 | hamr_cuda_malloc_allocator_instantiate(int) 59 | hamr_cuda_malloc_allocator_instantiate(long) 60 | hamr_cuda_malloc_allocator_instantiate(long long) 61 | hamr_cuda_malloc_allocator_instantiate(unsigned char) 62 | hamr_cuda_malloc_allocator_instantiate(unsigned short) 63 | hamr_cuda_malloc_allocator_instantiate(unsigned int) 64 | hamr_cuda_malloc_allocator_instantiate(unsigned long) 65 | hamr_cuda_malloc_allocator_instantiate(unsigned long long) 66 | -------------------------------------------------------------------------------- /hamr_cuda_malloc_host_allocator.h: -------------------------------------------------------------------------------- 1 | #ifndef hamr_cuda_malloc_host_allocator_h 2 | #define hamr_cuda_malloc_host_allocator_h 3 | 4 | #include "hamr_config.h" 5 | #include 6 | #include 7 | 8 | namespace hamr 9 | { 10 | /// a deleter for arrays allocated with cudaMallocHost 11 | template 12 | class cuda_malloc_host_deleter {}; 13 | 14 | /// a deleter for arrays allocated with cudaMallocHost, specialized for objects 15 | template 16 | class HAMR_EXPORT cuda_malloc_host_deleter::value>::type> 17 | { 18 | public: 19 | /** constructs the deleter 20 | * @param[in] ptr the pointer to the array to delete 21 | * @param[in] n the number of elements in the array 22 | */ 23 | cuda_malloc_host_deleter(T *ptr, size_t n); 24 | 25 | /** deletes the array 26 | * @param[in] ptr the pointer to the array to delete. must be the same as 27 | * that passed during construction. 28 | */ 29 | void operator()(T *ptr); 30 | 31 | private: 32 | T *m_ptr; 33 | size_t m_elem; 34 | }; 35 | 36 | 37 | 38 | 39 | 40 | 41 | /// a deleter for arrays allocated with cudaMallocHost, specialized for numbers 42 | template 43 | class HAMR_EXPORT cuda_malloc_host_deleter::value>::type> 44 | { 45 | public: 46 | /** constructs the deleter 47 | * @param[in] ptr the pointer to the array to delete 48 | * @param[in] n the number of elements in the array 49 | */ 50 | cuda_malloc_host_deleter(T *ptr, size_t n); 51 | 52 | /** deletes the array 53 | * @param[in] ptr the pointer to the array to delete. must be the same as 54 | * that passed during construction. 55 | */ 56 | void operator()(T *ptr); 57 | 58 | private: 59 | T *m_ptr; 60 | size_t m_elem; 61 | }; 62 | 63 | 64 | 65 | 66 | 67 | 68 | /** A class for allocating arrays with cudaMallocHost. Use this allocator for 69 | * host accessible memory when you want to overlap data movement and computation 70 | * with CUDA. 71 | */ 72 | template 73 | struct cuda_malloc_host_allocator {}; 74 | 75 | /** a class for allocating arrays with cudaMallocHost, specialized for objects 76 | * Use this allocator for host accessible memory when you want to overlap data movement and computation 77 | * with CUDA 78 | */ 79 | template 80 | struct HAMR_EXPORT cuda_malloc_host_allocator::value>::type> 81 | { 82 | /** allocate an array of n elements. 83 | * @param[in] n the number of elements to allocate 84 | * @returns a shared pointer to the array that holds a deleter for the memory 85 | */ 86 | static std::shared_ptr allocate(size_t n) HAMR_EXPORT; 87 | 88 | /** allocate an array of n elements. 89 | * @param[in] n the number of elements to allocate 90 | * @param[in] val a value to initialize the elements to 91 | * @returns a shared pointer to the array that holds a deleter for the memory 92 | */ 93 | static std::shared_ptr allocate(size_t n, const T &val) HAMR_EXPORT; 94 | 95 | /** allocate an array of n elements. 96 | * @param[in] n the number of elements to allocate 97 | * @param[in] vals an array of n elements to initialize the elements with 98 | * @returns a shared pointer to the array that holds a deleter for the memory 99 | */ 100 | template 101 | static std::shared_ptr allocate(size_t n, const U *vals) HAMR_EXPORT; 102 | }; 103 | 104 | 105 | 106 | 107 | 108 | /** a class for allocating arrays with cudaMallocHost, specialized for numbers. 109 | * Use this allocator for host accessible memory when you want to overlap data 110 | * movement and computation with CUDA 111 | */ 112 | template 113 | struct HAMR_EXPORT cuda_malloc_host_allocator::value>::type> 114 | { 115 | /** allocate an array of n elements. 116 | * @param[in] n the number of elements to allocate 117 | * @returns a shared pointer to the array that holds a deleter for the memory 118 | */ 119 | static std::shared_ptr allocate(size_t n) HAMR_EXPORT; 120 | 121 | /** allocate an array of n elements. 122 | * @param[in] n the number of elements to allocate 123 | * @param[in] val a value to initialize the elements to 124 | * @returns a shared pointer to the array that holds a deleter for the memory 125 | */ 126 | static std::shared_ptr allocate(size_t n, const T &val) HAMR_EXPORT; 127 | 128 | /** allocate an array of n elements. 129 | * @param[in] n the number of elements to allocate 130 | * @param[in] vals an array of n elements to initialize the elements with 131 | * @returns a shared pointer to the array that holds a deleter for the memory 132 | */ 133 | template 134 | static std::shared_ptr allocate(size_t n, const U *vals) HAMR_EXPORT; 135 | }; 136 | 137 | } 138 | 139 | #if !defined(HAMR_SEPARATE_IMPL) 140 | #include "hamr_cuda_malloc_host_allocator_impl.h" 141 | #endif 142 | 143 | #endif 144 | -------------------------------------------------------------------------------- /hamr_hip_kernels.h: -------------------------------------------------------------------------------- 1 | #ifndef hamr_hip_kernels_h 2 | #define hamr_hip_kernels_h 3 | 4 | #include "hamr_config.h" 5 | #include "hamr_env.h" 6 | #include "hamr_hip_launch.h" 7 | 8 | namespace hamr 9 | { 10 | 11 | namespace hip_kernels 12 | { 13 | 14 | /// helpers to get the printf code given a POD type 15 | template struct printf_tt {}; 16 | 17 | #define declare_printf_tt(cpp_t, print_t, code, len)\ 18 | /** printf code wrapper for cpp_t */ \ 19 | template <> struct printf_tt \ 20 | { \ 21 | /** cast from cpp_t to print_t */ \ 22 | __device__ \ 23 | static print_t get_value(cpp_t v) \ 24 | { return v; } \ 25 | \ 26 | /** returns the printf code for cpp_t */ \ 27 | __device__ \ 28 | static const char *get_code() \ 29 | { return code; } \ 30 | \ 31 | /** copies the printf code */ \ 32 | __device__ \ 33 | static void copy_code(char *dest) \ 34 | { \ 35 | for (int i = 0; i < len; ++i) \ 36 | dest[i] = get_code()[i]; \ 37 | } \ 38 | \ 39 | /** returns the length of the printf code */ \ 40 | __device__ \ 41 | static int get_code_len() \ 42 | { return len; } \ 43 | }; 44 | 45 | declare_printf_tt(char, int, "%d", 2) 46 | declare_printf_tt(signed char, int, "%d", 2) 47 | declare_printf_tt(unsigned char, unsigned int, "%u", 2) 48 | declare_printf_tt(short, short, "%hd", 3) 49 | declare_printf_tt(unsigned short, unsigned short, "%hu", 3) 50 | declare_printf_tt(int, int, "%d", 2) 51 | declare_printf_tt(unsigned int, unsigned int, "%u", 2) 52 | declare_printf_tt(long, long, "%ld", 3) 53 | declare_printf_tt(unsigned long, unsigned long, "%lu", 3) 54 | declare_printf_tt(long long, long long, "%lld", 4) 55 | declare_printf_tt(unsigned long long, unsigned long long, "%llu", 4) 56 | declare_printf_tt(float, float, "%g", 2) 57 | declare_printf_tt(double, double, "%g", 2) 58 | 59 | 60 | /// send an array to the stderr stream on the GPU using HIP 61 | template 62 | __global__ 63 | void print(const T *vals, size_t n_elem) 64 | { 65 | unsigned long i = hamr::thread_id_to_array_index(); 66 | 67 | if (i >= n_elem) 68 | return; 69 | 70 | int cl = printf_tt::get_code_len(); 71 | char fmt[] = "vals[%lu] = XXXXXXXXX"; // <-- 20 72 | printf_tt::copy_code(fmt + 12); 73 | fmt[12 + cl] = '\n'; 74 | fmt[13 + cl] = '\0'; 75 | 76 | printf(fmt, i, printf_tt::get_value(vals[i])); 77 | } 78 | 79 | /// copy an array on the GPU using HIP 80 | template 81 | __global__ 82 | void copy(T *dest, const U *src, size_t n_elem) 83 | { 84 | unsigned long i = hamr::thread_id_to_array_index(); 85 | 86 | if (i >= n_elem) 87 | return; 88 | 89 | dest[i] = static_cast(src[i]); 90 | } 91 | 92 | /// default construct on the GPU 93 | template 94 | __global__ 95 | void construct(T *dest, size_t n_elem) 96 | { 97 | unsigned long i = hamr::thread_id_to_array_index(); 98 | 99 | if (i >= n_elem) 100 | return; 101 | 102 | new (&dest[i]) T(); 103 | } 104 | 105 | /// copy construct on the GPU 106 | template 107 | __global__ 108 | void construct(T *dest, size_t n_elem, U val) 109 | { 110 | unsigned long i = hamr::thread_id_to_array_index(); 111 | 112 | if (i >= n_elem) 113 | return; 114 | 115 | new (&dest[i]) T(val); 116 | } 117 | 118 | /// copy construct on the GPU 119 | template 120 | __global__ 121 | void construct(T *dest, size_t n_elem, const U *vals) 122 | { 123 | unsigned long i = hamr::thread_id_to_array_index(); 124 | 125 | if (i >= n_elem) 126 | return; 127 | 128 | new (&dest[i]) T(vals[i]); 129 | } 130 | 131 | /// destruct on the GPU 132 | template 133 | __global__ 134 | void destruct(T *dest, size_t n_elem) 135 | { 136 | unsigned long i = hamr::thread_id_to_array_index(); 137 | 138 | if (i >= n_elem) 139 | return; 140 | 141 | dest[i].~T(); 142 | } 143 | 144 | /// initialize an array on the GPU 145 | template 146 | __global__ 147 | void fill(T *dest, size_t n_elem, U val) 148 | { 149 | unsigned long i = hamr::thread_id_to_array_index(); 150 | 151 | if (i >= n_elem) 152 | return; 153 | 154 | dest[i] = val; 155 | } 156 | 157 | /// initialize an array on the GPU 158 | template 159 | __global__ 160 | void fill(T *dest, size_t n_elem, const U *vals) 161 | { 162 | unsigned long i = hamr::thread_id_to_array_index(); 163 | 164 | if (i >= n_elem) 165 | return; 166 | 167 | dest[i] = vals[i]; 168 | } 169 | 170 | } 171 | 172 | } 173 | 174 | #endif 175 | -------------------------------------------------------------------------------- /hamr_cuda_kernels.h: -------------------------------------------------------------------------------- 1 | #ifndef hamr_cuda_kernels_h 2 | #define hamr_cuda_kernels_h 3 | 4 | #include "hamr_config.h" 5 | #include "hamr_env.h" 6 | #include "hamr_cuda_launch.h" 7 | 8 | namespace hamr 9 | { 10 | 11 | namespace cuda_kernels 12 | { 13 | 14 | /// helpers to get the printf code given a POD type 15 | template struct printf_tt {}; 16 | 17 | #define declare_printf_tt(cpp_t, print_t, code, len)\ 18 | /** printf code wrapper for cpp_t */ \ 19 | template <> struct printf_tt \ 20 | { \ 21 | /** cast from cpp_t to print_t */ \ 22 | __device__ \ 23 | static print_t get_value(cpp_t v) \ 24 | { return v; } \ 25 | \ 26 | /** returns the printf code for cpp_t */ \ 27 | __device__ \ 28 | static const char *get_code() \ 29 | { return code; } \ 30 | \ 31 | /** copies the printf code */ \ 32 | __device__ \ 33 | static void copy_code(char *dest) \ 34 | { \ 35 | for (int i = 0; i < len; ++i) \ 36 | dest[i] = get_code()[i]; \ 37 | } \ 38 | \ 39 | /** returns the length of the printf code */ \ 40 | __device__ \ 41 | static int get_code_len() \ 42 | { return len; } \ 43 | }; 44 | 45 | declare_printf_tt(char, int, "%d", 2) 46 | declare_printf_tt(signed char, int, "%d", 2) 47 | declare_printf_tt(unsigned char, unsigned int, "%u", 2) 48 | declare_printf_tt(short, short, "%hd", 3) 49 | declare_printf_tt(unsigned short, unsigned short, "%hu", 3) 50 | declare_printf_tt(int, int, "%d", 2) 51 | declare_printf_tt(unsigned int, unsigned int, "%u", 2) 52 | declare_printf_tt(long, long, "%ld", 3) 53 | declare_printf_tt(unsigned long, unsigned long, "%lu", 3) 54 | declare_printf_tt(long long, long long, "%lld", 4) 55 | declare_printf_tt(unsigned long long, unsigned long long, "%llu", 4) 56 | declare_printf_tt(float, float, "%g", 2) 57 | declare_printf_tt(double, double, "%g", 2) 58 | 59 | 60 | /// send an array to the stderr stream on the GPU using CUDA 61 | template 62 | __global__ 63 | void print(const T *vals, size_t n_elem) 64 | { 65 | unsigned long i = hamr::thread_id_to_array_index(); 66 | 67 | if (i >= n_elem) 68 | return; 69 | 70 | int cl = printf_tt::get_code_len(); 71 | char fmt[] = "vals[%lu] = XXXXXXXXX"; // <-- 20 72 | printf_tt::copy_code(fmt + 12); 73 | fmt[12 + cl] = '\n'; 74 | fmt[13 + cl] = '\0'; 75 | 76 | printf(fmt, i, printf_tt::get_value(vals[i])); 77 | } 78 | 79 | /// copy an array on the GPU using CUDA 80 | template 81 | __global__ 82 | void copy(T *dest, const U *src, size_t n_elem) 83 | { 84 | unsigned long i = hamr::thread_id_to_array_index(); 85 | 86 | if (i >= n_elem) 87 | return; 88 | 89 | dest[i] = static_cast(src[i]); 90 | } 91 | 92 | /// default construct on the GPU 93 | template 94 | __global__ 95 | void construct(T *dest, size_t n_elem) 96 | { 97 | unsigned long i = hamr::thread_id_to_array_index(); 98 | 99 | if (i >= n_elem) 100 | return; 101 | 102 | new (&dest[i]) T(); 103 | } 104 | 105 | /// copy construct on the GPU 106 | template 107 | __global__ 108 | void construct(T *dest, size_t n_elem, U val) 109 | { 110 | unsigned long i = hamr::thread_id_to_array_index(); 111 | 112 | if (i >= n_elem) 113 | return; 114 | 115 | new (&dest[i]) T(val); 116 | } 117 | 118 | /// copy construct on the GPU 119 | template 120 | __global__ 121 | void construct(T *dest, size_t n_elem, const U *vals) 122 | { 123 | unsigned long i = hamr::thread_id_to_array_index(); 124 | 125 | if (i >= n_elem) 126 | return; 127 | 128 | new (&dest[i]) T(vals[i]); 129 | } 130 | 131 | /// destruct on the GPU 132 | template 133 | __global__ 134 | void destruct(T *dest, size_t n_elem) 135 | { 136 | unsigned long i = hamr::thread_id_to_array_index(); 137 | 138 | if (i >= n_elem) 139 | return; 140 | 141 | dest[i].~T(); 142 | } 143 | 144 | /// initialize an array on the GPU 145 | template 146 | __global__ 147 | void fill(T *dest, size_t n_elem, U val) 148 | { 149 | unsigned long i = hamr::thread_id_to_array_index(); 150 | 151 | if (i >= n_elem) 152 | return; 153 | 154 | dest[i] = val; 155 | } 156 | 157 | /// initialize an array on the GPU 158 | template 159 | __global__ 160 | void fill(T *dest, size_t n_elem, const U *vals) 161 | { 162 | unsigned long i = hamr::thread_id_to_array_index(); 163 | 164 | if (i >= n_elem) 165 | return; 166 | 167 | dest[i] = vals[i]; 168 | } 169 | 170 | } 171 | 172 | } 173 | 174 | #endif 175 | -------------------------------------------------------------------------------- /test/test_hamr_pipeline_openmp.cpp: -------------------------------------------------------------------------------- 1 | #include "hamr_config.h" 2 | #include "hamr_buffer.h" 3 | #include "hamr_buffer_util.h" 4 | #include "hamr_buffer_allocator.h" 5 | 6 | #include 7 | 8 | using allocator = hamr::buffer_allocator; 9 | 10 | // ************************************************************************** 11 | template 12 | hamr::buffer initialize(size_t n_vals, const T &val) 13 | { 14 | // allocate the memory 15 | hamr::buffer ao(allocator::openmp, n_vals); 16 | T *pao = ao.data(); 17 | 18 | // initialize using openmp 19 | 20 | #pragma omp target teams HAMR_OPENMP_LOOP is_device_ptr(pao) map(to: val) 21 | for (size_t i = 0; i < n_vals; ++i) 22 | { 23 | pao[i] = val; 24 | } 25 | 26 | // print the results 27 | std::cerr << "initialized to an array of " << n_vals << " to " << val << std::endl; 28 | 29 | if (n_vals < 33) 30 | { 31 | std::cerr << "ao = "; ao.print(); std::cerr << std::endl; 32 | ao.print(); 33 | std::cerr << std::endl; 34 | } 35 | 36 | return ao; 37 | } 38 | 39 | 40 | // ************************************************************************** 41 | template 42 | hamr::buffer add(const hamr::buffer &a1, const hamr::buffer &a2) 43 | { 44 | size_t n_vals = a1.size(); 45 | 46 | // get the inputs 47 | auto spa1 = a1.get_openmp_accessible(); 48 | auto pa1 = spa1.get(); 49 | 50 | auto spa2 = a2.get_openmp_accessible(); 51 | auto pa2 = spa2.get(); 52 | 53 | // allocate the memory 54 | hamr::buffer ao(allocator::openmp, n_vals, T(0)); 55 | T *pao = ao.data(); 56 | 57 | // do the calculation 58 | #pragma omp target teams HAMR_OPENMP_LOOP is_device_ptr(pao, pa1, pa2) 59 | for (size_t i = 0; i < n_vals; ++i) 60 | { 61 | pao[i] = pa1[i] + pa2[i]; 62 | } 63 | 64 | // print the results 65 | std::cerr << "added " << n_vals << " array " << typeid(T).name() << sizeof(T) 66 | << " to array " << typeid(U).name() << sizeof(U) << std::endl; 67 | 68 | if (n_vals < 33) 69 | { 70 | std::cerr << "a1 = "; a1.print(); std::cerr << std::endl; 71 | std::cerr << "a2 = "; a2.print(); std::cerr << std::endl; 72 | std::cerr << "ao = "; ao.print(); std::cerr << std::endl; 73 | } 74 | 75 | return ao; 76 | } 77 | 78 | 79 | // ************************************************************************** 80 | template 81 | hamr::buffer multiply_scalar(const hamr::buffer &ai, const U &val) 82 | { 83 | size_t n_vals = ai.size(); 84 | 85 | // get the inputs 86 | auto spai = ai.get_openmp_accessible(); 87 | auto pai = spai.get(); 88 | 89 | // allocate the memory 90 | hamr::buffer ao(allocator::openmp, n_vals, T(0)); 91 | T *pao = ao.data(); 92 | 93 | // do the calculation 94 | #pragma omp target teams HAMR_OPENMP_LOOP is_device_ptr(pao, pai) map(to: val) 95 | for (size_t i = 0; i < n_vals; ++i) 96 | { 97 | pao[i] = val * pai[i]; 98 | } 99 | 100 | // print the results 101 | std::cerr << "multiply_scalar " << val << " " << typeid(U).name() << sizeof(U) 102 | << " by " << n_vals << " array " << typeid(T).name() << sizeof(T) << std::endl; 103 | 104 | if (n_vals < 33) 105 | { 106 | std::cerr << "ai = "; ai.print(); std::cerr << std::endl; 107 | std::cerr << "ao = "; ao.print(); std::cerr << std::endl; 108 | } 109 | 110 | return ao; 111 | } 112 | 113 | 114 | // ************************************************************************** 115 | template 116 | int compare_int(const hamr::buffer &ain, int val) 117 | { 118 | size_t n_vals = ain.size(); 119 | std::cerr << "comparing array with " << n_vals << " elements to " << val << std::endl; 120 | 121 | hamr::buffer ai(ain.get_allocator(), n_vals); 122 | ain.get(ai); 123 | 124 | if (n_vals < 33) 125 | { 126 | ai.print(); 127 | } 128 | 129 | auto [spai, pai] = hamr::get_host_accessible(ai); 130 | 131 | for (size_t i = 0; i < n_vals; ++i) 132 | { 133 | if (pai[i] != val) 134 | { 135 | std::cerr << "ERROR: pai[" << i << "] = " 136 | << pai[i] << " != " << val << std::endl; 137 | return -1; 138 | } 139 | } 140 | 141 | std::cerr << "all elements are equal to " << val << std::endl; 142 | 143 | return 0; 144 | } 145 | 146 | 147 | 148 | int main(int, char **) 149 | { 150 | size_t n_vals = 10000; 151 | 152 | hamr::buffer ao0(allocator::openmp, n_vals, 1.0f); // = 1 (device) 153 | hamr::buffer ao1 = multiply_scalar(ao0, 2.0f); // = 2 (device) 154 | ao0.free(); 155 | 156 | hamr::buffer ao2 = initialize(n_vals, 2.0); // = 2 (device) 157 | hamr::buffer ao3 = add(ao2, ao1); // = 4 (device) 158 | ao1.free(); 159 | ao2.free(); 160 | 161 | hamr::buffer ao4 = multiply_scalar(ao3, 1000.0); // = 4000 (device) 162 | ao3.free(); 163 | 164 | hamr::buffer ao5(allocator::malloc, n_vals, 3.0f); // = 1 (host) 165 | hamr::buffer ao6 = multiply_scalar(ao5, 100.0f); // = 300 (device) 166 | ao5.free(); 167 | 168 | hamr::buffer ao7(allocator::malloc, n_vals); // = uninit (host) 169 | ao7.set(ao6); // = 300 (host) 170 | ao6.free(); 171 | 172 | hamr::buffer ao8 = add(ao4, ao7); // = 4300 (device) 173 | ao4.free(); 174 | ao7.free(); 175 | 176 | return compare_int(ao8, 4300); 177 | } 178 | -------------------------------------------------------------------------------- /hamr_cuda_malloc_uva_allocator.h: -------------------------------------------------------------------------------- 1 | #ifndef hamr_cuda_malloc_uva_allocator_h 2 | #define hamr_cuda_malloc_uva_allocator_h 3 | 4 | #include "hamr_config.h" 5 | #include 6 | #include 7 | 8 | namespace hamr 9 | { 10 | 11 | /// a deleter for arrays allocated with cuda_malloc_uva 12 | template 13 | class cuda_malloc_uva_deleter {}; 14 | 15 | /// a deleter for arrays allocated with cuda_malloc_uva, specialized for objects 16 | template 17 | class cuda_malloc_uva_deleter::value>::type> 18 | { 19 | public: 20 | /** constructs the deleter 21 | * @param[in] ptr the pointer to the array to delete 22 | * @param[in] n the number of elements in the array 23 | */ 24 | cuda_malloc_uva_deleter(cudaStream_t str, T *ptr, size_t n); 25 | 26 | /** deletes the array 27 | * @param[in] ptr the pointer to the array to delete. must be the same as 28 | * that passed during construction. 29 | */ 30 | void operator()(T *ptr); 31 | 32 | private: 33 | T *m_ptr; 34 | size_t m_elem; 35 | cudaStream_t m_str; 36 | }; 37 | 38 | 39 | 40 | 41 | 42 | 43 | /// a deleter for arrays allocated with cuda_malloc_uva, specialized for numbers 44 | template 45 | class cuda_malloc_uva_deleter::value>::type> 46 | { 47 | public: 48 | /** constructs the deleter 49 | * @param[in] ptr the pointer to the array to delete 50 | * @param[in] n the number of elements in the array 51 | */ 52 | cuda_malloc_uva_deleter(cudaStream_t str, T *ptr, size_t n); 53 | 54 | /** deletes the array 55 | * @param[in] ptr the pointer to the array to delete. must be the same as 56 | * that passed during construction. 57 | */ 58 | void operator()(T *ptr); 59 | 60 | private: 61 | T *m_ptr; 62 | size_t m_elem; 63 | }; 64 | 65 | 66 | 67 | 68 | 69 | 70 | /// a class for allocating arrays with cuda_malloc_uva 71 | template 72 | struct cuda_malloc_uva_allocator {}; 73 | 74 | /// a class for allocating arrays with cuda_malloc_uva, specialized for objects 75 | template 76 | struct HAMR_EXPORT cuda_malloc_uva_allocator::value>::type> 77 | { 78 | /** allocate an array of n elements. 79 | * @param[in] str a stream used to order operations, or null for the 80 | * default stream 81 | * @param[in] n the number of elements to allocate 82 | * @returns a shared point to the array that holds a deleter for the memory 83 | */ 84 | static std::shared_ptr allocate(cudaStream_t str, size_t n); 85 | 86 | /** allocate an array of n elements. 87 | * @param[in] str a stream used to order operations, or null for the 88 | * default stream 89 | * @param[in] n the number of elements to allocate 90 | * @param[in] val a value to initialize the elements to 91 | * @returns a shared point to the array that holds a deleter for the memory 92 | */ 93 | static std::shared_ptr allocate(cudaStream_t str, size_t n, const T &val); 94 | 95 | /** allocate an array of n elements. 96 | * @param[in] str a stream used to order operations, or null for the 97 | * default stream 98 | * @param[in] n the number of elements to allocate 99 | * @param[in] vals an array of values to initialize the elements with 100 | * @param[in] cudaVals a flag that is set to true if vals is accessible from codes running in CUDA 101 | * @returns a shared point to the array that holds a deleter for the memory 102 | */ 103 | template 104 | static std::shared_ptr allocate(cudaStream_t str, size_t n, const U *vals, bool cudaVals = false); 105 | }; 106 | 107 | 108 | 109 | 110 | 111 | /// a class for allocating arrays with cuda_malloc_uva, specialized for numbers 112 | template 113 | struct HAMR_EXPORT cuda_malloc_uva_allocator::value>::type> 114 | { 115 | /** allocate an array of n elements. 116 | * @param[in] str a stream used to order operations, or null for the 117 | * default stream 118 | * @param[in] n the number of elements to allocate 119 | * @returns a shared point to the array that holds a deleter for the memory 120 | */ 121 | static std::shared_ptr allocate(cudaStream_t str, size_t n); 122 | 123 | /** allocate an array of n elements. 124 | * @param[in] str a stream used to order operations, or null for the 125 | * default stream 126 | * @param[in] n the number of elements to allocate 127 | * @param[in] val a value to initialize the elements to 128 | * @returns a shared point to the array that holds a deleter for the memory 129 | */ 130 | static std::shared_ptr allocate(cudaStream_t str, size_t n, const T &val); 131 | 132 | /** allocate an array of n elements. 133 | * @param[in] str a stream used to order operations, or null for the 134 | * default stream 135 | * @param[in] n the number of elements to allocate 136 | * @param[in] vals an array of values to initialize the elements with 137 | * @param[in] cudaVals a flag set to true if vals is accessible from codes running in CUDA 138 | * @returns a shared point to the array that holds a deleter for the memory 139 | */ 140 | template 141 | static std::shared_ptr allocate(cudaStream_t str, size_t n, const U *vals, bool cudaVals = false); 142 | }; 143 | 144 | } 145 | 146 | #if !defined(HAMR_SEPARATE_IMPL) 147 | #include "hamr_cuda_malloc_uva_allocator_impl.h" 148 | #endif 149 | 150 | #endif 151 | -------------------------------------------------------------------------------- /test/test_hamr_pipeline_host.cpp: -------------------------------------------------------------------------------- 1 | #include "hamr_buffer.h" 2 | #include "hamr_buffer_util.h" 3 | 4 | #include 5 | 6 | using hamr::buffer; 7 | using allocator = hamr::buffer_allocator; 8 | 9 | // ************************************************************************** 10 | template 11 | void initialize_host(T *data, double val, size_t n_vals) 12 | { 13 | for (size_t i = 0; i < n_vals; ++i) 14 | { 15 | data[i] = val; 16 | } 17 | } 18 | 19 | // ************************************************************************** 20 | template 21 | buffer initialize_host(size_t n_vals, const T &val) 22 | { 23 | // allocate the memory 24 | buffer ao(allocator::malloc, n_vals); 25 | T *pao = ao.data(); 26 | 27 | // initialize the data 28 | initialize_host(pao, val, n_vals); 29 | 30 | std::cerr << "initialized to an array of " << n_vals << " to " << val << std::endl; 31 | if (n_vals < 33) 32 | { 33 | std::cerr << "ao = "; ao.print(); std::cerr << std::endl; 34 | ao.print(); 35 | std::cerr << std::endl; 36 | } 37 | 38 | return ao; 39 | } 40 | 41 | 42 | 43 | 44 | 45 | 46 | // ************************************************************************** 47 | template 48 | void add_host(T *result, const T *array_1, const U *array_2, size_t n_vals) 49 | { 50 | for (size_t i = 0; i < n_vals; ++i) 51 | { 52 | result[i] = array_1[i] + array_2[i]; 53 | } 54 | } 55 | 56 | // ************************************************************************** 57 | template 58 | buffer add_host(const buffer &a1, const buffer &a2) 59 | { 60 | // get the inputs 61 | auto [spa1, pa1] = hamr::get_host_accessible(a1); 62 | auto [spa2, pa2] = hamr::get_host_accessible(a2); 63 | 64 | // allocate the memory 65 | size_t n_vals = a1.size(); 66 | buffer ao(allocator::malloc, n_vals, T(0)); 67 | T *pao = ao.data(); 68 | 69 | // initialize the data 70 | add_host(pao, pa1, pa2, n_vals); 71 | 72 | std::cerr << "added " << n_vals << " array " << typeid(T).name() 73 | << sizeof(T) << " to array " << typeid(U).name() << sizeof(U) << std::endl; 74 | if (n_vals < 33) 75 | { 76 | std::cerr << "a1 = "; a1.print(); std::cerr << std::endl; 77 | std::cerr << "a2 = "; a2.print(); std::cerr << std::endl; 78 | std::cerr << "ao = "; ao.print(); std::cerr << std::endl; 79 | } 80 | 81 | return ao; 82 | } 83 | 84 | 85 | 86 | 87 | 88 | // ************************************************************************** 89 | template 90 | void multiply_scalar_host(T *result, const T *array_in, U scalar, size_t n_vals) 91 | { 92 | for (size_t i = 0; i < n_vals; ++i) 93 | { 94 | result[i] = array_in[i] * scalar; 95 | } 96 | } 97 | 98 | // ************************************************************************** 99 | template 100 | buffer multiply_scalar_host(const buffer &ai, const U &val) 101 | { 102 | // get the inputs 103 | auto [spai, pai] = hamr::get_host_accessible(ai); 104 | 105 | // allocate the memory 106 | size_t n_vals = ai.size(); 107 | buffer ao(allocator::malloc, n_vals, T(0)); 108 | T *pao = ao.data(); 109 | 110 | // initialize the data 111 | multiply_scalar_host(pao, pai, val, n_vals); 112 | 113 | std::cerr << "multiply_scalar " << val << " " << typeid(U).name() << sizeof(U) 114 | << " by " << n_vals << " array " << typeid(T).name() << sizeof(T) << std::endl; 115 | 116 | if (n_vals < 33) 117 | { 118 | std::cerr << "ain = "; ai.print(); std::cerr << std::endl; 119 | std::cerr << "ao = "; ao.print(); std::cerr << std::endl; 120 | } 121 | 122 | return ao; 123 | } 124 | 125 | // ************************************************************************** 126 | template 127 | int compare_int(const buffer &ain, int val) 128 | { 129 | size_t n_vals = ain.size(); 130 | std::cerr << "comparing array with " << n_vals 131 | << " elements to " << val << std::endl; 132 | 133 | buffer ai(ain.get_allocator(), n_vals); 134 | ain.get(ai); 135 | 136 | if (n_vals < 33) 137 | { 138 | ai.print(); 139 | } 140 | 141 | auto [spai, pai] = hamr::get_host_accessible(ai); 142 | 143 | for (size_t i = 0; i < n_vals; ++i) 144 | { 145 | if (pai[i] != val) 146 | { 147 | std::cerr << "ERROR: pai[" << i << "] = " << pai[i] 148 | << " != " << val << std::endl; 149 | return -1; 150 | } 151 | } 152 | 153 | std::cerr << "all elements are equal to " << val << std::endl; 154 | 155 | return 0; 156 | } 157 | 158 | 159 | 160 | int main(int, char **) 161 | { 162 | size_t n_vals = 100000; 163 | 164 | buffer ao0(allocator::malloc, n_vals, 1.0f); // = 1 (host) 165 | buffer ao1 = multiply_scalar_host(ao0, 2.0f); // = 2 (host) 166 | ao0.free(); 167 | 168 | buffer ao2 = initialize_host(n_vals, 2.0); // = 2 (host) 169 | buffer ao3 = add_host(ao2, ao1); // = 4 (host) 170 | ao1.free(); 171 | ao2.free(); 172 | 173 | buffer ao4 = multiply_scalar_host(ao3, 1000.0); // = 4000 (host) 174 | ao3.free(); 175 | 176 | buffer ao5(allocator::malloc, n_vals, 3.0f); // = 1 (host) 177 | buffer ao6 = multiply_scalar_host(ao5, 100.0f); // = 300 (host) 178 | ao5.free(); 179 | 180 | buffer ao7(allocator::malloc, n_vals); // = uninit (host) 181 | ao7.set(ao6); // = 300 (host) 182 | 183 | buffer ao8 = add_host(ao4, ao7); // = 4300 (host) 184 | ao4.free(); 185 | ao7.free(); 186 | 187 | int res = compare_int(ao8, 4300); 188 | ao8.free(); 189 | 190 | return res; 191 | } 192 | -------------------------------------------------------------------------------- /hamr_hip_launch.cxx: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include "hamr_hip_launch.h" 4 | 5 | #include "hamr_env.h" 6 | 7 | #include 8 | 9 | namespace hamr 10 | { 11 | // ************************************************************************** 12 | int synchronize() 13 | { 14 | hipError_t ierr = hipSuccess; 15 | if ((ierr = hipDeviceSynchronize()) != hipSuccess) 16 | { 17 | std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:" 18 | " Failed to synchronize HIP execution. " 19 | << hipGetErrorString(ierr) << std::endl; 20 | return -1; 21 | } 22 | return 0; 23 | } 24 | 25 | // ************************************************************************** 26 | int get_launch_props(int device_id, 27 | int *block_grid_max, int &warp_size, 28 | int &warps_per_block_max) 29 | { 30 | hipError_t ierr = hipSuccess; 31 | 32 | if (device_id < 0) 33 | { 34 | if ((ierr = hipGetDevice(&device_id)) != hipSuccess) 35 | { 36 | std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:" 37 | " Failed to get the active device id. " 38 | << hipGetErrorString(ierr) << std::endl; 39 | return -1; 40 | } 41 | } 42 | 43 | if (((ierr = hipDeviceGetAttribute(&block_grid_max[0], hipDeviceAttributeMaxGridDimX, device_id)) != hipSuccess) 44 | || ((ierr = hipDeviceGetAttribute(&block_grid_max[1], hipDeviceAttributeMaxGridDimY, device_id)) != hipSuccess) 45 | || ((ierr = hipDeviceGetAttribute(&block_grid_max[2], hipDeviceAttributeMaxGridDimZ, device_id)) != hipSuccess)) 46 | { 47 | std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:" 48 | " Failed to get HIP max grid dim. " << hipGetErrorString(ierr) << std::endl; 49 | return -1; 50 | } 51 | 52 | if ((ierr = hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, device_id)) != hipSuccess) 53 | { 54 | std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:" 55 | " Failed to get HIP warp size. " << hipGetErrorString(ierr) << std::endl; 56 | return -1; 57 | } 58 | 59 | int threads_per_block_max = 0; 60 | 61 | if ((ierr = hipDeviceGetAttribute(&threads_per_block_max, 62 | hipDeviceAttributeMaxThreadsPerBlock, device_id)) != hipSuccess) 63 | { 64 | std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:" 65 | " Failed to get HIP max threads per block. " << hipGetErrorString(ierr) << std::endl; 66 | return -1; 67 | } 68 | 69 | warps_per_block_max = threads_per_block_max / warp_size; 70 | 71 | return 0; 72 | } 73 | 74 | // ************************************************************************** 75 | int partition_thread_blocks(size_t array_size, 76 | int warps_per_block, int warp_size, int *block_grid_max, 77 | dim3 &block_grid, int &n_blocks, dim3 &thread_grid) 78 | { 79 | unsigned long threads_per_block = warps_per_block * warp_size; 80 | 81 | thread_grid.x = threads_per_block; 82 | thread_grid.y = 1; 83 | thread_grid.z = 1; 84 | 85 | unsigned long block_size = threads_per_block; 86 | n_blocks = array_size / block_size; 87 | 88 | if (array_size % block_size) 89 | ++n_blocks; 90 | 91 | if (n_blocks > block_grid_max[0]) 92 | { 93 | // multi-d decomp required 94 | block_grid.x = block_grid_max[0]; 95 | block_grid.y = n_blocks / block_grid_max[0]; 96 | if (n_blocks % block_grid_max[0]) 97 | { 98 | ++block_grid.y; 99 | } 100 | 101 | if (block_grid.y > ((unsigned int)block_grid_max[1])) 102 | { 103 | // 3d decomp 104 | unsigned long block_grid_max01 = block_grid_max[0] * block_grid_max[1]; 105 | block_grid.y = block_grid_max[1]; 106 | block_grid.z = n_blocks / block_grid_max01; 107 | 108 | if (n_blocks % block_grid_max01) 109 | ++block_grid.z; 110 | 111 | if (block_grid.z > ((unsigned int)block_grid_max[2])) 112 | { 113 | std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:" 114 | " Too many blocks " << n_blocks << " of size " << block_size 115 | << " are required for a grid of (" << block_grid_max[0] << ", " 116 | << block_grid_max[1] << ", " << block_grid_max[2] 117 | << ") blocks. Hint: increase the number of warps per block." << std::endl; 118 | return -1; 119 | } 120 | } 121 | else 122 | { 123 | // 2d decomp 124 | block_grid.z = 1; 125 | } 126 | } 127 | else 128 | { 129 | // 1d decomp 130 | block_grid.x = n_blocks; 131 | block_grid.y = 1; 132 | block_grid.z = 1; 133 | } 134 | 135 | #if defined(HAMR_VERBOSE) 136 | if (hamr::get_verbose()) 137 | { 138 | std::cerr << "partition_thread_blocks arrays_size = " << array_size 139 | << " warps_per_block = " << warps_per_block << " warp_size = " << warp_size 140 | << " block_grid_max = (" << block_grid_max[0] << ", " << block_grid_max[1] 141 | << ", " << block_grid_max[2] << ") block_grid = (" << block_grid.x << ", " 142 | << block_grid.y << ", " << block_grid.z << ") n_blocks = " << n_blocks 143 | << " thread_grid = (" << thread_grid.x << ", " << thread_grid.y << ", " 144 | << thread_grid.z << ")" << std::endl; 145 | } 146 | #endif 147 | 148 | return 0; 149 | } 150 | 151 | // ************************************************************************** 152 | int partition_thread_blocks(int device_id, size_t array_size, 153 | int warps_per_block, dim3 &block_grid, int &n_blocks, 154 | dim3 &thread_grid) 155 | { 156 | int block_grid_max[3] = {0}; 157 | int warp_size = 0; 158 | int warps_per_block_max = 0; 159 | if (get_launch_props(device_id, block_grid_max, 160 | warp_size, warps_per_block_max)) 161 | { 162 | std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:" 163 | " Failed to get launch properties" << std::endl; 164 | return -1; 165 | } 166 | 167 | return partition_thread_blocks(array_size, warps_per_block, 168 | warp_size, block_grid_max, block_grid, n_blocks, 169 | thread_grid); 170 | } 171 | 172 | } 173 | --------------------------------------------------------------------------------