├── python
    ├── hamr.py
    ├── hamr_py.i
    └── CMakeLists.txt
├── doc
    └── rtd
    │   ├── requirements.txt
    │   ├── source
    │       ├── hello_cuda
    │       │   ├── Makefile
    │       │   ├── add.cuh
    │       │   ├── write.h
    │       │   ├── hello_cuda.cu
    │       │   └── add.h
    │       ├── hello_hip
    │       │   ├── Makefile
    │       │   ├── add_kernel.h
    │       │   ├── write.h
    │       │   ├── hello_hip.cpp
    │       │   └── add.h
    │       ├── hello_openmp
    │       │   ├── write.h
    │       │   ├── Makefile
    │       │   ├── hello_openmp.cpp
    │       │   └── add.h
    │       ├── zero_copy_cupy
    │       │   ├── python_to_cpp.py
    │       │   └── cpp_to_python.py
    │       └── hello_cupy
    │       │   └── hello_cupy.py
    │   ├── _static
    │       └── theme_overrides.css
    │   ├── Makefile
    │   ├── make.bat
    │   └── conf.py
├── .gitignore
├── hamr_buffer_allocator.i
├── hamr_buffer_transfer.i
├── CITATION.cff
├── .readthedocs.yaml
├── test
    ├── test_hamr_openmp_allocator.cpp
    ├── test_hamr_buffer_cupy_host.py
    ├── test_hamr_buffer_numpy_cuda.py
    ├── test_hamr_multi_gpu_cuda.cpp
    ├── test_hamr_multi_gpu_hip.cpp
    ├── test_hamr_buffer_cupy_cuda.py
    ├── test_hamr_buffer_numpy_host.py
    ├── test_hamr_pipeline_cuda_openmp.cpp
    ├── test_hamr_pipeline_cuda_openmp_mp.cpp
    ├── test_hamr_pipeline_openmp.cpp
    └── test_hamr_pipeline_host.cpp
├── hamr_env.h
├── hamr_buffer_transfer.h
├── hamr_hip_print.h
├── hamr_stream.i
├── hamr_cuda_print.h
├── hamr_env.cxx
├── hamr_gil_state.h
├── hamr_openmp_print.h
├── hamr_config.cmake.in
├── hamr_config.h.in
├── hamr_hip_print.cxx
├── hamr_openmp_print.cxx
├── hamr_buffer_pointer.h
├── hamr_host_copy.h
├── hamr_python_deleter_impl.h
├── hamr_buffer_allocator.cxx
├── hamr_openmp_device.h
├── hamr_cuda_print.cxx
├── hamr_hip_device.h
├── hamr_openmp_print_impl.h
├── hamr_host_copy.cxx
├── hamr_python_deleter.h
├── .github
    └── workflows
    │   ├── build_and_test_cuda.yml
    │   ├── build_and_test_hip.yml
    │   ├── build_and_test_amd_openmp.yml
    │   └── build_and_test_host.yml
├── hamr_openmp_device.cxx
├── hamr_host_copy_impl.h
├── hamr_python_deleter.cxx
├── hamr_buffer_handle.i
├── hamr_hip_print_impl.h
├── hamr_copier_traits.h
├── hamr_cuda_print_impl.h
├── hamr_stream.cxx
├── hamr_cuda_device.h
├── hamr_new_allocator.h
├── hamr_hip_device.cxx
├── hamr_new_allocator.cxx
├── README.md
├── hamr_openmp_allocator.cxx
├── hamr_malloc_allocator.cxx
├── hamr_hip_malloc_allocator.cxx
├── hamr_new_allocator_impl.h
├── hamr_cuda_malloc_host_allocator.cxx
├── hamr_device.h
├── cmake
    └── hamr_omp_offload.cmake
├── hamr_stream.h
├── hamr_stream_impl.h
├── hamr_cuda_malloc_uva_allocator.cxx
├── hamr_cuda_malloc_async_allocator.cxx
├── LICENSE
├── hamr_hip_launch.h
├── hamr_cuda_launch.h
├── hamr_buffer_allocator.h
├── hamr_malloc_allocator.h
├── hamr_openmp_allocator.h
├── hamr_hip_malloc_allocator.h
├── hamr_cuda_malloc_allocator.cxx
├── hamr_cuda_malloc_host_allocator.h
├── hamr_hip_kernels.h
├── hamr_cuda_kernels.h
├── hamr_cuda_malloc_uva_allocator.h
└── hamr_hip_launch.cxx


/python/hamr.py:
--------------------------------------------------------------------------------
1 | from hamr.hamr_py import *
2 | 


--------------------------------------------------------------------------------
/doc/rtd/requirements.txt:
--------------------------------------------------------------------------------
1 | sphinx<7
2 | sphinxcontrib-bibtex
3 | breathe
4 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.sw[a-z]
2 | *.patch
3 | _build
4 | *.pt
5 | *.vscode*
6 | .DS_Store
7 | generated_rtd*
8 | 


--------------------------------------------------------------------------------
/doc/rtd/source/hello_cuda/Makefile:
--------------------------------------------------------------------------------
1 | 
2 | HAMR_SOURCE=../../../../
3 | HAMR_BUILD=../../../../build_cuda
4 | 
5 | all:
6 | 	nvcc hello_cuda.cu -I${HAMR_SOURCE} -I${HAMR_BUILD} -std=c++17 -L${HAMR_BUILD}/lib/ -lhamr
7 | 


--------------------------------------------------------------------------------
/doc/rtd/source/hello_hip/Makefile:
--------------------------------------------------------------------------------
1 | 
2 | HAMR_SOURCE=../../../../
3 | HAMR_BUILD=../../../../build_hip
4 | 
5 | all:
6 | 	hipcc hello_hip.cpp -I${HAMR_SOURCE} -I${HAMR_BUILD} -std=c++17 -L${HAMR_BUILD}/lib/ -lhamr
7 | 


--------------------------------------------------------------------------------
/doc/rtd/_static/theme_overrides.css:
--------------------------------------------------------------------------------
 1 | /* override table width restrictions */
 2 | .wy-table-responsive table td, .wy-table-responsive table th {
 3 |     white-space: normal;
 4 | }
 5 | 
 6 | .wy-table-responsive {
 7 |     margin-bottom: 24px;
 8 |     max-width: 100%;
 9 |     overflow: visible;
10 | }
11 | 


--------------------------------------------------------------------------------
/hamr_buffer_allocator.i:
--------------------------------------------------------------------------------
1 | %{
2 | #include "hamr_config.h"
3 | #include "hamr_buffer_allocator.h"
4 | %}
5 | /***************************************************************************
6 |  * buffer allocator
7 |  **************************************************************************/
8 | %include "hamr_buffer_allocator.h"
9 | 


--------------------------------------------------------------------------------
/doc/rtd/source/hello_cuda/add.cuh:
--------------------------------------------------------------------------------
 1 | template<typename T, typename U>
 2 | __global__
 3 | void add(T *result, const T *array_1, const U *array_2, size_t n_vals)
 4 | {
 5 |     unsigned long i = blockIdx.x*blockDim.x + threadIdx.x;
 6 | 
 7 |     if (i >= n_vals)
 8 |         return;
 9 | 
10 |     result[i] = array_1[i] + array_2[i];
11 | }
12 | 


--------------------------------------------------------------------------------
/doc/rtd/source/hello_hip/add_kernel.h:
--------------------------------------------------------------------------------
 1 | template<typename T, typename U>
 2 | __global__
 3 | void add(T *result, const T *array_1, const U *array_2, size_t n_vals)
 4 | {
 5 |     unsigned long i = blockIdx.x*blockDim.x + threadIdx.x;
 6 | 
 7 |     if (i >= n_vals)
 8 |         return;
 9 | 
10 |     result[i] = array_1[i] + array_2[i];
11 | }
12 | 


--------------------------------------------------------------------------------
/hamr_buffer_transfer.i:
--------------------------------------------------------------------------------
 1 | %{
 2 | #include "hamr_config.h"
 3 | #include "hamr_buffer_transfer.h"
 4 | %}
 5 | /***************************************************************************
 6 |  * buffer transfer
 7 |  **************************************************************************/
 8 | %namewarn("") "async";
 9 | %include "hamr_buffer_transfer.h"
10 | 


--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
 1 | cff-version: 1.2.0
 2 | message: "If you use this software, please cite it as below."
 3 | authors:
 4 |   - family-names: Loring
 5 |     given-names: Burlen
 6 |     orcid: https://orcid.org/0000-0002-4678-8142
 7 | title: "HAMR the Heterogeneous Accelerator Memory Resource"
 8 | version: 1.0.0
 9 | doi: https://zenodo.org/record/6471012 
10 | date-released: 2022-04-19
11 | 


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | # .readthedocs.yml
 2 | # Read the Docs configuration file
 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 4 | 
 5 | version: 2
 6 | 
 7 | build:
 8 |   os: "ubuntu-20.04"
 9 |   tools:
10 |     python: "3"
11 | 
12 | sphinx:
13 |   configuration: doc/rtd/conf.py
14 | 
15 | python:
16 |   install:
17 |     - requirements: doc/rtd/requirements.txt
18 | 


--------------------------------------------------------------------------------
/test/test_hamr_openmp_allocator.cpp:
--------------------------------------------------------------------------------
 1 | #include "hamr_config.h"
 2 | #include "hamr_openmp_allocator.h"
 3 | #include "hamr_openmp_print.h"
 4 | 
 5 | int main(int argc, char **argv)
 6 | {
 7 |     (void) argc;
 8 |     (void) argv;
 9 | 
10 |     {
11 |     auto data = hamr::openmp_allocator<double>::allocate(400, 3.1415);
12 | 
13 |     hamr::openmp_print(data.get(), 400);
14 |     }
15 | 
16 |     return 0;
17 | }
18 | 


--------------------------------------------------------------------------------
/hamr_env.h:
--------------------------------------------------------------------------------
 1 | #ifndef hamr_env_h
 2 | #define hamr_env_h
 3 | 
 4 | #include "hamr_config.h"
 5 | 
 6 | /// heterogeneous accelerator memory resource
 7 | namespace hamr
 8 | {
 9 | 
10 | /// returns the value of the HAMR_VERBOSE environment variable
11 | #if defined(HAMR_VERBOSE)
12 | HAMR_EXPORT int get_verbose();
13 | #else
14 | constexpr HAMR_EXPORT int get_verbose() { return 0; }
15 | #endif
16 | 
17 | }
18 | 
19 | #endif
20 | 


--------------------------------------------------------------------------------
/doc/rtd/source/hello_hip/write.h:
--------------------------------------------------------------------------------
 1 | template <typename T>
 2 | void write(std::ostream &os, const hamr::buffer<T> &ai)
 3 | {
 4 |     // get pointer to the input array that is safe to use on the host
 5 |     auto [spai, pai] = hamr::get_host_accessible(ai);
 6 | 
 7 |     // write the elements of the array to the stream
 8 |     for (int i = 0; i < ai.size(); ++i)
 9 |     {
10 |         os << pai[i] << " ";
11 |     }
12 | 
13 |     os << std::endl;
14 | }
15 | 


--------------------------------------------------------------------------------
/doc/rtd/source/hello_openmp/write.h:
--------------------------------------------------------------------------------
 1 | template <typename T>
 2 | void write(std::ostream &os, const hamr::buffer<T> &ai)
 3 | {
 4 |     // get pointer to the input array that is safe to use on the host
 5 |     auto [spai, pai] = hamr::get_host_accessible(ai);
 6 | 
 7 |     // write the elements of the array to the stream
 8 |     for (size_t i = 0; i < ai.size(); ++i)
 9 |     {
10 |         os << pai[i] << " ";
11 |     }
12 | 
13 |     os << std::endl;
14 | }
15 | 


--------------------------------------------------------------------------------
/doc/rtd/source/zero_copy_cupy/python_to_cpp.py:
--------------------------------------------------------------------------------
 1 | from hamr import *
 2 | import cupy as cp
 3 | 
 4 | # create a cupy array on the GPU
 5 | n_elem = 16
 6 | arr = cp.full((n_elem), 3.1415, dtype='float32')
 7 | 
 8 | # zero-copy share the data with C++
 9 | buf = buffer(arr)
10 | 
11 | # modify the cupy array
12 | arr *= 10000
13 | 
14 | # print the buffer, which should reflect the modification because of the
15 | # zero-copy data sharing
16 | print('buf = %s\n'%(str(buf)))
17 | 


--------------------------------------------------------------------------------
/doc/rtd/source/hello_cuda/write.h:
--------------------------------------------------------------------------------
 1 | template <typename T>
 2 | void write(std::ostream &os, const hamr::buffer<T> &ai)
 3 | {
 4 |     // get pointer to the input array that is safe to use on the host
 5 |     auto spai = ai.get_host_accessible();
 6 |     const T *pai = spai.get();
 7 | 
 8 |     // write the elements of the array to the stream
 9 |     for (int i = 0; i < ai.size(); ++i)
10 |     {
11 |         os << pai[i] << " ";
12 |     }
13 | 
14 |     os << std::endl;
15 | }
16 | 


--------------------------------------------------------------------------------
/doc/rtd/source/zero_copy_cupy/cpp_to_python.py:
--------------------------------------------------------------------------------
 1 | from hamr import *
 2 | import cupy as cp
 3 | 
 4 | # allocate some memory on the GPU
 5 | n_elem = 16
 6 | buf = buffer_float(buffer_allocator_cuda, n_elem, 3.1415)
 7 | 
 8 | # convert to a cupy array
 9 | arr = cp.array(buf.get_cuda_accessible(), copy=False)
10 | 
11 | # modify the cupy array
12 | arr *= 10000
13 | 
14 | # print the buffer, which should reflect the modification because of the
15 | # zero-copy data sharing
16 | print('buf = %s\n'%(str(buf)))
17 | 


--------------------------------------------------------------------------------
/doc/rtd/source/hello_openmp/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | HAMR_SOURCE=../../../../
 3 | HAMR_BUILD=../../../../build_omp
 4 | 
 5 | # NVIDIA HPC Compiler
 6 | #CXX=`which nvc++`
 7 | #CXX_FLAGS=-mp=gpu -Minfo
 8 | 
 9 | # AMD ROCm compiler
10 | CXX=/opt/rocm/llvm/bin/amdclang++
11 | CXX_FLAGS=-target x86_64-pc-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx1030
12 | 
13 | all:
14 | 	${CXX} ${CXX_FLAGS} hello_openmp.cpp -I${HAMR_SOURCE} -I${HAMR_BUILD} -std=c++17 -L${HAMR_BUILD}/lib/ -lhamr
15 | 


--------------------------------------------------------------------------------
/hamr_buffer_transfer.h:
--------------------------------------------------------------------------------
 1 | #ifndef buffer_transfer_h
 2 | #define buffer_transfer_h
 3 | 
 4 | ///@file
 5 | 
 6 | /// heterogeneous accelerator memory resource
 7 | namespace hamr
 8 | {
 9 | 
10 | /** flag used to indicate whether or not a transfer operation should be
11 |  * synchronous or not.
12 |  */
13 | enum class buffer_transfer
14 | {
15 |     async = 0,   ///< all operations are asynchronous
16 |     sync_host = 1,///< operations moving data from GPU to host memory are synchronous
17 |     sync = 2     ///< all operations are synchronous
18 | };
19 | 
20 | }
21 | 
22 | #endif
23 | 


--------------------------------------------------------------------------------
/hamr_hip_print.h:
--------------------------------------------------------------------------------
 1 | #ifndef hamr_hip_print_h
 2 | #define hamr_hip_print_h
 3 | 
 4 | #include "hamr_config.h"
 5 | 
 6 | /// heterogeneous accelerator memory resource
 7 | namespace hamr
 8 | {
 9 | 
10 | /** prints an array on the GPU
11 |  * @param[in] vals an array of n elements accessible in HIP
12 |  * @param[in] n_elem the length of the array
13 |  * @returns 0 if there were no errors
14 |  */
15 | template <typename T>
16 | int hip_print(T *vals, size_t n_elem);
17 | 
18 | }
19 | 
20 | #if !defined(HAMR_SEPARATE_IMPL)
21 | #include "hamr_hip_print_impl.h"
22 | #endif
23 | 
24 | #endif
25 | 


--------------------------------------------------------------------------------
/hamr_stream.i:
--------------------------------------------------------------------------------
 1 | %{
 2 | #include "hamr_config.h"
 3 | #include "hamr_stream.h"
 4 | %}
 5 | 
 6 | /***************************************************************************
 7 |  * stream
 8 |  **************************************************************************/
 9 | %namewarn("") "print";
10 | %ignore hamr::stream::operator=;
11 | #if defined(HAMR_ENABLE_CUDA)
12 | %ignore hamr::stream::operator cudaStream_t;
13 | #endif
14 | #if defined(HAMR_ENABLE_HIP)
15 | %ignore hamr::stream::operator hipStream_t;
16 | #endif
17 | %ignore hamr::stream::stream(stream &&);
18 | %include "hamr_stream.h"
19 | 


--------------------------------------------------------------------------------
/hamr_cuda_print.h:
--------------------------------------------------------------------------------
 1 | #ifndef hamr_cuda_print_h
 2 | #define hamr_cuda_print_h
 3 | 
 4 | #include "hamr_config.h"
 5 | 
 6 | /// heterogeneous accelerator memory resource
 7 | namespace hamr
 8 | {
 9 | class stream;
10 | 
11 | /** prints an array on the GPU
12 |  * @param[in] vals an array of n elements accessible in CUDA
13 |  * @param[in] n_elem the length of the array
14 |  * @returns 0 if there were no errors
15 |  */
16 | template <typename T>
17 | int cuda_print(const hamr::stream &strm, T *vals, size_t n_elem);
18 | 
19 | }
20 | 
21 | #if !defined(HAMR_SEPARATE_IMPL)
22 | #include "hamr_cuda_print_impl.h"
23 | #endif
24 | 
25 | #endif
26 | 


--------------------------------------------------------------------------------
/doc/rtd/source/hello_openmp/hello_openmp.cpp:
--------------------------------------------------------------------------------
 1 | #include <hamr_buffer.h>
 2 | 
 3 | #include <iostream>
 4 | #include <memory>
 5 | 
 6 | #include "add.h"
 7 | #include "write.h"
 8 | 
 9 | int main(int, char **)
10 | {
11 |     size_t n_vals = 400;
12 | 
13 |     // allocate and initialize to 1 on the GPU
14 |     hamr::buffer<float> a0(hamr::buffer_allocator::openmp, n_vals, 1.0f);
15 | 
16 |     // allocate and initialize to 1 on the host
17 |     hamr::buffer<float> a1(hamr::buffer_allocator::malloc, n_vals, 1.0f);
18 | 
19 |     // add the two arrays
20 |     hamr::buffer<float> a2 = add(a0, a1);
21 | 
22 |     // write the result
23 |     write(std::cerr, a2);
24 | 
25 |     return 0;
26 | }
27 | 


--------------------------------------------------------------------------------
/hamr_env.cxx:
--------------------------------------------------------------------------------
 1 | #include "hamr_env.h"
 2 | 
 3 | #if defined(HAMR_VERBOSE)
 4 | 
 5 | #include <cstdlib>
 6 | #include <iostream>
 7 | 
 8 | namespace hamr
 9 | {
10 | 
11 | // **************************************************************************
12 | int get_verbose()
13 | {
14 |     static int ival = -1;
15 | 
16 |     if (ival < 0)
17 |     {
18 |         char *cval = getenv("HAMR_VERBOSE");
19 |         if (cval)
20 |         {
21 |             ival = atoi(cval);
22 |             std::cerr << "HAMR_VERBOSE=" << ival << std::endl;
23 |         }
24 |         else
25 |         {
26 |             ival = 0;
27 |         }
28 |     }
29 | 
30 |     return ival;
31 | }
32 | 
33 | }
34 | 
35 | #endif
36 | 


--------------------------------------------------------------------------------
/hamr_gil_state.h:
--------------------------------------------------------------------------------
 1 | #ifndef hamr_gil_state_h
 2 | #define hamr_gil_state_h
 3 | 
 4 | #include <Python.h>
 5 | 
 6 | namespace hamr
 7 | {
 8 | 
 9 | /// A RAII helper for managing the Python GIL.
10 | /** The GIL is aquired and held while the object exists. The GIL must be held
11 |  * by C++ code invoking any Python C-API calls.
12 |  */
13 | class HAMR_EXPORT gil_state
14 | {
15 | public:
16 |     gil_state()
17 |     { m_state = PyGILState_Ensure(); }
18 | 
19 |     ~gil_state()
20 |     { PyGILState_Release(m_state); }
21 | 
22 |     gil_state(const gil_state&) = delete;
23 |     void operator=(const gil_state&) = delete;
24 | 
25 | private:
26 |     PyGILState_STATE m_state;
27 | };
28 | 
29 | }
30 | 
31 | #endif
32 | 


--------------------------------------------------------------------------------
/doc/rtd/source/hello_hip/hello_hip.cpp:
--------------------------------------------------------------------------------
 1 | #include <hamr_buffer.h>
 2 | #include <hip/hip_runtime.h>
 3 | 
 4 | #include <iostream>
 5 | #include <memory>
 6 | 
 7 | #include "add.h"
 8 | #include "write.h"
 9 | 
10 | int main(int, char **)
11 | {
12 |     size_t n_vals = 400;
13 | 
14 |     // allocate and initialize to 1 on the GPU
15 |     hamr::buffer<float> a0(hamr::buffer_allocator::hip, n_vals, 1.0f);
16 | 
17 |     // allocate and initialize to 1 on the host
18 |     hamr::buffer<float> a1(hamr::buffer_allocator::malloc, n_vals, 1.0f);
19 | 
20 |     // add the two arrays
21 |     hamr::buffer<float> a2 = add(a0, a1);
22 | 
23 |     // write the result
24 |     write(std::cerr, a2);
25 | 
26 |     return 0;
27 | }
28 | 


--------------------------------------------------------------------------------
/hamr_openmp_print.h:
--------------------------------------------------------------------------------
 1 | #ifndef hamr_openmp_print_impl_h
 2 | #define hamr_openmp_print_impl_h
 3 | 
 4 | #include "hamr_config.h"
 5 | #include <stddef.h>
 6 | 
 7 | /// heterogeneous accelerator memory resource
 8 | namespace hamr
 9 | {
10 | 
11 | /** prints an array on the host (note: OpenMP provides no way to print directly
12 |  * from the device)
13 |  * @param[in] vals an array of n elements accessible in OpenMP
14 |  * @param[in] n_elem the length of the array
15 |  * @returns 0 if there were no errors
16 |  */
17 | template <typename T>
18 | HAMR_EXPORT
19 | int openmp_print(T *vals, size_t n_elem);
20 | 
21 | }
22 | 
23 | #if !defined(HAMR_SEPARATE_IMPL)
24 | #include "hamr_openmp_print_impl.h"
25 | #endif
26 | #endif
27 | 


--------------------------------------------------------------------------------
/doc/rtd/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = .
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/doc/rtd/source/hello_cuda/hello_cuda.cu:
--------------------------------------------------------------------------------
 1 | #include <hamr_buffer.h>
 2 | #include <hamr_buffer_util.h>
 3 | #include <cuda.h>
 4 | #include <cuda_runtime.h>
 5 | #include <iostream>
 6 | #include <memory>
 7 | 
 8 | #include "add.h"
 9 | #include "write.h"
10 | 
11 | int main(int, char **)
12 | {
13 |     size_t n_vals = 400;
14 | 
15 |     // allocate and initialize to 1 on the GPU
16 |     hamr::buffer<float> a0(hamr::buffer_allocator::cuda, n_vals, 1.0f);
17 | 
18 |     // allocate and initialize to 1 on the host
19 |     hamr::buffer<float> a1(hamr::buffer_allocator::malloc, n_vals, 1.0f);
20 | 
21 |     // add the two arrays
22 |     hamr::buffer<float> a2 = add(a0, a1);
23 | 
24 |     // write the result
25 |     write(std::cerr, a2);
26 | 
27 |     return 0;
28 | }
29 | 


--------------------------------------------------------------------------------
/hamr_config.cmake.in:
--------------------------------------------------------------------------------
 1 | include(CMakeFindDependencyMacro)
 2 | 
 3 | list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}")
 4 | 
 5 | set(HAMR_LIB_TYPE STATIC)
 6 | if (@BUILD_SHARED_LIBS@)
 7 |   set(HAMR_LIB_TYPE SHARED)
 8 | endif()
 9 | 
10 | set(HAMR_SEPARATE_IMPL @HAMR_SEPARATE_IMPL@)
11 | set(HAMR_ENABLE_CUDA @HAMR_ENABLE_CUDA@)
12 | set(HAMR_NVHPC_CUDA @HAMR_NVHPC_CUDA@)
13 | set(HAMR_NVCC_CUDA @HAMR_NVCC_CUDA@)
14 | set(HAMR_CLANG_CUDA @HAMR_CLANG_CUDA@)
15 | set(HAMR_NVHPC_CUDA @HAMR_NVHPC_CUDA@)
16 | set(HAMR_ENABLE_HIP @HAMR_ENABLE_HIP@)
17 | set(HAMR_ENABLE_OPENMP @HAMR_ENABLE_HIP@)
18 | set(HAMR_ENABLE_OBJECTS @HAMR_ENABLE_OBJECTS@)
19 | set(HAMR_ENABLE_PAGE_LOCKED_MEMORY @HAMR_ENABLE_PAGE_LOCKED_MEMORY@)
20 | set(HAMR_ENABLE_PYTHON @HAMR_ENABLE_PYTHON@)
21 | set(HAMR_VERBOSE @HAMR_VERBOSE@)
22 | 
23 | include(hamr)
24 | 


--------------------------------------------------------------------------------
/doc/rtd/source/hello_cuda/add.h:
--------------------------------------------------------------------------------
 1 | #include "add.cuh"
 2 | 
 3 | template <typename T, typename U>
 4 | hamr::buffer<T> add(const hamr::buffer<T> &a1, const hamr::buffer<U> &a2)
 5 | {
 6 |     size_t n_vals = a1.size();
 7 | 
 8 |     // get pointers to the input arrays that are safe to use on the GPU
 9 |     auto [spa1, pa1] = hamr::get_cuda_accessible(a1);
10 |     auto [spa2, pa2] = hamr::get_cuda_accessible(a2);
11 | 
12 |     // allocate the memory for the result on the GPU, and get a pointer to it
13 |     hamr::buffer<T> ao(hamr::buffer_allocator::cuda, n_vals, T(0));
14 |     T *pao = ao.data();
15 | 
16 |     // launch the kernel to add the arrays
17 |     dim3 thread_grid(128);
18 |     dim3 block_grid(n_vals/128 + (n_vals % 128 ? 1 : 0));
19 |     add<<<block_grid, thread_grid>>>(pao, pa1, pa2, n_vals);
20 | 
21 |     return ao;
22 | }
23 | 


--------------------------------------------------------------------------------
/doc/rtd/source/hello_hip/add.h:
--------------------------------------------------------------------------------
 1 | #include "add_kernel.h"
 2 | 
 3 | template <typename T, typename U>
 4 | hamr::buffer<T> add(const hamr::buffer<T> &a1, const hamr::buffer<U> &a2)
 5 | {
 6 |     size_t n_vals = a1.size();
 7 | 
 8 |     // get pointers to the input arrays that are safe to use on the GPU
 9 |     auto [spa1, pa1] = hamr::get_hip_accessible(a1);
10 |     auto [spa2, pa2] = hamr::get_hip_accessible(a2);
11 | 
12 |     // allocate the memory for the result on the GPU, and get a pointer to it
13 |     hamr::buffer<T> ao(hamr::buffer_allocator::hip, n_vals, T(0));
14 |     T *pao = ao.data();
15 | 
16 |     // launch the kernel to add the arrays
17 |     dim3 thread_grid(128);
18 |     dim3 block_grid(n_vals/128 + (n_vals % 128 ? 1 : 0));
19 |     add<<<block_grid, thread_grid>>>(pao, pa1, pa2, n_vals);
20 | 
21 |     return ao;
22 | }
23 | 


--------------------------------------------------------------------------------
/doc/rtd/source/hello_openmp/add.h:
--------------------------------------------------------------------------------
 1 | template <typename T, typename U>
 2 | hamr::buffer<T> add(const hamr::buffer<T> &a1, const hamr::buffer<U> &a2)
 3 | {
 4 |     size_t n_vals = a1.size();
 5 | 
 6 |     // get pointers to the input arrays that are safe to use on the GPU
 7 |     auto [spa1, pa1] = hamr::get_openmp_accessible(a1);
 8 |     auto [spa2, pa2] = hamr::get_openmp_accessible(a2);
 9 | 
10 |     // allocate the memory for the result on the GPU, and get a pointer to it
11 |     hamr::buffer<T> ao(hamr::buffer_allocator::openmp, n_vals, T(0));
12 |     T *pao = ao.data();
13 | 
14 |     // launch the kernel to add the arrays
15 |     #pragma omp target teams distribute parallel for is_device_ptr(pao, pa1, pa2)
16 |     for (size_t i = 0; i < n_vals; ++i)
17 |     {
18 |         pao[i] = pa1[i] + pa2[i];
19 |     }
20 | 
21 |     return ao;
22 | }
23 | 


--------------------------------------------------------------------------------
/doc/rtd/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/hamr_config.h.in:
--------------------------------------------------------------------------------
 1 | #ifndef hamr_config_h
 2 | #define hamr_config_h
 3 | 
 4 | #define HAMR_EXPORT __attribute__ ((visibility ("default")))
 5 | #define HAMR_PRIVATE __attribute__ ((visibility ("hidden")))
 6 | 
 7 | #cmakedefine HAMR_SEPARATE_IMPL
 8 | #cmakedefine HAMR_ENABLE_CUDA
 9 | #cmakedefine HAMR_NVHPC_CUDA
10 | #cmakedefine HAMR_NVCC_CUDA
11 | #cmakedefine HAMR_CLANG_CUDA
12 | #cmakedefine HAMR_ENABLE_HIP
13 | #cmakedefine HAMR_ENABLE_OPENMP
14 | #define HAMR_OPENMP_LOOP @HAMR_OPENMP_LOOP@
15 | #cmakedefine HAMR_ENABLE_OBJECTS
16 | #cmakedefine HAMR_ENABLE_PAGE_LOCKED_MEMORY
17 | #cmakedefine HAMR_ENABLE_PYTHON
18 | #cmakedefine HAMR_VERBOSE
19 | 
20 | // work around an issue with clang compiling CUDA (clang 17/CUDA 12 May 2023).
21 | // problematic includes can go here, leaving the individual source files
22 | // unmodified.
23 | #if defined(HAMR_CLANG_CUDA)
24 | #undef __noinline__
25 | #include <memory>
26 | #include <type_traits>
27 | #define __noinline__ __attribute__((noinline))
28 | #endif
29 | 
30 | #endif
31 | 


--------------------------------------------------------------------------------
/hamr_hip_print.cxx:
--------------------------------------------------------------------------------
 1 | #include "hamr_config.h"
 2 | 
 3 | #include "hamr_hip_print.h"
 4 | #include "hamr_hip_print_impl.h"
 5 | 
 6 | template int hamr::hip_print<float>(float *vals, size_t n_elem);
 7 | template int hamr::hip_print<double>(double *vals, size_t n_elem);
 8 | template int hamr::hip_print<char>(char *vals, size_t n_elem);
 9 | template int hamr::hip_print<signed char>(signed char *vals, size_t n_elem);
10 | template int hamr::hip_print<short>(short *vals, size_t n_elem);
11 | template int hamr::hip_print<int>(int *vals, size_t n_elem);
12 | template int hamr::hip_print<long>(long *vals, size_t n_elem);
13 | template int hamr::hip_print<long long>(long long *vals, size_t n_elem);
14 | template int hamr::hip_print<unsigned char>(unsigned char *vals, size_t n_elem);
15 | template int hamr::hip_print<unsigned short>(unsigned short *vals, size_t n_elem);
16 | template int hamr::hip_print<unsigned int>(unsigned int *vals, size_t n_elem);
17 | template int hamr::hip_print<unsigned long>(unsigned long *vals, size_t n_elem);
18 | template int hamr::hip_print<unsigned long long>(unsigned long long *vals, size_t n_elem);
19 | 


--------------------------------------------------------------------------------
/doc/rtd/source/hello_cupy/hello_cupy.py:
--------------------------------------------------------------------------------
 1 | from hamr import *
 2 | import cupy as cp
 3 | import numpy as np
 4 | import sys
 5 | 
 6 | 
 7 | def add(buf_0, buf_1):
 8 |     """ add 2 arrays on the GPU """
 9 |     arr_0 = cp.array(buf_0.get_cuda_accessible()) # share data w/ cupy on GPU
10 |     arr_1 = cp.array(buf_1.get_cuda_accessible()) # share data w/ cupy on GPU
11 |     arr_2 = arr_0 + arr_1                         # add on the GPU
12 |     buf_2 = buffer(arr_2)                         # zero-copy from cupy on GPU
13 |     return buf_2
14 | 
15 | def write(fh, buf):
16 |     """ print the array on the host """
17 |     arr = np.array(buf.get_host_accessible())      # share data w/ numpy on host
18 |     fh.write('%s\n'%(str(arr)))                   # write to the file on host
19 | 
20 | 
21 | n_vals = 400
22 | buf_0 = buffer_float(buffer_allocator_cuda, n_vals, 1.0)   # allocate on the host
23 | buf_1 = buffer_float(buffer_allocator_malloc, n_vals, 1.0) # allocate on the GPU
24 | 
25 | buf_2 = add(buf_0, buf_1)                                  # add the arrays
26 | 
27 | write(sys.stdout, buf_2)                                   # write the arrays
28 | 


--------------------------------------------------------------------------------
/hamr_openmp_print.cxx:
--------------------------------------------------------------------------------
 1 | #include "hamr_config.h"
 2 | 
 3 | #include "hamr_openmp_print.h"
 4 | #include "hamr_openmp_print_impl.h"
 5 | 
 6 | template int hamr::openmp_print<float>(float *vals, size_t n_elem);
 7 | template int hamr::openmp_print<double>(double *vals, size_t n_elem);
 8 | template int hamr::openmp_print<char>(char *vals, size_t n_elem);
 9 | template int hamr::openmp_print<signed char>(signed char *vals, size_t n_elem);
10 | template int hamr::openmp_print<short>(short *vals, size_t n_elem);
11 | template int hamr::openmp_print<int>(int *vals, size_t n_elem);
12 | template int hamr::openmp_print<long>(long *vals, size_t n_elem);
13 | template int hamr::openmp_print<long long>(long long *vals, size_t n_elem);
14 | template int hamr::openmp_print<unsigned char>(unsigned char *vals, size_t n_elem);
15 | template int hamr::openmp_print<unsigned short>(unsigned short *vals, size_t n_elem);
16 | template int hamr::openmp_print<unsigned int>(unsigned int *vals, size_t n_elem);
17 | template int hamr::openmp_print<unsigned long>(unsigned long *vals, size_t n_elem);
18 | template int hamr::openmp_print<unsigned long long>(unsigned long long *vals, size_t n_elem);
19 | 


--------------------------------------------------------------------------------
/hamr_buffer_pointer.h:
--------------------------------------------------------------------------------
 1 | #ifndef buffer_pointer_h
 2 | #define buffer_pointer_h
 3 | 
 4 | #include <memory>
 5 | 
 6 | /// heterogeneous accelerator memory resource
 7 | namespace hamr
 8 | {
 9 | 
10 | template <typename T> class buffer;
11 | 
12 | ///  a shared pointer to an instance of a buffer<T>
13 | template <typename T>
14 | using p_buffer = std::shared_ptr<buffer<T>>;
15 | 
16 | ///  a shared pointer to an instance of a const buffer<T>
17 | template <typename T>
18 | using const_p_buffer = std::shared_ptr<const buffer<T>>;
19 | 
20 | /// a helper for explicitly casting to a const buffer pointer.
21 | template <typename T>
22 | hamr::const_p_buffer<T> const_ptr(const hamr::p_buffer<T> &v)
23 | {
24 |     return hamr::const_p_buffer<T>(v);
25 | }
26 | 
27 | /// a helper for getting a reference to pointed to hamr::buffer
28 | template <typename T>
29 | const hamr::buffer<T> &ref_to(const hamr::const_p_buffer<T> &ptr)
30 | {
31 |     return *(ptr.get());
32 | }
33 | 
34 | /// a helper for getting a reference to pointed to hamr::buffer
35 | template <typename T>
36 | hamr::buffer<T> &ref_to(const hamr::p_buffer<T> &ptr)
37 | {
38 |     return *(ptr.get());
39 | }
40 | 
41 | }
42 | 
43 | #endif
44 | 


--------------------------------------------------------------------------------
/hamr_host_copy.h:
--------------------------------------------------------------------------------
 1 | #ifndef hamr_host_copy_h
 2 | #define hamr_host_copy_h
 3 | 
 4 | #include "hamr_config.h"
 5 | #include <memory>
 6 | #include <type_traits>
 7 | 
 8 | /// heterogeneous accelerator memory resource
 9 | namespace hamr
10 | {
11 | 
12 | /** Copies an array on the host.
13 |  *
14 |  * @param[in] dest an array of n elements accessible on the host
15 |  * @param[in] src an array of n elements accessible on the host
16 |  * @param[in] n_elem the number of elements in the array
17 |  * @returns 0 if there were no errors
18 |  */
19 | template <typename T, typename U>
20 | int copy_to_host_from_host(T *dest, const U *src, size_t n_elem);
21 | 
22 | /** Copies an array on the host (fast path for arrays of arithmetic types of the
23 |  * same type).
24 |  *
25 |  * @param[in] dest an array of n elements accessible in CUDA
26 |  * @param[in] src an array of n elements accessible on the host
27 |  * @param[in] n_elem the number of elements in the array
28 |  * @returns 0 if there were no errors
29 |  */
30 | template <typename T>
31 | int copy_to_host_from_host(T *dest, const T *src, size_t n_elem,
32 |    typename std::enable_if<std::is_arithmetic<T>::value>::type * = nullptr);
33 | 
34 | }
35 | 
36 | #if !defined(HAMR_SEPARATE_IMPL)
37 | #include "hamr_host_copy_impl.h"
38 | #endif
39 | 
40 | #endif
41 | 


--------------------------------------------------------------------------------
/hamr_python_deleter_impl.h:
--------------------------------------------------------------------------------
 1 | #include "hamr_gil_state.h"
 2 | #include <Python.h>
 3 | #include <iostream>
 4 | 
 5 | namespace hamr
 6 | {
 7 | 
 8 | // --------------------------------------------------------------------------
 9 | template <typename T>
10 | python_deleter<T>::python_deleter(T *ptr, size_t n, PyObject *obj)
11 |     : m_ptr(ptr), m_elem(n), m_object(obj)
12 | {
13 | #if defined(HAMR_VERBOSE)
14 |     if (hamr::get_verbose())
15 |     {
16 |         std::cerr << "created python_deleter for array of " << n
17 |             << " objects of type " << typeid(T).name() << sizeof(T)
18 |             << " holding a reference to " << m_object << std::endl;
19 |     }
20 | #endif
21 |     hamr::gil_state gil;
22 |     Py_INCREF(obj);
23 | }
24 | 
25 | // --------------------------------------------------------------------------
26 | template <typename T>
27 | void python_deleter<T>::operator()(T *ptr)
28 | {
29 |     (void)ptr;
30 |     assert(ptr == m_ptr);
31 | #if defined(HAMR_VERBOSE)
32 |     if (hamr::get_verbose())
33 |     {
34 |         std::cerr << "python_deleter deleting array of " << m_elem
35 |             << " objects of type " << typeid(T).name() << sizeof(T)
36 |             << " release reference to " << m_object << std::endl;
37 |     }
38 | #endif
39 |     hamr::gil_state gil;
40 |     Py_DECREF(m_object);
41 | }
42 | 
43 | }
44 | 


--------------------------------------------------------------------------------
/hamr_buffer_allocator.cxx:
--------------------------------------------------------------------------------
 1 | #include "hamr_buffer_allocator.h"
 2 | 
 3 | namespace hamr
 4 | {
 5 | 
 6 | // **************************************************************************
 7 | const char *get_allocator_name(buffer_allocator alloc)
 8 | {
 9 |     if (alloc == buffer_allocator::cpp)
10 |     {
11 |         return "cpp";
12 |     }
13 |     else if (alloc == buffer_allocator::malloc)
14 |     {
15 |         return "malloc";
16 |     }
17 |     else if (alloc == buffer_allocator::cuda)
18 |     {
19 |         return "cuda_malloc_allocator";
20 |     }
21 |     else if (alloc == buffer_allocator::cuda_host)
22 |     {
23 |         return "cuda_malloc_host_allocator";
24 |     }
25 |     else if (alloc == buffer_allocator::cuda_async)
26 |     {
27 |         return "cuda_malloc_async_allocator";
28 |     }
29 |     else if (alloc == buffer_allocator::cuda_uva)
30 |     {
31 |         return "cuda_malloc_uva_allocator";
32 |     }
33 |     else if (alloc == buffer_allocator::hip)
34 |     {
35 |         return "hip_malloc_allocator";
36 |     }
37 |     else if (alloc == buffer_allocator::hip_uva)
38 |     {
39 |         return "hip_malloc_uva_allocator";
40 |     }
41 |     else if (alloc == buffer_allocator::openmp)
42 |     {
43 |         return "openmp_allocator";
44 |     }
45 | 
46 |     return "the allocator name is not known";
47 | }
48 | 
49 | }
50 | 


--------------------------------------------------------------------------------
/test/test_hamr_buffer_cupy_host.py:
--------------------------------------------------------------------------------
 1 | from hamr import *
 2 | import cupy as cp
 3 | import sys
 4 | 
 5 | stderr = sys.__stderr__
 6 | 
 7 | stderr.write('TEST: creating a hamr::buffer host ... \n')
 8 | buf = buffer_float(buffer_allocator_malloc, 16, 3.1415)
 9 | stderr.write('buf = %s\n'%(str(buf)))
10 | stderr.write('TEST: creating a hamr::buffer host ... OK!\n\n')
11 | 
12 | stderr.write('TEST: get a handle to the data ... \n')
13 | h = buf.get_cuda_accessible()
14 | stderr.write('TEST: get a handle to the data ... OK!\n\n')
15 | 
16 | stderr.write('TEST: share the data with Cupy ... \n')
17 | arr = cp.array(h, copy=False)
18 | stderr.write('arr.__cuda_array_interface__ = %s\n'%(arr.__cuda_array_interface__))
19 | stderr.write('TEST: share the data with Cupy ... OK!\n\n')
20 | 
21 | stderr.write('TEST: deleting the hamr::buffer ... \n')
22 | buf = None
23 | stderr.write('TEST: deleting the hamr::buffer ... OK!\n\n')
24 | 
25 | stderr.write('TEST: Cupy reads the data ... \n')
26 | stderr.write('arr = %s\n'%(str(arr)))
27 | stderr.write('TEST: Cupy reads the data ... OK!\n\n')
28 | 
29 | stderr.write('TEST: Cupy modifies the data ... \n')
30 | arr *= 10000
31 | stderr.write('arr = %s\n'%(str(arr)))
32 | stderr.write('TEST: Cupy modifies the data ... OK!\n\n')
33 | 
34 | stderr.write('TEST: deleting the Cupy array ... \n')
35 | arr = None
36 | stderr.write('TEST: deleting the Cupy array ... OK!\n\n')
37 | 
38 | sys.exit(0)
39 | 


--------------------------------------------------------------------------------
/hamr_openmp_device.h:
--------------------------------------------------------------------------------
 1 | #ifndef hamr_openmp_device_h
 2 | #define hamr_openmp_device_h
 3 | 
 4 | #include "hamr_config.h"
 5 | 
 6 | ///@file
 7 | 
 8 | namespace hamr
 9 | {
10 | /// gets the device identifier for the first GPU. @returns zero if successful.
11 | inline int HAMR_EXPORT get_openmp_device_identifier(int &dev_id) { dev_id = 0; return 0; }
12 | 
13 | /// gets the device identifier for the host. @returns zero if successful.
14 | int HAMR_EXPORT get_openmp_host_identifier(int &dev_id);
15 | 
16 | /// gets the currently atcive HIP device. returns zero if successful.
17 | int HAMR_EXPORT get_active_openmp_device(int &dev_id);
18 | 
19 | /// sets the active HIP device. returns zero if successful.
20 | int HAMR_EXPORT set_active_openmp_device(int dev_id);
21 | 
22 | /// gets the device that owns the given pointer. @returns zero if successful.
23 | int HAMR_EXPORT get_openmp_device(const void *ptr, int &device_id);
24 | 
25 | /** Activate the specified HIP device, and restore the previously active
26 |  * device when the object is destroyed.
27 |  */
28 | class HAMR_EXPORT activate_openmp_device
29 | {
30 | public:
31 |     activate_openmp_device() = delete;
32 |     activate_openmp_device(const activate_openmp_device &) = delete;
33 |     void operator=(const activate_openmp_device &) = delete;
34 | 
35 |     activate_openmp_device(int id);
36 |     ~activate_openmp_device();
37 | 
38 | private:
39 |     int m_device;
40 | };
41 | 
42 | }
43 | #endif
44 | 


--------------------------------------------------------------------------------
/hamr_cuda_print.cxx:
--------------------------------------------------------------------------------
 1 | #include "hamr_config.h"
 2 | 
 3 | #include "hamr_cuda_print.h"
 4 | #include "hamr_cuda_print_impl.h"
 5 | 
 6 | template int hamr::cuda_print<float>(const hamr::stream &strm, float *vals, size_t n_elem);
 7 | template int hamr::cuda_print<double>(const hamr::stream &strm, double *vals, size_t n_elem);
 8 | template int hamr::cuda_print<char>(const hamr::stream &strm, char *vals, size_t n_elem);
 9 | template int hamr::cuda_print<signed char>(const hamr::stream &strm, signed char *vals, size_t n_elem);
10 | template int hamr::cuda_print<short>(const hamr::stream &strm, short *vals, size_t n_elem);
11 | template int hamr::cuda_print<int>(const hamr::stream &strm, int *vals, size_t n_elem);
12 | template int hamr::cuda_print<long>(const hamr::stream &strm, long *vals, size_t n_elem);
13 | template int hamr::cuda_print<long long>(const hamr::stream &strm, long long *vals, size_t n_elem);
14 | template int hamr::cuda_print<unsigned char>(const hamr::stream &strm, unsigned char *vals, size_t n_elem);
15 | template int hamr::cuda_print<unsigned short>(const hamr::stream &strm, unsigned short *vals, size_t n_elem);
16 | template int hamr::cuda_print<unsigned int>(const hamr::stream &strm, unsigned int *vals, size_t n_elem);
17 | template int hamr::cuda_print<unsigned long>(const hamr::stream &strm, unsigned long *vals, size_t n_elem);
18 | template int hamr::cuda_print<unsigned long long>(const hamr::stream &strm, unsigned long long *vals, size_t n_elem);
19 | 


--------------------------------------------------------------------------------
/hamr_hip_device.h:
--------------------------------------------------------------------------------
 1 | #ifndef hamr_hip_device_h
 2 | #define hamr_hip_device_h
 3 | 
 4 | #include "hamr_config.h"
 5 | 
 6 | ///@file
 7 | 
 8 | namespace hamr
 9 | {
10 | /// gets the device identifier for the first GPU. @returns zero if successful.
11 | inline int HAMR_EXPORT get_hip_device_identifier(int &dev_id) { dev_id = 0; return 0; }
12 | 
13 | /// gets the device identifier for the host. @returns zero if successful.
14 | inline int HAMR_EXPORT get_hip_host_identifier(int &dev_id) { dev_id = -1; return 0; }
15 | 
16 | /// gets the currently atcive HIP device. returns zero if successful.
17 | int HAMR_EXPORT get_active_hip_device(int &dev_id);
18 | 
19 | /// sets the active HIP device. returns zero if successful.
20 | int HAMR_EXPORT set_active_hip_device(int dev_id);
21 | 
22 | /// gets the device that owns the given pointer. @returns zero if successful.
23 | int HAMR_EXPORT get_hip_device(const void *ptr, int &device_id);
24 | 
25 | 
26 | /** Activate the specified HIP device, and restore the previously active
27 |  * device when the object is destroyed.
28 |  */
29 | class HAMR_EXPORT activate_hip_device
30 | {
31 | public:
32 |     activate_hip_device() = delete;
33 |     activate_hip_device(const activate_hip_device &) = delete;
34 |     void operator=(const activate_hip_device &) = delete;
35 | 
36 |     activate_hip_device(int id);
37 |     ~activate_hip_device();
38 | 
39 | private:
40 |     int m_device;
41 | };
42 | 
43 | }
44 | 
45 | 
46 | 
47 | 
48 | #endif
49 | 


--------------------------------------------------------------------------------
/hamr_openmp_print_impl.h:
--------------------------------------------------------------------------------
 1 | #ifndef hamr_openmp_print_h
 2 | #define hamr_openmp_print_h
 3 | 
 4 | #include "hamr_config.h"
 5 | #include "hamr_env.h"
 6 | #if defined(HAMR_ENABLE_OPENMP)
 7 | #include "hamr_openmp_copy.h"
 8 | #include "hamr_malloc_allocator.h"
 9 | #endif
10 | 
11 | #include <iostream>
12 | 
13 | /// heterogeneous accelerator memory resource
14 | namespace hamr
15 | {
16 | // ---------------------------------------------------------------------------
17 | template <typename T>
18 | int openmp_print(T *vals, size_t n_elem)
19 | {
20 | #if !defined(HAMR_ENABLE_OPENMP)
21 |     (void) vals;
22 |     (void) n_elem;
23 |     std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
24 |         " print_openmp failed because OpenMP is not enabled." << std::endl;
25 |     return -1;
26 | #else
27 | 
28 |     // allocate a temporary on the host
29 |     auto sptmp = hamr::malloc_allocator<T>::allocate(n_elem);
30 |     T *ptmp = sptmp.get();
31 | 
32 |     // move to the host
33 |     if (hamr::copy_to_host_from_openmp(ptmp, vals, n_elem))
34 |     {
35 |         std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
36 |             " failed to move data to the host" << std::endl;
37 |         return -1;
38 |     }
39 | 
40 |     // print
41 |     if (n_elem)
42 |     {
43 |         std::cerr << ptmp[0];
44 |         for (size_t i = 1; i < n_elem; ++i)
45 |         {
46 |             std::cerr << ", " << ptmp[i];
47 |         }
48 |     }
49 |     std::cerr << std::endl;
50 | 
51 |     return 0;
52 | #endif
53 | }
54 | 
55 | }
56 | #endif
57 | 


--------------------------------------------------------------------------------
/hamr_host_copy.cxx:
--------------------------------------------------------------------------------
 1 | #include "hamr_config.h"
 2 | 
 3 | #include "hamr_host_copy.h"
 4 | #include "hamr_host_copy_impl.h"
 5 | 
 6 | #define hamr_host_copy_instantiate_(T, U) \
 7 | template int hamr::copy_to_host_from_host<T,U>(T *dest, const U *src, size_t n_elem);
 8 | 
 9 | #define hamr_host_copy_instantiate(T) \
10 | template int hamr::copy_to_host_from_host<T>(T *dest, const T *src, size_t n_elem, void *); \
11 | hamr_host_copy_instantiate_(T, float) \
12 | hamr_host_copy_instantiate_(T, double) \
13 | hamr_host_copy_instantiate_(T, char) \
14 | hamr_host_copy_instantiate_(T, signed char) \
15 | hamr_host_copy_instantiate_(T, short) \
16 | hamr_host_copy_instantiate_(T, int) \
17 | hamr_host_copy_instantiate_(T, long) \
18 | hamr_host_copy_instantiate_(T, long long) \
19 | hamr_host_copy_instantiate_(T, unsigned char) \
20 | hamr_host_copy_instantiate_(T, unsigned short) \
21 | hamr_host_copy_instantiate_(T, unsigned int) \
22 | hamr_host_copy_instantiate_(T, unsigned long) \
23 | hamr_host_copy_instantiate_(T, unsigned long long)
24 | 
25 | hamr_host_copy_instantiate(float)
26 | hamr_host_copy_instantiate(double)
27 | hamr_host_copy_instantiate(char)
28 | hamr_host_copy_instantiate(signed char)
29 | hamr_host_copy_instantiate(short)
30 | hamr_host_copy_instantiate(int)
31 | hamr_host_copy_instantiate(long)
32 | hamr_host_copy_instantiate(long long)
33 | hamr_host_copy_instantiate(unsigned char)
34 | hamr_host_copy_instantiate(unsigned short)
35 | hamr_host_copy_instantiate(unsigned int)
36 | hamr_host_copy_instantiate(unsigned long)
37 | hamr_host_copy_instantiate(unsigned long long)
38 | 


--------------------------------------------------------------------------------
/hamr_python_deleter.h:
--------------------------------------------------------------------------------
 1 | #ifndef hamr_python_deleter_h
 2 | #define hamr_python_deleter_h
 3 | 
 4 | #include "hamr_config.h"
 5 | #include <Python.h>
 6 | 
 7 | namespace hamr
 8 | {
 9 | 
10 | /// a deleter for memory managed from within Python
11 | /** This class manages an array allocated by a Python code. In the functor's
12 |  * constructor a refrence to a user provdied Python object is stolen. When the
13 |  * functor is invoked, a reference to this Python object is released. It is up
14 |  * to the Python object to free the memory. One may use a PyCapsule to
15 |  * implement custom delete methods if they are needed.
16 |  */
17 | template <typename T>
18 | class HAMR_EXPORT python_deleter
19 | {
20 | public:
21 |     /** constructs the deleter. A reference to obj is stolen by this constructor.
22 |      * @param[in] ptr a pointer to shared data
23 |      * @param[in] n_elem the number of elements of type T shared
24 |      * @param[in] obj a PyObject who's reference count will be decremented when
25 |      *                the data shared from Python is no longer needed.
26 |      */
27 |     python_deleter(T *ptr, size_t n_elem, PyObject *obj);
28 | 
29 |     /** deletes the array
30 |      * @param[in] ptr the pointer to the array to delete. must be the same as
31 |      *                that passed during construction.
32 |      */
33 |     void operator()(T *ptr);
34 | 
35 | private:
36 |     T *m_ptr;
37 |     size_t m_elem;
38 |     PyObject *m_object;
39 | };
40 | 
41 | }
42 | 
43 | #if !defined(HAMR_SEPARATE_IMPL)
44 | #include "hamr_python_deleter_impl.h"
45 | #endif
46 | 
47 | #endif
48 | 


--------------------------------------------------------------------------------
/.github/workflows/build_and_test_cuda.yml:
--------------------------------------------------------------------------------
 1 | name: CUDA-HAMR
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ master ]
 6 |   pull_request:
 7 |     branches: [ master ]
 8 | 
 9 |   workflow_dispatch:
10 | 
11 | jobs:
12 |   build_and_test:
13 |     runs-on: ubuntu-22.04
14 | 
15 |     steps:
16 |       - uses: actions/checkout@v2
17 | 
18 |       - name: install_deps
19 |         run: |
20 |           export DEBIAN_FRONTEND="noninteractive"
21 |           export TZ="America/Los_Angeles"
22 |           sudo apt-get update -qq
23 |           sudo apt-get install -qq -y git-core gcc g++ cmake automake m4 wget swig python3 nvidia-cuda-toolkit
24 |           python3 -mvenv py3k_testing
25 |           source py3k_testing/bin/activate
26 |           python3 -mpip install numpy
27 | 
28 |       # build for CUDA
29 |       - name: build_cuda
30 |         run: |
31 |           source py3k_testing/bin/activate
32 |           mkdir build_cuda
33 |           cd build_cuda
34 |           cmake -DCMAKE_BUILD_TYPE=Debug -DBUILD_TESTING=ON -DHAMR_ENABLE_PYTHON=ON -DHAMR_ENABLE_CUDA=ON -DCMAKE_INSTALL_PREFIX=`pwd`/../../hamr-install-cuda ..
35 |           make -j2 install
36 |           cd ..
37 | 
38 |       # build for CUDA
39 |       - name: build_cuda_separate_impl
40 |         run: |
41 |           source py3k_testing/bin/activate
42 |           mkdir build_cuda_sep
43 |           cd build_cuda_sep
44 |           cmake -DCMAKE_BUILD_TYPE=Debug -DBUILD_TESTING=ON -DHAMR_ENABLE_PYTHON=ON -DHAMR_ENABLE_CUDA=ON -DHAMR_SEPARATE_IMPL=ON -DCMAKE_INSTALL_PREFIX=`pwd`/../../hamr-install-cuda-sep ..
45 |           make -j2 install
46 |           cd ..
47 | 


--------------------------------------------------------------------------------
/hamr_openmp_device.cxx:
--------------------------------------------------------------------------------
 1 | #include "hamr_openmp_device.h"
 2 | 
 3 | #include <iostream>
 4 | #include <omp.h>
 5 | 
 6 | namespace hamr
 7 | {
 8 | // **************************************************************************
 9 | int get_openmp_host_identifier(int &dev_id)
10 | {
11 |     dev_id = omp_get_initial_device();
12 |     return 0;
13 | }
14 | 
15 | // **************************************************************************
16 | int get_active_openmp_device(int &dev_id)
17 | {
18 |     dev_id = omp_get_default_device();
19 |     return 0;
20 | }
21 | 
22 | // **************************************************************************
23 | int set_active_openmp_device(int dev_id)
24 | {
25 |     omp_set_default_device(dev_id);
26 |     return 0;
27 | }
28 | 
29 | // **************************************************************************
30 | int HAMR_EXPORT get_openmp_device(const void *ptr, int &device_id)
31 | {
32 |     (void)ptr;
33 |     device_id = 0;
34 |     return -1;
35 | }
36 | 
37 | // --------------------------------------------------------------------------
38 | activate_openmp_device::activate_openmp_device(int new_dev) : m_device(-1)
39 | {
40 |     int cur_dev = -1;
41 |     if (!get_active_openmp_device(cur_dev) && (cur_dev != new_dev) &&
42 |         !set_active_openmp_device(new_dev))
43 |     {
44 |         m_device = cur_dev;
45 |     }
46 | }
47 | 
48 | // --------------------------------------------------------------------------
49 | activate_openmp_device::~activate_openmp_device()
50 | {
51 |     if (m_device >= 0)
52 |     {
53 |         set_active_openmp_device(m_device);
54 |     }
55 | }
56 | 
57 | }
58 | 


--------------------------------------------------------------------------------
/hamr_host_copy_impl.h:
--------------------------------------------------------------------------------
 1 | #ifndef hamr_host_copy_impl_h
 2 | #define hamr_host_copy_impl_h
 3 | 
 4 | #include "hamr_config.h"
 5 | #include "hamr_env.h"
 6 | 
 7 | #include <cstring>
 8 | #include <cstdlib>
 9 | #include <iostream>
10 | #include <memory>
11 | 
12 | /// heterogeneous accelerator memory resource
13 | namespace hamr
14 | {
15 | 
16 | // --------------------------------------------------------------------------
17 | template <typename T, typename U>
18 | int copy_to_host_from_host(T *dest, const U *src, size_t n_elem)
19 | {
20 |     for (size_t i = 0; i < n_elem; ++i)
21 |     {
22 |         dest[i] = static_cast<T>(src[i]);
23 |     }
24 | 
25 | #if defined(HAMR_VERBOSE)
26 |     if (hamr::get_verbose())
27 |     {
28 |         std::cerr << "hamr::copy_to_host_from_host " << n_elem
29 |             << " from " << typeid(U).name() << sizeof(U) << " to "
30 |             << typeid(T).name() << sizeof(T) << std::endl;
31 |     }
32 | #endif
33 | 
34 |     return 0;
35 | }
36 | 
37 | // --------------------------------------------------------------------------
38 | template <typename T>
39 | int copy_to_host_from_host(T *dest, const T *src, size_t n_elem,
40 |    typename std::enable_if<std::is_arithmetic<T>::value>::type *)
41 | {
42 |     // copy src to gpu
43 |     size_t n_bytes = n_elem*sizeof(T);
44 |     memcpy(dest, src, n_bytes);
45 | 
46 | #if defined(HAMR_VERBOSE)
47 |     if (hamr::get_verbose())
48 |     {
49 |         std::cerr << "hamr::copy_to_host_from_host same " << n_elem
50 |             << " " << typeid(T).name() << sizeof(T) << std::endl;
51 |     }
52 | #endif
53 | 
54 |     return 0;
55 | }
56 | 
57 | }
58 | 
59 | #endif
60 | 


--------------------------------------------------------------------------------
/hamr_python_deleter.cxx:
--------------------------------------------------------------------------------
 1 | #include "hamr_config.h"
 2 | 
 3 | #include "hamr_python_deleter.h"
 4 | #include "hamr_python_deleter_impl.h"
 5 | 
 6 | template class hamr::python_deleter<float>;
 7 | template class hamr::python_deleter<double>;
 8 | template class hamr::python_deleter<char>;
 9 | template class hamr::python_deleter<signed char>;
10 | template class hamr::python_deleter<short>;
11 | template class hamr::python_deleter<int>;
12 | template class hamr::python_deleter<long>;
13 | template class hamr::python_deleter<long long>;
14 | template class hamr::python_deleter<unsigned char>;
15 | template class hamr::python_deleter<unsigned short>;
16 | template class hamr::python_deleter<unsigned int>;
17 | template class hamr::python_deleter<unsigned long>;
18 | template class hamr::python_deleter<unsigned long long>;
19 | 
20 | #include "hamr_buffer.h"
21 | #include "hamr_buffer_impl.h"
22 | 
23 | #define hamr_buffer_instantiate_python(T) \
24 | template hamr::buffer<T>::buffer(allocator alloc, const hamr::stream &strm, transfer sync, size_t size, int owner, T *ptr, hamr::python_deleter<T> df);
25 | 
26 | hamr_buffer_instantiate_python(float)
27 | hamr_buffer_instantiate_python(double)
28 | hamr_buffer_instantiate_python(char)
29 | hamr_buffer_instantiate_python(signed char)
30 | hamr_buffer_instantiate_python(short)
31 | hamr_buffer_instantiate_python(int)
32 | hamr_buffer_instantiate_python(long)
33 | hamr_buffer_instantiate_python(long long)
34 | hamr_buffer_instantiate_python(unsigned char)
35 | hamr_buffer_instantiate_python(unsigned short)
36 | hamr_buffer_instantiate_python(unsigned int)
37 | hamr_buffer_instantiate_python(unsigned long)
38 | hamr_buffer_instantiate_python(unsigned long long)
39 | 


--------------------------------------------------------------------------------
/test/test_hamr_buffer_numpy_cuda.py:
--------------------------------------------------------------------------------
 1 | from hamr import *
 2 | import numpy as np
 3 | import sys
 4 | 
 5 | stderr = sys.__stderr__
 6 | 
 7 | n_elem = 256
 8 | init_val = 3.1415
 9 | mod_val = 10000
10 | res_val = init_val*mod_val
11 | 
12 | # send data from C++ to Python
13 | stderr.write('TEST: C++ --> Python\n' \
14 |              '=======================\n')
15 | 
16 | stderr.write('TEST: creating a hamr::buffer w. CUDA ... \n')
17 | buf = buffer_float(buffer_allocator_cuda, 16, 3.1415)
18 | stderr.write('buf = %s\n'%(str(buf)))
19 | stderr.write('TEST: creating a hamr::buffer w. CUDA ... OK!\n\n')
20 | 
21 | stderr.write('TEST: get a handle to the data ... \n')
22 | h = buf.get_host_accessible()
23 | stderr.write('TEST: get a handle to the data ... OK!\n\n')
24 | 
25 | stderr.write('TEST: share the data with Numpy ... \n')
26 | arr = np.array(h, copy=False)
27 | stderr.write('arr.__array_interface__ = %s\n'%(arr.__array_interface__))
28 | stderr.write('TEST: share the data with Numpy ... OK!\n\n')
29 | 
30 | stderr.write('TEST: deleting the hamr::buffer ... \n')
31 | buf = None
32 | stderr.write('TEST: deleting the hamr::buffer ... OK!\n\n')
33 | 
34 | stderr.write('TEST: Numpy reads the data ... \n')
35 | stderr.write('arr = %s\n'%(str(arr)))
36 | stderr.write('TEST: Numpy reads the data ... OK!\n\n')
37 | 
38 | stderr.write('TEST: Numpy modifies the data ... \n')
39 | arr *= 10000
40 | stderr.write('arr = %s\n'%(str(arr)))
41 | stderr.write('TEST: Numpy modifies the data ... OK!\n\n')
42 | 
43 | stderr.write('TEST: Verify the result ... \n')
44 | if not np.allclose(arr, res_val):
45 |     stderr.write('ERROR: TEST failed!\n')
46 |     sys.exit(-1)
47 | stderr.write('TEST: Verify the result ... OK\n\n')
48 | 
49 | stderr.write('TEST: deleting the Numpy array ... \n')
50 | arr = None
51 | stderr.write('TEST: deleting the Numpy array ... OK!\n\n')
52 | 
53 | sys.exit(0)
54 | 


--------------------------------------------------------------------------------
/hamr_buffer_handle.i:
--------------------------------------------------------------------------------
 1 | %{
 2 | #include "hamr_config.h"
 3 | #include "hamr_buffer_handle.h"
 4 | #include "hamr_gil_state.h"
 5 | %}
 6 | 
 7 | /***************************************************************************
 8 |  * buffer handle
 9 |  **************************************************************************/
10 | %ignore hamr::buffer_handle::buffer_handle(buffer_handle &&);
11 | %ignore hamr::buffer_handle::operator=;
12 | 
13 | %include "hamr_buffer_handle.h"
14 | 
15 | %extend hamr::buffer_handle
16 | {
17 |     PyObject *__str__()
18 |     {
19 |         hamr::gil_state gil;
20 |         std::ostringstream oss;
21 |         self->to_stream(oss);
22 |         return PyUnicode_FromString(oss.str().c_str());
23 |     }
24 | 
25 | %pythoncode
26 | {
27 |     @property
28 |     def __array_interface__(self):
29 |         return self.get_numpy_array_interface()
30 | 
31 |     @property
32 |     def __cuda_array_interface__(self):
33 |         return self.get_cuda_array_interface()
34 | }
35 | }
36 | 
37 | /* named buffer_handles */
38 | %template(buffer_handle_float) hamr::buffer_handle<float>;
39 | %template(buffer_handle_double) hamr::buffer_handle<double>;
40 | %template(buffer_handle_char) hamr::buffer_handle<char>;
41 | %template(buffer_handle_short) hamr::buffer_handle<short>;
42 | %template(buffer_handle_int) hamr::buffer_handle<int>;
43 | %template(buffer_handle_long) hamr::buffer_handle<long>;
44 | %template(buffer_handle_long_long) hamr::buffer_handle<long long>;
45 | %template(buffer_handle_unsigned_char) hamr::buffer_handle<unsigned char>;
46 | %template(buffer_handle_unsigned_short) hamr::buffer_handle<unsigned short>;
47 | %template(buffer_handle_unsigned_int) hamr::buffer_handle<unsigned int>;
48 | %template(buffer_handle_unsigned_long) hamr::buffer_handle<unsigned long>;
49 | %template(buffer_handle_unsigned_long_long) hamr::buffer_handle<unsigned long long>;
50 | 


--------------------------------------------------------------------------------
/hamr_hip_print_impl.h:
--------------------------------------------------------------------------------
 1 | #ifndef hamr_hip_print_impl_h
 2 | #define hamr_hip_print_impl_h
 3 | 
 4 | #include "hamr_config.h"
 5 | #include "hamr_env.h"
 6 | 
 7 | #if defined(HAMR_ENABLE_HIP)
 8 | #include "hamr_hip_kernels.h"
 9 | #include "hamr_hip_launch.h"
10 | #include <hip/hip_runtime.h>
11 | #endif
12 | 
13 | #include <iostream>
14 | 
15 | /// heterogeneous accelerator memory resource
16 | namespace hamr
17 | {
18 | 
19 | /** prints an array on the GPU
20 |  * @param[in] vals an array of n elements accessible in HIP
21 |  * @param[in] n_elem the length of the array
22 |  * @returns 0 if there were no errors
23 |  */
24 | template <typename T>
25 | int hip_print(T *vals, size_t n_elem)
26 | {
27 | #if !defined(HAMR_ENABLE_HIP)
28 |     (void) vals;
29 |     (void) n_elem;
30 |     std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
31 |         " print_hip failed because HIP is not enabled." << std::endl;
32 |     return -1;
33 | #else
34 | 
35 |     // get launch parameters
36 |     int device_id = -1;
37 |     dim3 block_grid;
38 |     int n_blocks = 0;
39 |     dim3 thread_grid = 0;
40 |     if (hamr::partition_thread_blocks(device_id, n_elem, 8, block_grid,
41 |         n_blocks, thread_grid))
42 |     {
43 |         std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
44 |             " Failed to determine launch properties." << std::endl;
45 |         return -1;
46 |     }
47 | 
48 |     // invoke the print kernel
49 |     hipError_t ierr = hipSuccess;
50 |     hamr::hip_kernels::print<<<block_grid, thread_grid>>>(vals, n_elem);
51 |     if ((ierr = hipGetLastError()) != hipSuccess)
52 |     {
53 |         std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
54 |             " Failed to launch the print kernel. "
55 |             << hipGetErrorString(ierr) << std::endl;
56 |         return -1;
57 |     }
58 | 
59 |     return 0;
60 | #endif
61 | }
62 | 
63 | }
64 | #endif
65 | 


--------------------------------------------------------------------------------
/hamr_copier_traits.h:
--------------------------------------------------------------------------------
 1 | #ifndef hamr_copier_traits_h
 2 | #define hamr_copier_traits_h
 3 | 
 4 | #include "hamr_config.h"
 5 | #include <type_traits>
 6 | 
 7 | namespace hamr
 8 | {
 9 | /// @name type trait that enables object copy
10 | ///@{
11 | template <typename T, typename U, bool val = (!std::is_arithmetic<T>::value || !std::is_arithmetic<U>::value)> struct use_object_copier : std::false_type {};
12 | template <typename T, typename U> struct use_object_copier<T, U, true> : std::true_type {};
13 | template <typename T, typename U> using use_object_copier_t = typename std::enable_if<use_object_copier<T,U>::value>::type;
14 | ///@}
15 | 
16 | 
17 | /// @name type trait that enables POD copy from different types
18 | ///@{
19 | #if defined(HAMR_ENABLE_OBJECTS)
20 | template <typename T, typename U, bool val = (!std::is_same<T,U>::value)> struct use_cons_copier : std::false_type {};
21 | template <typename T, typename U> struct use_cons_copier<T, U, true> : std::true_type {};
22 | template <typename T, typename U> using use_cons_copier_t = typename std::enable_if<use_cons_copier<T,U>::value>::type;
23 | #else
24 | template <typename T, typename U, bool val = (!std::is_same<T,U>::value && std::is_arithmetic<T>::value)> struct use_cons_copier : std::false_type {};
25 | template <typename T, typename U> struct use_cons_copier<T, U, true> : std::true_type {};
26 | template <typename T, typename U> using use_cons_copier_t = typename std::enable_if<use_cons_copier<T,U>::value>::type;
27 | #endif
28 | ///@}
29 | 
30 | /// @name type trait that enables POD copy from the same types
31 | ///@{
32 | template <typename T, typename U, bool obj = (std::is_same<T,U>::value && std::is_arithmetic<T>::value)> struct use_bytes_copier : std::false_type {};
33 | template <typename T, typename U> struct use_bytes_copier<T, U, true> : std::true_type {};
34 | template <typename T, typename U> using use_bytes_copier_t = typename std::enable_if<use_bytes_copier<T,U>::value>::type;
35 | ///@}
36 | 
37 | }
38 | 
39 | #endif
40 | 


--------------------------------------------------------------------------------
/hamr_cuda_print_impl.h:
--------------------------------------------------------------------------------
 1 | #ifndef hamr_cuda_print_impl_h
 2 | #define hamr_cuda_print_impl_h
 3 | 
 4 | #include "hamr_config.h"
 5 | #include "hamr_env.h"
 6 | #include "hamr_stream.h"
 7 | #if defined(HAMR_ENABLE_CUDA)
 8 | #include "hamr_cuda_kernels.h"
 9 | #include "hamr_cuda_launch.h"
10 | #include <cuda.h>
11 | #include <cuda_runtime.h>
12 | #endif
13 | 
14 | #include <iostream>
15 | 
16 | /// heterogeneous accelerator memory resource
17 | namespace hamr
18 | {
19 | 
20 | /** prints an array on the GPU
21 |  * @param[in] vals an array of n elements accessible in CUDA
22 |  * @param[in] n_elem the length of the array
23 |  * @returns 0 if there were no errors
24 |  */
25 | template <typename T>
26 | int cuda_print(const hamr::stream &strm, T *vals, size_t n_elem)
27 | {
28 | #if !defined(HAMR_ENABLE_CUDA)
29 |     (void) vals;
30 |     (void) n_elem;
31 |     std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
32 |         " print_cuda failed because CUDA is not enabled." << std::endl;
33 |     return -1;
34 | #else
35 | 
36 |     // get launch parameters
37 |     int device_id = -1;
38 |     dim3 block_grid;
39 |     int n_blocks = 0;
40 |     dim3 thread_grid = 0;
41 |     if (hamr::partition_thread_blocks(device_id, n_elem, 8, block_grid,
42 |         n_blocks, thread_grid))
43 |     {
44 |         std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
45 |             " Failed to determine launch properties." << std::endl;
46 |         return -1;
47 |     }
48 | 
49 |     // invoke the print kernel
50 |     cudaError_t ierr = cudaSuccess;
51 |     hamr::cuda_kernels::print<<<block_grid, thread_grid, 0, strm>>>(vals, n_elem);
52 |     if ((ierr = cudaGetLastError()) != cudaSuccess)
53 |     {
54 |         std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
55 |             " Failed to launch the print kernel. "
56 |             << cudaGetErrorString(ierr) << std::endl;
57 |         return -1;
58 |     }
59 | 
60 |     return 0;
61 | #endif
62 | }
63 | 
64 | }
65 | #endif
66 | 


--------------------------------------------------------------------------------
/.github/workflows/build_and_test_hip.yml:
--------------------------------------------------------------------------------
 1 | name: HIP-HAMR
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ master ]
 6 |   pull_request:
 7 |     branches: [ master ]
 8 | 
 9 |   workflow_dispatch:
10 | 
11 | jobs:
12 |   build_and_test:
13 |     runs-on: ubuntu-20.04
14 | 
15 |     steps:
16 |       - uses: actions/checkout@v2
17 | 
18 |       - name: install_deps
19 |         run: |
20 |           export DEBIAN_FRONTEND="noninteractive"
21 |           export TZ="America/Los_Angeles"
22 |           sudo apt-get update -qq
23 |           sudo apt-get install -qq -y git-core gcc g++ cmake automake m4 wget swig python-dev
24 |           wget https://repo.radeon.com/amdgpu-install/21.50.2/ubuntu/focal/amdgpu-install_21.50.2.50002-1_all.deb
25 |           sudo apt-get install -qq -y ./amdgpu-install_21.50.2.50002-1_all.deb
26 |           sudo amdgpu-install --usecase=rocm,openclsdk,hiplibsdk --no-dkms
27 |           python3 -mvenv py3k_testing
28 |           source py3k_testing/bin/activate
29 |           python3 -mpip install numpy
30 | 
31 |       # build for HIP
32 |       - name: build_hip
33 |         run: |
34 |           source py3k_testing/bin/activate
35 |           mkdir build_hip
36 |           cd build_hip
37 |           cmake -DCMAKE_C_COMPILER=/opt/rocm/llvm/bin/clang -DCMAKE_CXX_COMPILER=/opt/rocm/llvm/bin/clang++ -DCMAKE_BUILD_TYPE=Debug -DBUILD_TESTING=ON -DHAMR_ENABLE_HIP=ON -DCMAKE_INSTALL_PREFIX=`pwd`/../../hamr-install-hip ..
38 |           make -j2 install
39 |           cd ..
40 | 
41 |       # build for HIP
42 |       - name: build_hip_separate_impl
43 |         run: |
44 |           source py3k_testing/bin/activate
45 |           mkdir build_hip_sep
46 |           cd build_hip_sep
47 |           cmake -DCMAKE_C_COMPILER=/opt/rocm/llvm/bin/clang -DCMAKE_CXX_COMPILER=/opt/rocm/llvm/bin/clang++ -DCMAKE_BUILD_TYPE=Debug -DBUILD_TESTING=ON -DHAMR_ENABLE_HIP=ON -DHAMR_SEPARATE_IMPL=ON -DCMAKE_INSTALL_PREFIX=`pwd`/../../hamr-install-hip-sep ..
48 |           make -j2 install
49 |           cd ..
50 | 
51 | 


--------------------------------------------------------------------------------
/.github/workflows/build_and_test_amd_openmp.yml:
--------------------------------------------------------------------------------
 1 | name: AMD-OpenMP
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ master ]
 6 |   pull_request:
 7 |     branches: [ master ]
 8 | 
 9 |   workflow_dispatch:
10 | 
11 | jobs:
12 |   build_and_test:
13 |     runs-on: ubuntu-22.04
14 | 
15 |     steps:
16 |       - uses: actions/checkout@v2
17 | 
18 |       - name: install_deps
19 |         run: |
20 |           export DEBIAN_FRONTEND="noninteractive"
21 |           export TZ="America/Los_Angeles"
22 |           sudo apt-get update -qq
23 |           sudo apt-get install -qq -y git-core gcc g++ cmake automake m4 wget swig python3
24 |           wget https://repo.radeon.com/amdgpu-install/5.4.3/ubuntu/jammy/amdgpu-install_5.4.50403-1_all.deb
25 |           sudo apt-get install ./amdgpu-install_5.4.50403-1_all.deb
26 |           sudo amdgpu-install --usecase=rocm,openclsdk,hiplibsdk --no-dkms
27 |           python3 -mvenv py3k_testing
28 |           source py3k_testing/bin/activate
29 |           python3 -mpip install numpy
30 | 
31 |       # build for OpenMP
32 |       - name: build_openmp
33 |         run: |
34 |           source py3k_testing/bin/activate
35 |           mkdir build_amd_openmp
36 |           cd build_amd_openmp
37 |           cmake -DCMAKE_CXX_COMPILER=/opt/rocm/llvm/bin/clang++ -DHAMR_OPENMP_ARCH=gfx1030 -DCMAKE_BUILD_TYPE=Debug -DBUILD_TESTING=ON -DHAMR_ENABLE_OPENMP=ON -DCMAKE_INSTALL_PREFIX=`pwd`/../../hamr-install-amd-omp ..
38 |           make -j2 install
39 |           cd ..
40 | 
41 |       # build for OpenMP
42 |       - name: build_openmp_separate_impl
43 |         run: |
44 |           source py3k_testing/bin/activate
45 |           mkdir build_amd_openmp_sep
46 |           cd build_amd_openmp_sep
47 |           cmake -DCMAKE_CXX_COMPILER=/opt/rocm/llvm/bin/clang++ -DHAMR_OPENMP_ARCH=gfx1030 -DCMAKE_BUILD_TYPE=Debug -DBUILD_TESTING=ON -DHAMR_ENABLE_OPENMP=ON -DHAMR_EMABLE_SEPARATE_IMPL=ON -DCMAKE_INSTALL_PREFIX=`pwd`/../../hamr-install-amd-omp-sep ..
48 |           make -j2 install
49 |           cd ..
50 | 
51 | 


--------------------------------------------------------------------------------
/hamr_stream.cxx:
--------------------------------------------------------------------------------
 1 | #include "hamr_stream.h"
 2 | 
 3 | #include <iostream>
 4 | 
 5 | namespace hamr
 6 | {
 7 | 
 8 | // --------------------------------------------------------------------------
 9 | int stream::synchronize() const
10 | {
11 | #if defined(HAMR_ENABLE_CUDA)
12 |     if (const cudaStream_t *cs = std::get_if<1>(&m_stream))
13 |     {
14 |         cudaStreamSynchronize(*cs);
15 |     }
16 | #endif
17 | #if defined(HAMR_ENABLE_HIP)
18 |     if (const hipStream_t *hs = std::get_if<2>(&m_stream))
19 |     {
20 |         hipStreamSynchronize(*hs);
21 |     }
22 | #endif
23 |     return 0;
24 | }
25 | 
26 | // --------------------------------------------------------------------------
27 | stream::operator bool() const
28 | {
29 |     if (std::get_if<1>(&m_stream))
30 |     {
31 |         return true;
32 |     }
33 |     else if (std::get_if<2>(&m_stream))
34 |     {
35 |         return true;
36 |     }
37 |     return false;
38 | }
39 | 
40 | // --------------------------------------------------------------------------
41 | size_t stream::get_stream()
42 | {
43 | #if defined(HAMR_ENABLE_CUDA)
44 |     if (const cudaStream_t *cs = std::get_if<1>(&m_stream))
45 |     {
46 |         return (size_t)*cs;
47 |     }
48 | #endif
49 | #if defined(HAMR_ENABLE_HIP)
50 |     if (const hipStream_t *hs = std::get_if<2>(&m_stream))
51 |     {
52 |         return (size_t)*hs;
53 |     }
54 | #endif
55 |     return 2;
56 | }
57 | 
58 | // --------------------------------------------------------------------------
59 | void stream::print() const
60 | {
61 | #if defined(HAMR_ENABLE_CUDA)
62 |     if (const cudaStream_t *cs = std::get_if<1>(&m_stream))
63 |     {
64 |         std::cerr << "cudaStream_t m_stream = " << *cs << std::endl;
65 |         return;
66 |     }
67 | #endif
68 | #if defined(HAMR_ENABLE_HIP)
69 |     if (const hipStream_t *hs = std::get_if<2>(&m_stream))
70 |     {
71 |         std::cerr << "hipStream_t m_stream = " << *hs << std::endl;
72 |         return;
73 |     }
74 | #endif
75 |     std::cerr << "empty" << std::endl;
76 | }
77 | 
78 | }
79 | 


--------------------------------------------------------------------------------
/hamr_cuda_device.h:
--------------------------------------------------------------------------------
 1 | #ifndef hamr_cuda_device_h
 2 | #define hamr_cuda_device_h
 3 | 
 4 | #include "hamr_config.h"
 5 | 
 6 | ///@file
 7 | 
 8 | namespace hamr
 9 | {
10 | /// gets the device identifier for the first GPU. @returns zero if successful.
11 | inline int HAMR_EXPORT get_cuda_device_identifier(int &dev_id) { dev_id = 0; return 0; }
12 | 
13 | /// gets the device identifier for the host. @returns zero if successful.
14 | inline int HAMR_EXPORT get_cuda_host_identifier(int &dev_id) { dev_id = -1; return 0; }
15 | 
16 | /// gets the currently atcive CUDA device. @returns zero if successful.
17 | int HAMR_EXPORT get_active_cuda_device(int &dev_id);
18 | 
19 | /// sets the active CUDA device. returns zero if successful.
20 | int HAMR_EXPORT set_active_cuda_device(int dev_id);
21 | 
22 | /// gets the device that owns the given pointer. @returns zero if successful.
23 | int HAMR_EXPORT get_cuda_device(const void *ptr, int &device_id);
24 | 
25 | /** Activate the specified CUDA device, and restore the previously active
26 |  * device when the object is destroyed.
27 |  */
28 | class HAMR_EXPORT activate_cuda_device
29 | {
30 | public:
31 |     activate_cuda_device() = delete;
32 |     activate_cuda_device(const activate_cuda_device &) = delete;
33 |     void operator=(const activate_cuda_device &) = delete;
34 | 
35 |     activate_cuda_device(int id);
36 |     ~activate_cuda_device();
37 | 
38 | private:
39 |     int m_device;
40 | };
41 | 
42 | 
43 | /** Activate peer to peer memory access between two devices, and deactivate when
44 |  * the object goes out of scope.
45 |  */
46 | class access_cuda_peer
47 | {
48 | public:
49 |     access_cuda_peer() : m_dest_device(-1), m_src_device(-1), m_symetric(false) {}
50 |     ~access_cuda_peer() { disable(); }
51 | 
52 |     /// enable peer to peer access. the dest device must active.
53 |     int enable(int dest_device, int src_device, bool symetric);
54 | 
55 |     /// disable peer to peer access.
56 |     int disable();
57 | 
58 | private:
59 |     int m_dest_device;
60 |     int m_src_device;
61 |     int m_symetric;
62 | };
63 | 
64 | 
65 | }
66 | #endif
67 | 


--------------------------------------------------------------------------------
/hamr_new_allocator.h:
--------------------------------------------------------------------------------
 1 | #ifndef hamr_new_allocator_h
 2 | #define hamr_new_allocator_h
 3 | 
 4 | #include "hamr_config.h"
 5 | #include <type_traits>
 6 | #include <memory>
 7 | 
 8 | namespace hamr
 9 | {
10 | 
11 | /// a deleter for arrays allocated with new
12 | template <typename T>
13 | class HAMR_EXPORT new_deleter
14 | {
15 | public:
16 |     /** constructs the deleter
17 |      * @param[in] ptr the pointer to the array to delete
18 |      * @param[in] n the number of elements in the array
19 |      */
20 |     new_deleter(T *ptr, size_t n);
21 | 
22 |     /** deletes the array
23 |      * @param[in] ptr the pointer to the array to delete. must be the same as
24 |      *                that passed during construction.
25 |      */
26 |     void operator()(T *ptr);
27 | 
28 | private:
29 |     T *m_ptr;
30 |     size_t m_elem;
31 | };
32 | 
33 | 
34 | 
35 | 
36 | 
37 | 
38 | /// a class for allocating arrays with new
39 | template <typename T>
40 | struct HAMR_EXPORT new_allocator
41 | {
42 |     /** allocate an array of n elements.
43 |      * @param[in] n the number of elements to allocate
44 |      * @returns a shared pointer to the array that holds a deleter for the memory
45 |      */
46 |     static std::shared_ptr<T> allocate(size_t n) HAMR_EXPORT;
47 | 
48 |     /** allocate an array of n elements.
49 |      * @param[in] n the number of elements to allocate
50 |      * @param[in] val a value to initialize the elements to
51 |      * @returns a shared pointer to the array that holds a deleter for the memory
52 |      */
53 |     static std::shared_ptr<T> allocate(size_t n, const T &val) HAMR_EXPORT;
54 | 
55 |     /** allocate an array of n elements.
56 |      * @param[in] n the number of elements to allocate
57 |      * @param[in] vals an array of n values to initialize the elements with
58 |      * @returns a shared pointer to the array that holds a deleter for the memory
59 |      */
60 |     template <typename U>
61 |     static std::shared_ptr<T> allocate(size_t n, const U *vals) HAMR_EXPORT;
62 | };
63 | 
64 | }
65 | 
66 | #if !defined(HAMR_SEPARATE_IMPL)
67 | #include "hamr_new_allocator_impl.h"
68 | #endif
69 | 
70 | #endif
71 | 


--------------------------------------------------------------------------------
/.github/workflows/build_and_test_host.yml:
--------------------------------------------------------------------------------
 1 | name: host-HAMR
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ master ]
 6 |   pull_request:
 7 |     branches: [ master ]
 8 | 
 9 |   workflow_dispatch:
10 | 
11 | jobs:
12 |   build_and_test:
13 |     runs-on: ubuntu-20.04
14 | 
15 |     steps:
16 |       - uses: actions/checkout@v2
17 | 
18 |       - name: install_deps
19 |         run: |
20 |           export DEBIAN_FRONTEND="noninteractive"
21 |           export TZ="America/Los_Angeles"
22 |           sudo apt-get update -qq
23 |           sudo apt-get install -qq -y git-core gcc g++ cmake automake m4 wget swig python-dev
24 |           python3 -mvenv py3k_testing
25 |           source py3k_testing/bin/activate
26 |           python3 -mpip install numpy
27 | 
28 |       # build for host only
29 |       - name: build_host
30 |         run: |
31 |           source py3k_testing/bin/activate
32 |           mkdir build_host
33 |           cd build_host
34 |           cmake -DCMAKE_BUILD_TYPE=Debug -DBUILD_TESTING=ON -DHAMR_ENABLE_PYTHON=ON -DCMAKE_INSTALL_PREFIX=`pwd`/../../hamr-install-host ..
35 |           make -j2 install
36 |           cd ..
37 | 
38 |       # test the host build
39 |       - name: test_host
40 |         run: |
41 |           source py3k_testing/bin/activate
42 |           cd build_host
43 |           tmp_llp=$LD_LIBRARY_PATH
44 |           tmp_pp=$PYTHONPATH
45 |           source ./bin/hamr_python_env.sh
46 |           ctest --output-on-failure
47 |           export LD_LIBRARY_PATH=$tmp_llp PYTHONPATH=$tmp_pp
48 |           cd ..
49 | 
50 |       # build for host only
51 |       - name: build_host_separate_impl
52 |         run: |
53 |           source py3k_testing/bin/activate
54 |           mkdir build_host_sep
55 |           cd build_host_sep
56 |           cmake -DCMAKE_BUILD_TYPE=Debug -DBUILD_TESTING=ON -DHAMR_ENABLE_PYTHON=ON -DHAMR_SEPARATE_IMPL=ON -DCMAKE_INSTALL_PREFIX=`pwd`/../../hamr-install-host-sep ..
57 |           make -j2 install
58 |           cd ..
59 | 
60 |       # test the host build
61 |       - name: test_host_separate_impl
62 |         run: |
63 |           source py3k_testing/bin/activate
64 |           cd build_host_sep
65 |           tmp_llp=$LD_LIBRARY_PATH
66 |           tmp_pp=$PYTHONPATH
67 |           source ./bin/hamr_python_env.sh
68 |           ctest --output-on-failure
69 |           export LD_LIBRARY_PATH=$tmp_llp PYTHONPATH=$tmp_pp
70 |           cd ..
71 | 


--------------------------------------------------------------------------------
/python/hamr_py.i:
--------------------------------------------------------------------------------
 1 | %define DOCSTR
 2 | "HAMR - Heterogeneous Accelerator Memory Resource. A library for autmated
 3 | memory management on systems with heterogeneous accellerators."
 4 | %enddef
 5 | %module(docstring=DOCSTR) hamr
 6 | %feature("autodoc", "3");
 7 | %{
 8 | #define SWIG_FILE_WITH_INIT
 9 | 
10 | #include <Python.h>
11 | 
12 | #include "hamr_config.h"
13 | #include "hamr_buffer.h"
14 | #include "hamr_buffer_allocator.h"
15 | #include "hamr_buffer_handle.h"
16 | #include "hamr_python_deleter.h"
17 | #include "hamr_stream.h"
18 | 
19 | #include <iostream>
20 | #include <sstream>
21 | 
22 | /* disable some warnings that are present in SWIG generated code. */
23 | #if __GNUC__ > 8
24 | #pragma GCC diagnostic ignored "-Wcast-function-type"
25 | #endif
26 | #pragma GCC diagnostic ignored "-Wunused-parameter"
27 | #pragma GCC diagnostic ignored "-Wmissing-field-initializers"
28 | #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
29 | #if defined(__CUDACC__)
30 | #pragma nv_diag_suppress = set_but_not_used
31 | #endif
32 | %}
33 | 
34 | /* SWIG doens't understand compiler attriibbutes */
35 | #define __attribute__(x)
36 | 
37 | /* enable STL classes */
38 | %include "shared_ptr.i"
39 | 
40 | /***************************************************************************
41 |  * expose the build configuration
42 |  **************************************************************************/
43 | %include "hamr_config.h"
44 | 
45 | /***************************************************************************
46 |  * buffer allocator enumerations
47 |  **************************************************************************/
48 | %include "hamr_buffer_allocator.i"
49 | 
50 | /***************************************************************************
51 |  * buffer transfer mode enumerations
52 |  **************************************************************************/
53 | %include "hamr_buffer_transfer.i"
54 | 
55 | /***************************************************************************
56 |  * stream
57 |  **************************************************************************/
58 | %include "hamr_stream.i"
59 | 
60 | /***************************************************************************
61 |  * buffer_handle
62 |  **************************************************************************/
63 | %include "hamr_buffer_handle.i"
64 | 
65 | /***************************************************************************
66 |  * buffer
67 |  **************************************************************************/
68 | %include "hamr_buffer.i"
69 | 


--------------------------------------------------------------------------------
/test/test_hamr_multi_gpu_cuda.cpp:
--------------------------------------------------------------------------------
 1 | #include "hamr_buffer.h"
 2 | #include "hamr_buffer_util.h"
 3 | 
 4 | #include <cuda.h>
 5 | #include <cuda_runtime.h>
 6 | 
 7 | #include <iostream>
 8 | 
 9 | template <typename T>
10 | void print(const hamr::buffer<T> &buf)
11 | {
12 |     auto [spbuf, pbuf] = hamr::get_host_accessible(buf);
13 |     std::cerr << pbuf[0];
14 |     for (size_t i = 1; i < buf.size(); ++i)
15 |         std::cerr << ", "<< pbuf[i];
16 |     std::cerr << std::endl;
17 | }
18 | 
19 | 
20 | int main(int argc, char **argv)
21 | {
22 |     (void) argc;
23 |     (void) argv;
24 | 
25 |     // get the number of GPUs
26 |     int n_dev = 0;
27 |     if (cudaGetDeviceCount(&n_dev) != cudaSuccess)
28 |     {
29 |         std::cerr << "ERROR: failed to get number of devices" << std::endl;
30 |         return -1;
31 |     }
32 | 
33 |     if (n_dev < 2)
34 |     {
35 |         std::cerr << "Can't run the test with " << n_dev << " CUDA devices" << std::endl;
36 |         return 0;
37 |     }
38 | 
39 |     // allocate some data on the host
40 |     size_t n_elem = 1000;
41 | 
42 |     using T = int;
43 |     T val = 31415;
44 | 
45 |     hamr::buffer<T> *src = new hamr::buffer<T>(hamr::buffer_allocator::malloc, n_elem, val);
46 | 
47 |     if (n_elem < 33)
48 |         print(*src);
49 | 
50 |     // move to each GPU
51 |     for (int i = 0; i < n_dev; ++i)
52 |     {
53 |         std::cerr << " ==== move to device " << i << " ==== " << std::endl;
54 | 
55 |         // move to GPU i
56 |         if (cudaSetDevice(i) != cudaSuccess)
57 |         {
58 |             std::cerr << "ERROR: failed to set the active device to " << i << std::endl;
59 |             return -1;
60 |         }
61 | 
62 |         hamr::buffer<T> *dest = new hamr::buffer<T>(hamr::buffer_allocator::cuda, *src);
63 | 
64 |         if (n_elem < 33)
65 |             print(*dest);
66 | 
67 |         // update the source
68 |         delete src;
69 |         src = dest;
70 |     }
71 | 
72 |     // move back to the host
73 |     std::cerr << " ==== move to host ==== " << std::endl;
74 | 
75 |     hamr::buffer<T> end(hamr::buffer_allocator::malloc, *src);
76 | 
77 |     if (n_elem < 33)
78 |        print(end);
79 | 
80 |     // check for 31415
81 |     std::cerr << " ==== validate ==== " << std::endl;
82 | 
83 |     auto [spsrc, psrc] = hamr::get_host_accessible(*src);
84 | 
85 |     for (size_t i = 0; i < n_elem; ++i)
86 |     {
87 |         if (psrc[i] != val)
88 |         {
89 |             std::cerr << "ERROR: psrc[ " << i << "] == " << psrc[i]
90 |                 << " != " << val << std::endl;
91 |             return -1;
92 |         }
93 |     }
94 | 
95 |     std::cerr << "All values verified to be " << val << std::endl;
96 | 
97 |     return 0;
98 | }
99 | 


--------------------------------------------------------------------------------
/test/test_hamr_multi_gpu_hip.cpp:
--------------------------------------------------------------------------------
  1 | #include "hamr_buffer.h"
  2 | #include "hamr_buffer_util.h"
  3 | 
  4 | #include <hip/hip_runtime.h>
  5 | 
  6 | 
  7 | #include <iostream>
  8 | 
  9 | template <typename T>
 10 | void print(const hamr::buffer<T> &buf)
 11 | {
 12 |     auto [spbuf, pbuf] = hamr::get_host_accessible(buf);
 13 | 
 14 |     std::cerr << pbuf[0];
 15 |     for (size_t i = 1; i < buf.size(); ++i)
 16 |         std::cerr << ", "<< pbuf[i];
 17 |     std::cerr << std::endl;
 18 | }
 19 | 
 20 | 
 21 | int main(int argc, char **argv)
 22 | {
 23 |     (void) argc;
 24 |     (void) argv;
 25 | 
 26 |     // get the number of GPUs
 27 |     int n_dev = 0;
 28 |     if (hipGetDeviceCount(&n_dev) != hipSuccess)
 29 |     {
 30 |         std::cerr << "ERROR: failed to get number of devices" << std::endl;
 31 |         return -1;
 32 |     }
 33 | 
 34 |     if (n_dev < 2)
 35 |     {
 36 |         std::cerr << "Can't run the test with " << n_dev << " HIP devices" << std::endl;
 37 |         return 0;
 38 |     }
 39 | 
 40 |     // allocate some data on the host
 41 |     size_t n_elem = 1000;
 42 | 
 43 |     using T = int;
 44 |     T val = 31415;
 45 | 
 46 |     hamr::buffer<T> *src = new hamr::buffer<T>(hamr::buffer_allocator::malloc, n_elem, val);
 47 | 
 48 |     if (n_elem < 33)
 49 |         print(*src);
 50 | 
 51 |     // move to each GPU
 52 |     for (int i = 0; i < n_dev; ++i)
 53 |     {
 54 |         std::cerr << " ==== move to device " << i << " ==== " << std::endl;
 55 | 
 56 |         // move to GPU i
 57 |         if (hipSetDevice(i) != hipSuccess)
 58 |         {
 59 |             std::cerr << "ERROR: failed to set the active device to " << i << std::endl;
 60 |             return -1;
 61 |         }
 62 | 
 63 |         hamr::buffer<T> *dest = new hamr::buffer<T>(hamr::buffer_allocator::hip, *src);
 64 | 
 65 |         if (n_elem < 33)
 66 |             print(*dest);
 67 | 
 68 |         // update the source
 69 |         delete src;
 70 |         src = dest;
 71 |     }
 72 | 
 73 |     // move back to the host
 74 |     std::cerr << " ==== move to host ==== " << std::endl;
 75 | 
 76 |     hamr::buffer<T> end(hamr::buffer_allocator::malloc, *src);
 77 | 
 78 |     if (n_elem < 33)
 79 |        print(end);
 80 | 
 81 |     // check for 31415
 82 |     std::cerr << " ==== validate ==== " << std::endl;
 83 | 
 84 |     auto [spsrc, psrc] = hamr::get_host_accessible(*src);
 85 | 
 86 |     for (size_t i = 0; i < n_elem; ++i)
 87 |     {
 88 |         if (psrc[i] != val)
 89 |         {
 90 |             std::cerr << "ERROR: psrc[ " << i << "] == " << psrc[i]
 91 |                 << " != " << val << std::endl;
 92 |             return -1;
 93 |         }
 94 |     }
 95 | 
 96 |     std::cerr << "All values verified to be " << val << std::endl;
 97 | 
 98 |     return 0;
 99 | }
100 | 


--------------------------------------------------------------------------------
/hamr_hip_device.cxx:
--------------------------------------------------------------------------------
 1 | #include "hamr_hip_device.h"
 2 | 
 3 | #include <iostream>
 4 | 
 5 | #include <hip/hip_runtime.h>
 6 | 
 7 | 
 8 | namespace hamr
 9 | {
10 | 
11 | // **************************************************************************
12 | int get_hip_device(const void *ptr, int &device_id)
13 | {
14 |     hipError_t ierr = hipSuccess;
15 |     hipPointerAttribute_t ptrAtts;
16 |     ierr = hipPointerGetAttributes(&ptrAtts, ptr);
17 | 
18 |     // TODO -- HIP doesn;t yet have this feature of CUDA
19 |     // these types of pointers are NOT accessible on the GPU
20 |     // hipErrorInValue occurs when the pointer is unknown to HIP, as is
21 |     // the case with pointers allocated by malloc or new.
22 |     /*if ((ierr == hipErrorInvalidValue) ||
23 |         ((ierr == hipSuccess) && ((ptrAtts.type == hipMemoryTypeHost) ||
24 |         (ptrAtts.type == hipMemoryTypeUnregistered))))
25 |     {
26 |         // this is host backed memory not associate with a GPU
27 |         device_id = -1;
28 |     }
29 |     else*/ if (ierr != hipSuccess)
30 |     {
31 |         std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
32 |             " Failed to get pointer attributes for " << ptr << std::endl;
33 |         return -1;
34 |     }
35 |     else
36 |     {
37 |         device_id = ptrAtts.device;
38 |     }
39 | 
40 |     return 0;
41 | }
42 | 
43 | // **************************************************************************
44 | int get_active_hip_device(int &dev_id)
45 | {
46 |     hipError_t ierr = hipSuccess;
47 |     if ((ierr = hipGetDevice(&dev_id)) != hipSuccess)
48 |     {
49 |         std::cerr << "Failed to get the active HIP device. "
50 |             << hipGetErrorString(ierr) << std::endl;
51 |         return -1;
52 |     }
53 | 
54 |     return 0;
55 | }
56 | 
57 | // **************************************************************************
58 | int set_active_hip_device(int dev_id)
59 | {
60 |     hipError_t ierr = hipSuccess;
61 |     if ((ierr = hipSetDevice(dev_id)) != hipSuccess)
62 |     {
63 |         std::cerr << "Failed to set the active HIP device. "
64 |             << hipGetErrorString(ierr) << std::endl;
65 |         return -1;
66 |     }
67 | 
68 |     return 0;
69 | }
70 | 
71 | 
72 | // --------------------------------------------------------------------------
73 | activate_hip_device::activate_hip_device(int new_dev) : m_device(-1)
74 | {
75 |     int cur_dev = -1;
76 |     if (!get_active_hip_device(cur_dev) && (cur_dev != new_dev) &&
77 |         !set_active_hip_device(new_dev))
78 |     {
79 |         m_device = cur_dev;
80 |     }
81 | }
82 | 
83 | // --------------------------------------------------------------------------
84 | activate_hip_device::~activate_hip_device()
85 | {
86 |     if (m_device >= 0)
87 |     {
88 |         set_active_hip_device(m_device);
89 |     }
90 | }
91 | 
92 | }
93 | 


--------------------------------------------------------------------------------
/hamr_new_allocator.cxx:
--------------------------------------------------------------------------------
 1 | #include "hamr_config.h"
 2 | 
 3 | #include "hamr_new_allocator.h"
 4 | #include "hamr_new_allocator_impl.h"
 5 | 
 6 | template class hamr::new_deleter<float>;
 7 | template class hamr::new_deleter<double>;
 8 | template class hamr::new_deleter<char>;
 9 | template class hamr::new_deleter<signed char>;
10 | template class hamr::new_deleter<short>;
11 | template class hamr::new_deleter<int>;
12 | template class hamr::new_deleter<long>;
13 | template class hamr::new_deleter<long long>;
14 | template class hamr::new_deleter<unsigned char>;
15 | template class hamr::new_deleter<unsigned short>;
16 | template class hamr::new_deleter<unsigned int>;
17 | template class hamr::new_deleter<unsigned long>;
18 | template class hamr::new_deleter<unsigned long long>;
19 | 
20 | #define hamr_new_allocator_instantiate_members(_T) \
21 | template std::shared_ptr<_T> hamr::new_allocator<_T>::allocate(size_t n, const float *vals); \
22 | template std::shared_ptr<_T> hamr::new_allocator<_T>::allocate(size_t n, const double *vals); \
23 | template std::shared_ptr<_T> hamr::new_allocator<_T>::allocate(size_t n, const char *vals); \
24 | template std::shared_ptr<_T> hamr::new_allocator<_T>::allocate(size_t n, const signed char *vals); \
25 | template std::shared_ptr<_T> hamr::new_allocator<_T>::allocate(size_t n, const short *vals); \
26 | template std::shared_ptr<_T> hamr::new_allocator<_T>::allocate(size_t n, const int *vals); \
27 | template std::shared_ptr<_T> hamr::new_allocator<_T>::allocate(size_t n, const long *vals); \
28 | template std::shared_ptr<_T> hamr::new_allocator<_T>::allocate(size_t n, const long long *vals); \
29 | template std::shared_ptr<_T> hamr::new_allocator<_T>::allocate(size_t n, const unsigned char *vals); \
30 | template std::shared_ptr<_T> hamr::new_allocator<_T>::allocate(size_t n, const unsigned short *vals); \
31 | template std::shared_ptr<_T> hamr::new_allocator<_T>::allocate(size_t n, const unsigned int *vals); \
32 | template std::shared_ptr<_T> hamr::new_allocator<_T>::allocate(size_t n, const unsigned long *vals); \
33 | template std::shared_ptr<_T> hamr::new_allocator<_T>::allocate(size_t n, const unsigned long long *vals);
34 | 
35 | 
36 | #define hamr_new_allocator_instantiate(_T) \
37 | template struct hamr::new_allocator<_T>; \
38 | hamr_new_allocator_instantiate_members(_T)
39 | 
40 | hamr_new_allocator_instantiate(float)
41 | hamr_new_allocator_instantiate(double)
42 | hamr_new_allocator_instantiate(char)
43 | hamr_new_allocator_instantiate(signed char)
44 | hamr_new_allocator_instantiate(short)
45 | hamr_new_allocator_instantiate(int)
46 | hamr_new_allocator_instantiate(long)
47 | hamr_new_allocator_instantiate(long long)
48 | hamr_new_allocator_instantiate(unsigned char)
49 | hamr_new_allocator_instantiate(unsigned short)
50 | hamr_new_allocator_instantiate(unsigned int)
51 | hamr_new_allocator_instantiate(unsigned long)
52 | hamr_new_allocator_instantiate(unsigned long long)
53 | 


--------------------------------------------------------------------------------
/doc/rtd/conf.py:
--------------------------------------------------------------------------------
 1 | import subprocess, os
 2 | 
 3 | # Configuration file for the Sphinx documentation builder.
 4 | #
 5 | # This file only contains a selection of the most common options. For a full
 6 | # list see the documentation:
 7 | # http://www.sphinx-doc.org/en/master/config
 8 | 
 9 | # -- Path setup --------------------------------------------------------------
10 | 
11 | # If extensions (or modules to document with autodoc) are in another directory,
12 | # add these directories to sys.path here. If the directory is relative to the
13 | # documentation root, use os.path.abspath to make it absolute, like shown here.
14 | #
15 | # import os
16 | # import sys
17 | # sys.path.insert(0, os.path.abspath('.'))
18 | 
19 | 
20 | # -- Project information -----------------------------------------------------
21 | 
22 | project = 'HAMR'
23 | copyright = "2022, Burlen Loring"
24 | author = "Burlen Loring"
25 | 
26 | # -- General configuration ---------------------------------------------------
27 | 
28 | try:
29 |     odir = os.environ['READTHEDOCS_OUTPUT']
30 | except:
31 |     os.environ['READTHEDOCS_OUTPUT'] = '_build'
32 |     odir = os.environ['READTHEDOCS_OUTPUT']
33 | 
34 | if not os.path.exists(odir + '/html'):
35 |     os.makedirs(odir + '/html')
36 | 
37 | subprocess.call('doxygen --version', shell=True)
38 | subprocess.call('doxygen', shell=True)
39 | 
40 | # Add any Sphinx extension module names here, as strings. They can be
41 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
42 | # ones.
43 | 
44 | # pip install sphinxcontrib-bibtex breathe
45 | extensions = ['sphinxcontrib.bibtex', 'breathe']
46 | 
47 | bibtex_bibfiles = ['bibliography.bib']
48 | 
49 | # Configuring Breathe
50 | breathe_projects = {
51 |     "HAMR": "_build/xml"
52 | }
53 | breathe_default_project = "HAMR"
54 | 
55 | # Add any paths that contain templates here, relative to this directory.
56 | templates_path = ['_templates']
57 | 
58 | # The master toctree document.
59 | master_doc = 'index'
60 | 
61 | # List of patterns, relative to source directory, that match files and
62 | # directories to ignore when looking for source files.
63 | # This pattern also affects html_static_path and html_extra_path.
64 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
65 | 
66 | 
67 | # -- Options for HTML output -------------------------------------------------
68 | 
69 | # The theme to use for HTML and HTML Help pages.  See the documentation for
70 | # a list of builtin themes.
71 | #
72 | html_theme = 'sphinx_rtd_theme'
73 | 
74 | # Add any paths that contain custom static files (such as style sheets) here,
75 | # relative to this directory. They are copied after the builtin static files,
76 | # so a file named "default.css" will overwrite the builtin "default.css".
77 | html_static_path = ['_static']
78 | 
79 | html_css_files = [
80 |     'theme_overrides.css'  # overrides for wide tables in RTD theme
81 |     ]
82 | 
83 | numfig = True
84 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ### HAMR
 2 | HAMR is a library defining an accelerator technology agnostic memory model that
 3 | bridges between accelerator technologies (CUDA, HIP, ROCm, OpenMP, Kokos, etc)
 4 | and traditional CPUs in heterogeneous computing environments.  HAMR is light
 5 | weight and implemented in modern C++. HAMR includes Python integration that
 6 | enables zero-copy data transfer between C++ and Python technogies such as Numba
 7 | and Cupy.
 8 | 
 9 | ### Citing
10 | If you've used HAMR in your application please cite us.
11 | 
12 | [![DOI](https://zenodo.org/badge/429528113.svg)](https://zenodo.org/badge/latestdoi/429528113)
13 | 
14 | ### Source Code
15 | The source code can be obtained at the [HAMR github repository](https://github.com/LBL-EESA/HAMR).
16 | 
17 | ### Documentation
18 | The [HAMR User's Guide](https://hamr.readthedocs.io/en/latest/) documents
19 | compiling and use of HAMR and contains simple examples.
20 | 
21 | The [HAMR Doxygen site](https://hamr.readthedocs.io/en/latest/doxygen/index.html) documents the APIs. Most users will
22 | want to start with the [hamr::buffer](https://hamr.readthedocs.io/en/latest/doxygen/classhamr_1_1buffer.html), a
23 | container that has capabilities similar to std::vector and can provide access
24 | to data in different accelerator execution environments.
25 | 
26 | ### Regression Testing and CI
27 | ![CPU-HAMR build and test](https://github.com/LBL-EESA/hamr/actions/workflows/build_and_test_host.yml/badge.svg)
28 | ![CUDA-HAMR build and test](https://github.com/LBL-EESA/hamr/actions/workflows/build_and_test_cuda.yml/badge.svg)
29 | ![HIP-HAMR build and test](https://github.com/LBL-EESA/hamr/actions/workflows/build_and_test_hip.yml/badge.svg)
30 | ![AMD-OpenMP-HAMR build and test](https://github.com/LBL-EESA/hamr/actions/workflows/build_and_test_amd_openmp.yml/badge.svg)
31 | 
32 | ### License
33 | HAMR's [license](LICENSE) is a BSD license with an ADDED paragraph at the end that makes it easy for us to
34 | accept improvements. See [license](LICENSE) for more information.
35 | 
36 | ## Copyright Notice
37 | HAMR - Heterogeneous Accelerator Memory Resource (HAMR)
38 | Copyright (c) 2022, The Regents of the University of California, through
39 | Lawrence Berkeley National Laboratory (subject to receipt of any
40 | required approvals from the U.S. Dept. of Energy). All rights reserved.
41 | 
42 | If you have questions about your rights to use or distribute this software,
43 | please contact Berkeley Lab's Intellectual Property Office at
44 | IPO@lbl.gov.
45 | 
46 | NOTICE.  This Software was developed under funding from the U.S. Department
47 | of Energy and the U.S. Government consequently retains certain rights.  As
48 | such, the U.S. Government has been granted for itself and others acting on
49 | its behalf a paid-up, nonexclusive, irrevocable, worldwide license in the
50 | Software to reproduce, distribute copies to the public, prepare derivative
51 | works, and perform publicly and display publicly, and to permit others to do so.
52 | 


--------------------------------------------------------------------------------
/hamr_openmp_allocator.cxx:
--------------------------------------------------------------------------------
 1 | #include "hamr_config.h"
 2 | 
 3 | #include "hamr_openmp_allocator.h"
 4 | #include "hamr_openmp_allocator_impl.h"
 5 | 
 6 | template class hamr::openmp_deleter<float>;
 7 | template class hamr::openmp_deleter<double>;
 8 | template class hamr::openmp_deleter<char>;
 9 | template class hamr::openmp_deleter<signed char>;
10 | template class hamr::openmp_deleter<short>;
11 | template class hamr::openmp_deleter<int>;
12 | template class hamr::openmp_deleter<long>;
13 | template class hamr::openmp_deleter<long long>;
14 | template class hamr::openmp_deleter<unsigned char>;
15 | template class hamr::openmp_deleter<unsigned short>;
16 | template class hamr::openmp_deleter<unsigned int>;
17 | template class hamr::openmp_deleter<unsigned long>;
18 | template class hamr::openmp_deleter<unsigned long long>;
19 | 
20 | #define hamr_openmp_allocator_instantiate_members(_T) \
21 | template std::shared_ptr<_T> hamr::openmp_allocator<_T>::allocate(size_t n, const float *vals); \
22 | template std::shared_ptr<_T> hamr::openmp_allocator<_T>::allocate(size_t n, const double *vals); \
23 | template std::shared_ptr<_T> hamr::openmp_allocator<_T>::allocate(size_t n, const char *vals); \
24 | template std::shared_ptr<_T> hamr::openmp_allocator<_T>::allocate(size_t n, const signed char *vals); \
25 | template std::shared_ptr<_T> hamr::openmp_allocator<_T>::allocate(size_t n, const short *vals); \
26 | template std::shared_ptr<_T> hamr::openmp_allocator<_T>::allocate(size_t n, const int *vals); \
27 | template std::shared_ptr<_T> hamr::openmp_allocator<_T>::allocate(size_t n, const long *vals); \
28 | template std::shared_ptr<_T> hamr::openmp_allocator<_T>::allocate(size_t n, const long long *vals); \
29 | template std::shared_ptr<_T> hamr::openmp_allocator<_T>::allocate(size_t n, const unsigned char *vals); \
30 | template std::shared_ptr<_T> hamr::openmp_allocator<_T>::allocate(size_t n, const unsigned short *vals); \
31 | template std::shared_ptr<_T> hamr::openmp_allocator<_T>::allocate(size_t n, const unsigned int *vals); \
32 | template std::shared_ptr<_T> hamr::openmp_allocator<_T>::allocate(size_t n, const unsigned long *vals); \
33 | template std::shared_ptr<_T> hamr::openmp_allocator<_T>::allocate(size_t n, const unsigned long long *vals);
34 | 
35 | #define hamr_openmp_allocator_instantiate(_T) \
36 | template struct hamr::openmp_allocator<_T>; \
37 | hamr_openmp_allocator_instantiate_members(_T)
38 | 
39 | hamr_openmp_allocator_instantiate(float)
40 | hamr_openmp_allocator_instantiate(double)
41 | hamr_openmp_allocator_instantiate(char)
42 | hamr_openmp_allocator_instantiate(signed char)
43 | hamr_openmp_allocator_instantiate(short)
44 | hamr_openmp_allocator_instantiate(int)
45 | hamr_openmp_allocator_instantiate(long)
46 | hamr_openmp_allocator_instantiate(long long)
47 | hamr_openmp_allocator_instantiate(unsigned char)
48 | hamr_openmp_allocator_instantiate(unsigned short)
49 | hamr_openmp_allocator_instantiate(unsigned int)
50 | hamr_openmp_allocator_instantiate(unsigned long)
51 | hamr_openmp_allocator_instantiate(unsigned long long)
52 | 


--------------------------------------------------------------------------------
/hamr_malloc_allocator.cxx:
--------------------------------------------------------------------------------
 1 | #include "hamr_config.h"
 2 | 
 3 | #include "hamr_malloc_allocator.h"
 4 | #include "hamr_malloc_allocator_impl.h"
 5 | 
 6 | template class hamr::malloc_deleter<float>;
 7 | template class hamr::malloc_deleter<double>;
 8 | template class hamr::malloc_deleter<char>;
 9 | template class hamr::malloc_deleter<signed char>;
10 | template class hamr::malloc_deleter<short>;
11 | template class hamr::malloc_deleter<int>;
12 | template class hamr::malloc_deleter<long>;
13 | template class hamr::malloc_deleter<long long>;
14 | template class hamr::malloc_deleter<unsigned char>;
15 | template class hamr::malloc_deleter<unsigned short>;
16 | template class hamr::malloc_deleter<unsigned int>;
17 | template class hamr::malloc_deleter<unsigned long>;
18 | template class hamr::malloc_deleter<unsigned long long>;
19 | 
20 | #define hamr_malloc_allocator_instantiate_members(_T) \
21 | template std::shared_ptr<_T> hamr::malloc_allocator<_T>::allocate(size_t n, const float *vals); \
22 | template std::shared_ptr<_T> hamr::malloc_allocator<_T>::allocate(size_t n, const double *vals); \
23 | template std::shared_ptr<_T> hamr::malloc_allocator<_T>::allocate(size_t n, const char *vals); \
24 | template std::shared_ptr<_T> hamr::malloc_allocator<_T>::allocate(size_t n, const signed char *vals); \
25 | template std::shared_ptr<_T> hamr::malloc_allocator<_T>::allocate(size_t n, const short *vals); \
26 | template std::shared_ptr<_T> hamr::malloc_allocator<_T>::allocate(size_t n, const int *vals); \
27 | template std::shared_ptr<_T> hamr::malloc_allocator<_T>::allocate(size_t n, const long *vals); \
28 | template std::shared_ptr<_T> hamr::malloc_allocator<_T>::allocate(size_t n, const long long *vals); \
29 | template std::shared_ptr<_T> hamr::malloc_allocator<_T>::allocate(size_t n, const unsigned char *vals); \
30 | template std::shared_ptr<_T> hamr::malloc_allocator<_T>::allocate(size_t n, const unsigned short *vals); \
31 | template std::shared_ptr<_T> hamr::malloc_allocator<_T>::allocate(size_t n, const unsigned int *vals); \
32 | template std::shared_ptr<_T> hamr::malloc_allocator<_T>::allocate(size_t n, const unsigned long *vals); \
33 | template std::shared_ptr<_T> hamr::malloc_allocator<_T>::allocate(size_t n, const unsigned long long *vals);
34 | 
35 | 
36 | #define hamr_malloc_allocator_instantiate(_T) \
37 | template struct hamr::malloc_allocator<_T>; \
38 | hamr_malloc_allocator_instantiate_members(_T)
39 | 
40 | hamr_malloc_allocator_instantiate(float)
41 | hamr_malloc_allocator_instantiate(double)
42 | hamr_malloc_allocator_instantiate(char)
43 | hamr_malloc_allocator_instantiate(signed char)
44 | hamr_malloc_allocator_instantiate(short)
45 | hamr_malloc_allocator_instantiate(int)
46 | hamr_malloc_allocator_instantiate(long)
47 | hamr_malloc_allocator_instantiate(long long)
48 | hamr_malloc_allocator_instantiate(unsigned char)
49 | hamr_malloc_allocator_instantiate(unsigned short)
50 | hamr_malloc_allocator_instantiate(unsigned int)
51 | hamr_malloc_allocator_instantiate(unsigned long)
52 | hamr_malloc_allocator_instantiate(unsigned long long)
53 | 


--------------------------------------------------------------------------------
/test/test_hamr_buffer_cupy_cuda.py:
--------------------------------------------------------------------------------
 1 | from hamr import *
 2 | import cupy as cp
 3 | import sys
 4 | 
 5 | stderr = sys.__stderr__
 6 | 
 7 | n_elem = 256
 8 | init_val = 3.1415
 9 | mod_val = 10000
10 | res_val = init_val*mod_val
11 | 
12 | # send data from C++ to Python
13 | stderr.write('TEST 1 : C++ --> Python\n' \
14 |              '=======================\n')
15 | 
16 | stderr.write('TEST 1: creating a hamr::buffer w. CUDA ... \n')
17 | buf = buffer_float(buffer_allocator_cuda, n_elem, init_val)
18 | stderr.write('buf = %s\n'%(str(buf)))
19 | stderr.write('TEST 1: creating a hamr::buffer w. CUDA ... OK!\n\n')
20 | 
21 | stderr.write('TEST 1: get a handle to the data ... \n')
22 | h = buf.get_cuda_accessible()
23 | stderr.write('TEST 1: get a handle to the data ... OK!\n\n')
24 | 
25 | stderr.write('TEST 1: share the data with Cupy ... \n')
26 | arr = cp.array(h, copy=False)
27 | stderr.write('arr.__cuda_array_interface__ = %s\n'%(arr.__cuda_array_interface__))
28 | stderr.write('TEST 1: share the data with Cupy ... OK!\n\n')
29 | 
30 | stderr.write('TEST 1: deleting the hamr::buffer ... \n')
31 | buf = None
32 | h = None
33 | stderr.write('TEST 1: deleting the hamr::buffer ... OK!\n\n')
34 | 
35 | stderr.write('TEST 1: Cupy modifies the data ... \n')
36 | arr *= mod_val
37 | stderr.write('arr = %s\n'%(str(arr)))
38 | stderr.write('TEST 1: Cupy modifies the data ... OK!\n\n')
39 | 
40 | stderr.write('TEST 1: Verify the result ... \n')
41 | if not cp.allclose(arr, res_val):
42 |     stderr.write('ERROR: TEST 1 failed!\n')
43 |     sys.exit(-1)
44 | stderr.write('TEST 1: Verify the result ... OK\n\n')
45 | 
46 | stderr.write('TEST 1: deleting the Cupy array ... \n')
47 | arr = None
48 | stderr.write('TEST 1: deleting the Cupy array ... OK!\n\n')
49 | 
50 | 
51 | 
52 | # send data from Python to C++
53 | stderr.write('TEST 2 : Python --> C++\n' \
54 |              '=======================\n')
55 | 
56 | stderr.write('TEST 2: creating a Cupy array ... \n')
57 | arr = cp.full((n_elem), init_val, dtype='float32')
58 | stderr.write('arr.__cuda_array_interface__ = %s\n'%(arr.__cuda_array_interface__))
59 | #stderr.write('arr = %s\n'%(str(arr)))
60 | stderr.write('TEST 2: creating a Cupy array ... OK\n\n')
61 | 
62 | stderr.write('TEST 2: share the data with hamr::buffer ... \n')
63 | buf = buffer(arr)
64 | stderr.write('buf = %s\n'%(str(buf)))
65 | stderr.write('TEST 2: share the data with hamr::buffer ... OK\n\n')
66 | 
67 | stderr.write('TEST 2: Cupy modifies the data ... \n')
68 | arr *= mod_val
69 | #stderr.write('arr = %s\n'%(str(arr)))
70 | stderr.write('TEST 2: Cupy modifies the data ... OK!\n\n')
71 | 
72 | stderr.write('TEST 2: deleting the Cupy array ... \n')
73 | arr = None
74 | stderr.write('TEST 2: deleting the Cupy array ... OK!\n\n')
75 | 
76 | stderr.write('TEST 2: display the modified hamr::buffer ... \n')
77 | stderr.write('buf = %s\n'%(str(buf)))
78 | stderr.write('TEST 2: display the modified hamr::buffer ... OK\n\n')
79 | 
80 | stderr.write('TEST 2: deleting the hamr::buffer ... \n')
81 | buf = None
82 | stderr.write('TEST 2: deleting the hamr::buffer ... OK!\n\n')
83 | 
84 | sys.exit(0)
85 | 


--------------------------------------------------------------------------------
/python/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_policy(SET CMP0078 NEW)
 2 | cmake_policy(SET CMP0086 NEW)
 3 | 
 4 | find_package(Python3 COMPONENTS Interpreter Development REQUIRED)
 5 | message("++ Executable: ${Python3_EXECUTABLE}")
 6 | 
 7 | set(HAMR_PYTHON_SITE
 8 |   "${CMAKE_INSTALL_LIBDIR}/python${Python3_VERSION_MAJOR}.${Python3_VERSION_MINOR}/site-packages/"
 9 |   CACHE STRING "Where Python modules are compiled and installed.")
10 | 
11 | set(HAMR_PYTHON_DIR "${HAMR_PYTHON_SITE}/hamr/"
12 |   CACHE STRING "Where HAMR Python bindings are compiled and installed")
13 | 
14 | message(STATUS "HAMR: Python modules will be installed at \"${HAMR_PYTHON_DIR}\"")
15 | 
16 | find_package(SWIG COMPONENTS python)
17 | include(UseSWIG)
18 | 
19 | set(swig_deps
20 |     ../hamr_buffer_allocator.i
21 |     ../hamr_buffer_handle.i
22 |     ../hamr_buffer.i)
23 | 
24 | set_property(SOURCE hamr_py.i PROPERTY CPLUSPLUS ON)
25 | set_property(SOURCE hamr_py.i PROPERTY DEPENDS ${swig_deps})
26 | set_property(SOURCE hamr_py.i PROPERTY SWIG_MODULE_NAME hamr_py)
27 | 
28 | list(APPEND hamr_py_sources hamr_py.i)
29 | if (HAMR_SEPARATE_IMPL)
30 |     list(APPEND hamr_py_sources
31 |         ../hamr_python_deleter.cxx
32 |         )
33 | endif()
34 | 
35 | swig_add_library(hamr_py
36 |     TYPE MODULE LANGUAGE python
37 |     SOURCES ${hamr_py_sources}
38 |     OUTPUT_DIR "${CMAKE_BINARY_DIR}/${HAMR_PYTHON_DIR}"
39 |     OUTFILE_DIR "${CMAKE_CURRENT_BINARY_DIR}")
40 | 
41 | target_link_libraries(hamr_py ${Python3_LIBRARIES} hamr)
42 | 
43 | target_include_directories(hamr_py
44 |     PRIVATE "${Python3_INCLUDE_DIRS}"
45 |     "${CMAKE_CURRENT_SOURCE_DIR}"
46 |     "${CMAKE_CURRENT_SOURCE_DIR}/.."
47 |     "${CMAKE_CURRENBT_BINARY_DIR}"
48 |     "${CMAKE_CURRENBT_BINARY_DIR}/.."
49 |     )
50 | 
51 | set_property(TARGET hamr_py
52 |     PROPERTY SWIG_USE_TARGET_INCLUDE_DIRECTORIES ON)
53 | 
54 | set_target_properties(hamr_py PROPERTIES
55 |   LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/${HAMR_PYTHON_DIR}")
56 | 
57 | if (HAMR_ENABLE_CUDA AND NOT HAMR_NVHPC_CUDA)
58 |     set_source_files_properties(
59 |         "${CMAKE_CURRENT_BINARY_DIR}/hamr_pyPYTHON_wrap.cxx"
60 |         PROPERTIES LANGUAGE CUDA)
61 | 
62 |     set_target_properties(hamr_py PROPERTIES
63 |         CUDA_ARCHITECTURES "${HAMR_CUDA_ARCHITECTURES}")
64 | endif()
65 | 
66 | install(TARGETS hamr_py DESTINATION ${HAMR_PYTHON_DIR})
67 | install(FILES ${CMAKE_BINARY_DIR}/${HAMR_PYTHON_DIR}/hamr_py.py
68 |   DESTINATION ${HAMR_PYTHON_DIR})
69 | 
70 | if (APPLE)
71 |   set_target_properties(hamr_py PROPERTIES INSTALL_RPATH "@loader_path/./")
72 | elseif(UNIX)
73 |   set_target_properties(hamr_py PROPERTIES INSTALL_RPATH "\$ORIGIN/")
74 | endif()
75 | 
76 | configure_file(hamr.py "${CMAKE_BINARY_DIR}/${HAMR_PYTHON_DIR}/__init__.py"
77 |   COPYONLY)
78 | 
79 | install(FILES  "${CMAKE_BINARY_DIR}/${HAMR_PYTHON_DIR}/__init__.py"
80 |   DESTINATION "${HAMR_PYTHON_DIR}")
81 | 
82 | # capture python path for use in automated CI
83 | file(CONFIGURE OUTPUT "${CMAKE_BINARY_DIR}/${CMAKE_INSTALL_BINDIR}/hamr_python_env.sh"
84 | CONTENT
85 | [=[
86 | #!/bin/bash
87 | export PYTHONPATH=@CMAKE_BINARY_DIR@/@HAMR_PYTHON_SITE@:$PYTHONPATH
88 | ]=]
89 | @ONLY)
90 | 


--------------------------------------------------------------------------------
/test/test_hamr_buffer_numpy_host.py:
--------------------------------------------------------------------------------
 1 | from hamr import *
 2 | import numpy as np
 3 | import sys
 4 | 
 5 | stderr = sys.__stderr__
 6 | 
 7 | n_elem = 256
 8 | init_val = 3.1415
 9 | mod_val = 10000
10 | res_val = init_val*mod_val
11 | 
12 | # send data from C++ to Python
13 | stderr.write('TEST 1 : C++ --> Python\n' \
14 |              '=======================\n')
15 | 
16 | stderr.write('TEST 1: creating a hamr::buffer host ... \n')
17 | buf = buffer_float(buffer_allocator_malloc, n_elem, init_val)
18 | stderr.write('buf = %s\n'%(str(buf)))
19 | stderr.write('TEST 1: creating a hamr::buffer host ... OK!\n\n')
20 | 
21 | stderr.write('TEST 1: get a handle to the data ... \n')
22 | h = buf.get_host_accessible()
23 | stderr.write('TEST 1: get a handle to the data ... OK!\n\n')
24 | 
25 | stderr.write('TEST 1: share the data with Numpy ... \n')
26 | arr = np.array(h, copy=False)
27 | stderr.write('arr.__array_interface__ = %s\n'%(arr.__array_interface__))
28 | stderr.write('TEST 1: share the data with Numpy ... OK!\n\n')
29 | 
30 | stderr.write('TEST 1: deleting the hamr::buffer ... \n')
31 | buf = None
32 | h = None
33 | stderr.write('TEST 1: deleting the hamr::buffer ... OK!\n\n')
34 | 
35 | stderr.write('TEST 1: Numpy reads the data ... \n')
36 | stderr.write('arr = %s\n'%(str(arr)))
37 | stderr.write('TEST 1: Numpy reads the data ... OK!\n\n')
38 | 
39 | stderr.write('TEST 1: Numpy modifies the data ... \n')
40 | arr *= mod_val
41 | stderr.write('arr = %s\n'%(str(arr)))
42 | stderr.write('TEST 1: Numpy modifies the data ... OK!\n\n')
43 | 
44 | stderr.write('TEST 1: Verify the result ... \n')
45 | if not np.allclose(arr, res_val):
46 |     stderr.write('ERROR: TEST 1 failed!\n')
47 |     sys.exit(-1)
48 | stderr.write('TEST 1: Verify the result ... OK\n\n')
49 | 
50 | stderr.write('TEST 1: deleting the Numpy array ... \n')
51 | arr = None
52 | stderr.write('TEST 1: deleting the Numpy array ... OK!\n\n')
53 | 
54 | 
55 | 
56 | # send data from Python to C++
57 | stderr.write('TEST 2 : Python --> C++\n' \
58 |              '=======================\n')
59 | 
60 | stderr.write('TEST 2: creating a Numpy array ... \n')
61 | arr = np.full((n_elem), init_val, dtype='float32')
62 | stderr.write('arr.__array_interface__ = %s\n'%(arr.__array_interface__))
63 | #stderr.write('arr = %s\n'%(str(arr)))
64 | stderr.write('TEST 2: creating a Numpy array ... OK\n\n')
65 | 
66 | stderr.write('TEST 2: share the data with hamr::buffer ... \n')
67 | buf = buffer(arr)
68 | stderr.write('buf = %s\n'%(str(buf)))
69 | stderr.write('TEST 2: share the data with hamr::buffer ... OK\n\n')
70 | 
71 | stderr.write('TEST 2: Numpy modifies the data ... \n')
72 | arr *= mod_val
73 | #stderr.write('arr = %s\n'%(str(arr)))
74 | stderr.write('TEST 2: Numpy modifies the data ... OK!\n\n')
75 | 
76 | stderr.write('TEST 2: deleting the Numpy array ... \n')
77 | arr = None
78 | stderr.write('TEST 2: deleting the Numpy array ... OK!\n\n')
79 | 
80 | stderr.write('TEST 2: display the modified hamr::buffer ... \n')
81 | stderr.write('buf = %s\n'%(str(buf)))
82 | stderr.write('TEST 2: display the modified hamr::buffer ... OK\n\n')
83 | 
84 | stderr.write('TEST 2: deleting the hamr::buffer ... \n')
85 | buf = None
86 | stderr.write('TEST 2: deleting the hamr::buffer ... OK!\n\n')
87 | 
88 | sys.exit(0)
89 | 


--------------------------------------------------------------------------------
/hamr_hip_malloc_allocator.cxx:
--------------------------------------------------------------------------------
 1 | #include "hamr_config.h"
 2 | 
 3 | #include "hamr_hip_malloc_allocator.h"
 4 | #include "hamr_hip_malloc_allocator_impl.h"
 5 | 
 6 | template class hamr::hip_malloc_deleter<float>;
 7 | template class hamr::hip_malloc_deleter<double>;
 8 | template class hamr::hip_malloc_deleter<char>;
 9 | template class hamr::hip_malloc_deleter<signed char>;
10 | template class hamr::hip_malloc_deleter<short>;
11 | template class hamr::hip_malloc_deleter<int>;
12 | template class hamr::hip_malloc_deleter<long>;
13 | template class hamr::hip_malloc_deleter<long long>;
14 | template class hamr::hip_malloc_deleter<unsigned char>;
15 | template class hamr::hip_malloc_deleter<unsigned short>;
16 | template class hamr::hip_malloc_deleter<unsigned int>;
17 | template class hamr::hip_malloc_deleter<unsigned long>;
18 | template class hamr::hip_malloc_deleter<unsigned long long>;
19 | 
20 | #define hamr_hip_malloc_allocator_instantiate_members(_T) \
21 | template std::shared_ptr<_T> hamr::hip_malloc_allocator<_T>::allocate(size_t n, const float *vals, bool hipVals); \
22 | template std::shared_ptr<_T> hamr::hip_malloc_allocator<_T>::allocate(size_t n, const double *vals, bool hipVals); \
23 | template std::shared_ptr<_T> hamr::hip_malloc_allocator<_T>::allocate(size_t n, const char *vals, bool hipVals); \
24 | template std::shared_ptr<_T> hamr::hip_malloc_allocator<_T>::allocate(size_t n, const signed char *vals, bool hipVals); \
25 | template std::shared_ptr<_T> hamr::hip_malloc_allocator<_T>::allocate(size_t n, const short *vals, bool hipVals); \
26 | template std::shared_ptr<_T> hamr::hip_malloc_allocator<_T>::allocate(size_t n, const int *vals, bool hipVals); \
27 | template std::shared_ptr<_T> hamr::hip_malloc_allocator<_T>::allocate(size_t n, const long *vals, bool hipVals); \
28 | template std::shared_ptr<_T> hamr::hip_malloc_allocator<_T>::allocate(size_t n, const long long *vals, bool hipVals); \
29 | template std::shared_ptr<_T> hamr::hip_malloc_allocator<_T>::allocate(size_t n, const unsigned char *vals, bool hipVals); \
30 | template std::shared_ptr<_T> hamr::hip_malloc_allocator<_T>::allocate(size_t n, const unsigned short *vals, bool hipVals); \
31 | template std::shared_ptr<_T> hamr::hip_malloc_allocator<_T>::allocate(size_t n, const unsigned int *vals, bool hipVals); \
32 | template std::shared_ptr<_T> hamr::hip_malloc_allocator<_T>::allocate(size_t n, const unsigned long *vals, bool hipVals); \
33 | template std::shared_ptr<_T> hamr::hip_malloc_allocator<_T>::allocate(size_t n, const unsigned long long *vals, bool hipVals); \
34 | 
35 | #define hamr_hip_malloc_allocator_instantiate(_T) \
36 | template struct hamr::hip_malloc_allocator<_T>; \
37 | hamr_hip_malloc_allocator_instantiate_members(_T)
38 | 
39 | hamr_hip_malloc_allocator_instantiate(float)
40 | hamr_hip_malloc_allocator_instantiate(double)
41 | hamr_hip_malloc_allocator_instantiate(char)
42 | hamr_hip_malloc_allocator_instantiate(signed char)
43 | hamr_hip_malloc_allocator_instantiate(short)
44 | hamr_hip_malloc_allocator_instantiate(int)
45 | hamr_hip_malloc_allocator_instantiate(long)
46 | hamr_hip_malloc_allocator_instantiate(long long)
47 | hamr_hip_malloc_allocator_instantiate(unsigned char)
48 | hamr_hip_malloc_allocator_instantiate(unsigned short)
49 | hamr_hip_malloc_allocator_instantiate(unsigned int)
50 | hamr_hip_malloc_allocator_instantiate(unsigned long)
51 | hamr_hip_malloc_allocator_instantiate(unsigned long long)
52 | 


--------------------------------------------------------------------------------
/hamr_new_allocator_impl.h:
--------------------------------------------------------------------------------
  1 | #ifndef hamr_new_allocator_impl_h
  2 | #define hamr_new_allocator_impl_h
  3 | 
  4 | //#include "hamr_new_allocator.h"
  5 | 
  6 | #include <iostream>
  7 | #include <type_traits>
  8 | #include <memory>
  9 | #include <typeinfo>
 10 | #include <cassert>
 11 | #include <cstring>
 12 | 
 13 | namespace hamr
 14 | {
 15 | 
 16 | // --------------------------------------------------------------------------
 17 | template <typename T>
 18 | new_deleter<T>::new_deleter(T *ptr, size_t n) : m_ptr(ptr), m_elem(n)
 19 | {
 20 | #if defined(HAMR_VERBOSE)
 21 |     if (hamr::get_verbose())
 22 |     {
 23 |         std::cerr << "created new_deleter for array of " << n
 24 |             << " objects of type " << typeid(T).name() << std::endl;
 25 |     }
 26 | #endif
 27 | }
 28 | 
 29 | // --------------------------------------------------------------------------
 30 | template <typename T>
 31 | void new_deleter<T>::operator()(T *ptr)
 32 | {
 33 |     assert(ptr == m_ptr);
 34 | 
 35 | #if defined(HAMR_VERBOSE)
 36 |     if (hamr::get_verbose())
 37 |     {
 38 |         std::cerr << "new_deleter deleting array of " << m_elem
 39 |             << " objects of type " << typeid(T).name() << std::endl;
 40 |     }
 41 | #endif
 42 | 
 43 |     delete [] ptr;
 44 | }
 45 | 
 46 | 
 47 | 
 48 | 
 49 | 
 50 | 
 51 | // --------------------------------------------------------------------------
 52 | template <typename T>
 53 | std::shared_ptr<T> new_allocator<T>::allocate(size_t n)
 54 | {
 55 | #if defined(HAMR_VERBOSE)
 56 |     if (hamr::get_verbose())
 57 |     {
 58 |         std::cerr << "new_allocator allocating array of " << n
 59 |             << " objects of type " << typeid(T).name() << std::endl;
 60 |     }
 61 | #endif
 62 | 
 63 |     // allocate
 64 |     T *ptr = new T[n];
 65 | 
 66 |     // package
 67 |     return std::shared_ptr<T>(ptr, new_deleter<T>(ptr, n));
 68 | }
 69 | 
 70 | // --------------------------------------------------------------------------
 71 | template <typename T>
 72 | std::shared_ptr<T> new_allocator<T>::allocate(size_t n, const T &val)
 73 | {
 74 | #if defined(HAMR_VERBOSE)
 75 |     if (hamr::get_verbose())
 76 |     {
 77 |         std::cerr << "new_allocator allocating array of " << n
 78 |             << " objects of type " << typeid(T).name() << " initialized"
 79 |             << std::endl;
 80 |     }
 81 | #endif
 82 | 
 83 |     // allocate
 84 |     T *ptr = (T*)new unsigned char[n*sizeof(T)];
 85 | 
 86 |     // construct
 87 |     for (size_t i = 0; i < n; ++i)
 88 |         new (&ptr[i]) T(val);
 89 | 
 90 |     // package
 91 |     return std::shared_ptr<T>(ptr, new_deleter<T>(ptr, n));
 92 | }
 93 | 
 94 | // --------------------------------------------------------------------------
 95 | template <typename T>
 96 | template <typename U>
 97 | std::shared_ptr<T> new_allocator<T>::allocate(size_t n, const U *vals)
 98 | {
 99 | #if defined(HAMR_VERBOSE)
100 |     if (hamr::get_verbose())
101 |     {
102 |         std::cerr << "new_allocator allocating array of " << n
103 |             << " objects of type " << typeid(T).name() << " initialized"
104 |             << std::endl;
105 |     }
106 | #endif
107 | 
108 |     // allocate
109 |     T *ptr = (T*)new unsigned char[n*sizeof(T)];
110 | 
111 |     // construct
112 |     for (size_t i = 0; i < n; ++i)
113 |         new (&ptr[i]) T(vals[i]);
114 | 
115 |     // package
116 |     return std::shared_ptr<T>(ptr, new_deleter<T>(ptr, n));
117 | }
118 | 
119 | };
120 | 
121 | #endif
122 | 


--------------------------------------------------------------------------------
/hamr_cuda_malloc_host_allocator.cxx:
--------------------------------------------------------------------------------
 1 | #include "hamr_config.h"
 2 | 
 3 | #include "hamr_cuda_malloc_host_allocator.h"
 4 | #include "hamr_cuda_malloc_host_allocator_impl.h"
 5 | 
 6 | template class hamr::cuda_malloc_host_deleter<float>;
 7 | template class hamr::cuda_malloc_host_deleter<double>;
 8 | template class hamr::cuda_malloc_host_deleter<char>;
 9 | template class hamr::cuda_malloc_host_deleter<signed char>;
10 | template class hamr::cuda_malloc_host_deleter<short>;
11 | template class hamr::cuda_malloc_host_deleter<int>;
12 | template class hamr::cuda_malloc_host_deleter<long>;
13 | template class hamr::cuda_malloc_host_deleter<long long>;
14 | template class hamr::cuda_malloc_host_deleter<unsigned char>;
15 | template class hamr::cuda_malloc_host_deleter<unsigned short>;
16 | template class hamr::cuda_malloc_host_deleter<unsigned int>;
17 | template class hamr::cuda_malloc_host_deleter<unsigned long>;
18 | template class hamr::cuda_malloc_host_deleter<unsigned long long>;
19 | 
20 | #define hamr_cuda_malloc_host_allocator_instantiate_members(_T) \
21 | template std::shared_ptr<_T> hamr::cuda_malloc_host_allocator<_T>::allocate(size_t n, const float *vals); \
22 | template std::shared_ptr<_T> hamr::cuda_malloc_host_allocator<_T>::allocate(size_t n, const double *vals); \
23 | template std::shared_ptr<_T> hamr::cuda_malloc_host_allocator<_T>::allocate(size_t n, const char *vals); \
24 | template std::shared_ptr<_T> hamr::cuda_malloc_host_allocator<_T>::allocate(size_t n, const signed char *vals); \
25 | template std::shared_ptr<_T> hamr::cuda_malloc_host_allocator<_T>::allocate(size_t n, const short *vals); \
26 | template std::shared_ptr<_T> hamr::cuda_malloc_host_allocator<_T>::allocate(size_t n, const int *vals); \
27 | template std::shared_ptr<_T> hamr::cuda_malloc_host_allocator<_T>::allocate(size_t n, const long *vals); \
28 | template std::shared_ptr<_T> hamr::cuda_malloc_host_allocator<_T>::allocate(size_t n, const long long *vals); \
29 | template std::shared_ptr<_T> hamr::cuda_malloc_host_allocator<_T>::allocate(size_t n, const unsigned char *vals); \
30 | template std::shared_ptr<_T> hamr::cuda_malloc_host_allocator<_T>::allocate(size_t n, const unsigned short *vals); \
31 | template std::shared_ptr<_T> hamr::cuda_malloc_host_allocator<_T>::allocate(size_t n, const unsigned int *vals); \
32 | template std::shared_ptr<_T> hamr::cuda_malloc_host_allocator<_T>::allocate(size_t n, const unsigned long *vals); \
33 | template std::shared_ptr<_T> hamr::cuda_malloc_host_allocator<_T>::allocate(size_t n, const unsigned long long *vals);
34 | 
35 | 
36 | #define hamr_cuda_malloc_host_allocator_instantiate(_T) \
37 | template struct hamr::cuda_malloc_host_allocator<_T>; \
38 | hamr_cuda_malloc_host_allocator_instantiate_members(_T)
39 | 
40 | hamr_cuda_malloc_host_allocator_instantiate(float)
41 | hamr_cuda_malloc_host_allocator_instantiate(double)
42 | hamr_cuda_malloc_host_allocator_instantiate(char)
43 | hamr_cuda_malloc_host_allocator_instantiate(signed char)
44 | hamr_cuda_malloc_host_allocator_instantiate(short)
45 | hamr_cuda_malloc_host_allocator_instantiate(int)
46 | hamr_cuda_malloc_host_allocator_instantiate(long)
47 | hamr_cuda_malloc_host_allocator_instantiate(long long)
48 | hamr_cuda_malloc_host_allocator_instantiate(unsigned char)
49 | hamr_cuda_malloc_host_allocator_instantiate(unsigned short)
50 | hamr_cuda_malloc_host_allocator_instantiate(unsigned int)
51 | hamr_cuda_malloc_host_allocator_instantiate(unsigned long)
52 | hamr_cuda_malloc_host_allocator_instantiate(unsigned long long)
53 | 


--------------------------------------------------------------------------------
/test/test_hamr_pipeline_cuda_openmp.cpp:
--------------------------------------------------------------------------------
  1 | #include "hamr_buffer.h"
  2 | #include "hamr_buffer_util.h"
  3 | 
  4 | #include <iostream>
  5 | 
  6 | using hamr::buffer;
  7 | using allocator = hamr::buffer_allocator;
  8 | 
  9 | // with LLVM Clang CUDA and OpenMP need to be compiled in separate
 10 | // translation units.
 11 | 
 12 | //
 13 | // CUDA kernels
 14 | //
 15 | template <typename T>
 16 | buffer<T> initialize_cuda(size_t n_vals, const T &val) HAMR_EXPORT;
 17 | 
 18 | template <typename T, typename U>
 19 | buffer<T> add_cuda(const buffer<T> &a1, const buffer<U> &a2) HAMR_EXPORT;
 20 | 
 21 | template <typename T, typename U>
 22 | buffer<T> multiply_scalar_cuda(const buffer<T> &ai, const U &val) HAMR_EXPORT;
 23 | 
 24 | //
 25 | // OpenMP kernels
 26 | //
 27 | template <typename T>
 28 | buffer<T> initialize_openmp(size_t n_vals, const T &val) HAMR_EXPORT;
 29 | 
 30 | template <typename T, typename U>
 31 | buffer<T> add_openmp(const buffer<T> &a1, const buffer<U> &a2) HAMR_EXPORT;
 32 | 
 33 | template <typename T, typename U>
 34 | buffer<T> multiply_scalar_openmp(const buffer<T> &ai, const U &val) HAMR_EXPORT;
 35 | 
 36 | 
 37 | 
 38 | // **************************************************************************
 39 | template <typename T>
 40 | int compare_int(const buffer<T> &ain, int val)
 41 | {
 42 |     size_t n_vals = ain.size();
 43 |     std::cerr << "comparing array with " << n_vals << " elements to " << val << std::endl;
 44 | 
 45 |     buffer<int> ai(ain.get_allocator(), n_vals);
 46 |     ain.get(ai);
 47 | 
 48 |     auto [spai, pai] = hamr::get_host_accessible(ai);
 49 | 
 50 |     if (n_vals < 33)
 51 |     {
 52 |         ai.print();
 53 |     }
 54 | 
 55 |     for (size_t i = 0; i < n_vals; ++i)
 56 |     {
 57 |         if (pai[i] != val)
 58 |         {
 59 |             std::cerr << "ERROR: pai[" << i << "] = "
 60 |                 << pai[i] << " != " << val << std::endl;
 61 |             return -1;
 62 |         }
 63 |     }
 64 | 
 65 |     std::cerr << "all elements are equal to " << val << std::endl;
 66 | 
 67 |     return 0;
 68 | }
 69 | 
 70 | 
 71 | 
 72 | int main(int, char **)
 73 | {
 74 |     size_t n_vals = 100000;
 75 | 
 76 |     buffer<float>  ao0(allocator::cuda, n_vals, 1.0f);         // = 1 (CUDA)
 77 |     buffer<float>  ao1 = multiply_scalar_cuda(ao0, 2.0f);      // = 2 (CUDA)
 78 |     ao0.free();
 79 | 
 80 |     buffer<double> ao2 = initialize_openmp(n_vals, 2.0);       // = 2 (OpenMP)
 81 |     buffer<double> ao3 = add_openmp(ao2, ao1);                 // = 4 (OpenMP w/ CUDA data)
 82 |     ao1.free();
 83 |     ao2.free();
 84 | 
 85 |     buffer<double> ao4 = multiply_scalar_cuda(ao3, 1000.0);    // = 4000 (CUDA w/ OpenMP data)
 86 |     ao3.free();
 87 | 
 88 |     buffer<float>  ao5(allocator::malloc, n_vals, 3.0f);       // = 3 (host)
 89 |     buffer<float>  ao6 = multiply_scalar_cuda(ao5, 100.0f);    // = 300 (CUDA)
 90 |     ao5.free();
 91 | 
 92 |     buffer<float> ao7(allocator::openmp, n_vals);              // = uninit (OpenMP)
 93 |     ao7.set(ao6);                                              // = 300 (CUDA to OpenMP)
 94 |     ao6.free();
 95 | 
 96 |     buffer<float> ao8(allocator::cuda, n_vals);                // = uninit (CUDA)
 97 |     ao8.set(ao7);                                              // = 300 (OpenMP to CUDA)
 98 |     ao7.free();
 99 | 
100 |     buffer<double> ao9 = add_cuda(ao4, ao8);                   // = 4300 (CUDA)
101 |     ao4.free();
102 |     ao8.free();
103 | 
104 |     return compare_int(ao8, 4300);
105 | }
106 | 


--------------------------------------------------------------------------------
/hamr_device.h:
--------------------------------------------------------------------------------
  1 | #ifndef hamr_device_h
  2 | #define hamr_device_h
  3 | 
  4 | #include "hamr_config.h"
  5 | #if defined(HAMR_ENABLE_CUDA)
  6 | #include "hamr_cuda_device.h"
  7 | #elif defined(HAMR_ENABLE_HIP)
  8 | #include "hamr_hip_device.h"
  9 | #elif defined(HAMR_ENABLE_OPENMP)
 10 | #include "hamr_openmp_device.h"
 11 | #endif
 12 | 
 13 | ///@file
 14 | 
 15 | namespace hamr
 16 | {
 17 | /// gets the device identifier for the first GPU. @returns zero if successful.
 18 | inline int HAMR_EXPORT get_device_identifier(int &dev_id)
 19 | {
 20 | #if defined(HAMR_ENABLE_CUDA)
 21 |     return get_cuda_device_identifier(dev_id);
 22 | #elif defined(HAMR_ENABLE_HIP)
 23 |     return get_hip_device_identifier(dev_id);
 24 | #elif defined(HAMR_ENABLE_OPENMP)
 25 |     return get_openmp_device_identifier(dev_id);
 26 | #else
 27 |     dev_id = -1;
 28 |     return 0;
 29 | #endif
 30 | }
 31 | 
 32 | /// gets the device identifier for the host. @returns zero if successful.
 33 | inline int HAMR_EXPORT get_host_identifier(int &dev_id)
 34 | {
 35 | #if defined(HAMR_ENABLE_CUDA)
 36 |     return get_cuda_host_identifier(dev_id);
 37 | #elif defined(HAMR_ENABLE_HIP)
 38 |     return get_hip_host_identifier(dev_id);
 39 | #elif defined(HAMR_ENABLE_OPENMP)
 40 |     return get_openmp_host_identifier(dev_id);
 41 | #else
 42 |     dev_id = -1;
 43 |     return 0;
 44 | #endif
 45 | }
 46 | 
 47 | /// gets the currently atcive device. @returns zero if successful.
 48 | inline int HAMR_EXPORT get_active_device(int &dev_id)
 49 | {
 50 | #if defined(HAMR_ENABLE_CUDA)
 51 |     return get_active_cuda_device(dev_id);
 52 | #elif defined(HAMR_ENABLE_HIP)
 53 |     return get_active_hip_device(dev_id);
 54 | #elif defined(HAMR_ENABLE_OPENMP)
 55 |     return get_active_openmp_device(dev_id);
 56 | #else
 57 |     dev_id = -1;
 58 |     return 0;
 59 | #endif
 60 | }
 61 | 
 62 | /// sets the active  device. returns zero if successful.
 63 | inline int HAMR_EXPORT set_active_device(int dev_id)
 64 | {
 65 | #if defined(HAMR_ENABLE_CUDA)
 66 |     return set_active_cuda_device(dev_id);
 67 | #elif defined(HAMR_ENABLE_HIP)
 68 |     return set_active_hip_device(dev_id);
 69 | #elif defined(HAMR_ENABLE_OPENMP)
 70 |     return set_active_openmp_device(dev_id);
 71 | #else
 72 |     return 0;
 73 | #endif
 74 | }
 75 | 
 76 | /// gets the device that owns the given pointer. @returns zero if successful.
 77 | inline int HAMR_EXPORT get_device(const void *ptr, int &device_id)
 78 | {
 79 | #if defined(HAMR_ENABLE_CUDA)
 80 |     return get_cuda_device(ptr, device_id);
 81 | #elif defined(HAMR_ENABLE_HIP)
 82 |     return get_hip_device(ptr, device_id);
 83 | #elif defined(HAMR_ENABLE_OPENMP)
 84 |     return get_openmp_device(ptr, device_id);
 85 | #else
 86 |     device_id = -1;
 87 |     return 0;
 88 | #endif
 89 | }
 90 | 
 91 | #if defined(HAMR_ENABLE_CUDA)
 92 | using activate_device = activate_cuda_device;
 93 | #elif defined(HAMR_ENABLE_HIP)
 94 | using activate_device = activate_hip_device;
 95 | #elif defined(HAMR_ENABLE_OPENMP)
 96 | using activate_device = activate_openmp_device;
 97 | #else
 98 | /** Activate the specified device, and restore the previously active
 99 |  * device when the object is destroyed.
100 |  */
101 | class HAMR_EXPORT activate_device
102 | {
103 | public:
104 |     activate_device() = delete;
105 |     activate_device(const activate_device &) = delete;
106 |     void operator=(const activate_device &) = delete;
107 |     activate_device(int) {}
108 |     ~activate_device() {}
109 | };
110 | #endif
111 | 
112 | }
113 | #endif
114 | 


--------------------------------------------------------------------------------
/cmake/hamr_omp_offload.cmake:
--------------------------------------------------------------------------------
 1 | # Get the OpenMP device offload flags for the current C++ compiler.
 2 | #
 3 | #   TARGET <offload target>
 4 | #       names the target for offloading (optional).
 5 | #
 6 | #   ARCH <target architecture>
 7 | #       names the architcure to compiler for (optional).
 8 | #
 9 | #   ADD_FLAGS <other flags>
10 | #       additional flags that may be needed (optional).
11 | #
12 | #   RESULT <avr name>
13 | #       the flags are stored in this variable.
14 | #
15 | function(get_offload_compile_flags)
16 |     set(opts "")
17 |     set(nvpo ARCH TARGET ADD_FLAGS RESULT)
18 |     set(mvo)
19 |     cmake_parse_arguments(PARSE_ARGV 0 OMP_DO "${opts}" "${nvpo}" "${mvo}")
20 |     set(tmp)
21 |     if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
22 |         set(tmp "-fopenmp --offload-new-driver")
23 |         if (OMP_DO_TARGET)
24 |             set(tmp "${tmp} -fopenmp-targets=${OMP_DO_TARGET}")
25 |         endif()
26 |         if (OMP_DO_ARCH)
27 |             set(tmp "${tmp} --offload-arch=${OMP_DO_ARCH}")
28 |         endif()
29 |     elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "IntelLLVM")
30 |         set(tmp "-qopenmp")
31 |         if (OMP_DO_TARGET)
32 |             set(tmp "${tmp} -fopenmp-targets=${OMP_DO_TARGET}")
33 |         endif()
34 |     elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "GNU")
35 |         set(tmp "-fopenmp")
36 |         if (OMP_DO_TARGET)
37 |             set(tmp "${tmp} -foffload=${OMP_DO_TARGET}")
38 |         endif()
39 |         if (OMP_DO_ARCH)
40 |             set(tmp "${tmp} --offload-options=${OMP_DO_TARGET}=-march=${OMP_DO_ARCH}")
41 |         endif()
42 |     elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "NVHPC")
43 |         set(tmp "-mp=gpu")
44 |         if (OMP_DO_ARCH)
45 |             set(tmp "${tmp} -gpu=${OMP_DO_ARCH}")
46 |         endif()
47 |     endif()
48 |     if (OMP_DO_ADD_FLAGS)
49 |         set(tmp "${tmp} ${OMP_DO_ADD_FLAGS}")
50 |     endif()
51 |     if ("${tmp}" STREQUAL "")
52 |         message(WARNING "OpenMP offload compiler flags not known for ${CMAKE_CXX_COMPILER_ID}")
53 |     else()
54 |         message(STATUS "OpenMP offload compiler flags for ${CMAKE_CXX_COMPILER_ID} are ${tmp}")
55 |     endif()
56 |     set(${OMP_DO_RESULT} ${tmp} PARENT_SCOPE)
57 | endfunction()
58 | 
59 | # Get the OpenMP device offload flags for the current C++ compiler.
60 | #
61 | #   TARGET <offload target>
62 | #       names the target for offloading (optional).
63 | #
64 | #   ARCH <target architecture>
65 | #       names the architcure to compiler for (optional).
66 | #
67 | #   ADD_FLAGS <other flags>
68 | #       additional flags that may be needed (optional).
69 | #
70 | #   RESULT <avr name>
71 | #       the flags are stored in this variable.
72 | #
73 | function(get_offload_link_flags)
74 |     set(opts "")
75 |     set(nvpo ARCH TARGET ADD_FLAGS RESULT)
76 |     set(mvo)
77 |     cmake_parse_arguments(PARSE_ARGV 0 OMP_DO "${opts}" "${nvpo}" "${mvo}")
78 |     set(tmp)
79 |     if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
80 |         list(APPEND tmp -fopenmp --offload-new-driver)
81 |         if (OMP_DO_TARGET)
82 |             list(APPEND tmp -fopenmp-targets=${OMP_DO_TARGET})
83 |         endif()
84 |         if (OMP_DO_ARCH)
85 |             list(APPEND tmp --offload-arch=${OMP_DO_ARCH})
86 |         endif()
87 |     elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "IntelLLVM")
88 |     elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "GNU")
89 |     elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "NVHPC")
90 |     endif()
91 |     if (OMP_DO_ADD_FLAGS)
92 |         set(tmp "${tmp} ${OMP_DO_ADD_FLAGS}")
93 |     endif()
94 |     message(STATUS "OpenMP offload linker flags for ${CMAKE_CXX_COMPILER_ID} are ${tmp}")
95 |     set(${OMP_DO_RESULT} ${tmp} PARENT_SCOPE)
96 | endfunction()
97 | 


--------------------------------------------------------------------------------
/hamr_stream.h:
--------------------------------------------------------------------------------
  1 | #ifndef hamr_stream_h
  2 | #define hamr_stream_h
  3 | 
  4 | ///@file
  5 | 
  6 | #include "hamr_config.h"
  7 | 
  8 | #include <cstddef>
  9 | #include <variant>
 10 | 
 11 | #if defined(HAMR_ENABLE_CUDA)
 12 | #include <cuda_runtime.h>
 13 | #else
 14 | using cudaStream_t = void*;
 15 | #endif
 16 | #if defined(HAMR_ENABLE_HIP)
 17 | #include <hip/hip_runtime.h>
 18 | #else
 19 | using hipStream_t = void*;
 20 | #endif
 21 | 
 22 | namespace hamr
 23 | {
 24 | 
 25 | /// A wrapper around technology specific streams.
 26 | /** Streams are used to enable and order concurrent operations on accelerator
 27 |  * devices. The default stream used in hamr is a stream-per-thread where
 28 |  * available.  However, note that libraries built seperately will likely use
 29 |  * the default blocking stream and if so explicit specification of the stream
 30 |  * when calling into those libraries is necessary. Note that hamr passes stream
 31 |  * correctly when interfacing with Python. In most cases the hamr API's
 32 |  * requiring a ::stream can be passed the technology specific stream due to
 33 |  * implicit conversion operators implemented here.
 34 |  */
 35 | class HAMR_EXPORT stream
 36 | {
 37 | public:
 38 |     /// constructs a default stream
 39 |     stream() :
 40 | #if defined(HAMR_ENABLE_CUDA)
 41 |         m_stream(std::in_place_index<1>, cudaStreamPerThread)
 42 | #elif defined(HAMR_ENABLE_HIP)
 43 |         m_stream(std::in_place_index<2>, hipStreamPerThread)
 44 | #else
 45 |         m_stream(std::in_place_index<0>, '\0')
 46 | #endif
 47 |    {}
 48 | 
 49 |     stream(const stream &) = default;
 50 |     stream(stream &&) = default;
 51 | 
 52 |     stream &operator=(const stream &) = default;
 53 |     stream &operator=(stream &&) = default;
 54 | 
 55 | #if defined(HAMR_ENABLE_CUDA)
 56 |     /// convert to a CUDA stream
 57 |     operator cudaStream_t () const { return this->get_cuda_stream(); }
 58 | 
 59 |     /// assign a CUDA stream
 60 |     stream &operator=(cudaStream_t strm)
 61 |     {
 62 |         m_stream = strm;
 63 |         return *this;
 64 |     }
 65 | 
 66 |     /// Constructs or converts from a CUDA stream
 67 |     stream(const cudaStream_t &strm) : m_stream(std::in_place_index<1>, strm) {}
 68 | 
 69 |     /// Accesses the CUDA stream.
 70 |     cudaStream_t get_cuda_stream() const
 71 |     {
 72 |         const cudaStream_t *cs;
 73 |         if ((cs = std::get_if<1>(&m_stream)))
 74 |             return *cs;
 75 |         return 0; // default stream
 76 |     }
 77 | #endif
 78 | #if defined(HAMR_ENABLE_HIP)
 79 |     /// convert to a HIP stream
 80 |     operator hipStream_t () const { return this->get_hip_stream(); }
 81 | 
 82 |     /// assign a HIP stream
 83 |     stream &operator=(hipStream_t strm)
 84 |     {
 85 |         m_stream = strm;
 86 |         return *this;
 87 |     }
 88 | 
 89 |     /// Constructs or converts from a HIP stream
 90 |     stream(hipStream_t &strm) : m_stream(std::in_place_index<2>, strm) {}
 91 | 
 92 |     /// Accesses the HIP stream.
 93 |     hipStream_t get_hip_stream() const
 94 |     {
 95 |         const hipStream_t *hs;
 96 |         if ((hs = std::get_if<2>(&m_stream)))
 97 |             return *hs;
 98 |         return 0; // default stream
 99 |     }
100 | #endif
101 | 
102 |     /// synchronize the stream
103 |     int synchronize() const;
104 | 
105 |     /// evaluates true if a stream has been set
106 |     operator bool() const;
107 | 
108 |     /// sends the value of the stream to std::cerr
109 |     void print() const;
110 | 
111 |     /// convert the technology specific stream to an integer
112 |     size_t get_stream();
113 | 
114 | private:
115 |     std::variant<char, cudaStream_t, hipStream_t> m_stream;
116 | };
117 | }
118 | #endif
119 | 


--------------------------------------------------------------------------------
/hamr_stream_impl.h:
--------------------------------------------------------------------------------
  1 | #ifndef hamr_stream_h
  2 | #define hamr_stream_h
  3 | 
  4 | ///@file
  5 | 
  6 | #include "hamr_config.h"
  7 | 
  8 | #include <cstddef>
  9 | #include <variant>
 10 | 
 11 | #if defined(HAMR_ENABLE_CUDA)
 12 | #include <cuda_runtime.h>
 13 | #else
 14 | using cudaStream_t = void*;
 15 | #endif
 16 | #if defined(HAMR_ENABLE_HIP)
 17 | #include <hip/hip_runtime.h>
 18 | #else
 19 | using hipStream_t = void*;
 20 | #endif
 21 | 
 22 | namespace hamr
 23 | {
 24 | 
 25 | /// A wrapper around technology specific streams.
 26 | /** Streams are used to enable and order concurrent operations on accelerator
 27 |  * devices. The default stream used in hamr is a stream-per-thread where
 28 |  * available.  However, note that libraries built seperately will likely use
 29 |  * the default blocking stream and if so explicit specification of the stream
 30 |  * when calling into those libraries is necessary. Note that hamr passes stream
 31 |  * correctly when interfacing with Python. In most cases the hamr API's
 32 |  * requiring a ::stream can be passed the technology specific stream due to
 33 |  * implicit conversion operators implemented here.
 34 |  */
 35 | class HAMR_EXPORT stream
 36 | {
 37 | public:
 38 |     /// constructs a default stream
 39 |     stream() :
 40 | #if defined(HAMR_ENABLE_CUDA)
 41 |         m_stream(std::in_place_index<1>, cudaStreamPerThread)
 42 | #elif defined(HAMR_ENABLE_HIP)
 43 |         m_stream(std::in_place_index<2>, hipStreamPerThread)
 44 | #else
 45 |         m_stream(std::in_place_index<0>, '\0')
 46 | #endif
 47 |    {}
 48 | 
 49 |     stream(const stream &) = default;
 50 |     stream(stream &&) = default;
 51 | 
 52 |     stream &operator=(const stream &) = default;
 53 |     stream &operator=(stream &&) = default;
 54 | 
 55 | #if defined(HAMR_ENABLE_CUDA)
 56 |     /// convert to a CUDA stream
 57 |     operator cudaStream_t () const { return this->get_cuda_stream(); }
 58 | 
 59 |     /// assign a CUDA stream
 60 |     stream &operator=(cudaStream_t strm)
 61 |     {
 62 |         m_stream = strm;
 63 |         return *this;
 64 |     }
 65 | 
 66 |     /// Constructs or converts from a CUDA stream
 67 |     stream(const cudaStream_t &strm) : m_stream(std::in_place_index<1>, strm) {}
 68 | 
 69 |     /// Accesses the CUDA stream.
 70 |     cudaStream_t get_cuda_stream() const
 71 |     {
 72 |         const cudaStream_t *cs;
 73 |         if ((cs = std::get_if<1>(&m_stream)))
 74 |             return *cs;
 75 |         return 0; // default stream
 76 |     }
 77 | #endif
 78 | #if defined(HAMR_ENABLE_HIP)
 79 |     /// convert to a HIP stream
 80 |     operator hipStream_t () const { return this->get_hip_stream(); }
 81 | 
 82 |     /// assign a HIP stream
 83 |     stream &operator=(hipStream_t strm)
 84 |     {
 85 |         m_stream = strm;
 86 |         return *this;
 87 |     }
 88 | 
 89 |     /// Constructs or converts from a HIP stream
 90 |     stream(hipStream_t &strm) : m_stream(std::in_place_index<2>, strm) {}
 91 | 
 92 |     /// Accesses the HIP stream.
 93 |     hipStream_t get_hip_stream() const
 94 |     {
 95 |         const hipStream_t *hs;
 96 |         if ((hs = std::get_if<2>(&m_stream)))
 97 |             return *hs;
 98 |         return 0; // default stream
 99 |     }
100 | #endif
101 | 
102 |     /// synchronize the stream
103 |     int synchronize() const;
104 | 
105 |     /// evaluates true if a stream has been set
106 |     operator bool() const;
107 | 
108 |     /// sends the value of the stream to std::cerr
109 |     void print() const;
110 | 
111 |     /// convert the technology specific stream to an integer
112 |     size_t get_stream();
113 | 
114 | private:
115 |     std::variant<char, cudaStream_t, hipStream_t> m_stream;
116 | };
117 | }
118 | #endif
119 | 


--------------------------------------------------------------------------------
/hamr_cuda_malloc_uva_allocator.cxx:
--------------------------------------------------------------------------------
 1 | #include "hamr_config.h"
 2 | 
 3 | #include "hamr_cuda_malloc_uva_allocator.h"
 4 | #include "hamr_cuda_malloc_uva_allocator_impl.h"
 5 | 
 6 | template class hamr::cuda_malloc_uva_deleter<float>;
 7 | template class hamr::cuda_malloc_uva_deleter<double>;
 8 | template class hamr::cuda_malloc_uva_deleter<char>;
 9 | template class hamr::cuda_malloc_uva_deleter<signed char>;
10 | template class hamr::cuda_malloc_uva_deleter<short>;
11 | template class hamr::cuda_malloc_uva_deleter<int>;
12 | template class hamr::cuda_malloc_uva_deleter<long>;
13 | template class hamr::cuda_malloc_uva_deleter<long long>;
14 | template class hamr::cuda_malloc_uva_deleter<unsigned char>;
15 | template class hamr::cuda_malloc_uva_deleter<unsigned short>;
16 | template class hamr::cuda_malloc_uva_deleter<unsigned int>;
17 | template class hamr::cuda_malloc_uva_deleter<unsigned long>;
18 | template class hamr::cuda_malloc_uva_deleter<unsigned long long>;
19 | 
20 | #define hamr_cuda_malloc_uva_allocator_instantiate_members(_T) \
21 | template std::shared_ptr<_T> hamr::cuda_malloc_uva_allocator<_T>::allocate(cudaStream_t strm, size_t n, const float *vals, bool cudaVals); \
22 | template std::shared_ptr<_T> hamr::cuda_malloc_uva_allocator<_T>::allocate(cudaStream_t strm, size_t n, const double *vals, bool cudaVals); \
23 | template std::shared_ptr<_T> hamr::cuda_malloc_uva_allocator<_T>::allocate(cudaStream_t strm, size_t n, const char *vals, bool cudaVals); \
24 | template std::shared_ptr<_T> hamr::cuda_malloc_uva_allocator<_T>::allocate(cudaStream_t strm, size_t n, const signed char *vals, bool cudaVals); \
25 | template std::shared_ptr<_T> hamr::cuda_malloc_uva_allocator<_T>::allocate(cudaStream_t strm, size_t n, const short *vals, bool cudaVals); \
26 | template std::shared_ptr<_T> hamr::cuda_malloc_uva_allocator<_T>::allocate(cudaStream_t strm, size_t n, const int *vals, bool cudaVals); \
27 | template std::shared_ptr<_T> hamr::cuda_malloc_uva_allocator<_T>::allocate(cudaStream_t strm, size_t n, const long *vals, bool cudaVals); \
28 | template std::shared_ptr<_T> hamr::cuda_malloc_uva_allocator<_T>::allocate(cudaStream_t strm, size_t n, const long long *vals, bool cudaVals); \
29 | template std::shared_ptr<_T> hamr::cuda_malloc_uva_allocator<_T>::allocate(cudaStream_t strm, size_t n, const unsigned char *vals, bool cudaVals); \
30 | template std::shared_ptr<_T> hamr::cuda_malloc_uva_allocator<_T>::allocate(cudaStream_t strm, size_t n, const unsigned short *vals, bool cudaVals); \
31 | template std::shared_ptr<_T> hamr::cuda_malloc_uva_allocator<_T>::allocate(cudaStream_t strm, size_t n, const unsigned int *vals, bool cudaVals); \
32 | template std::shared_ptr<_T> hamr::cuda_malloc_uva_allocator<_T>::allocate(cudaStream_t strm, size_t n, const unsigned long *vals, bool cudaVals); \
33 | template std::shared_ptr<_T> hamr::cuda_malloc_uva_allocator<_T>::allocate(cudaStream_t strm, size_t n, const unsigned long long *vals, bool cudaVals);
34 | 
35 | #define hamr_cuda_malloc_uva_allocator_instantiate(_T) \
36 | template struct hamr::cuda_malloc_uva_allocator<_T>; \
37 | hamr_cuda_malloc_uva_allocator_instantiate_members(_T)
38 | 
39 | hamr_cuda_malloc_uva_allocator_instantiate(float)
40 | hamr_cuda_malloc_uva_allocator_instantiate(double)
41 | hamr_cuda_malloc_uva_allocator_instantiate(char)
42 | hamr_cuda_malloc_uva_allocator_instantiate(signed char)
43 | hamr_cuda_malloc_uva_allocator_instantiate(short)
44 | hamr_cuda_malloc_uva_allocator_instantiate(int)
45 | hamr_cuda_malloc_uva_allocator_instantiate(long)
46 | hamr_cuda_malloc_uva_allocator_instantiate(long long)
47 | hamr_cuda_malloc_uva_allocator_instantiate(unsigned char)
48 | hamr_cuda_malloc_uva_allocator_instantiate(unsigned short)
49 | hamr_cuda_malloc_uva_allocator_instantiate(unsigned int)
50 | hamr_cuda_malloc_uva_allocator_instantiate(unsigned long)
51 | hamr_cuda_malloc_uva_allocator_instantiate(unsigned long long)
52 | 


--------------------------------------------------------------------------------
/hamr_cuda_malloc_async_allocator.cxx:
--------------------------------------------------------------------------------
 1 | #include "hamr_config.h"
 2 | 
 3 | #include "hamr_cuda_malloc_async_allocator.h"
 4 | #include "hamr_cuda_malloc_async_allocator_impl.h"
 5 | 
 6 | template class hamr::cuda_malloc_async_deleter<float>;
 7 | template class hamr::cuda_malloc_async_deleter<double>;
 8 | template class hamr::cuda_malloc_async_deleter<char>;
 9 | template class hamr::cuda_malloc_async_deleter<signed char>;
10 | template class hamr::cuda_malloc_async_deleter<short>;
11 | template class hamr::cuda_malloc_async_deleter<int>;
12 | template class hamr::cuda_malloc_async_deleter<long>;
13 | template class hamr::cuda_malloc_async_deleter<long long>;
14 | template class hamr::cuda_malloc_async_deleter<unsigned char>;
15 | template class hamr::cuda_malloc_async_deleter<unsigned short>;
16 | template class hamr::cuda_malloc_async_deleter<unsigned int>;
17 | template class hamr::cuda_malloc_async_deleter<unsigned long>;
18 | template class hamr::cuda_malloc_async_deleter<unsigned long long>;
19 | 
20 | #define hamr_cuda_malloc_async_allocator_instantiate_members(_T) \
21 | template std::shared_ptr<_T> hamr::cuda_malloc_async_allocator<_T>::allocate(cudaStream_t strm, size_t n, const float *vals, bool cudaVals); \
22 | template std::shared_ptr<_T> hamr::cuda_malloc_async_allocator<_T>::allocate(cudaStream_t strm, size_t n, const double *vals, bool cudaVals); \
23 | template std::shared_ptr<_T> hamr::cuda_malloc_async_allocator<_T>::allocate(cudaStream_t strm, size_t n, const char *vals, bool cudaVals); \
24 | template std::shared_ptr<_T> hamr::cuda_malloc_async_allocator<_T>::allocate(cudaStream_t strm, size_t n, const signed char *vals, bool cudaVals); \
25 | template std::shared_ptr<_T> hamr::cuda_malloc_async_allocator<_T>::allocate(cudaStream_t strm, size_t n, const short *vals, bool cudaVals); \
26 | template std::shared_ptr<_T> hamr::cuda_malloc_async_allocator<_T>::allocate(cudaStream_t strm, size_t n, const int *vals, bool cudaVals); \
27 | template std::shared_ptr<_T> hamr::cuda_malloc_async_allocator<_T>::allocate(cudaStream_t strm, size_t n, const long *vals, bool cudaVals); \
28 | template std::shared_ptr<_T> hamr::cuda_malloc_async_allocator<_T>::allocate(cudaStream_t strm, size_t n, const long long *vals, bool cudaVals); \
29 | template std::shared_ptr<_T> hamr::cuda_malloc_async_allocator<_T>::allocate(cudaStream_t strm, size_t n, const unsigned char *vals, bool cudaVals); \
30 | template std::shared_ptr<_T> hamr::cuda_malloc_async_allocator<_T>::allocate(cudaStream_t strm, size_t n, const unsigned short *vals, bool cudaVals); \
31 | template std::shared_ptr<_T> hamr::cuda_malloc_async_allocator<_T>::allocate(cudaStream_t strm, size_t n, const unsigned int *vals, bool cudaVals); \
32 | template std::shared_ptr<_T> hamr::cuda_malloc_async_allocator<_T>::allocate(cudaStream_t strm, size_t n, const unsigned long *vals, bool cudaVals); \
33 | template std::shared_ptr<_T> hamr::cuda_malloc_async_allocator<_T>::allocate(cudaStream_t strm, size_t n, const unsigned long long *vals, bool cudaVals);
34 | 
35 | #define hamr_cuda_malloc_async_allocator_instantiate(_T) \
36 | template struct hamr::cuda_malloc_async_allocator<_T>; \
37 | hamr_cuda_malloc_async_allocator_instantiate_members(_T)
38 | 
39 | hamr_cuda_malloc_async_allocator_instantiate(float)
40 | hamr_cuda_malloc_async_allocator_instantiate(double)
41 | hamr_cuda_malloc_async_allocator_instantiate(char)
42 | hamr_cuda_malloc_async_allocator_instantiate(signed char)
43 | hamr_cuda_malloc_async_allocator_instantiate(short)
44 | hamr_cuda_malloc_async_allocator_instantiate(int)
45 | hamr_cuda_malloc_async_allocator_instantiate(long)
46 | hamr_cuda_malloc_async_allocator_instantiate(long long)
47 | hamr_cuda_malloc_async_allocator_instantiate(unsigned char)
48 | hamr_cuda_malloc_async_allocator_instantiate(unsigned short)
49 | hamr_cuda_malloc_async_allocator_instantiate(unsigned int)
50 | hamr_cuda_malloc_async_allocator_instantiate(unsigned long)
51 | hamr_cuda_malloc_async_allocator_instantiate(unsigned long long)
52 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | ******************************************************************************
 2 | ***                          Copyright Notice                              ***
 3 | ******************************************************************************
 4 | HAMR - Heterogeneous Accelerator Memory Resource (HAMR) 
 5 | Copyright (c) 2022, The Regents of the University of California, through
 6 | Lawrence Berkeley National Laboratory (subject to receipt of any
 7 | required approvals from the U.S. Dept. of Energy). All rights reserved.
 8 | 
 9 | If you have questions about your rights to use or distribute this software,
10 | please contact Berkeley Lab's Intellectual Property Office at
11 | IPO@lbl.gov.
12 | 
13 | NOTICE.  This Software was developed under funding from the U.S. Department
14 | of Energy and the U.S. Government consequently retains certain rights.  As
15 | such, the U.S. Government has been granted for itself and others acting on
16 | its behalf a paid-up, nonexclusive, irrevocable, worldwide license in the
17 | Software to reproduce, distribute copies to the public, prepare derivative 
18 | works, and perform publicly and display publicly, and to permit others to do so.
19 | 
20 | 
21 | ******************************************************************************
22 | ***                          License Agreement                             ***
23 | ******************************************************************************
24 | HAMR - Heterogeneous Accelerator Memory Resource (HAMR) 
25 | Copyright (c) 2022, The Regents of the University of California, through
26 | Lawrence Berkeley National Laboratory (subject to receipt of any
27 | required approvals from the U.S. Dept. of Energy). All rights reserved.
28 | 
29 | Redistribution and use in source and binary forms, with or without
30 | modification, are permitted provided that the following conditions are met:
31 | 
32 | (1) Redistributions of source code must retain the above copyright notice,
33 | this list of conditions and the following disclaimer.
34 | 
35 | (2) Redistributions in binary form must reproduce the above copyright
36 | notice, this list of conditions and the following disclaimer in the
37 | documentation and/or other materials provided with the distribution.
38 | 
39 | (3) Neither the name of the University of California, Lawrence Berkeley
40 | National Laboratory, U.S. Dept. of Energy nor the names of its contributors
41 | may be used to endorse or promote products derived from this software
42 | without specific prior written permission.
43 | 
44 | 
45 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
46 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
47 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
48 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
49 | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
50 | CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
51 | SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
52 | INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
53 | CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
54 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
55 | POSSIBILITY OF SUCH DAMAGE.
56 | 
57 | You are under no obligation whatsoever to provide any bug fixes, patches,
58 | or upgrades to the features, functionality or performance of the source
59 | code ("Enhancements") to anyone; however, if you choose to make your
60 | Enhancements available either publicly, or directly to Lawrence Berkeley
61 | National Laboratory, without imposing a separate written license agreement
62 | for such Enhancements, then you hereby grant the following license: a
63 | non-exclusive, royalty-free perpetual license to install, use, modify,
64 | prepare derivative works, incorporate into other computer software,
65 | distribute, and sublicense such enhancements or derivative works thereof,
66 | in binary and source code form.
67 | 


--------------------------------------------------------------------------------
/test/test_hamr_pipeline_cuda_openmp_mp.cpp:
--------------------------------------------------------------------------------
  1 | #include "hamr_config.h"
  2 | #include "hamr_buffer.h"
  3 | #include "hamr_buffer_util.h"
  4 | 
  5 | #include <iostream>
  6 | 
  7 | using hamr::buffer;
  8 | using allocator = hamr::buffer_allocator;
  9 | 
 10 | // **************************************************************************
 11 | template <typename T>
 12 | hamr::buffer<T> initialize_openmp(size_t n_vals, const T &val)
 13 | {
 14 |     // allocate the memory
 15 |     hamr::buffer<T> ao(allocator::openmp, n_vals);
 16 |     T *pao = ao.data();
 17 | 
 18 |     // initialize using openmp
 19 | 
 20 |     #pragma omp target teams HAMR_OPENMP_LOOP is_device_ptr(pao) map(to: val)
 21 |     for (size_t i = 0; i < n_vals; ++i)
 22 |     {
 23 |         pao[i] = val;
 24 |     }
 25 | 
 26 |     // print the results
 27 |     std::cerr << "initialized to an array of " << n_vals << " to " << val << std::endl;
 28 | 
 29 |     if (n_vals < 33)
 30 |     {
 31 |         std::cerr << "ao = "; ao.print(); std::cerr << std::endl;
 32 |         ao.print();
 33 |         std::cerr << std::endl;
 34 |     }
 35 | 
 36 |     return ao;
 37 | }
 38 | 
 39 | 
 40 | 
 41 | 
 42 | // **************************************************************************
 43 | template <typename T, typename U>
 44 | hamr::buffer<T> add_openmp(const hamr::buffer<T> &a1, const hamr::buffer<U> &a2)
 45 | {
 46 |     size_t n_vals = a1.size();
 47 | 
 48 |     // get the inputs
 49 |     auto spa1 = a1.get_openmp_accessible();
 50 |     auto pa1 = spa1.get();
 51 | 
 52 |     auto spa2 = a2.get_openmp_accessible();
 53 |     auto pa2 = spa2.get();
 54 | 
 55 |     // allocate the memory
 56 |     hamr::buffer<T> ao(allocator::openmp, n_vals, T(0));
 57 |     T *pao = ao.data();
 58 | 
 59 |     // do the calculation
 60 |     #pragma omp target teams HAMR_OPENMP_LOOP is_device_ptr(pao, pa1, pa2)
 61 |     for (size_t i = 0; i < n_vals; ++i)
 62 |     {
 63 |         pao[i] = pa1[i] + pa2[i];
 64 |     }
 65 | 
 66 |     // print the results
 67 |     std::cerr << "added " << n_vals << " array " << typeid(T).name() << sizeof(T)
 68 |          << " to array  " << typeid(U).name() << sizeof(U) << std::endl;
 69 | 
 70 |     if (n_vals < 33)
 71 |     {
 72 |         std::cerr << "a1 = "; a1.print(); std::cerr << std::endl;
 73 |         std::cerr << "a2 = "; a2.print(); std::cerr << std::endl;
 74 |         std::cerr << "ao = "; ao.print(); std::cerr << std::endl;
 75 |     }
 76 | 
 77 |     return ao;
 78 | }
 79 | 
 80 | 
 81 | // **************************************************************************
 82 | template <typename T, typename U>
 83 | hamr::buffer<T> multiply_scalar_openmp(const hamr::buffer<T> &ai, const U &val)
 84 | {
 85 |     size_t n_vals = ai.size();
 86 | 
 87 |     // get the inputs
 88 |     auto spai = ai.get_openmp_accessible();
 89 |     auto pai = spai.get();
 90 | 
 91 |     // allocate the memory
 92 |     hamr::buffer<T> ao(allocator::openmp, n_vals, T(0));
 93 |     T *pao = ao.data();
 94 | 
 95 |     // do the calculation
 96 |     #pragma omp target teams HAMR_OPENMP_LOOP is_device_ptr(pao, pai) map(to: val)
 97 |     for (size_t i = 0; i < n_vals; ++i)
 98 |     {
 99 |         pao[i] = val * pai[i];
100 |     }
101 | 
102 |     // print the results
103 |     std::cerr << "multiply_scalar " << val << " " << typeid(U).name() << sizeof(U)
104 |        << " by " << n_vals << " array " << typeid(T).name() << sizeof(T) << std::endl;
105 | 
106 |     if (n_vals < 33)
107 |     {
108 |         std::cerr << "ai = "; ai.print(); std::cerr << std::endl;
109 |         std::cerr << "ao = "; ao.print(); std::cerr << std::endl;
110 |     }
111 | 
112 |     return ao;
113 | }
114 | 
115 | #define instantiate_openmp_kernels_(T,U) \
116 | template buffer<T> add_openmp<T,U>(const buffer<T> &a1, const buffer<U> &a2); \
117 | template buffer<T> multiply_scalar_openmp<T,U>(const buffer<T> &ai, const U &val);
118 | 
119 | #define instantiate_openmp_kernels(T) \
120 | template buffer<T> initialize_openmp(size_t n_vals, const T &val); \
121 | instantiate_openmp_kernels_(T, float) \
122 | instantiate_openmp_kernels_(T, double)
123 | 
124 | instantiate_openmp_kernels(double)
125 | instantiate_openmp_kernels(float)
126 | 
127 | 


--------------------------------------------------------------------------------
/hamr_hip_launch.h:
--------------------------------------------------------------------------------
  1 | #ifndef hamr_hip_launch_h
  2 | #define hamr_hip_launch_h
  3 | 
  4 | /// @file
  5 | 
  6 | #include "hamr_config.h"
  7 | 
  8 | #include <deque>
  9 | 
 10 | #include <hip/hip_runtime.h>
 11 | 
 12 | 
 13 | /// heterogeneous accelerator memory resource
 14 | namespace hamr
 15 | {
 16 | 
 17 | /** A flat array is broken into blocks of number of threads where each adjacent
 18 |  * thread accesses adjacent memory locations. To accomplish this we might need
 19 |  * a large number of blocks. If the number of blocks exceeds the max block
 20 |  * dimension in the first and or second block grid dimension then we need to
 21 |  * use a 2d or 3d block grid.
 22 |  *
 23 |  * ::partition_thread_blocks - decides on a partitioning of the data based on
 24 |  * warps_per_block parameter. The resulting decomposition will be either 1,2,
 25 |  * or 3D as needed to accommodate the number of fixed sized blocks. It can
 26 |  * happen that max grid dimensions are hit, in which case you'll need to
 27 |  * increase the number of warps per block.
 28 |  *
 29 |  * ::thread_id_to_array_index - given a thread and block id gets the
 30 |  * array index to update. _this may be out of bounds so be sure
 31 |  * to validate before using it.
 32 |  *
 33 |  * ::index_is_valid - test an index for validity.
 34 | */
 35 | /// @name CUDA indexing scheme
 36 | ///@{
 37 | 
 38 | /// query properties for the named CUDA device. retruns non-zero on error
 39 | HAMR_EXPORT
 40 | int get_launch_props(int device_id,
 41 |     int *block_grid_max, int &warp_size,
 42 |     int &max_warps_per_block);
 43 | 
 44 | 
 45 | /** convert a CUDA index into a flat array index using the partitioning scheme
 46 |  * defined in partition_thread_blocks
 47 |  */
 48 | inline
 49 | __device__
 50 | unsigned long thread_id_to_array_index()
 51 | {
 52 |     return threadIdx.x + blockDim.x*(blockIdx.x + blockIdx.y * gridDim.x
 53 |         + blockIdx.z * gridDim.x * gridDim.y);
 54 | }
 55 | 
 56 | /// bounds check the flat index
 57 | inline
 58 | __device__
 59 | int index_is_valid(unsigned long index, unsigned long max_index)
 60 | {
 61 |     return index < max_index;
 62 | }
 63 | 
 64 | /** Calculate CUDA launch parameters for an arbitrarily large flat array.
 65 |  *
 66 |  * @param[in]  device_id the CUDA device to use. Default values for
 67 |  *                       warps_per_block and block_grid_max are determined by
 68 |  *                       querying the capabilities of the device. If -1 is
 69 |  *                       passed then the currently active device is used.
 70 |  * @param[in]  array_size the length of the array being processed
 71 |  * @param[in]  warps_per_block number of warps to use per block (your choice).
 72 |  *                             Using a larger number here will result in fewer
 73 |  *                             blocks being processed concurrently.
 74 |  *
 75 |  * @param[out] block_grid block dimension kernel launch control
 76 |  * @param[out] n_blocks number of blocks
 77 |  * @param[out] thread_grid thread dimension kernel launch control
 78 |  *
 79 |  * @returns zero if successful and non-zero if an error occurred
 80 |  */
 81 | HAMR_EXPORT
 82 | int partition_thread_blocks(int device_id, size_t array_size,
 83 |     int warps_per_block, dim3 &block_grid, int &n_blocks,
 84 |     dim3 &thread_grid);
 85 | 
 86 | /** Calculate CUDA launch parameters for an arbitrarily large flat array. See
 87 |  * ::get_launch_props for determining the correct values for warp_size and
 88 |  * block_grid_max.
 89 |  *
 90 |  * @param[in]  array_size      The length of the array being processed
 91 |  * @param[in]  warp_size       The number of threads per warp supported on the device
 92 |  * @param[in]  warps_per_block The number of warps to use per block (your choice)
 93 |  * @param[in]  block_grid_max  The maximum number of blocks, in 3-dimensions,
 94 |  *                             supported by the device
 95 |  * @param[out] block_grid      The block grid dimension kernel launch control parameter
 96 |  * @param[out] n_blocks        The total number of blocks that will be launched
 97 |  * @param[out] thread_grid     The thread grid dimension kernel launch control parameter
 98 |  *
 99 |  * @returns zero if successful and non-zero if an error occurred
100 |  */
101 | HAMR_EXPORT
102 | int partition_thread_blocks(size_t array_size,
103 |     int warps_per_block, int warp_size, int *block_grid_max,
104 |     dim3 &block_grid, int &n_blocks, dim3 &thread_grid);
105 | }
106 | 
107 | ///@}
108 | #endif
109 | 


--------------------------------------------------------------------------------
/hamr_cuda_launch.h:
--------------------------------------------------------------------------------
  1 | #ifndef hamr_cuda_launch_h
  2 | #define hamr_cuda_launch_h
  3 | 
  4 | /// @file
  5 | 
  6 | #include "hamr_config.h"
  7 | 
  8 | #include <deque>
  9 | 
 10 | #include <cuda.h>
 11 | #include <cuda_runtime.h>
 12 | 
 13 | /// heterogeneous accelerator memory resource
 14 | namespace hamr
 15 | {
 16 | 
 17 | /** A flat array is broken into blocks of number of threads where each adjacent
 18 |  * thread accesses adjacent memory locations. To accomplish this we might need
 19 |  * a large number of blocks. If the number of blocks exceeds the max block
 20 |  * dimension in the first and or second block grid dimension then we need to
 21 |  * use a 2d or 3d block grid.
 22 |  *
 23 |  * ::partition_thread_blocks - decides on a partitioning of the data based on
 24 |  * warps_per_block parameter. The resulting decomposition will be either 1,2,
 25 |  * or 3D as needed to accommodate the number of fixed sized blocks. It can
 26 |  * happen that max grid dimensions are hit, in which case you'll need to
 27 |  * increase the number of warps per block.
 28 |  *
 29 |  * ::thread_id_to_array_index - given a thread and block id gets the
 30 |  * array index to update. _this may be out of bounds so be sure
 31 |  * to validate before using it.
 32 |  *
 33 |  * ::index_is_valid - test an index for validity.
 34 | */
 35 | /// @name CUDA indexing scheme
 36 | ///@{
 37 | 
 38 | /// query properties for the named CUDA device. retruns non-zero on error
 39 | HAMR_EXPORT
 40 | int get_launch_props(int device_id,
 41 |     int *block_grid_max, int &warp_size,
 42 |     int &max_warps_per_block);
 43 | 
 44 | 
 45 | /** convert a CUDA index into a flat array index using the partitioning scheme
 46 |  * defined in partition_thread_blocks
 47 |  */
 48 | inline
 49 | __device__
 50 | unsigned long thread_id_to_array_index()
 51 | {
 52 |     return threadIdx.x + blockDim.x*(blockIdx.x + blockIdx.y * gridDim.x
 53 |         + blockIdx.z * gridDim.x * gridDim.y);
 54 | }
 55 | 
 56 | /// bounds check the flat index
 57 | inline
 58 | __device__
 59 | int index_is_valid(unsigned long index, unsigned long max_index)
 60 | {
 61 |     return index < max_index;
 62 | }
 63 | 
 64 | /** Calculate CUDA launch parameters for an arbitrarily large flat array.
 65 |  *
 66 |  * @param[in]  device_id the CUDA device to use. Default values for
 67 |  *                       warps_per_block and block_grid_max are determined by
 68 |  *                       querying the capabilities of the device. If -1 is
 69 |  *                       passed then the currently active device is used.
 70 |  * @param[in]  array_size the length of the array being processed
 71 |  * @param[in]  warps_per_block number of warps to use per block (your choice).
 72 |  *                             Using a larger number here will result in fewer
 73 |  *                             blocks being processed concurrently.
 74 |  *
 75 |  * @param[out] block_grid block dimension kernel launch control
 76 |  * @param[out] n_blocks number of blocks
 77 |  * @param[out] thread_grid thread dimension kernel launch control
 78 |  *
 79 |  * @returns zero if successful and non-zero if an error occurred
 80 |  */
 81 | HAMR_EXPORT
 82 | int partition_thread_blocks(int device_id, size_t array_size,
 83 |     int warps_per_block, dim3 &block_grid, int &n_blocks,
 84 |     dim3 &thread_grid);
 85 | 
 86 | /** Calculate CUDA launch parameters for an arbitrarily large flat array. See
 87 |  * ::get_launch_props for determining the correct values for warp_size and
 88 |  * block_grid_max.
 89 |  *
 90 |  * @param[in]  array_size      The length of the array being processed
 91 |  * @param[in]  warp_size       The number of threads per warp supported on the device
 92 |  * @param[in]  warps_per_block The number of warps to use per block (your choice)
 93 |  * @param[in]  block_grid_max  The maximum number of blocks, in 3-dimensions,
 94 |  *                             supported by the device
 95 |  * @param[out] block_grid      The block grid dimension kernel launch control parameter
 96 |  * @param[out] n_blocks        The total number of blocks that will be launched
 97 |  * @param[out] thread_grid     The thread grid dimension kernel launch control parameter
 98 |  *
 99 |  * @returns zero if successful and non-zero if an error occurred
100 |  */
101 | HAMR_EXPORT
102 | int partition_thread_blocks(size_t array_size,
103 |     int warps_per_block, int warp_size, int *block_grid_max,
104 |     dim3 &block_grid, int &n_blocks, dim3 &thread_grid);
105 | }
106 | 
107 | ///@}
108 | #endif
109 | 


--------------------------------------------------------------------------------
/hamr_buffer_allocator.h:
--------------------------------------------------------------------------------
  1 | #ifndef hamr_buffer_allocator_h
  2 | #define hamr_buffer_allocator_h
  3 | 
  4 | ///@file
  5 | 
  6 | #include "hamr_config.h"
  7 | #include <cassert>
  8 | 
  9 | namespace hamr
 10 | {
 11 | 
 12 | /// allocator types that may be used with hamr::buffer
 13 | enum class buffer_allocator
 14 | {
 15 |     same = -2,     ///< propagate the current allocator
 16 |     none = -1,     ///< no allocator specified
 17 |     cpp = 0,       ///< allocates memory with new
 18 |     malloc = 1,    ///< allocates memory with malloc
 19 |     cuda = 2,      ///< allocates memory with cudaMalloc
 20 |     cuda_async = 3,///< allocates memory with cudaMallocAsync
 21 |     cuda_uva = 4,  ///< allocates memory with cudaMallocManaged
 22 |     cuda_host = 5, ///< allocates memory with cudaMallocHost
 23 |     hip = 6,       ///< allocates memory with hipMalloc
 24 |     hip_uva = 7,   ///< allocates memory with hipMallocManaged
 25 |     openmp = 8     ///< allocates memory with OpenMP device offload API
 26 | };
 27 | 
 28 | /// return the human readable name of the allocator
 29 | HAMR_EXPORT
 30 | const char *get_allocator_name(buffer_allocator alloc);
 31 | 
 32 | /// @returns true if the allocator creates host accessible memory
 33 | inline
 34 | HAMR_EXPORT
 35 | int host_accessible(buffer_allocator alloc)
 36 | {
 37 |     return (alloc == buffer_allocator::cpp) ||
 38 |         (alloc == buffer_allocator::malloc) ||
 39 |         (alloc == buffer_allocator::cuda_uva) ||
 40 |         (alloc == buffer_allocator::cuda_host) ||
 41 |         (alloc == buffer_allocator::hip_uva);
 42 | }
 43 | 
 44 | /// @returns true if the allocator creates CUDA accessible memory
 45 | inline
 46 | HAMR_EXPORT
 47 | int cuda_accessible(buffer_allocator alloc)
 48 | {
 49 |     return (alloc == buffer_allocator::cuda) ||
 50 |         (alloc == buffer_allocator::cuda_async) ||
 51 |         (alloc == buffer_allocator::cuda_uva) ||
 52 |         (alloc == buffer_allocator::hip) ||
 53 |         (alloc == buffer_allocator::hip_uva) ||
 54 |         (alloc == buffer_allocator::openmp);
 55 | }
 56 | 
 57 | /// @returns true if the allocator creates HIP accessible memory
 58 | inline
 59 | HAMR_EXPORT
 60 | int hip_accessible(buffer_allocator alloc)
 61 | {
 62 |     return (alloc == buffer_allocator::cuda) ||
 63 |         (alloc == buffer_allocator::cuda_async) ||
 64 |         (alloc == buffer_allocator::cuda_uva) ||
 65 |         (alloc == buffer_allocator::hip) ||
 66 |         (alloc == buffer_allocator::hip_uva);
 67 | }
 68 | 
 69 | /// @returns true if the allocator creates OPENMP accessible memory
 70 | inline
 71 | HAMR_EXPORT
 72 | int openmp_accessible(buffer_allocator alloc)
 73 | {
 74 |     return (alloc == buffer_allocator::cuda) ||
 75 |         (alloc == buffer_allocator::cuda_async) ||
 76 |         (alloc == buffer_allocator::cuda_uva) ||
 77 |         (alloc == buffer_allocator::openmp);
 78 | }
 79 | 
 80 | /// asserts that the passed value is one of the known allocators
 81 | inline
 82 | HAMR_EXPORT
 83 | void assert_valid_allocator(buffer_allocator alloc)
 84 | {
 85 |     (void) alloc;
 86 |     assert((alloc == buffer_allocator::cpp)
 87 |         || (alloc == buffer_allocator::malloc)
 88 | #if defined(HAMR_ENABLE_CUDA)
 89 |         || (alloc == buffer_allocator::cuda)
 90 |         || (alloc == buffer_allocator::cuda_async)
 91 |         || (alloc == buffer_allocator::cuda_uva)
 92 |         || (alloc == buffer_allocator::cuda_host)
 93 | #endif
 94 | #if defined(HAMR_ENABLE_HIP)
 95 |         || (alloc == buffer_allocator::hip)
 96 |         || (alloc == buffer_allocator::hip_uva)
 97 | #endif
 98 | #if defined(HAMR_ENABLE_OPENMP)
 99 |         || (alloc == buffer_allocator::openmp)
100 | #endif
101 |         );
102 | }
103 | 
104 | /// get the allocator type most suitable for the current build configuration.
105 | inline HAMR_EXPORT buffer_allocator get_device_allocator()
106 | {
107 | #if defined(HAMR_ENABLE_CUDA)
108 |     return buffer_allocator::cuda_async;
109 | #elif defined(HAMR_ENABLE_HIP)
110 |     return buffer_allocator::hip;
111 | #elif defined(HAMR_ENABLE_OPENMP)
112 |     return buffer_allocator::openmp;
113 | #else
114 |     return buffer_allocator::malloc;
115 | #endif
116 | }
117 | 
118 | /// get the allocator type most suitable for the current build configuration.
119 | inline HAMR_EXPORT buffer_allocator get_host_allocator()
120 | {
121 | #if defined(HAMR_ENABLE_CUDA)
122 |     return buffer_allocator::cuda_host;
123 | #elif defined(HAMR_ENABLE_HIP)
124 |     return buffer_allocator::malloc;
125 | #elif defined(HAMR_ENABLE_OPENMP)
126 |     return buffer_allocator::malloc;
127 | #else
128 |     return buffer_allocator::malloc;
129 | #endif
130 | }
131 | 
132 | }
133 | 
134 | #endif
135 | 


--------------------------------------------------------------------------------
/hamr_malloc_allocator.h:
--------------------------------------------------------------------------------
  1 | #ifndef hamr_malloc_allocator_h
  2 | #define hamr_malloc_allocator_h
  3 | 
  4 | #include "hamr_config.h"
  5 | 
  6 | #include <memory>
  7 | #include <type_traits>
  8 | 
  9 | namespace hamr
 10 | {
 11 | /// a deleter for arrays allocated with malloc
 12 | template <typename T, typename E = void>
 13 | class malloc_deleter {};
 14 | 
 15 | /// a deleter for arrays allocated with malloc, specialized for objects
 16 | template <typename T>
 17 | class HAMR_EXPORT malloc_deleter<T, typename std::enable_if<!std::is_arithmetic<T>::value>::type>
 18 | {
 19 | public:
 20 |     /** constructs the deleter
 21 |      * @param[in] ptr the pointer to the array to delete
 22 |      * @param[in] n the number of elements in the array
 23 |      */
 24 |     malloc_deleter(T *ptr, size_t n);
 25 | 
 26 |     /** deletes the array
 27 |      * @param[in] ptr the pointer to the array to delete. must be the same as
 28 |      *                that passed during construction.
 29 |      */
 30 |     void operator()(T *ptr);
 31 | 
 32 | private:
 33 |     T *m_ptr;
 34 |     size_t m_elem;
 35 | };
 36 | 
 37 | 
 38 | 
 39 | 
 40 | 
 41 | 
 42 | /// a deleter for arrays allocated with malloc, specialized for numbers
 43 | template <typename T>
 44 | class HAMR_EXPORT malloc_deleter<T, typename std::enable_if<std::is_arithmetic<T>::value>::type>
 45 | {
 46 | public:
 47 |     /** constructs the deleter
 48 |      * @param[in] ptr the pointer to the array to delete
 49 |      * @param[in] n the number of elements in the array
 50 |      */
 51 |     malloc_deleter(T *ptr, size_t n);
 52 | 
 53 |     /** deletes the array
 54 |      * @param[in] ptr the pointer to the array to delete. must be the same as
 55 |      *                that passed during construction.
 56 |      */
 57 |     void operator()(T *ptr);
 58 | 
 59 | private:
 60 |     T *m_ptr;
 61 |     size_t m_elem;
 62 | };
 63 | 
 64 | 
 65 | 
 66 | 
 67 | 
 68 | 
 69 | /// a class for allocating arrays with malloc
 70 | template <typename T, typename E = void>
 71 | struct malloc_allocator {};
 72 | 
 73 | /// a class for allocating arrays with malloc, specialized for objects
 74 | template <typename T>
 75 | struct HAMR_EXPORT malloc_allocator<T, typename std::enable_if<!std::is_arithmetic<T>::value>::type>
 76 | {
 77 |     /** allocate an array of n elements.
 78 |      * @param[in] n the number of elements to allocate
 79 |      * @returns a shared pointer to the array that holds a deleter for the memory
 80 |      */
 81 |     static std::shared_ptr<T> allocate(size_t n) HAMR_EXPORT;
 82 | 
 83 |     /** allocate an array of n elements.
 84 |      * @param[in] n the number of elements to allocate
 85 |      * @param[in] val a value to initialize the elements to
 86 |      * @returns a shared pointer to the array that holds a deleter for the memory
 87 |      */
 88 | 
 89 |     static std::shared_ptr<T> allocate(size_t n, const T &val) HAMR_EXPORT;
 90 | 
 91 |     /** allocate an array of n elements.
 92 |      * @param[in] n the number of elements to allocate
 93 |      * @param[in] vals an array of n elements to initialize the elements with
 94 |      * @returns a shared pointer to the array that holds a deleter for the memory
 95 |      */
 96 |     template <typename U>
 97 |     static std::shared_ptr<T> allocate(size_t n, const U *vals) HAMR_EXPORT;
 98 | };
 99 | 
100 | 
101 | 
102 | 
103 | 
104 | /// a class for allocating arrays with malloc, specialized for numbers
105 | template <typename T>
106 | struct HAMR_EXPORT malloc_allocator<T, typename std::enable_if<std::is_arithmetic<T>::value>::type>
107 | {
108 |     /** allocate an array of n elements.
109 |      * @param[in] n the number of elements to allocate
110 |      * @returns a shared pointer to the array that holds a deleter for the memory
111 |      */
112 |     static std::shared_ptr<T> allocate(size_t n) HAMR_EXPORT;
113 | 
114 |     /** allocate an array of n elements.
115 |      * @param[in] n the number of elements to allocate
116 |      * @param[in] val a value to initialize the elements to
117 |      * @returns a shared pointer to the array that holds a deleter for the memory
118 |      */
119 |     static std::shared_ptr<T> allocate(size_t n, const T &val) HAMR_EXPORT;
120 | 
121 |     /** allocate an array of n elements.
122 |      * @param[in] n the number of elements to allocate
123 |      * @param[in] vals an array of n elements to initialize the elements with
124 |      * @returns a shared pointer to the array that holds a deleter for the memory
125 |      */
126 |     template <typename U>
127 |     static std::shared_ptr<T> allocate(size_t n, const U *vals) HAMR_EXPORT;
128 | };
129 | 
130 | }
131 | 
132 | #if !defined(HAMR_SEPARATE_IMPL)
133 | #include "hamr_malloc_allocator_impl.h"
134 | #endif
135 | 
136 | #endif
137 | 


--------------------------------------------------------------------------------
/hamr_openmp_allocator.h:
--------------------------------------------------------------------------------
  1 | #ifndef hamr_openmp_allocator_h
  2 | #define hamr_openmp_allocator_h
  3 | 
  4 | #include "hamr_config.h"
  5 | #include <type_traits>
  6 | #include <memory>
  7 | 
  8 | namespace hamr
  9 | {
 10 | /// a deleter for arrays allocated with OpenMP
 11 | template <typename T, typename E = void>
 12 | class openmp_deleter {};
 13 | 
 14 | /// a deleter for arrays allocated with OpenMP, specialized for objects
 15 | template <typename T>
 16 | class HAMR_EXPORT openmp_deleter<T, typename std::enable_if<!std::is_arithmetic<T>::value>::type>
 17 | {
 18 | public:
 19 |     /** constructs the deleter
 20 |      * @param[in] ptr the pointer to the array to delete
 21 |      * @param[in] n the number of elements in the array
 22 |      */
 23 |     openmp_deleter(T *ptr, size_t n, int dev);
 24 | 
 25 |     /** deletes the array
 26 |      * @param[in] ptr the pointer to the array to delete. must be the same as
 27 |      *                that passed during construction.
 28 |      */
 29 |     void operator()(T *ptr);
 30 | 
 31 | private:
 32 |     T *m_ptr;
 33 |     size_t m_elem;
 34 |     int m_dev;
 35 | };
 36 | 
 37 | 
 38 | 
 39 | 
 40 | 
 41 | 
 42 | /// a deleter for arrays allocated with OpenMP, specialized for numbers
 43 | template <typename T>
 44 | class HAMR_EXPORT openmp_deleter<T, typename std::enable_if<std::is_arithmetic<T>::value>::type>
 45 | {
 46 | public:
 47 |     /** constructs the deleter
 48 |      * @param[in] ptr the pointer to the array to delete
 49 |      * @param[in] n the number of elements in the array
 50 |      */
 51 |     openmp_deleter(T *ptr, size_t n, int dev);
 52 | 
 53 |     /** deletes the array
 54 |      * @param[in] ptr the pointer to the array to delete. must be the same as
 55 |      *                that passed during construction.
 56 |      */
 57 |     void operator()(T *ptr);
 58 | 
 59 | private:
 60 |     T *m_ptr;
 61 |     size_t m_elem;
 62 |     int m_dev;
 63 | };
 64 | 
 65 | 
 66 | 
 67 | 
 68 | 
 69 | 
 70 | 
 71 | /// a class for allocating arrays with OpenMP
 72 | template <typename T, typename E = void>
 73 | struct openmp_allocator {};
 74 | 
 75 | /// a class for allocating arrays with OpenMP, specialized for objects
 76 | template <typename T>
 77 | struct HAMR_EXPORT openmp_allocator<T, typename std::enable_if<!std::is_arithmetic<T>::value>::type>
 78 | {
 79 |     /** allocate an array of n elements.
 80 |      * @param[in] n the number of elements to allocate
 81 |      * @returns a shared pointer to the array that holds a deleter for the memory
 82 |      */
 83 |     static std::shared_ptr<T> allocate(size_t n) HAMR_EXPORT;
 84 | 
 85 |     /** allocate an array of n elements.
 86 |      * @param[in] n the number of elements to allocate
 87 |      * @param[in] val a value to initialize the elements to
 88 |      * @returns a shared pointer to the array that holds a deleter for the memory
 89 |      */
 90 | 
 91 |     static std::shared_ptr<T> allocate(size_t n, const T &val) HAMR_EXPORT;
 92 | 
 93 |     /** allocate an array of n elements.
 94 |      * @param[in] n the number of elements to allocate
 95 |      * @param[in] vals an array of n elements to initialize the elements with
 96 |      * @returns a shared pointer to the array that holds a deleter for the memory
 97 |      */
 98 |     template <typename U>
 99 |     static std::shared_ptr<T> allocate(size_t n, const U *vals) HAMR_EXPORT;
100 | };
101 | 
102 | 
103 | 
104 | 
105 | 
106 | 
107 | /// a class for allocating arrays with OpenMP, specialized for numbers
108 | template <typename T>
109 | struct HAMR_EXPORT openmp_allocator<T, typename std::enable_if<std::is_arithmetic<T>::value>::type>
110 | {
111 |     /** allocate an array of n elements.
112 |      * @param[in] n the number of elements to allocate
113 |      * @returns a shared pointer to the array that holds a deleter for the memory
114 |      */
115 |     static std::shared_ptr<T> allocate(size_t n) HAMR_EXPORT;
116 | 
117 |     /** allocate an array of n elements.
118 |      * @param[in] n the number of elements to allocate
119 |      * @param[in] val a value to initialize the elements to
120 |      * @returns a shared pointer to the array that holds a deleter for the memory
121 |      */
122 |     static std::shared_ptr<T> allocate(size_t n, const T &val) HAMR_EXPORT;
123 | 
124 |     /** allocate an array of n elements.
125 |      * @param[in] n the number of elements to allocate
126 |      * @param[in] vals an array of n elements to initialize the elements with
127 |      * @returns a shared pointer to the array that holds a deleter for the memory
128 |      */
129 |     template <typename U>
130 |     static std::shared_ptr<T> allocate(size_t n, const U *vals) HAMR_EXPORT;
131 | };
132 | 
133 | }
134 | 
135 | #if !defined(HAMR_SEPARATE_IMPL)
136 | #include "hamr_openmp_allocator_impl.h"
137 | #endif
138 | 
139 | #endif
140 | 


--------------------------------------------------------------------------------
/hamr_hip_malloc_allocator.h:
--------------------------------------------------------------------------------
  1 | #ifndef hamr_hip_malloc_allocator_h
  2 | #define hamr_hip_malloc_allocator_h
  3 | 
  4 | #include "hamr_config.h"
  5 | #include <type_traits>
  6 | #include <memory>
  7 | 
  8 | namespace hamr
  9 | {
 10 | 
 11 | /// a deleter for arrays allocated with hip_malloc
 12 | template <typename T, typename E = void>
 13 | class hip_malloc_deleter {};
 14 | 
 15 | /// a deleter for arrays allocated with hip_malloc, specialized for objects
 16 | template <typename T>
 17 | class HAMR_EXPORT hip_malloc_deleter<T, typename std::enable_if<!std::is_arithmetic<T>::value>::type>
 18 | {
 19 | public:
 20 |     /** constructs the deleter
 21 |      * @param[in] ptr the pointer to the array to delete
 22 |      * @param[in] n   the number of elements in the array
 23 |      */
 24 |     hip_malloc_deleter(T *ptr, size_t n);
 25 | 
 26 |     /** deletes the array
 27 |      * @param[in] ptr the pointer to the array to delete. must be the same as
 28 |      *                that passed during construction.
 29 |      */
 30 |     void operator()(T *ptr);
 31 | 
 32 | private:
 33 |     T *m_ptr;
 34 |     size_t m_elem;
 35 | };
 36 | 
 37 | 
 38 | 
 39 | 
 40 | 
 41 | 
 42 | /// a deleter for arrays allocated with hip_malloc, specialized for numbers
 43 | template <typename T>
 44 | class HAMR_EXPORT hip_malloc_deleter<T, typename std::enable_if<std::is_arithmetic<T>::value>::type>
 45 | {
 46 | public:
 47 |     /** constructs the deleter
 48 |      * @param[in] ptr the pointer to the array to delete
 49 |      * @param[in] n the number of elements in the array
 50 |      */
 51 |     hip_malloc_deleter(T *ptr, size_t n);
 52 | 
 53 |     /** deletes the array
 54 |      * @param[in] ptr the pointer to the array to delete. must be the same as
 55 |      *                that passed during construction.
 56 |      */
 57 |     void operator()(T *ptr);
 58 | 
 59 | private:
 60 |     T *m_ptr;
 61 |     size_t m_elem;
 62 | };
 63 | 
 64 | 
 65 | 
 66 | 
 67 | 
 68 | 
 69 | /// a class for allocating arrays with hip_malloc
 70 | template <typename T, typename E = void>
 71 | struct hip_malloc_allocator {};
 72 | 
 73 | /// a class for allocating arrays with hip_malloc, specialized for objects
 74 | template <typename T>
 75 | struct HAMR_EXPORT hip_malloc_allocator<T, typename std::enable_if<!std::is_arithmetic<T>::value>::type>
 76 | {
 77 |     /** allocate an array of n elements.
 78 |      * @param[in] n the number of elements to allocate
 79 |      * @returns a shared pointer to the array that holds a deleter for the
 80 |      * memory
 81 |      */
 82 |     static std::shared_ptr<T> allocate(size_t n);
 83 | 
 84 |     /** allocate an array of n elements.
 85 |      * @param[in] n the number of elements to allocate
 86 |      * @param[in] val a value to initialize the elements to
 87 |      * @returns a shared pointer to the array that holds a deleter for the
 88 |      * memory
 89 |      */
 90 |     static std::shared_ptr<T> allocate(size_t n, const T &val);
 91 | 
 92 |     /** allocate an array of n elements.
 93 |      * @param[in] n the number of elements to allocate
 94 |      * @param[in] vals an array of values to initialize the elements with
 95 |      * @param[in] hipVals a flag set to true if vals are accessible by codes
 96 |      *                     running in HIP
 97 |      * @returns a shared pointer to the array that holds a deleter for the
 98 |      * memory
 99 |      */
100 |     template <typename U>
101 |     static std::shared_ptr<T> allocate(size_t n, const U *vals, bool hipVals = false);
102 | };
103 | 
104 | 
105 | 
106 | 
107 | 
108 | /// a class for allocating arrays with hip_malloc, specialized for numbers
109 | template <typename T>
110 | struct HAMR_EXPORT hip_malloc_allocator<T, typename std::enable_if<std::is_arithmetic<T>::value>::type>
111 | {
112 |     /** allocate an array of n elements.
113 |      * @param[in] n the number of elements to allocate
114 |      * @returns a shared pointer to the array that holds a deleter for the
115 |      * memory
116 |      */
117 |     static std::shared_ptr<T> allocate(size_t n);
118 | 
119 |     /** allocate an array of n elements.
120 |      * @param[in] n the number of elements to allocate
121 |      * @param[in] val a value to initialize the elements to
122 |      * @returns a shared pointer to the array that holds a deleter for the
123 |      * memory
124 |      */
125 |     static std::shared_ptr<T> allocate(size_t n, const T &val);
126 | 
127 |     /** allocate an array of n elements.
128 |      * @param[in] n the number of elements to allocate
129 |      * @param[in] vals an array of values to initialize the elements with
130 |      * @param[in] hipVals a flag set to true if vals are accessible by codes
131 |      *                     running in HIP
132 |      * @returns a shared pointer to the array that holds a
133 |      * deleter for the memory
134 |      */
135 |     template <typename U>
136 |     static std::shared_ptr<T> allocate(size_t n, const U *vals, bool hipVals = false);
137 | };
138 | 
139 | }
140 | 
141 | #if !defined(HAMR_SEPARATE_IMPL)
142 | #include "hamr_hip_malloc_allocator_impl.h"
143 | #endif
144 | 
145 | #endif
146 | 


--------------------------------------------------------------------------------
/hamr_cuda_malloc_allocator.cxx:
--------------------------------------------------------------------------------
 1 | #include "hamr_config.h"
 2 | 
 3 | #include "hamr_cuda_malloc_allocator.h"
 4 | #include "hamr_cuda_malloc_allocator_impl.h"
 5 | 
 6 | template class hamr::cuda_malloc_deleter<float>;
 7 | template class hamr::cuda_malloc_deleter<double>;
 8 | template class hamr::cuda_malloc_deleter<char>;
 9 | template class hamr::cuda_malloc_deleter<signed char>;
10 | template class hamr::cuda_malloc_deleter<short>;
11 | template class hamr::cuda_malloc_deleter<int>;
12 | template class hamr::cuda_malloc_deleter<long>;
13 | template class hamr::cuda_malloc_deleter<long long>;
14 | template class hamr::cuda_malloc_deleter<unsigned char>;
15 | template class hamr::cuda_malloc_deleter<unsigned short>;
16 | template class hamr::cuda_malloc_deleter<unsigned int>;
17 | template class hamr::cuda_malloc_deleter<unsigned long>;
18 | template class hamr::cuda_malloc_deleter<unsigned long long>;
19 | 
20 | #define hamr_cuda_malloc_allocator_instantiate_members(_T) \
21 | template std::shared_ptr<_T> hamr::cuda_malloc_allocator<_T>::allocate(size_t n, const float *vals, bool cudaVals); \
22 | template std::shared_ptr<_T> hamr::cuda_malloc_allocator<_T>::allocate(size_t n, const double *vals, bool cudaVals); \
23 | template std::shared_ptr<_T> hamr::cuda_malloc_allocator<_T>::allocate(size_t n, const char *vals, bool cudaVals); \
24 | template std::shared_ptr<_T> hamr::cuda_malloc_allocator<_T>::allocate(size_t n, const signed char *vals, bool cudaVals); \
25 | template std::shared_ptr<_T> hamr::cuda_malloc_allocator<_T>::allocate(size_t n, const short *vals, bool cudaVals); \
26 | template std::shared_ptr<_T> hamr::cuda_malloc_allocator<_T>::allocate(size_t n, const int *vals, bool cudaVals); \
27 | template std::shared_ptr<_T> hamr::cuda_malloc_allocator<_T>::allocate(size_t n, const long *vals, bool cudaVals); \
28 | template std::shared_ptr<_T> hamr::cuda_malloc_allocator<_T>::allocate(size_t n, const long long *vals, bool cudaVals); \
29 | template std::shared_ptr<_T> hamr::cuda_malloc_allocator<_T>::allocate(size_t n, const unsigned char *vals, bool cudaVals); \
30 | template std::shared_ptr<_T> hamr::cuda_malloc_allocator<_T>::allocate(size_t n, const unsigned short *vals, bool cudaVals); \
31 | template std::shared_ptr<_T> hamr::cuda_malloc_allocator<_T>::allocate(size_t n, const unsigned int *vals, bool cudaVals); \
32 | template std::shared_ptr<_T> hamr::cuda_malloc_allocator<_T>::allocate(size_t n, const unsigned long *vals, bool cudaVals); \
33 | template std::shared_ptr<_T> hamr::cuda_malloc_allocator<_T>::allocate(size_t n, const unsigned long long *vals, bool cudaVals); \
34 | \
35 | template std::shared_ptr<_T> hamr::cuda_malloc_allocator<_T>::allocate(cudaStream_t strm, size_t n, const float *vals, bool cudaVals); \
36 | template std::shared_ptr<_T> hamr::cuda_malloc_allocator<_T>::allocate(cudaStream_t strm, size_t n, const double *vals, bool cudaVals); \
37 | template std::shared_ptr<_T> hamr::cuda_malloc_allocator<_T>::allocate(cudaStream_t strm, size_t n, const char *vals, bool cudaVals); \
38 | template std::shared_ptr<_T> hamr::cuda_malloc_allocator<_T>::allocate(cudaStream_t strm, size_t n, const signed char *vals, bool cudaVals); \
39 | template std::shared_ptr<_T> hamr::cuda_malloc_allocator<_T>::allocate(cudaStream_t strm, size_t n, const short *vals, bool cudaVals); \
40 | template std::shared_ptr<_T> hamr::cuda_malloc_allocator<_T>::allocate(cudaStream_t strm, size_t n, const int *vals, bool cudaVals); \
41 | template std::shared_ptr<_T> hamr::cuda_malloc_allocator<_T>::allocate(cudaStream_t strm, size_t n, const long *vals, bool cudaVals); \
42 | template std::shared_ptr<_T> hamr::cuda_malloc_allocator<_T>::allocate(cudaStream_t strm, size_t n, const long long *vals, bool cudaVals); \
43 | template std::shared_ptr<_T> hamr::cuda_malloc_allocator<_T>::allocate(cudaStream_t strm, size_t n, const unsigned char *vals, bool cudaVals); \
44 | template std::shared_ptr<_T> hamr::cuda_malloc_allocator<_T>::allocate(cudaStream_t strm, size_t n, const unsigned short *vals, bool cudaVals); \
45 | template std::shared_ptr<_T> hamr::cuda_malloc_allocator<_T>::allocate(cudaStream_t strm, size_t n, const unsigned int *vals, bool cudaVals); \
46 | template std::shared_ptr<_T> hamr::cuda_malloc_allocator<_T>::allocate(cudaStream_t strm, size_t n, const unsigned long *vals, bool cudaVals); \
47 | template std::shared_ptr<_T> hamr::cuda_malloc_allocator<_T>::allocate(cudaStream_t strm, size_t n, const unsigned long long *vals, bool cudaVals);
48 | 
49 | #define hamr_cuda_malloc_allocator_instantiate(_T) \
50 | template struct hamr::cuda_malloc_allocator<_T>; \
51 | hamr_cuda_malloc_allocator_instantiate_members(_T)
52 | 
53 | hamr_cuda_malloc_allocator_instantiate(float)
54 | hamr_cuda_malloc_allocator_instantiate(double)
55 | hamr_cuda_malloc_allocator_instantiate(char)
56 | hamr_cuda_malloc_allocator_instantiate(signed char)
57 | hamr_cuda_malloc_allocator_instantiate(short)
58 | hamr_cuda_malloc_allocator_instantiate(int)
59 | hamr_cuda_malloc_allocator_instantiate(long)
60 | hamr_cuda_malloc_allocator_instantiate(long long)
61 | hamr_cuda_malloc_allocator_instantiate(unsigned char)
62 | hamr_cuda_malloc_allocator_instantiate(unsigned short)
63 | hamr_cuda_malloc_allocator_instantiate(unsigned int)
64 | hamr_cuda_malloc_allocator_instantiate(unsigned long)
65 | hamr_cuda_malloc_allocator_instantiate(unsigned long long)
66 | 


--------------------------------------------------------------------------------
/hamr_cuda_malloc_host_allocator.h:
--------------------------------------------------------------------------------
  1 | #ifndef hamr_cuda_malloc_host_allocator_h
  2 | #define hamr_cuda_malloc_host_allocator_h
  3 | 
  4 | #include "hamr_config.h"
  5 | #include <type_traits>
  6 | #include <memory>
  7 | 
  8 | namespace hamr
  9 | {
 10 | /// a deleter for arrays allocated with cudaMallocHost
 11 | template <typename T, typename E = void>
 12 | class cuda_malloc_host_deleter {};
 13 | 
 14 | /// a deleter for arrays allocated with cudaMallocHost, specialized for objects
 15 | template <typename T>
 16 | class HAMR_EXPORT cuda_malloc_host_deleter<T, typename std::enable_if<!std::is_arithmetic<T>::value>::type>
 17 | {
 18 | public:
 19 |     /** constructs the deleter
 20 |      * @param[in] ptr the pointer to the array to delete
 21 |      * @param[in] n the number of elements in the array
 22 |      */
 23 |     cuda_malloc_host_deleter(T *ptr, size_t n);
 24 | 
 25 |     /** deletes the array
 26 |      * @param[in] ptr the pointer to the array to delete. must be the same as
 27 |      *                that passed during construction.
 28 |      */
 29 |     void operator()(T *ptr);
 30 | 
 31 | private:
 32 |     T *m_ptr;
 33 |     size_t m_elem;
 34 | };
 35 | 
 36 | 
 37 | 
 38 | 
 39 | 
 40 | 
 41 | /// a deleter for arrays allocated with cudaMallocHost, specialized for numbers
 42 | template <typename T>
 43 | class HAMR_EXPORT cuda_malloc_host_deleter<T, typename std::enable_if<std::is_arithmetic<T>::value>::type>
 44 | {
 45 | public:
 46 |     /** constructs the deleter
 47 |      * @param[in] ptr the pointer to the array to delete
 48 |      * @param[in] n the number of elements in the array
 49 |      */
 50 |     cuda_malloc_host_deleter(T *ptr, size_t n);
 51 | 
 52 |     /** deletes the array
 53 |      * @param[in] ptr the pointer to the array to delete. must be the same as
 54 |      *                that passed during construction.
 55 |      */
 56 |     void operator()(T *ptr);
 57 | 
 58 | private:
 59 |     T *m_ptr;
 60 |     size_t m_elem;
 61 | };
 62 | 
 63 | 
 64 | 
 65 | 
 66 | 
 67 | 
 68 | /** A class for allocating arrays with cudaMallocHost.  Use this allocator for
 69 |  * host accessible memory when you want to overlap data movement and computation
 70 |  * with CUDA.
 71 |  */
 72 | template <typename T, typename E = void>
 73 | struct cuda_malloc_host_allocator {};
 74 | 
 75 | /** a class for allocating arrays with cudaMallocHost, specialized for objects
 76 |  * Use this allocator for host accessible memory when you want to overlap data movement and computation
 77 |  * with CUDA
 78 |  */
 79 | template <typename T>
 80 | struct HAMR_EXPORT cuda_malloc_host_allocator<T, typename std::enable_if<!std::is_arithmetic<T>::value>::type>
 81 | {
 82 |     /** allocate an array of n elements.
 83 |      * @param[in] n the number of elements to allocate
 84 |      * @returns a shared pointer to the array that holds a deleter for the memory
 85 |      */
 86 |     static std::shared_ptr<T> allocate(size_t n) HAMR_EXPORT;
 87 | 
 88 |     /** allocate an array of n elements.
 89 |      * @param[in] n the number of elements to allocate
 90 |      * @param[in] val a value to initialize the elements to
 91 |      * @returns a shared pointer to the array that holds a deleter for the memory
 92 |      */
 93 |     static std::shared_ptr<T> allocate(size_t n, const T &val) HAMR_EXPORT;
 94 | 
 95 |     /** allocate an array of n elements.
 96 |      * @param[in] n the number of elements to allocate
 97 |      * @param[in] vals an array of n elements to initialize the elements with
 98 |      * @returns a shared pointer to the array that holds a deleter for the memory
 99 |      */
100 |     template <typename U>
101 |     static std::shared_ptr<T> allocate(size_t n, const U *vals) HAMR_EXPORT;
102 | };
103 | 
104 | 
105 | 
106 | 
107 | 
108 | /** a class for allocating arrays with cudaMallocHost, specialized for numbers.
109 |  * Use this allocator for host accessible memory when you want to overlap data
110 |  * movement and computation with CUDA
111 |  */
112 | template <typename T>
113 | struct HAMR_EXPORT cuda_malloc_host_allocator<T, typename std::enable_if<std::is_arithmetic<T>::value>::type>
114 | {
115 |     /** allocate an array of n elements.
116 |      * @param[in] n the number of elements to allocate
117 |      * @returns a shared pointer to the array that holds a deleter for the memory
118 |      */
119 |     static std::shared_ptr<T> allocate(size_t n) HAMR_EXPORT;
120 | 
121 |     /** allocate an array of n elements.
122 |      * @param[in] n the number of elements to allocate
123 |      * @param[in] val a value to initialize the elements to
124 |      * @returns a shared pointer to the array that holds a deleter for the memory
125 |      */
126 |     static std::shared_ptr<T> allocate(size_t n, const T &val) HAMR_EXPORT;
127 | 
128 |     /** allocate an array of n elements.
129 |      * @param[in] n the number of elements to allocate
130 |      * @param[in] vals an array of n elements to initialize the elements with
131 |      * @returns a shared pointer to the array that holds a deleter for the memory
132 |      */
133 |     template <typename U>
134 |     static std::shared_ptr<T> allocate(size_t n, const U *vals) HAMR_EXPORT;
135 | };
136 | 
137 | }
138 | 
139 | #if !defined(HAMR_SEPARATE_IMPL)
140 | #include "hamr_cuda_malloc_host_allocator_impl.h"
141 | #endif
142 | 
143 | #endif
144 | 


--------------------------------------------------------------------------------
/hamr_hip_kernels.h:
--------------------------------------------------------------------------------
  1 | #ifndef hamr_hip_kernels_h
  2 | #define hamr_hip_kernels_h
  3 | 
  4 | #include "hamr_config.h"
  5 | #include "hamr_env.h"
  6 | #include "hamr_hip_launch.h"
  7 | 
  8 | namespace hamr
  9 | {
 10 | 
 11 | namespace hip_kernels
 12 | {
 13 | 
 14 | /// helpers to get the printf code given a POD type
 15 | template <typename T> struct printf_tt {};
 16 | 
 17 | #define declare_printf_tt(cpp_t, print_t, code, len)\
 18 | /** printf code wrapper for cpp_t */                \
 19 | template <> struct printf_tt<cpp_t>                 \
 20 | {                                                   \
 21 |     /** cast from cpp_t to print_t */               \
 22 |     __device__                                      \
 23 |     static print_t get_value(cpp_t v)               \
 24 |     { return v; }                                   \
 25 |                                                     \
 26 |     /** returns the printf code for cpp_t */        \
 27 |     __device__                                      \
 28 |     static const char *get_code()                   \
 29 |     { return code; }                                \
 30 |                                                     \
 31 |     /** copies the printf code */                   \
 32 |     __device__                                      \
 33 |     static void copy_code(char *dest)               \
 34 |     {                                               \
 35 |         for (int i = 0; i < len; ++i)               \
 36 |             dest[i] = get_code()[i];                \
 37 |     }                                               \
 38 |                                                     \
 39 |     /** returns the length of the printf code */    \
 40 |     __device__                                      \
 41 |     static int get_code_len()                       \
 42 |     { return len; }                                 \
 43 | };
 44 | 
 45 | declare_printf_tt(char, int, "%d", 2)
 46 | declare_printf_tt(signed char, int, "%d", 2)
 47 | declare_printf_tt(unsigned char, unsigned int, "%u", 2)
 48 | declare_printf_tt(short, short, "%hd", 3)
 49 | declare_printf_tt(unsigned short, unsigned short, "%hu", 3)
 50 | declare_printf_tt(int, int, "%d", 2)
 51 | declare_printf_tt(unsigned int, unsigned int, "%u", 2)
 52 | declare_printf_tt(long, long, "%ld", 3)
 53 | declare_printf_tt(unsigned long, unsigned long, "%lu", 3)
 54 | declare_printf_tt(long long, long long, "%lld", 4)
 55 | declare_printf_tt(unsigned long long, unsigned long long, "%llu", 4)
 56 | declare_printf_tt(float, float, "%g", 2)
 57 | declare_printf_tt(double, double, "%g", 2)
 58 | 
 59 | 
 60 | /// send an array to the stderr stream on the GPU using HIP
 61 | template <typename T>
 62 | __global__
 63 | void print(const T *vals, size_t n_elem)
 64 | {
 65 |     unsigned long i = hamr::thread_id_to_array_index();
 66 | 
 67 |     if (i >= n_elem)
 68 |         return;
 69 | 
 70 |     int cl = printf_tt<T>::get_code_len();
 71 |     char fmt[] = "vals[%lu] = XXXXXXXXX"; // <-- 20
 72 |     printf_tt<T>::copy_code(fmt + 12);
 73 |     fmt[12 + cl] = '\n';
 74 |     fmt[13 + cl] = '\0';
 75 | 
 76 |     printf(fmt, i, printf_tt<T>::get_value(vals[i]));
 77 | }
 78 | 
 79 | /// copy an array on the GPU using HIP
 80 | template <typename T, typename U>
 81 | __global__
 82 | void copy(T *dest, const U *src, size_t n_elem)
 83 | {
 84 |     unsigned long i = hamr::thread_id_to_array_index();
 85 | 
 86 |     if (i >= n_elem)
 87 |         return;
 88 | 
 89 |     dest[i] = static_cast<T>(src[i]);
 90 | }
 91 | 
 92 | /// default construct on the GPU
 93 | template <typename T>
 94 | __global__
 95 | void construct(T *dest, size_t n_elem)
 96 | {
 97 |     unsigned long i = hamr::thread_id_to_array_index();
 98 | 
 99 |     if (i >= n_elem)
100 |         return;
101 | 
102 |     new (&dest[i]) T();
103 | }
104 | 
105 | /// copy construct on the GPU
106 | template <typename T, typename U>
107 | __global__
108 | void construct(T *dest, size_t n_elem, U val)
109 | {
110 |     unsigned long i = hamr::thread_id_to_array_index();
111 | 
112 |     if (i >= n_elem)
113 |         return;
114 | 
115 |     new (&dest[i]) T(val);
116 | }
117 | 
118 | /// copy construct on the GPU
119 | template <typename T, typename U>
120 | __global__
121 | void construct(T *dest, size_t n_elem, const U *vals)
122 | {
123 |     unsigned long i = hamr::thread_id_to_array_index();
124 | 
125 |     if (i >= n_elem)
126 |         return;
127 | 
128 |     new (&dest[i]) T(vals[i]);
129 | }
130 | 
131 | /// destruct on the GPU
132 | template <typename T>
133 | __global__
134 | void destruct(T *dest, size_t n_elem)
135 | {
136 |     unsigned long i = hamr::thread_id_to_array_index();
137 | 
138 |     if (i >= n_elem)
139 |         return;
140 | 
141 |     dest[i].~T();
142 | }
143 | 
144 | /// initialize an array on the GPU
145 | template <typename T, typename U>
146 | __global__
147 | void fill(T *dest, size_t n_elem, U val)
148 | {
149 |     unsigned long i = hamr::thread_id_to_array_index();
150 | 
151 |     if (i >= n_elem)
152 |         return;
153 | 
154 |     dest[i] = val;
155 | }
156 | 
157 | /// initialize an array on the GPU
158 | template <typename T, typename U>
159 | __global__
160 | void fill(T *dest, size_t n_elem, const U *vals)
161 | {
162 |     unsigned long i = hamr::thread_id_to_array_index();
163 | 
164 |     if (i >= n_elem)
165 |         return;
166 | 
167 |     dest[i] = vals[i];
168 | }
169 | 
170 | }
171 | 
172 | }
173 | 
174 | #endif
175 | 


--------------------------------------------------------------------------------
/hamr_cuda_kernels.h:
--------------------------------------------------------------------------------
  1 | #ifndef hamr_cuda_kernels_h
  2 | #define hamr_cuda_kernels_h
  3 | 
  4 | #include "hamr_config.h"
  5 | #include "hamr_env.h"
  6 | #include "hamr_cuda_launch.h"
  7 | 
  8 | namespace hamr
  9 | {
 10 | 
 11 | namespace cuda_kernels
 12 | {
 13 | 
 14 | /// helpers to get the printf code given a POD type
 15 | template <typename T> struct printf_tt {};
 16 | 
 17 | #define declare_printf_tt(cpp_t, print_t, code, len)\
 18 | /** printf code wrapper for cpp_t */                \
 19 | template <> struct printf_tt<cpp_t>                 \
 20 | {                                                   \
 21 |     /** cast from cpp_t to print_t */               \
 22 |     __device__                                      \
 23 |     static print_t get_value(cpp_t v)               \
 24 |     { return v; }                                   \
 25 |                                                     \
 26 |     /** returns the printf code for cpp_t */        \
 27 |     __device__                                      \
 28 |     static const char *get_code()                   \
 29 |     { return code; }                                \
 30 |                                                     \
 31 |     /** copies the printf code */                   \
 32 |     __device__                                      \
 33 |     static void copy_code(char *dest)               \
 34 |     {                                               \
 35 |         for (int i = 0; i < len; ++i)               \
 36 |             dest[i] = get_code()[i];                \
 37 |     }                                               \
 38 |                                                     \
 39 |     /** returns the length of the printf code */    \
 40 |     __device__                                      \
 41 |     static int get_code_len()                       \
 42 |     { return len; }                                 \
 43 | };
 44 | 
 45 | declare_printf_tt(char, int, "%d", 2)
 46 | declare_printf_tt(signed char, int, "%d", 2)
 47 | declare_printf_tt(unsigned char, unsigned int, "%u", 2)
 48 | declare_printf_tt(short, short, "%hd", 3)
 49 | declare_printf_tt(unsigned short, unsigned short, "%hu", 3)
 50 | declare_printf_tt(int, int, "%d", 2)
 51 | declare_printf_tt(unsigned int, unsigned int, "%u", 2)
 52 | declare_printf_tt(long, long, "%ld", 3)
 53 | declare_printf_tt(unsigned long, unsigned long, "%lu", 3)
 54 | declare_printf_tt(long long, long long, "%lld", 4)
 55 | declare_printf_tt(unsigned long long, unsigned long long, "%llu", 4)
 56 | declare_printf_tt(float, float, "%g", 2)
 57 | declare_printf_tt(double, double, "%g", 2)
 58 | 
 59 | 
 60 | /// send an array to the stderr stream on the GPU using CUDA
 61 | template <typename T>
 62 | __global__
 63 | void print(const T *vals, size_t n_elem)
 64 | {
 65 |     unsigned long i = hamr::thread_id_to_array_index();
 66 | 
 67 |     if (i >= n_elem)
 68 |         return;
 69 | 
 70 |     int cl = printf_tt<T>::get_code_len();
 71 |     char fmt[] = "vals[%lu] = XXXXXXXXX"; // <-- 20
 72 |     printf_tt<T>::copy_code(fmt + 12);
 73 |     fmt[12 + cl] = '\n';
 74 |     fmt[13 + cl] = '\0';
 75 | 
 76 |     printf(fmt, i, printf_tt<T>::get_value(vals[i]));
 77 | }
 78 | 
 79 | /// copy an array on the GPU using CUDA
 80 | template <typename T, typename U>
 81 | __global__
 82 | void copy(T *dest, const U *src, size_t n_elem)
 83 | {
 84 |     unsigned long i = hamr::thread_id_to_array_index();
 85 | 
 86 |     if (i >= n_elem)
 87 |         return;
 88 | 
 89 |     dest[i] = static_cast<T>(src[i]);
 90 | }
 91 | 
 92 | /// default construct on the GPU
 93 | template <typename T>
 94 | __global__
 95 | void construct(T *dest, size_t n_elem)
 96 | {
 97 |     unsigned long i = hamr::thread_id_to_array_index();
 98 | 
 99 |     if (i >= n_elem)
100 |         return;
101 | 
102 |     new (&dest[i]) T();
103 | }
104 | 
105 | /// copy construct on the GPU
106 | template <typename T, typename U>
107 | __global__
108 | void construct(T *dest, size_t n_elem, U val)
109 | {
110 |     unsigned long i = hamr::thread_id_to_array_index();
111 | 
112 |     if (i >= n_elem)
113 |         return;
114 | 
115 |     new (&dest[i]) T(val);
116 | }
117 | 
118 | /// copy construct on the GPU
119 | template <typename T, typename U>
120 | __global__
121 | void construct(T *dest, size_t n_elem, const U *vals)
122 | {
123 |     unsigned long i = hamr::thread_id_to_array_index();
124 | 
125 |     if (i >= n_elem)
126 |         return;
127 | 
128 |     new (&dest[i]) T(vals[i]);
129 | }
130 | 
131 | /// destruct on the GPU
132 | template <typename T>
133 | __global__
134 | void destruct(T *dest, size_t n_elem)
135 | {
136 |     unsigned long i = hamr::thread_id_to_array_index();
137 | 
138 |     if (i >= n_elem)
139 |         return;
140 | 
141 |     dest[i].~T();
142 | }
143 | 
144 | /// initialize an array on the GPU
145 | template <typename T, typename U>
146 | __global__
147 | void fill(T *dest, size_t n_elem, U val)
148 | {
149 |     unsigned long i = hamr::thread_id_to_array_index();
150 | 
151 |     if (i >= n_elem)
152 |         return;
153 | 
154 |     dest[i] = val;
155 | }
156 | 
157 | /// initialize an array on the GPU
158 | template <typename T, typename U>
159 | __global__
160 | void fill(T *dest, size_t n_elem, const U *vals)
161 | {
162 |     unsigned long i = hamr::thread_id_to_array_index();
163 | 
164 |     if (i >= n_elem)
165 |         return;
166 | 
167 |     dest[i] = vals[i];
168 | }
169 | 
170 | }
171 | 
172 | }
173 | 
174 | #endif
175 | 


--------------------------------------------------------------------------------
/test/test_hamr_pipeline_openmp.cpp:
--------------------------------------------------------------------------------
  1 | #include "hamr_config.h"
  2 | #include "hamr_buffer.h"
  3 | #include "hamr_buffer_util.h"
  4 | #include "hamr_buffer_allocator.h"
  5 | 
  6 | #include <iostream>
  7 | 
  8 | using allocator = hamr::buffer_allocator;
  9 | 
 10 | // **************************************************************************
 11 | template <typename T>
 12 | hamr::buffer<T> initialize(size_t n_vals, const T &val)
 13 | {
 14 |     // allocate the memory
 15 |     hamr::buffer<T> ao(allocator::openmp, n_vals);
 16 |     T *pao = ao.data();
 17 | 
 18 |     // initialize using openmp
 19 | 
 20 |     #pragma omp target teams HAMR_OPENMP_LOOP is_device_ptr(pao) map(to: val)
 21 |     for (size_t i = 0; i < n_vals; ++i)
 22 |     {
 23 |         pao[i] = val;
 24 |     }
 25 | 
 26 |     // print the results
 27 |     std::cerr << "initialized to an array of " << n_vals << " to " << val << std::endl;
 28 | 
 29 |     if (n_vals < 33)
 30 |     {
 31 |         std::cerr << "ao = "; ao.print(); std::cerr << std::endl;
 32 |         ao.print();
 33 |         std::cerr << std::endl;
 34 |     }
 35 | 
 36 |     return ao;
 37 | }
 38 | 
 39 | 
 40 | // **************************************************************************
 41 | template <typename T, typename U>
 42 | hamr::buffer<T> add(const hamr::buffer<T> &a1, const hamr::buffer<U> &a2)
 43 | {
 44 |     size_t n_vals = a1.size();
 45 | 
 46 |     // get the inputs
 47 |     auto spa1 = a1.get_openmp_accessible();
 48 |     auto pa1 = spa1.get();
 49 | 
 50 |     auto spa2 = a2.get_openmp_accessible();
 51 |     auto pa2 = spa2.get();
 52 | 
 53 |     // allocate the memory
 54 |     hamr::buffer<T> ao(allocator::openmp, n_vals, T(0));
 55 |     T *pao = ao.data();
 56 | 
 57 |     // do the calculation
 58 |     #pragma omp target teams HAMR_OPENMP_LOOP is_device_ptr(pao, pa1, pa2)
 59 |     for (size_t i = 0; i < n_vals; ++i)
 60 |     {
 61 |         pao[i] = pa1[i] + pa2[i];
 62 |     }
 63 | 
 64 |     // print the results
 65 |     std::cerr << "added " << n_vals << " array " << typeid(T).name() << sizeof(T)
 66 |          << " to array  " << typeid(U).name() << sizeof(U) << std::endl;
 67 | 
 68 |     if (n_vals < 33)
 69 |     {
 70 |         std::cerr << "a1 = "; a1.print(); std::cerr << std::endl;
 71 |         std::cerr << "a2 = "; a2.print(); std::cerr << std::endl;
 72 |         std::cerr << "ao = "; ao.print(); std::cerr << std::endl;
 73 |     }
 74 | 
 75 |     return ao;
 76 | }
 77 | 
 78 | 
 79 | // **************************************************************************
 80 | template <typename T, typename U>
 81 | hamr::buffer<T> multiply_scalar(const hamr::buffer<T> &ai, const U &val)
 82 | {
 83 |     size_t n_vals = ai.size();
 84 | 
 85 |     // get the inputs
 86 |     auto spai = ai.get_openmp_accessible();
 87 |     auto pai = spai.get();
 88 | 
 89 |     // allocate the memory
 90 |     hamr::buffer<T> ao(allocator::openmp, n_vals, T(0));
 91 |     T *pao = ao.data();
 92 | 
 93 |     // do the calculation
 94 |     #pragma omp target teams HAMR_OPENMP_LOOP is_device_ptr(pao, pai) map(to: val)
 95 |     for (size_t i = 0; i < n_vals; ++i)
 96 |     {
 97 |         pao[i] = val * pai[i];
 98 |     }
 99 | 
100 |     // print the results
101 |     std::cerr << "multiply_scalar " << val << " " << typeid(U).name() << sizeof(U)
102 |        << " by " << n_vals << " array " << typeid(T).name() << sizeof(T) << std::endl;
103 | 
104 |     if (n_vals < 33)
105 |     {
106 |         std::cerr << "ai = "; ai.print(); std::cerr << std::endl;
107 |         std::cerr << "ao = "; ao.print(); std::cerr << std::endl;
108 |     }
109 | 
110 |     return ao;
111 | }
112 | 
113 | 
114 | // **************************************************************************
115 | template <typename T>
116 | int compare_int(const hamr::buffer<T> &ain, int val)
117 | {
118 |     size_t n_vals = ain.size();
119 |     std::cerr << "comparing array with " << n_vals << " elements to " << val << std::endl;
120 | 
121 |     hamr::buffer<int> ai(ain.get_allocator(), n_vals);
122 |     ain.get(ai);
123 | 
124 |     if (n_vals < 33)
125 |     {
126 |         ai.print();
127 |     }
128 | 
129 |     auto [spai, pai] = hamr::get_host_accessible(ai);
130 | 
131 |     for (size_t i = 0; i < n_vals; ++i)
132 |     {
133 |         if (pai[i] != val)
134 |         {
135 |             std::cerr << "ERROR: pai[" << i << "] = "
136 |                 << pai[i] << " != " << val << std::endl;
137 |             return -1;
138 |         }
139 |     }
140 | 
141 |     std::cerr << "all elements are equal to " << val << std::endl;
142 | 
143 |     return 0;
144 | }
145 | 
146 | 
147 | 
148 | int main(int, char **)
149 | {
150 |     size_t n_vals = 10000;
151 | 
152 |     hamr::buffer<float>  ao0(allocator::openmp, n_vals, 1.0f);   // = 1 (device)
153 |     hamr::buffer<float>  ao1 = multiply_scalar(ao0, 2.0f);       // = 2 (device)
154 |     ao0.free();
155 | 
156 |     hamr::buffer<double> ao2 = initialize(n_vals, 2.0);          // = 2 (device)
157 |     hamr::buffer<double> ao3 = add(ao2, ao1);                    // = 4 (device)
158 |     ao1.free();
159 |     ao2.free();
160 | 
161 |     hamr::buffer<double> ao4 = multiply_scalar(ao3, 1000.0);     // = 4000 (device)
162 |     ao3.free();
163 | 
164 |     hamr::buffer<float>  ao5(allocator::malloc, n_vals, 3.0f);   // = 1 (host)
165 |     hamr::buffer<float>  ao6 = multiply_scalar(ao5, 100.0f);     // = 300 (device)
166 |     ao5.free();
167 | 
168 |     hamr::buffer<float> ao7(allocator::malloc, n_vals);          // = uninit (host)
169 |     ao7.set(ao6);                                                // = 300 (host)
170 |     ao6.free();
171 | 
172 |     hamr::buffer<double> ao8 = add(ao4, ao7);                    // = 4300 (device)
173 |     ao4.free();
174 |     ao7.free();
175 | 
176 |     return compare_int(ao8, 4300);
177 | }
178 | 


--------------------------------------------------------------------------------
/hamr_cuda_malloc_uva_allocator.h:
--------------------------------------------------------------------------------
  1 | #ifndef hamr_cuda_malloc_uva_allocator_h
  2 | #define hamr_cuda_malloc_uva_allocator_h
  3 | 
  4 | #include "hamr_config.h"
  5 | #include <type_traits>
  6 | #include <memory>
  7 | 
  8 | namespace hamr
  9 | {
 10 | 
 11 | /// a deleter for arrays allocated with cuda_malloc_uva
 12 | template <typename T, typename E = void>
 13 | class cuda_malloc_uva_deleter {};
 14 | 
 15 | /// a deleter for arrays allocated with cuda_malloc_uva, specialized for objects
 16 | template <typename T>
 17 | class cuda_malloc_uva_deleter<T, typename std::enable_if<!std::is_arithmetic<T>::value>::type>
 18 | {
 19 | public:
 20 |     /** constructs the deleter
 21 |      * @param[in] ptr the pointer to the array to delete
 22 |      * @param[in] n the number of elements in the array
 23 |      */
 24 |     cuda_malloc_uva_deleter(cudaStream_t str, T *ptr, size_t n);
 25 | 
 26 |     /** deletes the array
 27 |      * @param[in] ptr the pointer to the array to delete. must be the same as
 28 |      *                that passed during construction.
 29 |      */
 30 |     void operator()(T *ptr);
 31 | 
 32 | private:
 33 |     T *m_ptr;
 34 |     size_t m_elem;
 35 |     cudaStream_t m_str;
 36 | };
 37 | 
 38 | 
 39 | 
 40 | 
 41 | 
 42 | 
 43 | /// a deleter for arrays allocated with cuda_malloc_uva, specialized for numbers
 44 | template <typename T>
 45 | class cuda_malloc_uva_deleter<T, typename std::enable_if<std::is_arithmetic<T>::value>::type>
 46 | {
 47 | public:
 48 |     /** constructs the deleter
 49 |      * @param[in] ptr the pointer to the array to delete
 50 |      * @param[in] n the number of elements in the array
 51 |      */
 52 |     cuda_malloc_uva_deleter(cudaStream_t str, T *ptr, size_t n);
 53 | 
 54 |     /** deletes the array
 55 |      * @param[in] ptr the pointer to the array to delete. must be the same as
 56 |      *                that passed during construction.
 57 |      */
 58 |     void operator()(T *ptr);
 59 | 
 60 | private:
 61 |     T *m_ptr;
 62 |     size_t m_elem;
 63 | };
 64 | 
 65 | 
 66 | 
 67 | 
 68 | 
 69 | 
 70 | /// a class for allocating arrays with cuda_malloc_uva
 71 | template <typename T, typename E = void>
 72 | struct cuda_malloc_uva_allocator {};
 73 | 
 74 | /// a class for allocating arrays with cuda_malloc_uva, specialized for objects
 75 | template <typename T>
 76 | struct HAMR_EXPORT cuda_malloc_uva_allocator<T, typename std::enable_if<!std::is_arithmetic<T>::value>::type>
 77 | {
 78 |     /** allocate an array of n elements.
 79 |      * @param[in] str a stream used to order operations, or null for the
 80 |      *                default stream
 81 |      * @param[in] n the number of elements to allocate
 82 |      * @returns a shared point to the array that holds a deleter for the memory
 83 |      */
 84 |     static std::shared_ptr<T> allocate(cudaStream_t str, size_t n);
 85 | 
 86 |     /** allocate an array of n elements.
 87 |      * @param[in] str a stream used to order operations, or null for the
 88 |      *                default stream
 89 |      * @param[in] n the number of elements to allocate
 90 |      * @param[in] val a value to initialize the elements to
 91 |      * @returns a shared point to the array that holds a deleter for the memory
 92 |      */
 93 |     static std::shared_ptr<T> allocate(cudaStream_t str, size_t n, const T &val);
 94 | 
 95 |     /** allocate an array of n elements.
 96 |      * @param[in] str a stream used to order operations, or null for the
 97 |      *                default stream
 98 |      * @param[in] n the number of elements to allocate
 99 |      * @param[in] vals an array of values to initialize the elements with
100 |      * @param[in] cudaVals a flag that is set to true if vals is accessible from codes running in CUDA
101 |      * @returns a shared point to the array that holds a deleter for the memory
102 |      */
103 |     template <typename U>
104 |     static std::shared_ptr<T> allocate(cudaStream_t str, size_t n, const U *vals, bool cudaVals = false);
105 | };
106 | 
107 | 
108 | 
109 | 
110 | 
111 | /// a class for allocating arrays with cuda_malloc_uva, specialized for numbers
112 | template <typename T>
113 | struct HAMR_EXPORT cuda_malloc_uva_allocator<T, typename std::enable_if<std::is_arithmetic<T>::value>::type>
114 | {
115 |     /** allocate an array of n elements.
116 |      * @param[in] str a stream used to order operations, or null for the
117 |      *                default stream
118 |      * @param[in] n the number of elements to allocate
119 |      * @returns a shared point to the array that holds a deleter for the memory
120 |      */
121 |     static std::shared_ptr<T> allocate(cudaStream_t str, size_t n);
122 | 
123 |     /** allocate an array of n elements.
124 |      * @param[in] str a stream used to order operations, or null for the
125 |      *                default stream
126 |      * @param[in] n the number of elements to allocate
127 |      * @param[in] val a value to initialize the elements to
128 |      * @returns a shared point to the array that holds a deleter for the memory
129 |      */
130 |     static std::shared_ptr<T> allocate(cudaStream_t str, size_t n, const T &val);
131 | 
132 |     /** allocate an array of n elements.
133 |      * @param[in] str a stream used to order operations, or null for the
134 |      *                default stream
135 |      * @param[in] n the number of elements to allocate
136 |      * @param[in] vals an array of values to initialize the elements with
137 |      * @param[in] cudaVals a flag set to true if vals is accessible from codes running in CUDA
138 |      * @returns a shared point to the array that holds a deleter for the memory
139 |      */
140 |     template <typename U>
141 |     static std::shared_ptr<T> allocate(cudaStream_t str, size_t n, const U *vals, bool cudaVals = false);
142 | };
143 | 
144 | }
145 | 
146 | #if !defined(HAMR_SEPARATE_IMPL)
147 | #include "hamr_cuda_malloc_uva_allocator_impl.h"
148 | #endif
149 | 
150 | #endif
151 | 


--------------------------------------------------------------------------------
/test/test_hamr_pipeline_host.cpp:
--------------------------------------------------------------------------------
  1 | #include "hamr_buffer.h"
  2 | #include "hamr_buffer_util.h"
  3 | 
  4 | #include <iostream>
  5 | 
  6 | using hamr::buffer;
  7 | using allocator = hamr::buffer_allocator;
  8 | 
  9 | // **************************************************************************
 10 | template<typename T>
 11 | void initialize_host(T *data, double val, size_t n_vals)
 12 | {
 13 |     for (size_t i = 0; i < n_vals; ++i)
 14 |     {
 15 |         data[i] = val;
 16 |     }
 17 | }
 18 | 
 19 | // **************************************************************************
 20 | template <typename T>
 21 | buffer<T> initialize_host(size_t n_vals, const T &val)
 22 | {
 23 |     // allocate the memory
 24 |     buffer<T> ao(allocator::malloc, n_vals);
 25 |     T *pao = ao.data();
 26 | 
 27 |     // initialize the data
 28 |     initialize_host(pao, val, n_vals);
 29 | 
 30 |     std::cerr << "initialized to an array of " << n_vals << " to " << val << std::endl;
 31 |     if (n_vals < 33)
 32 |     {
 33 |         std::cerr << "ao = "; ao.print(); std::cerr << std::endl;
 34 |         ao.print();
 35 |         std::cerr << std::endl;
 36 |     }
 37 | 
 38 |     return ao;
 39 | }
 40 | 
 41 | 
 42 | 
 43 | 
 44 | 
 45 | 
 46 | // **************************************************************************
 47 | template<typename T, typename U>
 48 | void add_host(T *result, const T *array_1, const U *array_2, size_t n_vals)
 49 | {
 50 |     for (size_t i = 0; i < n_vals; ++i)
 51 |     {
 52 |         result[i] = array_1[i] + array_2[i];
 53 |     }
 54 | }
 55 | 
 56 | // **************************************************************************
 57 | template <typename T, typename U>
 58 | buffer<T> add_host(const buffer<T> &a1, const buffer<U> &a2)
 59 | {
 60 |     // get the inputs
 61 |     auto [spa1, pa1] = hamr::get_host_accessible(a1);
 62 |     auto [spa2, pa2] = hamr::get_host_accessible(a2);
 63 | 
 64 |     // allocate the memory
 65 |     size_t n_vals = a1.size();
 66 |     buffer<T> ao(allocator::malloc, n_vals, T(0));
 67 |     T *pao = ao.data();
 68 | 
 69 |     // initialize the data
 70 |     add_host(pao, pa1, pa2, n_vals);
 71 | 
 72 |     std::cerr << "added " << n_vals << " array " << typeid(T).name()
 73 |         << sizeof(T) << " to array  " << typeid(U).name() << sizeof(U) << std::endl;
 74 |     if (n_vals < 33)
 75 |     {
 76 |         std::cerr << "a1 = "; a1.print(); std::cerr << std::endl;
 77 |         std::cerr << "a2 = "; a2.print(); std::cerr << std::endl;
 78 |         std::cerr << "ao = "; ao.print(); std::cerr << std::endl;
 79 |     }
 80 | 
 81 |     return ao;
 82 | }
 83 | 
 84 | 
 85 | 
 86 | 
 87 | 
 88 | // **************************************************************************
 89 | template<typename T, typename U>
 90 | void multiply_scalar_host(T *result, const T *array_in, U scalar, size_t n_vals)
 91 | {
 92 |     for (size_t i = 0; i < n_vals; ++i)
 93 |     {
 94 |         result[i] = array_in[i] * scalar;
 95 |     }
 96 | }
 97 | 
 98 | // **************************************************************************
 99 | template <typename T, typename U>
100 | buffer<T> multiply_scalar_host(const buffer<T> &ai, const U &val)
101 | {
102 |     // get the inputs
103 |     auto [spai, pai] = hamr::get_host_accessible(ai);
104 | 
105 |     // allocate the memory
106 |     size_t n_vals = ai.size();
107 |     buffer<T> ao(allocator::malloc, n_vals, T(0));
108 |     T *pao = ao.data();
109 | 
110 |     // initialize the data
111 |     multiply_scalar_host(pao, pai, val, n_vals);
112 | 
113 |     std::cerr << "multiply_scalar " << val << " " << typeid(U).name() << sizeof(U)
114 |        << " by " << n_vals << " array " << typeid(T).name() << sizeof(T) << std::endl;
115 | 
116 |     if (n_vals < 33)
117 |     {
118 |         std::cerr << "ain = "; ai.print(); std::cerr << std::endl;
119 |         std::cerr << "ao = "; ao.print(); std::cerr << std::endl;
120 |     }
121 | 
122 |     return ao;
123 | }
124 | 
125 | // **************************************************************************
126 | template <typename T>
127 | int compare_int(const buffer<T> &ain, int val)
128 | {
129 |     size_t n_vals = ain.size();
130 |     std::cerr << "comparing array with " << n_vals
131 |         << " elements to " << val << std::endl;
132 | 
133 |     buffer<int> ai(ain.get_allocator(), n_vals);
134 |     ain.get(ai);
135 | 
136 |     if (n_vals < 33)
137 |     {
138 |         ai.print();
139 |     }
140 | 
141 |     auto [spai, pai] = hamr::get_host_accessible(ai);
142 | 
143 |     for (size_t i = 0; i < n_vals; ++i)
144 |     {
145 |         if (pai[i] != val)
146 |         {
147 |             std::cerr << "ERROR: pai[" << i << "] = " << pai[i]
148 |                 << " != " << val << std::endl;
149 |             return -1;
150 |         }
151 |     }
152 | 
153 |     std::cerr << "all elements are equal to " << val << std::endl;
154 | 
155 |     return 0;
156 | }
157 | 
158 | 
159 | 
160 | int main(int, char **)
161 | {
162 |     size_t n_vals = 100000;
163 | 
164 |     buffer<float>  ao0(allocator::malloc, n_vals, 1.0f);    // = 1 (host)
165 |     buffer<float>  ao1 = multiply_scalar_host(ao0, 2.0f);    // = 2 (host)
166 |     ao0.free();
167 | 
168 |     buffer<double> ao2 = initialize_host(n_vals, 2.0);       // = 2 (host)
169 |     buffer<double> ao3 = add_host(ao2, ao1);                 // = 4 (host)
170 |     ao1.free();
171 |     ao2.free();
172 | 
173 |     buffer<double> ao4 = multiply_scalar_host(ao3, 1000.0);  // = 4000 (host)
174 |     ao3.free();
175 | 
176 |     buffer<float>  ao5(allocator::malloc, n_vals, 3.0f);    // = 1 (host)
177 |     buffer<float>  ao6 = multiply_scalar_host(ao5, 100.0f);  // = 300 (host)
178 |     ao5.free();
179 | 
180 |     buffer<float> ao7(allocator::malloc, n_vals);           // = uninit (host)
181 |     ao7.set(ao6);                                           // = 300 (host)
182 | 
183 |     buffer<double> ao8 = add_host(ao4, ao7);                 // = 4300 (host)
184 |     ao4.free();
185 |     ao7.free();
186 | 
187 |     int res = compare_int(ao8, 4300);
188 |     ao8.free();
189 | 
190 |     return res;
191 | }
192 | 


--------------------------------------------------------------------------------
/hamr_hip_launch.cxx:
--------------------------------------------------------------------------------
  1 | 
  2 | #include <hip/hip_runtime.h>
  3 | #include "hamr_hip_launch.h"
  4 | 
  5 | #include "hamr_env.h"
  6 | 
  7 | #include <iostream>
  8 | 
  9 | namespace hamr
 10 | {
 11 | // **************************************************************************
 12 | int synchronize()
 13 | {
 14 |     hipError_t ierr = hipSuccess;
 15 |     if ((ierr = hipDeviceSynchronize()) != hipSuccess)
 16 |     {
 17 |         std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
 18 |             " Failed to synchronize HIP execution. "
 19 |             << hipGetErrorString(ierr) << std::endl;
 20 |         return -1;
 21 |     }
 22 |     return 0;
 23 | }
 24 | 
 25 | // **************************************************************************
 26 | int get_launch_props(int device_id,
 27 |     int *block_grid_max, int &warp_size,
 28 |     int &warps_per_block_max)
 29 | {
 30 |     hipError_t ierr = hipSuccess;
 31 | 
 32 |     if (device_id < 0)
 33 |     {
 34 |         if ((ierr = hipGetDevice(&device_id)) != hipSuccess)
 35 |         {
 36 |             std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
 37 |                 " Failed to get the active device id. "
 38 |                 << hipGetErrorString(ierr) << std::endl;
 39 |             return -1;
 40 |         }
 41 |     }
 42 | 
 43 |     if (((ierr = hipDeviceGetAttribute(&block_grid_max[0], hipDeviceAttributeMaxGridDimX, device_id)) != hipSuccess)
 44 |         || ((ierr = hipDeviceGetAttribute(&block_grid_max[1], hipDeviceAttributeMaxGridDimY, device_id)) != hipSuccess)
 45 |         || ((ierr = hipDeviceGetAttribute(&block_grid_max[2], hipDeviceAttributeMaxGridDimZ, device_id)) != hipSuccess))
 46 |     {
 47 |         std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
 48 |             " Failed to get HIP max grid dim. " << hipGetErrorString(ierr) << std::endl;
 49 |         return -1;
 50 |     }
 51 | 
 52 |     if ((ierr = hipDeviceGetAttribute(&warp_size, hipDeviceAttributeWarpSize, device_id)) != hipSuccess)
 53 |     {
 54 |         std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
 55 |             " Failed to get HIP warp size. " << hipGetErrorString(ierr) << std::endl;
 56 |         return -1;
 57 |     }
 58 | 
 59 |     int threads_per_block_max = 0;
 60 | 
 61 |     if ((ierr = hipDeviceGetAttribute(&threads_per_block_max,
 62 |         hipDeviceAttributeMaxThreadsPerBlock, device_id)) != hipSuccess)
 63 |     {
 64 |         std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
 65 |             " Failed to get HIP max threads per block. " << hipGetErrorString(ierr) << std::endl;
 66 |         return -1;
 67 |     }
 68 | 
 69 |     warps_per_block_max = threads_per_block_max / warp_size;
 70 | 
 71 |     return 0;
 72 | }
 73 | 
 74 | // **************************************************************************
 75 | int partition_thread_blocks(size_t array_size,
 76 |     int warps_per_block, int warp_size, int *block_grid_max,
 77 |     dim3 &block_grid, int &n_blocks, dim3 &thread_grid)
 78 | {
 79 |     unsigned long threads_per_block = warps_per_block * warp_size;
 80 | 
 81 |     thread_grid.x = threads_per_block;
 82 |     thread_grid.y = 1;
 83 |     thread_grid.z = 1;
 84 | 
 85 |     unsigned long block_size = threads_per_block;
 86 |     n_blocks = array_size / block_size;
 87 | 
 88 |     if (array_size % block_size)
 89 |         ++n_blocks;
 90 | 
 91 |     if (n_blocks > block_grid_max[0])
 92 |     {
 93 |         // multi-d decomp required
 94 |         block_grid.x = block_grid_max[0];
 95 |         block_grid.y = n_blocks / block_grid_max[0];
 96 |         if (n_blocks % block_grid_max[0])
 97 |         {
 98 |             ++block_grid.y;
 99 |         }
100 | 
101 |         if (block_grid.y > ((unsigned int)block_grid_max[1]))
102 |         {
103 |             // 3d decomp
104 |             unsigned long block_grid_max01 = block_grid_max[0] * block_grid_max[1];
105 |             block_grid.y = block_grid_max[1];
106 |             block_grid.z = n_blocks / block_grid_max01;
107 | 
108 |             if (n_blocks % block_grid_max01)
109 |                 ++block_grid.z;
110 | 
111 |             if (block_grid.z > ((unsigned int)block_grid_max[2]))
112 |             {
113 |                 std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
114 |                     " Too many blocks " << n_blocks << " of size " << block_size
115 |                     << " are required for a grid of (" << block_grid_max[0] << ", "
116 |                     << block_grid_max[1] << ", " << block_grid_max[2]
117 |                     << ") blocks. Hint: increase the number of warps per block." << std::endl;
118 |                 return -1;
119 |             }
120 |         }
121 |         else
122 |         {
123 |             // 2d decomp
124 |             block_grid.z = 1;
125 |         }
126 |     }
127 |     else
128 |     {
129 |         // 1d decomp
130 |         block_grid.x = n_blocks;
131 |         block_grid.y = 1;
132 |         block_grid.z = 1;
133 |     }
134 | 
135 | #if defined(HAMR_VERBOSE)
136 |     if (hamr::get_verbose())
137 |     {
138 |         std::cerr << "partition_thread_blocks arrays_size = " << array_size
139 |             << " warps_per_block = " << warps_per_block << " warp_size = " << warp_size
140 |             << " block_grid_max = (" << block_grid_max[0] << ", " << block_grid_max[1]
141 |             << ", " << block_grid_max[2] << ") block_grid = (" << block_grid.x << ", "
142 |             << block_grid.y << ", " << block_grid.z << ") n_blocks = " << n_blocks
143 |             << " thread_grid = (" << thread_grid.x << ", " << thread_grid.y << ", "
144 |             << thread_grid.z << ")" << std::endl;
145 |     }
146 | #endif
147 | 
148 |     return 0;
149 | }
150 | 
151 | // **************************************************************************
152 | int partition_thread_blocks(int device_id, size_t array_size,
153 |     int warps_per_block, dim3 &block_grid, int &n_blocks,
154 |     dim3 &thread_grid)
155 | {
156 |     int block_grid_max[3] = {0};
157 |     int warp_size = 0;
158 |     int warps_per_block_max = 0;
159 |     if (get_launch_props(device_id, block_grid_max,
160 |         warp_size, warps_per_block_max))
161 |     {
162 |         std::cerr << "[" << __FILE__ << ":" << __LINE__ << "] ERROR:"
163 |             " Failed to get launch properties" << std::endl;
164 |         return -1;
165 |     }
166 | 
167 |     return partition_thread_blocks(array_size, warps_per_block,
168 |         warp_size, block_grid_max, block_grid, n_blocks,
169 |         thread_grid);
170 | }
171 | 
172 | }
173 | 


--------------------------------------------------------------------------------