├── spack ├── repo.yaml └── packages │ ├── cuda-dolfinx │ └── package.py │ └── py-cuda-dolfinx │ └── package.py ├── python ├── build-requirements.txt ├── cudolfinx │ ├── __init__.py │ ├── mesh.py │ ├── context.py │ ├── wrappers │ │ ├── cudolfinx.cpp │ │ ├── petsc.cpp │ │ └── caster_petsc.h │ ├── la.py │ ├── bcs.py │ ├── jit.py │ └── form.py ├── README.md ├── pyproject.toml ├── CMakeLists.txt ├── test │ ├── test_multigpu_assembly.py │ └── test_cuda_assembly.py └── examples │ ├── poisson_sum_factorization.py │ └── poisson.py ├── cpp ├── cudolfinx │ ├── common │ │ ├── CMakeLists.txt │ │ ├── version.h.in │ │ ├── CUDAStore.h │ │ └── CUDA.h │ ├── mesh │ │ ├── CMakeLists.txt │ │ ├── util.cpp │ │ └── CUDAMesh.h │ ├── cudolfinx.h │ ├── la │ │ ├── CMakeLists.txt │ │ ├── petsc.h │ │ ├── CUDAMatrix.h │ │ ├── CUDASeqMatrix.h │ │ ├── CUDAVector.h │ │ ├── petsc.cpp │ │ └── CUDAMatrix.cpp │ ├── fem │ │ ├── CMakeLists.txt │ │ ├── CUDACoefficient.h │ │ ├── CUDADofMap.h │ │ ├── petsc.h │ │ ├── utils.h │ │ ├── CUDAFormConstants.h │ │ ├── CUDAForm.h │ │ └── CUDADofMap.cpp │ └── CMakeLists.txt ├── cmake │ ├── templates │ │ ├── cudolfinx.conf.in │ │ ├── cudolfinx.pc.in │ │ ├── cmake_uninstall.cmake.in │ │ └── CUDOLFINXConfig.cmake.in │ ├── post-install │ │ └── CMakeLists.txt │ └── modules │ │ └── FindUFCx.cmake └── CMakeLists.txt ├── README.md ├── docker ├── Dockerfile.end-user └── Dockerfile.test-env └── COPYING.LESSER /spack/repo.yaml: -------------------------------------------------------------------------------- 1 | repo: 2 | namespace: 'cudolfinx' 3 | -------------------------------------------------------------------------------- /python/build-requirements.txt: -------------------------------------------------------------------------------- 1 | nanobind>=1.8.0 2 | scikit-build-core[pyproject] 3 | petsc4py 4 | mpi4py 5 | -------------------------------------------------------------------------------- /cpp/cudolfinx/common/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(HEADERS_common 2 | ${CMAKE_CURRENT_SOURCE_DIR}/CUDA.h 3 | ${CMAKE_CURRENT_SOURCE_DIR}/CUDAStore.h 4 | PARENT_SCOPE 5 | ) 6 | 7 | target_sources( 8 | cudolfinx 9 | PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/CUDA.cpp 10 | ) 11 | -------------------------------------------------------------------------------- /cpp/cudolfinx/mesh/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(HEADERS_mesh 2 | ${CMAKE_CURRENT_SOURCE_DIR}/CUDAMesh.h 3 | ${CMAKE_CURRENT_SOURCE_DIR}/CUDAMeshEntities.h 4 | ${CMAKE_CURRENT_SOURCE_DIR}/util.h 5 | PARENT_SCOPE 6 | ) 7 | 8 | target_sources( 9 | cudolfinx 10 | PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/util.cpp 11 | ) 12 | 13 | -------------------------------------------------------------------------------- /cpp/cudolfinx/cudolfinx.h: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2024 Benjamin Pachev, James D. Trotter 2 | // 3 | // This file is part of cuDOLFINX 4 | // 5 | // SPDX-License-Identifier: LGPL-3.0-or-later 6 | 7 | #pragma once 8 | 9 | namespace cudolfinx 10 | { 11 | } 12 | 13 | // TODO: actually develop a C++ interface, currently the target is Python 14 | -------------------------------------------------------------------------------- /python/cudolfinx/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2024 Benjamin Pachev 2 | # 3 | # This file is part of cuDOLFINX 4 | # 5 | # SPDX-License-Identifier: LGPL-3.0-or-later 6 | 7 | """Main module for CUDOLFINx""" 8 | 9 | from cudolfinx.assemble import CUDAAssembler 10 | from cudolfinx.form import form 11 | from cudolfinx.mesh import ghost_layer_mesh, ghost_layer_meshtags 12 | -------------------------------------------------------------------------------- /cpp/cudolfinx/common/version.h.in: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #define CUDOLFINX_VERSION_RELEASE @CUDOLFINX_VERSION_RELEASE@ 4 | #define CUDOLFINX_VERSION_MAJOR @CUDOLFINX_VERSION_MAJOR@ 5 | #define CUDOLFINX_VERSION_MINOR @CUDOLFINX_VERSION_MINOR@ 6 | #define CUDOLFINX_VERSION_MICRO @CUDOLFINX_VERSION_MICRO_STRIPPED@ 7 | #define CUDOLFINX_VERSION_STRING "@CUDOLFINX_VERSION@" 8 | #define CUDOLFINX_VERSION_GIT "@GIT_COMMIT_HASH@" 9 | #define UFCX_SIGNATURE "@UFCX_SIGNATURE@" 10 | -------------------------------------------------------------------------------- /cpp/cudolfinx/la/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(HEADERS_la 2 | ${CMAKE_CURRENT_SOURCE_DIR}/CUDAMatrix.h 3 | ${CMAKE_CURRENT_SOURCE_DIR}/CUDASeqMatrix.h 4 | ${CMAKE_CURRENT_SOURCE_DIR}/CUDAVector.h 5 | ${CMAKE_CURRENT_SOURCE_DIR}/petsc.h 6 | PARENT_SCOPE 7 | ) 8 | 9 | target_sources( 10 | cudolfinx 11 | PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/CUDAMatrix.cpp 12 | ${CMAKE_CURRENT_SOURCE_DIR}/CUDASeqMatrix.cpp 13 | ${CMAKE_CURRENT_SOURCE_DIR}/CUDAVector.cpp 14 | ${CMAKE_CURRENT_SOURCE_DIR}/petsc.cpp 15 | ) 16 | -------------------------------------------------------------------------------- /cpp/cudolfinx/la/petsc.h: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2024 Benjamin Pachev, James D. Trotter 2 | // 3 | // This file is part of cuDOLFINX 4 | // 5 | // SPDX-License-Identifier: LGPL-3.0-or-later 6 | 7 | #pragma once 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | namespace dolfinx::la 16 | { 17 | 18 | namespace petsc 19 | { 20 | 21 | Mat create_cuda_matrix(MPI_Comm comm, const SparsityPattern& sp); 22 | 23 | } // namespace petsc 24 | } // namespace dolfinx::la 25 | -------------------------------------------------------------------------------- /cpp/cmake/templates/cudolfinx.conf.in: -------------------------------------------------------------------------------- 1 | # Helper file for setting non-default CUDOLFINx environment variables 2 | 3 | # Common Unix variables 4 | export @OS_LIBRARY_PATH_NAME@=@CMAKE_INSTALL_PREFIX@/@CMAKE_INSTALL_LIBDIR@:$@OS_LIBRARY_PATH_NAME@ 5 | export PATH=@CMAKE_INSTALL_PREFIX@/@CMAKE_INSTALL_BINDIR@:$PATH 6 | export PKG_CONFIG_PATH=@CMAKE_INSTALL_PREFIX@/@CMAKE_INSTALL_LIBDIR@/pkgconfig:$PKG_CONFIG_PATH 7 | export CMAKE_PREFIX_PATH=@CMAKE_INSTALL_PREFIX@/@CMAKE_INSTALL_LIBDIR@/cmake:$CMAKE_PREFIX_PATH 8 | 9 | # Special macOS variables 10 | export DYLD_FRAMEWORK_PATH=/opt/local/Library/Frameworks:$DYLD_FRAMEWORK_PATH 11 | -------------------------------------------------------------------------------- /cpp/cmake/templates/cudolfinx.pc.in: -------------------------------------------------------------------------------- 1 | # pkg-config configuration for CUDOLFINx 2 | prefix=@CMAKE_INSTALL_PREFIX@ 3 | exec_prefix=@CMAKE_INSTALL_PREFIX@ 4 | libdir=${exec_prefix}/@CMAKE_INSTALL_LIBDIR@ 5 | includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@ 6 | compiler=@CMAKE_CXX_COMPILER@ 7 | definitions=@PKG_DEFINITIONS@ 8 | extlibs=@CUDOLFINX_EXT_LIBS@ 9 | 10 | Name: CUDOLFINx 11 | Description: CUDA extension for DOLFINX 12 | Version: @CUDOLFINX_VERSION@ 13 | Requires: @PKG_REQUIRES@ 14 | Conflicts: 15 | Libs: @PKG_LINKFLAGS@ -L${libdir} -lcudolfinx 16 | Cflags: @PKG_CXXFLAGS@ -DCUDOLFINX_VERSION=\"@CUDOLFINX_VERSION@\" ${definitions} -I${includedir} @PKG_INCLUDES@ 17 | -------------------------------------------------------------------------------- /cpp/cmake/post-install/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | install( 2 | CODE "MESSAGE( 3 | \"---------------------------------------------------------------------------- 4 | CUDOLFINx has now been installed in 5 | 6 | ${CMAKE_INSTALL_PREFIX} 7 | 8 | 9 | Don't forget to update your environment variables. This can be done 10 | easily using the helper file 'cudolfinx.conf' which sets the appropriate 11 | variables (for users of the Bash shell). 12 | 13 | To update your environment variables, run the following command: 14 | 15 | source ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}/cudolfinx/cudolfinx.conf 16 | 17 | ----------------------------------------------------------------------------\")" 18 | ) 19 | -------------------------------------------------------------------------------- /python/README.md: -------------------------------------------------------------------------------- 1 | # CUDOLFINx Python interface installation 2 | 3 | Below is guidance for building the CUDOLFINx Python interface. 4 | 5 | 1. Build and install the CUDOLFINx C++ library. 6 | 7 | 2. Ensure the Python interface build requirements are installed: 8 | 9 | pip install -r build-requirements.txt 10 | 11 | 3. Build DOLFINx Python interface: 12 | 13 | pip install --check-build-dependencies --no-build-isolation . 14 | 15 | To build in debug and editable mode for development: 16 | 17 | pip -v install --check-build-dependencies --config-settings=build-dir="build" --config-settings=cmake.build-type="Debug" --config-settings=install.strip=false --no-build-isolation -e . 18 | -------------------------------------------------------------------------------- /python/cudolfinx/mesh.py: -------------------------------------------------------------------------------- 1 | from cudolfinx import cpp as _cucpp 2 | from dolfinx import mesh 3 | 4 | def ghost_layer_mesh(domain: mesh.Mesh): 5 | """Add a ghost layer of cells to the given mesh 6 | """ 7 | _ghost_mesh = _cucpp.fem.ghost_layer_mesh(domain._cpp_object, domain._geometry._cpp_object.cmap) 8 | return mesh.Mesh( 9 | _ghost_mesh, 10 | domain._ufl_domain) 11 | 12 | def ghost_layer_meshtags(meshtags: mesh.MeshTags, ghosted_mesh: mesh.Mesh): 13 | """Trasnfer meshtags to ghost layer mesh.""" 14 | 15 | _cpp_meshtags = _cucpp.fem.ghost_layer_meshtags(meshtags._cpp_object, ghosted_mesh.topology._cpp_object) 16 | return mesh.MeshTags(_cpp_meshtags) 17 | -------------------------------------------------------------------------------- /cpp/cudolfinx/fem/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(HEADERS_fem 2 | ${CMAKE_CURRENT_SOURCE_DIR}/CUDAAssembler.h 3 | ${CMAKE_CURRENT_SOURCE_DIR}/CUDACoefficient.h 4 | ${CMAKE_CURRENT_SOURCE_DIR}/CUDADirichletBC.h 5 | ${CMAKE_CURRENT_SOURCE_DIR}/CUDADofMap.h 6 | ${CMAKE_CURRENT_SOURCE_DIR}/CUDAForm.h 7 | ${CMAKE_CURRENT_SOURCE_DIR}/CUDAFormConstants.h 8 | ${CMAKE_CURRENT_SOURCE_DIR}/CUDAFormCoefficients.h 9 | ${CMAKE_CURRENT_SOURCE_DIR}/CUDAFormIntegral.h 10 | ${CMAKE_CURRENT_SOURCE_DIR}/petsc.h 11 | ${CMAKE_CURRENT_SOURCE_DIR}/utils.h 12 | PARENT_SCOPE 13 | ) 14 | 15 | target_sources( 16 | cudolfinx 17 | PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/CUDAAssembler.cpp 18 | ${CMAKE_CURRENT_SOURCE_DIR}/CUDADofMap.cpp 19 | ${CMAKE_CURRENT_SOURCE_DIR}/CUDAFormIntegral.cpp 20 | ) 21 | -------------------------------------------------------------------------------- /python/cudolfinx/context.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2024 Benjamin Pachev 2 | # 3 | # This file is part of cuDOLFINX 4 | # 5 | # SPDX-License-Identifier: LGPL-3.0-or-later 6 | 7 | from petsc4py import PETSc 8 | from cudolfinx import cpp as _cucpp 9 | 10 | _device = None 11 | 12 | def _init_device(): 13 | """Initialize PETSc device 14 | """ 15 | global _device 16 | d = PETSc.Device() 17 | d.create(PETSc.Device.Type.CUDA) 18 | _device = d 19 | 20 | def get_device(): 21 | """Return PETSc device 22 | """ 23 | 24 | global _device 25 | if _device is None: 26 | _init_device() 27 | return _device 28 | 29 | def get_cuda_context(): 30 | """Return the CUDA context, intializing it if needed 31 | """ 32 | global _device 33 | if _device is None: 34 | _init_device() 35 | return _cucpp.fem.CUDAContext() 36 | -------------------------------------------------------------------------------- /python/cudolfinx/wrappers/cudolfinx.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2024 Benjamin Pachev, James D. Trotter 2 | // 3 | // This file is part of cuDOLFINX 4 | // 5 | // SPDX-License-Identifier: LGPL-3.0-or-later 6 | 7 | #include 8 | 9 | namespace nb = nanobind; 10 | 11 | namespace cudolfinx_wrappers 12 | { 13 | void fem(nb::module_& m); 14 | void petsc(nb::module_& m_fem); 15 | } // namespace cudolfinx_wrappers 16 | 17 | NB_MODULE(cpp, m) 18 | { 19 | // Create module for C++ wrappers 20 | m.doc() = "DOLFINx CUDA Python interface"; 21 | m.attr("__version__") = CUDOLFINX_VERSION; 22 | 23 | #ifdef NDEBUG 24 | nanobind::set_leak_warnings(false); 25 | #endif 26 | // Create fem submodule [fem] 27 | nb::module_ fem = m.def_submodule("fem", "FEM module"); 28 | cudolfinx_wrappers::fem(fem); 29 | cudolfinx_wrappers::petsc(fem); 30 | } 31 | -------------------------------------------------------------------------------- /cpp/cmake/templates/cmake_uninstall.cmake.in: -------------------------------------------------------------------------------- 1 | if(NOT EXISTS "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt") 2 | message( 3 | FATAL_ERROR 4 | "Cannot find install manifest: \"@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt\"" 5 | ) 6 | endif() 7 | 8 | file(READ "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt" files) 9 | string(REGEX REPLACE "\n" ";" files "${files}") 10 | foreach(file ${files}) 11 | message(STATUS "Uninstalling \"$ENV{DESTDIR}${file}\"") 12 | if(EXISTS "$ENV{DESTDIR}${file}") 13 | exec_program( 14 | "@CMAKE_COMMAND@" ARGS 15 | "-E remove \"$ENV{DESTDIR}${file}\"" 16 | OUTPUT_VARIABLE rm_out 17 | RETURN_VALUE rm_retval 18 | ) 19 | if(NOT "${rm_retval}" STREQUAL 0) 20 | message(FATAL_ERROR "Problem when removing \"$ENV{DESTDIR}${file}\"") 21 | endif() 22 | else() 23 | message(STATUS "File \"$ENV{DESTDIR}${file}\" does not exist.") 24 | endif() 25 | endforeach() 26 | -------------------------------------------------------------------------------- /cpp/cudolfinx/common/CUDAStore.h: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2024 Benjamin Pachev, James D. Trotter 2 | // 3 | // This file is part of cuDOLFINX 4 | // 5 | // SPDX-License-Identifier: LGPL-3.0-or-later 6 | 7 | #pragma once 8 | #include 9 | #include 10 | 11 | namespace dolfinx::common 12 | { 13 | /// @brief This class represents an abstract mapping between host-side 14 | /// and device-side objects. Its purpose is to prevent creation of duplicate 15 | /// copies of host-side objects on the device. 16 | 17 | template 18 | class CUDAStore 19 | { 20 | public: 21 | 22 | /// @brief Empty constructor 23 | CUDAStore() 24 | { 25 | } 26 | 27 | /// @brief Return stored device object, or update accordingly 28 | /// @param[in] host_object Shared pointer to the host-side object 29 | std::shared_ptr get_device_object(const H* host_object) { 30 | auto it = _map.find(host_object); 31 | if (it != _map.end()) return it->second; 32 | auto device_object = std::make_shared(host_object); 33 | _map[host_object] = device_object; 34 | return device_object; 35 | } 36 | 37 | private: 38 | std::map> _map; 39 | }; 40 | } 41 | 42 | -------------------------------------------------------------------------------- /spack/packages/cuda-dolfinx/package.py: -------------------------------------------------------------------------------- 1 | # Copyright 2013-2024 Lawrence Livermore National Security, LLC and other 2 | # Spack Project Developers. See the top-level COPYRIGHT file for details. 3 | # 4 | # SPDX-License-Identifier: (Apache-2.0 OR MIT) 5 | 6 | from spack.package import * 7 | 8 | 9 | class CudaDolfinx(CMakePackage): 10 | """CUDA accelerated extension of DOLFINx from the FEniCS project.""" 11 | 12 | homepage = "https://github.com/bpachev/cuda-dolfinx" 13 | git = "https://github.com/bpachev/cuda-dolfinx.git" 14 | url = "https://github.com/bpachev/cuda-dolfinx/archive/refs/tags/v0.9.0.zip" 15 | 16 | maintainers("bpachev") 17 | license("LGPL-3.0-or-later", checked_by="bpachev") 18 | 19 | version("main", branch="main") 20 | version("0.9.0", sha256="5c93155e58eee139985e9e9341cf7d8b29f8c9cbc51ccdf05134cdfb70ae105d") 21 | 22 | depends_on("cxx", type="build") 23 | 24 | depends_on("fenics-dolfinx@0.9+petsc+adios2", when="@0.9:") 25 | depends_on("py-fenics-dolfinx@0.9", when="@0.9:") 26 | depends_on("petsc+shared+mpi+cuda") 27 | 28 | root_cmakelists_dir = "cpp" 29 | 30 | def cmake_args(self): 31 | return [self.define("CUDOLFINX_SKIP_BUILD_TESTS", True)] 32 | -------------------------------------------------------------------------------- /spack/packages/py-cuda-dolfinx/package.py: -------------------------------------------------------------------------------- 1 | # Copyright 2013-2024 Lawrence Livermore National Security, LLC and other 2 | # Spack Project Developers. See the top-level COPYRIGHT file for details. 3 | # 4 | # SPDX-License-Identifier: (Apache-2.0 OR MIT) 5 | 6 | from spack.package import * 7 | 8 | 9 | class PyCudaDolfinx(PythonPackage): 10 | """Python interface for CUDA acceleration of DOLFINx in the FEniCS project.""" 11 | 12 | homepage = "https://github.com/bpachev/cuda-dolfinx" 13 | url = "https://github.com/bpachev/cuda-dolfinx/archive/refs/tags/v0.9.0.zip" 14 | git = "https://github.com/bpachev/cuda-dolfinx.git" 15 | 16 | maintainers("bpachev") 17 | 18 | license("LGPL-3.0-only") 19 | version("main", branch="main") 20 | version("0.9.0", sha256="5c93155e58eee139985e9e9341cf7d8b29f8c9cbc51ccdf05134cdfb70ae105d") 21 | 22 | depends_on("cxx", type="build") 23 | depends_on("cmake@3.21:", when="@0.9:", type="build") 24 | depends_on("cuda-dolfinx@main", when="@main") 25 | depends_on("cuda-dolfinx@0.9.0", when="@0.9.0") 26 | depends_on("pkgconfig", type="build") 27 | depends_on("py-nanobind@2:", when="@0.9:", type="build") 28 | depends_on("py-scikit-build-core+pyproject@0.5:", when="@0.9:", type="build") 29 | 30 | build_directory = "python" 31 | 32 | -------------------------------------------------------------------------------- /python/cudolfinx/wrappers/petsc.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2024 Benjamin Pachev, James D. Trotter 2 | // 3 | // This file is part of cuDOLFINX 4 | // 5 | // SPDX-License-Identifier: LGPL-3.0-or-later 6 | 7 | #include "caster_petsc.h" 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | 22 | namespace 23 | { 24 | 25 | void petsc_fem_module(nb::module_& m) 26 | { 27 | m.def("create_cuda_matrix", dolfinx::fem::petsc::create_cuda_matrix, 28 | nb::rv_policy::take_ownership, nb::arg("a"), 29 | "Create a PETSc CUDA Mat for a bilinear form."); 30 | m.def("create_cuda_matrix_block", dolfinx::fem::petsc::create_cuda_matrix_block, 31 | nb::rv_policy::take_ownership, nb::arg("forms"), 32 | "Create a monolithic PETSc CUDA Mat from a list of lists of bilinear forms."); 33 | } 34 | 35 | } // namespace 36 | 37 | namespace cudolfinx_wrappers 38 | { 39 | void petsc(nb::module_& m_fem) 40 | { 41 | nb::module_ petsc_fem_mod 42 | = m_fem.def_submodule("petsc", "PETSc-specific finite element module"); 43 | petsc_fem_module(petsc_fem_mod); 44 | } 45 | } // namespace cudolfinx_wrappers 46 | 47 | -------------------------------------------------------------------------------- /cpp/cudolfinx/fem/CUDACoefficient.h: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2024 Benjamin Pachev, James D. Trotter 2 | // 3 | // This file is part of cuDOLFINX 4 | // 5 | // SPDX-License-Identifier: LGPL-3.0-or-later 6 | 7 | #pragma once 8 | 9 | #include 10 | #include 11 | 12 | namespace dolfinx::fem 13 | { 14 | /// @brief a wrapper around a Function 15 | template > 17 | class CUDACoefficient 18 | { 19 | public: 20 | 21 | /// @brief Construct a new CUDACoefficient 22 | CUDACoefficient(std::shared_ptr> f) { 23 | _f = f; 24 | _x = f->x(); 25 | _dvalues_size = _x->bs() * (_x->index_map()->size_local()+_x->index_map()->num_ghosts()) * sizeof(T); 26 | CUDA::safeMemAlloc(&_dvalues, _dvalues_size); 27 | copy_host_values_to_device(); 28 | } 29 | 30 | /// Copy to device, allocating GPU memory if required 31 | void copy_host_values_to_device() 32 | { 33 | CUDA::safeMemcpyHtoD(_dvalues, (void*)(_x->array().data()), _dvalues_size); 34 | } 35 | 36 | /// Get pointer to vector data on device 37 | CUdeviceptr device_values() const 38 | { 39 | return _dvalues; 40 | } 41 | 42 | ~CUDACoefficient() 43 | { 44 | if (_dvalues) 45 | cuMemFree(_dvalues); 46 | } 47 | 48 | private: 49 | 50 | // Device-side coefficient array 51 | CUdeviceptr _dvalues; 52 | // Size of coefficient array 53 | size_t _dvalues_size; 54 | // Pointer to host-side Function 55 | std::shared_ptr> _f; 56 | // Pointer to host-side coefficient vector 57 | std::shared_ptr> _x; 58 | }; 59 | 60 | } 61 | -------------------------------------------------------------------------------- /python/cudolfinx/la.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2024 Benjamin Pachev 2 | # 3 | # This file is part of cuDOLFINX 4 | # 5 | # SPDX-License-Identifier: LGPL-3.0-or-later 6 | 7 | from cudolfinx import cpp as _cucpp 8 | 9 | class CUDAVector: 10 | """Vector on device 11 | """ 12 | 13 | def __init__(self, ctx, vec): 14 | """Initialize the vector 15 | """ 16 | 17 | self._petsc_vec = vec 18 | self._ctx = ctx 19 | self._cpp_object = _cucpp.fem.CUDAVector(ctx, self._petsc_vec) 20 | 21 | @property 22 | def vector(self): 23 | """Return underlying PETSc vector 24 | """ 25 | 26 | return self._petsc_vec 27 | 28 | def to_host(self): 29 | """Copy device-side values to host 30 | """ 31 | 32 | self._cpp_object.to_host(self._ctx) 33 | 34 | def __del__(self): 35 | """Delete the vector and free up GPU resources 36 | """ 37 | 38 | # Ensure that the cpp CUDAVector is taken care of BEFORE the petsc vector. . . . 39 | del self._cpp_object 40 | 41 | class CUDAMatrix: 42 | """Matrix on device 43 | """ 44 | 45 | def __init__(self, ctx, petsc_mat): 46 | """Initialize the matrix 47 | """ 48 | 49 | self._petsc_mat = petsc_mat 50 | self._ctx = ctx 51 | self._cpp_object = _cucpp.fem.CUDAMatrix(ctx, petsc_mat) 52 | 53 | @property 54 | def mat(self): 55 | """Return underlying CUDA matrix 56 | """ 57 | 58 | return self._petsc_mat 59 | 60 | def assemble(self): 61 | """Call assemble on the underlying PETSc matrix. 62 | 63 | If the PETSc matrix is not a CUDA matrix, then matrix 64 | values will be explicitly copied to the host. 65 | """ 66 | 67 | self._cpp_object.to_host(self._ctx) 68 | 69 | def __del__(self): 70 | """Delete the matrix and free up GPU resources 71 | """ 72 | 73 | # make sure we delete the CUDAMatrix before the petsc matrix 74 | del self._cpp_object 75 | 76 | 77 | -------------------------------------------------------------------------------- /cpp/cmake/templates/CUDOLFINXConfig.cmake.in: -------------------------------------------------------------------------------- 1 | # * Build details for CUDOLFINx: CUDA extension for DOLFINX 2 | # 3 | # This file has been automatically generated. 4 | 5 | # FIXME: Check that naming conforms to CMake standards 6 | 7 | @PACKAGE_INIT@ 8 | include(CMakeFindDependencyMacro) 9 | 10 | find_dependency(MPI REQUIRED) 11 | find_dependency(pugixml) 12 | 13 | # Check for Boost 14 | if(DEFINED ENV{BOOST_ROOT} OR DEFINED BOOST_ROOT) 15 | set(Boost_NO_SYSTEM_PATHS on) 16 | endif() 17 | set(Boost_USE_MULTITHREADED $ENV{BOOST_USE_MULTITHREADED}) 18 | set(Boost_VERBOSE TRUE) 19 | find_dependency(Boost 1.70 REQUIRED COMPONENTS timer filesystem) 20 | 21 | if(@ufcx_FOUND@) 22 | find_dependency(ufcx) 23 | endif() 24 | 25 | # Basix 26 | find_package(Python3 COMPONENTS Interpreter) 27 | if(Python3_Interpreter_FOUND) 28 | execute_process( 29 | COMMAND 30 | ${Python3_EXECUTABLE} -c 31 | "import basix, os, sys; sys.stdout.write(os.path.dirname(basix.__file__))" 32 | OUTPUT_VARIABLE BASIX_PY_DIR 33 | RESULT_VARIABLE BASIX_PY_COMMAND_RESULT 34 | ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE 35 | ) 36 | endif() 37 | if(BASIX_PY_DIR) 38 | message(STATUS "Adding ${BASIX_PY_DIR} to Basix search hints") 39 | endif() 40 | find_dependency(Basix CONFIG HINTS ${BASIX_PY_DIR}) 41 | 42 | # HDF5 43 | if(NOT TARGET hdf5::hdf5) 44 | set(HDF5_PREFER_PARALLEL TRUE) 45 | set(HDF5_FIND_DEBUG TRUE) 46 | find_dependency(HDF5 COMPONENTS C) 47 | if(HDF5_FOUND AND NOT HDF5_IS_PARALLEL) 48 | message(FATAL_ERROR "Found serial HDF5 build, MPI HDF5 build required") 49 | endif() 50 | endif() 51 | 52 | if(@PETSC_FOUND@) 53 | if(NOT TARGET PkgConfig::PETSC) 54 | find_package(PkgConfig REQUIRED) 55 | set(ENV{PKG_CONFIG_PATH} 56 | "$ENV{PETSC_DIR}/$ENV{PETSC_ARCH}/lib/pkgconfig:$ENV{PETSC_DIR}/lib/pkgconfig:$ENV{PKG_CONFIG_PATH}" 57 | ) 58 | pkg_search_module(PETSC REQUIRED IMPORTED_TARGET PETSc petsc) 59 | endif() 60 | endif() 61 | 62 | if(@SLEPC_FOUND@) 63 | if(NOT TARGET PkgConfig::SLEPC) 64 | find_package(PkgConfig REQUIRED) 65 | set(ENV{PKG_CONFIG_PATH} 66 | "$ENV{SLEPC_DIR}/$ENV{PETSC_ARCH}/lib/pkgconfig:$ENV{SLEPC_DIR}/lib/pkgconfig:$ENV{PKG_CONFIG_PATH}" 67 | ) 68 | set(ENV{PKG_CONFIG_PATH} 69 | "$ENV{PETSC_DIR}/$ENV{PETSC_ARCH}/lib/pkgconfig:$ENV{PETSC_DIR}/lib/pkgconfig:$ENV{PKG_CONFIG_PATH}" 70 | ) 71 | set(ENV{PKG_CONFIG_PATH} 72 | "$ENV{PETSC_DIR}/$ENV{PETSC_ARCH}:$ENV{PETSC_DIR}:$ENV{PKG_CONFIG_PATH}" 73 | ) 74 | pkg_search_module(SLEPC REQUIRED IMPORTED_TARGET SLEPc slepc) 75 | endif() 76 | endif() 77 | 78 | if(NOT TARGET cudolfinx) 79 | include("${CMAKE_CURRENT_LIST_DIR}/CUDOLFINXTargets.cmake") 80 | endif() 81 | 82 | check_required_components(CUDOLFINX) 83 | -------------------------------------------------------------------------------- /python/cudolfinx/bcs.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2024 Benjamin Pachev 2 | # 3 | # This file is part of cuDOLFINX 4 | # 5 | # SPDX-License-Identifier: LGPL-3.0-or-later 6 | 7 | from cudolfinx import cpp as _cucpp 8 | from dolfinx import cpp as _cpp 9 | from dolfinx.fem.bcs import DirichletBC 10 | import typing 11 | 12 | class CUDADirichletBC: 13 | """Represents a collection of boundary conditions 14 | """ 15 | 16 | def __init__(self, ctx, bcs: typing.List[DirichletBC]): 17 | """Initialize a collection of boundary conditions 18 | """ 19 | 20 | self.bcs = bcs 21 | self._function_spaces = [] 22 | self._bc_lists = [] 23 | self._device_bcs = [] 24 | self._ctx = ctx 25 | 26 | for bc in bcs: 27 | V = bc.function_space 28 | try: 29 | i = self._function_spaces.index(V) 30 | except ValueError: 31 | self._function_spaces.append(V) 32 | self._bc_lists.append([]) 33 | i = -1 34 | self._bc_lists[i].append(bc._cpp_object) 35 | 36 | for V, cpp_bcs in zip(self._function_spaces, self._bc_lists): 37 | _cpp_bc_obj = self._make_device_bc(V, cpp_bcs) 38 | self._device_bcs.append(_cpp_bc_obj) 39 | 40 | def _make_device_bc(self, 41 | V: typing.Union[_cpp.fem.FunctionSpace_float32, _cpp.fem.FunctionSpace_float64], 42 | cpp_bcs: typing.List[typing.Union[_cpp.fem.DirichletBC_float32, _cpp.fem.DirichletBC_float64]] 43 | ): 44 | """Create device bc object wrapping a list of bcs for the same function space""" 45 | 46 | if type(V) is _cpp.fem.FunctionSpace_float32: 47 | return _cucpp.fem.CUDADirichletBC_float32(self._ctx, V, cpp_bcs) 48 | elif type(V) is _cpp.fem.FunctionSpace_float64: 49 | return _cucpp.fem.CUDADirichletBC_float64(self._ctx, V, cpp_bcs) 50 | else: 51 | raise TypeError(f"Invalid type for cpp FunctionSpace object '{type(V)}'") 52 | 53 | def _get_cpp_bcs(self, V: typing.Union[_cpp.fem.FunctionSpace_float32, _cpp.fem.FunctionSpace_float64]): 54 | """Get cpp CUDADirichletBC object 55 | """ 56 | 57 | # Use this to avoid needing hashes (which might not be supported) 58 | # Usually there will be a max of two function spaces associated with a set of bcs 59 | try: 60 | i = self._function_spaces.index(V) 61 | return self._device_bcs[i] 62 | except ValueError: 63 | # return empty collection 64 | return self._make_device_bc(V, []) 65 | 66 | def update(self, bcs: typing.Optional[typing.List[DirichletBC]] = None): 67 | """Update a subset of the boundary conditions. 68 | 69 | Used for cases with time-varying boundary conditions whose device-side values 70 | need to be updated. By default all boundary conditions are updated 71 | """ 72 | 73 | if bcs is None: 74 | bcs = self.bcs 75 | _bcs_to_update = [bc._cpp_object for bc in bcs] 76 | 77 | for _cpp_bc, V in zip(self._device_bcs, self._function_spaces): 78 | # filter out anything not contained in the right function space 79 | _cpp_bc.update([_bc for _bc in _bcs_to_update if V.contains(_bc.function_space)]) 80 | 81 | -------------------------------------------------------------------------------- /cpp/cmake/modules/FindUFCx.cmake: -------------------------------------------------------------------------------- 1 | #============================================================================= 2 | # - Try to find UFCx by interrogating the Python module FFCx 3 | # Once done this will define 4 | # 5 | # UFCX_FOUND - system has UFCx 6 | # UFCX_INCLUDE_DIRS - include directories for UFCx 7 | # UFCX_SIGNATURE - signature for UFCx 8 | # UFCX_VERSION - version for UFCx 9 | # 10 | #============================================================================= 11 | # Copyright (C) 2010-2021 Johannes Ring and Garth N. Wells 12 | # All rights reserved. 13 | # 14 | # Redistribution and use in source and binary forms, with or without 15 | # modification, are permitted provided that the following conditions 16 | # are met: 17 | # 18 | # 1. Redistributions of source code must retain the above copyright 19 | # notice, this list of conditions and the following disclaimer. 20 | # 2. Redistributions in binary form must reproduce the above copyright 21 | # notice, this list of conditions and the following disclaimer in 22 | # the documentation and/or other materials provided with the 23 | # distribution. 24 | # 25 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 26 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 27 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 28 | # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 29 | # COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 30 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 31 | # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 32 | # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 33 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 | # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 35 | # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 36 | # POSSIBILITY OF SUCH DAMAGE. 37 | #============================================================================= 38 | 39 | message( 40 | STATUS 41 | "Asking Python module FFCx for location of UFC... (Python executable: ${Python3_EXECUTABLE})" 42 | ) 43 | 44 | # Get include path 45 | execute_process( 46 | COMMAND 47 | ${Python3_EXECUTABLE} -c 48 | "import ffcx.codegeneration, sys; sys.stdout.write(ffcx.codegeneration.get_include_path())" 49 | OUTPUT_VARIABLE UFCX_INCLUDE_DIR 50 | ) 51 | 52 | # Get ufcx.h version 53 | if(UFCX_INCLUDE_DIR) 54 | set(UFCX_INCLUDE_DIRS 55 | ${UFCX_INCLUDE_DIR} 56 | CACHE STRING "Where to find ufcx.h" 57 | ) 58 | execute_process( 59 | COMMAND ${Python3_EXECUTABLE} -c 60 | "import ffcx, sys; sys.stdout.write(ffcx.__version__)" 61 | OUTPUT_VARIABLE UFCX_VERSION 62 | ) 63 | endif() 64 | 65 | # Compute hash of ufcx.h 66 | find_file(_UFCX_HEADER "ufcx.h" ${UFCX_INCLUDE_DIR}) 67 | if(_UFCX_HEADER) 68 | file(SHA1 ${_UFCX_HEADER} UFCX_SIGNATURE) 69 | endif() 70 | 71 | mark_as_advanced(UFCX_VERSION UFCX_INCLUDE_DIRS UFCX_SIGNATURE) 72 | find_package_handle_standard_args( 73 | UFCx 74 | REQUIRED_VARS UFCX_INCLUDE_DIRS UFCX_SIGNATURE UFCX_VERSION 75 | VERSION_VAR UFCX_VERSION HANDLE_VERSION_RANGE REASON_FAILURE_MESSAGE 76 | "UFCx could not be found." 77 | ) 78 | -------------------------------------------------------------------------------- /python/pyproject.toml: -------------------------------------------------------------------------------- 1 | # The CUDOLFINx Python interface must be built without build isolation (PEP517) 2 | # due to its runtime and build time dependency on system built petsc4py and 3 | # mpi4py. 4 | # pip install -r build-requirements.txt 5 | [build-system] 6 | requires = [ 7 | "scikit-build-core[pyproject]>=0.5", 8 | "nanobind>=1.8.0", 9 | "petsc4py", 10 | "mpi4py", 11 | ] 12 | build-backend = "scikit_build_core.build" 13 | 14 | [project] 15 | name = "fenics-cudolfinx" 16 | version = "0.9.0" 17 | description = "CUDA DOLFINx Python interface" 18 | readme = "../README.md" 19 | requires-python = ">=3.9.0" 20 | license = { file = "../COPYING.LESSER" } 21 | authors = [ 22 | { email = "benjaminpachev@gmail.com" }, 23 | { name = "Benjamin Pachev" }, 24 | ] 25 | dependencies = [ 26 | "numpy>=1.21", 27 | "cffi", 28 | "petsc4py", 29 | "mpi4py", 30 | "fenics-basix>=0.9.0,<0.10.0", 31 | "fenics-dolfinx>=0.9.0,<0.10.0", 32 | "fenics-ffcx>=0.9.0,<0.10.0", 33 | "fenics-ufl>=2024.2.0,<2024.3.0", 34 | ] 35 | 36 | [project.optional-dependencies] 37 | docs = ["markdown", "pyyaml", "sphinx", "sphinx_rtd_theme"] 38 | lint = ["ruff"] 39 | optional = ["numba"] 40 | test = ["pytest", "sympy", "scipy", "matplotlib", "fenics-dolfinx[optional]"] 41 | ci = [ 42 | "mypy", 43 | "pytest-xdist", 44 | "types-setuptools", 45 | "fenics-dolfinx[build]", 46 | "fenics-dolfinx[docs]", 47 | "fenics-dolfinx[lint]", 48 | "fenics-dolfinx[optional]", 49 | "fenics-dolfinx[test]", 50 | ] 51 | 52 | [tool.scikit-build] 53 | wheel.packages = ["cudolfinx"] 54 | sdist.exclude = ["*.cpp"] 55 | cmake.build-type = "Release" 56 | wheel.license-files = ["../COPYING*"] 57 | 58 | [tool.pytest] 59 | junit_family = "xunit2" 60 | 61 | [tool.pytest.ini_options] 62 | markers = ["skip_in_parallel: marks tests that should be run in serial only."] 63 | 64 | [tool.mypy] 65 | # Suggested at https://blog.wolt.com/engineering/2021/09/30/professional-grade-mypy-configuration/ 66 | # Goal would be to make all of the below True long-term 67 | disallow_untyped_defs = false 68 | disallow_any_unimported = false 69 | no_implicit_optional = false 70 | check_untyped_defs = false 71 | warn_return_any = false 72 | warn_unused_ignores = false 73 | show_error_codes = true 74 | ignore_missing_imports = true 75 | 76 | 77 | [tool.ruff] 78 | line-length = 100 79 | indent-width = 4 80 | 81 | [tool.ruff.lint] 82 | select = [ 83 | "E", # pycodestyle 84 | "W", # pycodestyle 85 | "F", # pyflakes 86 | "I", # isort - use standalone isort 87 | "RUF", # Ruff-specific rules 88 | "UP", # pyupgrade 89 | "ICN", # flake8-import-conventions 90 | "NPY", # numpy-specific rules 91 | "FLY", # use f-string not static joins 92 | ] 93 | ignore = ["UP007", "RUF012"] 94 | allowed-confusables = ["σ"] 95 | 96 | [tool.ruff.lint.isort] 97 | known-first-party = ["basix", "dolfinx", "ffcx", "ufl", "cudolfinx"] 98 | known-third-party = ["gmsh", "numba", "numpy", "pytest", "pyvista"] 99 | section-order = [ 100 | "future", 101 | "standard-library", 102 | "mpi", 103 | "third-party", 104 | "first-party", 105 | "local-folder", 106 | ] 107 | 108 | [tool.ruff.lint.isort.sections] 109 | "mpi" = ["mpi4py", "petsc4py"] 110 | -------------------------------------------------------------------------------- /python/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.18) 2 | 3 | project(cudolfinx_nanobind) 4 | 5 | # Set C++ standard 6 | set(CMAKE_CXX_STANDARD 20) 7 | set(CMAKE_CXX_STANDARD_REQUIRED ON) 8 | set(CMAKE_CXX_EXTENSIONS OFF) 9 | 10 | find_package( 11 | Python 12 | COMPONENTS Interpreter Development 13 | REQUIRED 14 | ) 15 | 16 | # Detect the installed nanobind package and import it into CMake 17 | execute_process( 18 | COMMAND "${Python_EXECUTABLE}" -m nanobind --cmake_dir 19 | OUTPUT_STRIP_TRAILING_WHITESPACE 20 | OUTPUT_VARIABLE NB_DIR 21 | ) 22 | list(APPEND CMAKE_PREFIX_PATH "${NB_DIR}") 23 | find_package(nanobind CONFIG REQUIRED) 24 | 25 | execute_process( 26 | COMMAND 27 | ${Python3_EXECUTABLE} -c 28 | "import os, sys, basix; sys.stdout.write(os.path.dirname(basix.__file__))" 29 | OUTPUT_VARIABLE BASIX_PY_DIR 30 | RESULT_VARIABLE BASIX_PY_COMMAND_RESULT 31 | ERROR_VARIABLE BASIX_ERROR_OUT 32 | OUTPUT_STRIP_TRAILING_WHITESPACE 33 | ) 34 | find_package(Basix REQUIRED CONFIG HINTS ${BASIX_PY_DIR}) 35 | 36 | if(Basix_FOUND) 37 | message(STATUS "Found Basix at ${Basix_DIR}") 38 | endif() 39 | 40 | find_package(DOLFINX REQUIRED CONFIG) 41 | 42 | if(DOLFINX_FOUND) 43 | message(STATUS "Found DOLFINx at ${DOLFINX_DIR}") 44 | endif() 45 | 46 | find_package(CUDOLFINX REQUIRED CONFIG) 47 | 48 | if(CUDOLFINX_FOUND) 49 | message(STATUS "Found CUDOLFINx at ${CUDOLFINX_DIR}") 50 | endif() 51 | 52 | find_package(CUDAToolkit REQUIRED) 53 | 54 | # Create the binding library nanobind handles its own calls to 55 | # target_link_libraries 56 | nanobind_add_module( 57 | cpp 58 | NOMINSIZE 59 | cudolfinx/wrappers/cudolfinx.cpp 60 | cudolfinx/wrappers/fem.cpp 61 | cudolfinx/wrappers/petsc.cpp 62 | ) 63 | 64 | # Add strict compiler flags include(CheckCXXCompilerFlag) 65 | # check_cxx_compiler_flag("-Wall -Werror -pedantic" HAVE_PEDANTIC) 66 | 67 | # if(HAVE_PEDANTIC) # target_compile_options(cpp PRIVATE 68 | # -Wall;-Werror;-pedantic) endif() 69 | 70 | # Add DOLFINx libraries 71 | target_link_libraries(cpp PRIVATE dolfinx) 72 | target_link_libraries(cpp PRIVATE cudolfinx) 73 | target_link_libraries(cpp PRIVATE CUDA::cuda_driver CUDA::nvrtc CUDA::cupti) 74 | target_include_directories(cpp SYSTEM PRIVATE ${CUDAToolkit_INCLUDE_DIRS}) 75 | 76 | # Check for petsc4py 77 | execute_process( 78 | COMMAND ${Python_EXECUTABLE} -c 79 | "import petsc4py; print(petsc4py.get_include())" 80 | OUTPUT_VARIABLE PETSC4PY_INCLUDE_DIR 81 | RESULT_VARIABLE PETSC4PY_COMMAND_RESULT 82 | ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE 83 | ) 84 | 85 | if(NOT PETSC4PY_COMMAND_RESULT) 86 | message(STATUS "Found petsc4py include directory at ${PETSC4PY_INCLUDE_DIR}") 87 | target_include_directories(cpp PRIVATE ${PETSC4PY_INCLUDE_DIR}) 88 | else() 89 | message(FATAL_ERROR "petsc4py could not be found.") 90 | endif() 91 | 92 | # Check for mpi4py 93 | execute_process( 94 | COMMAND "${Python_EXECUTABLE}" -c "import mpi4py; print(mpi4py.get_include())" 95 | OUTPUT_VARIABLE MPI4PY_INCLUDE_DIR 96 | RESULT_VARIABLE MPI4PY_COMMAND_RESULT 97 | ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE 98 | ) 99 | 100 | if(NOT MPI4PY_COMMAND_RESULT) 101 | message(STATUS "Found mpi4py include directory at ${MPI4PY_INCLUDE_DIR}") 102 | target_include_directories(cpp PRIVATE ${MPI4PY_INCLUDE_DIR}) 103 | else() 104 | message(FATAL_ERROR "mpi4py could not be found.") 105 | endif() 106 | 107 | set_target_properties(cpp PROPERTIES INSTALL_RPATH_USE_LINK_PATH TRUE) 108 | 109 | install(TARGETS cpp DESTINATION cudolfinx) 110 | -------------------------------------------------------------------------------- /cpp/cudolfinx/fem/CUDADofMap.h: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2024 Benjamin Pachev, James D. Trotter 2 | // 3 | // This file is part of cuDOLFINX 4 | // 5 | // SPDX-License-Identifier: LGPL-3.0-or-later 6 | 7 | #pragma once 8 | 9 | #include 10 | #include 11 | #include 12 | 13 | namespace dolfinx { 14 | namespace fem { 15 | class DofMap; 16 | 17 | /// A wrapper for a cellwise-to-global mapping of degres of freedom 18 | /// that is stored in the device memory of a CUDA device. 19 | class CUDADofMap 20 | { 21 | public: 22 | /// Create an empty dofmap 23 | CUDADofMap(); 24 | 25 | /// Create a dofmap 26 | /// 27 | /// @param[in] dofmap The dofmap to copy to device memory 28 | CUDADofMap( 29 | const dolfinx::fem::DofMap& dofmap, 30 | std::int32_t offset, 31 | std::int32_t ghost_offset, 32 | std::map* restriction 33 | ); 34 | 35 | // constructors without restriction 36 | CUDADofMap(const dolfinx::fem::DofMap* dofmap); 37 | 38 | CUDADofMap(const dolfinx::fem::DofMap& dofmap); 39 | 40 | /// Alternate constructor 41 | CUDADofMap( 42 | const dolfinx::fem::DofMap* dofmap, 43 | std::int32_t offset, 44 | std::int32_t ghost_offset, 45 | std::map* restriction 46 | ); 47 | 48 | /// Destructor 49 | ~CUDADofMap(); 50 | 51 | /// Copy constructor 52 | /// @param[in] dofmap The object to be copied 53 | CUDADofMap(const CUDADofMap& dofmap) = delete; 54 | 55 | /// Move constructor 56 | /// @param[in] dofmap The object to be moved 57 | CUDADofMap(CUDADofMap&& dofmap); 58 | 59 | /// Assignment operator 60 | /// @param[in] dofmap Another CUDADofMap object 61 | CUDADofMap& operator=(const CUDADofMap& dofmap) = delete; 62 | 63 | /// Move assignment operator 64 | /// @param[in] dofmap Another CUDADofMap object 65 | CUDADofMap& operator=(CUDADofMap&& dofmap); 66 | 67 | /// Update the dofmap on the device, possibly with a new restriction 68 | void update(std::int32_t offset, std::int32_t ghost_offset, std::map* restriction); 69 | 70 | /// Get the underlying dofmap on the host 71 | const dolfinx::fem::DofMap* dofmap() const { return _dofmap; } 72 | 73 | /// Get the number of degrees of freedom 74 | int32_t num_dofs() const { return _num_dofs; } 75 | 76 | /// Get the number of cells 77 | int32_t num_cells() const { return _num_cells; } 78 | 79 | /// Get the number of dofs per cell 80 | int32_t num_dofs_per_cell() const { 81 | return _num_dofs_per_cell; } 82 | 83 | /// Get a handle to the device-side dofs of each cell 84 | CUdeviceptr dofs_per_cell() const { 85 | return _ddofs_per_cell; } 86 | 87 | /// Get the offsets to the first cell containing each degree of freedom 88 | CUdeviceptr cells_per_dof_ptr() const { 89 | return _dcells_per_dof_ptr; } 90 | 91 | /// Get the cells containing each degree of freedom 92 | CUdeviceptr cells_per_dof() const { 93 | return _dcells_per_dof; } 94 | 95 | private: 96 | /// The underlying dofmap on the host 97 | const dolfinx::fem::DofMap* _dofmap; 98 | 99 | /// The number of degrees of freedom 100 | int32_t _num_dofs; 101 | 102 | /// The number of cells in the mesh 103 | int32_t _num_cells; 104 | 105 | /// The number of degrees of freedom in each cell 106 | int32_t _num_dofs_per_cell; 107 | 108 | /// The block size 109 | int32_t _block_size; 110 | 111 | /// The degrees of freedom of each cell 112 | CUdeviceptr _ddofs_per_cell; 113 | 114 | /// Offsets to the first cell containing each degree of freedom 115 | CUdeviceptr _dcells_per_dof_ptr; 116 | 117 | /// The cells containing each degree of freedom 118 | CUdeviceptr _dcells_per_dof; 119 | }; 120 | 121 | } // namespace fem 122 | } // namespace dolfinx 123 | 124 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # About 2 | 3 | This repository is an add-on extension to the DOLFINx library providing CUDA accelerated assembly routines. It complements the CUDA linear solvers in PETSc to enable fully GPU-accelerated DOLFINx codes. It is designed to enable GPU acceleration for existing codes with minimal changes. 4 | 5 | # Basic Usage 6 | 7 | ``` 8 | import cudolfinx as cufem 9 | 10 | # given UFL forms A and L representing a stiffness matrix and right-hand-side 11 | cuda_A = cufem.form(A) 12 | cuda_L = cufem.form(L) 13 | asm = cufem.CUDAAssembler() 14 | # returns a custom type CUDAMatrix 15 | mat = asm.assemble_matrix(cuda_A) 16 | mat.assemble() 17 | # get PETSc matrix 18 | petsc_mat = mat.mat() 19 | # returns a custom type CUDAVector 20 | vec = asm.assemble_vector(cuda_L) 21 | #get PETSc vector 22 | petsc_vec = vec.vector() 23 | ``` 24 | 25 | # Dependencies 26 | 27 | - dolfinx 0.9.0 28 | - PETSc with CUDA support 29 | - CUDA Toolkit 12.x 30 | 31 | # Installation 32 | 33 | There are three ways to do the install, in increasing order of difficulty. Currently, it is not possible to use `cudolfinx` with the existing Conda and Docker distributions of `dolfinx`, because these force installation of PETSc without CUDA support. Consequently, installing `cudolfinx` requires a custom modification to the `dolfinx` dependency stack that has CUDA-enabled PETSc. 34 | 35 | ## Docker 36 | 37 | Using Docker is by far the easiest approach. 38 | 39 | ``` 40 | docker run --gpus all -it benpachev/cudolfinx:v0.9.0-cuda12.6 41 | ``` 42 | You may experience errors with the prebuilt container due to CUDA Toolkit or MPI version mismatch between the host and container. In this case, the Dockerfiles in `docker/` can be modified to use a different CUDA Toolkit version or MPI version to build a container that will work with your system. Note that on HPC systems, Docker is not available, but Docker containers can be converted to Apptainer/Singularity containers. 43 | 44 | ``` 45 | apptainer pull docker://benpachev/cudolfinx:v0.9.0-cuda12.6 46 | apptainer run --nv cudolfinx_v0.9.0-cuda12.6.sif 47 | ``` 48 | 49 | ## Spack 50 | 51 | Spack is a management tool for HPC software, and allows for an extreme amount of flexibility in compilation of code and dependency selection. It has somewhat of a learning curve, and typically doesn't work out of the box without some manual configuration. However, it can be a good choice for HPC systems without Apptainer installed, or if more control over the compilation process and dependencies is desired. To install with Spack: 52 | 53 | ``` 54 | git clone https://github.com/spack/spack.git 55 | . spack/share/spack/setup-env.sh 56 | spack env create cudolfinx-env 57 | spacktivate cudolfinx-env 58 | git clone https://github.com/bpachev/cuda-dolfinx.git 59 | spack repo add cuda-dolfinx/spack 60 | spack add cuda-dolfinx py-cuda-dolfinx 61 | spack install 62 | ``` 63 | 64 | If this leads to errors, it is likely due to either (a) Spack is unable to find a suitable compiler or properly configure your existing compiler (b) Spack is trying to build a poorly supported low-level package from source. To resolve (a), you can usually do `spack compiler add`. Especially on HPC systems, [additional configuration](https://spack-tutorial.readthedocs.io/en/latest/tutorial_configuration.html#compiler-configuration) is usually needed. To solve (b), you will often need to [force Spack to use existing](https://spack-tutorial.readthedocs.io/en/latest/tutorial_configuration.html#external-packages) low-level software on your system instead of trying to install it from source. 65 | 66 | ## From Source 67 | 68 | The difficult part about the install is the dependencies. The Dockerfiles under `docker/` provide a template for how to install the dependencies on Debian-based systems (and using Docker is by far the easiest way to get a development environment). Once that is taken care of, the installation of `cuda-dolfinx` itself is simple. 69 | 70 | ### C++ Core 71 | ``` 72 | cd cpp 73 | mkdir build 74 | cmake .. -DCUDOLFINX_SKIP_BUILD_TESTS=YES 75 | make install 76 | ``` 77 | 78 | ### Python Bindings: 79 | ``` 80 | cd python 81 | pip --check-build-dependencies --no-build-isolation . 82 | ``` 83 | 84 | For help with installing or using the library, feel free to contact me at benjaminpachev@gmail.com. 85 | -------------------------------------------------------------------------------- /python/test/test_multigpu_assembly.py: -------------------------------------------------------------------------------- 1 | from test_cuda_assembly import make_test_domain, make_ufl 2 | from mpi4py import MPI 3 | import cudolfinx as cufem 4 | from dolfinx import fem as fe 5 | from dolfinx.fem import petsc as fe_petsc 6 | import numpy as np 7 | from petsc4py import PETSc 8 | import json 9 | 10 | def compute_universal_dofmap(mesh, V, res=1000): 11 | """Map the global array of dofs to unique geometric information 12 | 13 | This is needed to compute maps between DG dofs on meshes with different partitioning schemes 14 | """ 15 | 16 | num_local_dofs = V.dofmap.index_map.size_local 17 | 18 | c_to_dofs = V.dofmap.map() 19 | dofs_to_cells = np.zeros(num_local_dofs, dtype=int) 20 | for i, cell in enumerate(c_to_dofs): 21 | for dof in cell: 22 | if dof >= num_local_dofs: continue 23 | dofs_to_cells[dof] = i 24 | dof_coords = V.tabulate_dof_coordinates()[:num_local_dofs] 25 | cell_coords = mesh.geometry.x[mesh.geometry.dofmap] 26 | dof_cell_coords = cell_coords[dofs_to_cells] 27 | dof_coords = mesh.comm.gather(dof_coords, root=0) 28 | dof_cell_coords = mesh.comm.gather(dof_cell_coords, root=0) 29 | if mesh.comm.rank == 0: 30 | dof_coords = (res*np.concat(dof_coords)).astype(int) 31 | dof_cell_coords = (res*np.concat(dof_cell_coords)).astype(int) 32 | i = 0 33 | keys_to_dofs = {} 34 | keys = [] 35 | for d_coords, d_cell_coords in zip(dof_coords, dof_cell_coords): 36 | k = (tuple(d_coords.tolist()), tuple(sorted([tuple(arr.tolist()) for arr in d_cell_coords]))) 37 | keys_to_dofs[k] = i 38 | keys.append(k) 39 | i += 1 40 | 41 | return keys, keys_to_dofs 42 | 43 | def compare_parallel_matrices(mat1, mat2): 44 | """Compare two distributed PETSc matrices 45 | """ 46 | 47 | _, _, data1 = mat1.getValuesCSR() 48 | _, _, data2 = mat2.getValuesCSR() 49 | sum1 = MPI.COMM_WORLD.gather(data1.sum(), root=0) 50 | sum2 = MPI.COMM_WORLD.gather(data2.sum(), root=0) 51 | if MPI.COMM_WORLD.rank == 0: 52 | sum1, sum2 = sum(sum1), sum(sum2) 53 | print(sum1, sum2, np.allclose(sum1, sum2)) 54 | return np.allclose(sum1, sum2) 55 | 56 | def compare_parallel_vectors(vec1, vec2): 57 | """Compare two distributed PETSc vectors 58 | """ 59 | 60 | sum1 = MPI.COMM_WORLD.gather(vec1.array[:].sum(), root=0) 61 | sum2 = MPI.COMM_WORLD.gather(vec2.array[:].sum(), root=0) 62 | if MPI.COMM_WORLD.rank == 0: 63 | sum1, sum2 = sum(sum1), sum(sum2) 64 | print(sum1, sum2, np.allclose(sum1, sum2)) 65 | return np.allclose(sum1, sum2) 66 | 67 | def test_multigpu_assembly(): 68 | """Check assembly operations across multiple GPUs 69 | """ 70 | 71 | domain = make_test_domain() 72 | regular_ufl = make_ufl() 73 | ghosted_domain = cufem.ghost_layer_mesh(domain) 74 | ghosted_ufl = make_ufl(ghosted_domain) 75 | asm = cufem.CUDAAssembler() 76 | for form1, form2 in zip(regular_ufl['matrix'], ghosted_ufl['matrix']): 77 | form1 = fe.form(form1) 78 | form2 = cufem.form(form2) 79 | regular_mat = fe_petsc.create_matrix(form1) 80 | regular_mat.zeroEntries() 81 | fe_petsc.assemble_matrix(regular_mat, form1, bcs=regular_ufl['bcs']) 82 | regular_mat.assemble() 83 | cuda_mat = asm.assemble_matrix(form2, bcs=ghosted_ufl['bcs']) 84 | cuda_mat.assemble() 85 | compare_parallel_matrices(regular_mat, cuda_mat.mat) 86 | 87 | for form1, form2 in zip(regular_ufl['vector'], ghosted_ufl['vector']): 88 | form1 = fe.form(form1) 89 | form2 = cufem.form(form2) 90 | regular_vec = fe_petsc.create_vector(form1) 91 | with regular_vec.localForm() as loc: 92 | loc.set(0) 93 | fe_petsc.assemble_vector(regular_vec, form1) 94 | regular_vec.ghostUpdate(addv=PETSc.InsertMode.ADD, mode=PETSc.ScatterMode.REVERSE) 95 | cuda_vec = asm.assemble_vector(form2) 96 | good = compare_parallel_vectors(regular_vec, cuda_vec.vector) 97 | 98 | if __name__ == "__main__": 99 | 100 | test_multigpu_assembly() 101 | -------------------------------------------------------------------------------- /cpp/cudolfinx/fem/petsc.h: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2024 Benjamin Pachev, James D. Trotter 2 | // 3 | // This file is part of cuDOLFINX 4 | // 5 | // SPDX-License-Identifier: LGPL-3.0-or-later 6 | 7 | #pragma once 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | 25 | namespace dolfinx::fem 26 | { 27 | 28 | namespace petsc 29 | { 30 | 31 | template 32 | Mat create_cuda_matrix(const Form& a) 33 | { 34 | la::SparsityPattern pattern = fem::create_sparsity_pattern(a); 35 | pattern.finalize(); 36 | return la::petsc::create_cuda_matrix(a.mesh()->comm(), pattern); 37 | } 38 | 39 | template 40 | Mat create_cuda_matrix_block(std::vector>>>& forms) 41 | { 42 | int rows = forms.size(); 43 | int cols = (rows) ? forms[0].size() : 0; 44 | std::vector*>> a(rows); 45 | std::vector>> patterns(rows); 46 | std::shared_ptr> mesh; 47 | std::array, int>>, 49 | 2> 50 | maps; 51 | std::vector> restricted_row_maps(rows, nullptr); 52 | std::vector> restricted_col_maps(cols, nullptr); 53 | for (int row = 0; row < rows; row++) { 54 | for (int col = 0; col < cols; col++) { 55 | if (auto cuda_form = forms[row][col]; cuda_form) { 56 | auto form = cuda_form->form(); 57 | if (cuda_form->restricted()) { 58 | patterns[row].push_back( 59 | std::make_unique(compute_restricted_sparsity_pattern(cuda_form)) 60 | ); 61 | if (!restricted_row_maps[row]) restricted_row_maps[row] = cuda_form->restriction_index_map(0); 62 | if (!restricted_col_maps[col]) restricted_col_maps[col] = cuda_form->restriction_index_map(1); 63 | } 64 | else { 65 | patterns[row].push_back(std::make_unique( 66 | create_sparsity_pattern(*cuda_form->form()))); 67 | } 68 | a[row].push_back(form); 69 | if (!mesh) mesh = form->mesh(); 70 | } 71 | else { 72 | patterns[row].push_back(nullptr); 73 | a[row].push_back(nullptr); 74 | } 75 | } 76 | } 77 | 78 | std::array>>, 2> V 79 | = fem::common_function_spaces(extract_function_spaces(a)); 80 | std::array, 2> bs_dofs; 81 | std::array>, 2> restricted_maps 82 | = {restricted_row_maps, restricted_col_maps}; 83 | for (std::size_t d = 0; d < 2; ++d) 84 | { 85 | for (std::size_t i = 0; i < V[d].size(); i++) 86 | { 87 | auto& space = V[d][i]; 88 | std::shared_ptr imap = (restricted_maps[d][i]) 89 | ? restricted_maps[d][i] : space->dofmap()->index_map; 90 | 91 | maps[d].emplace_back(*imap, 92 | space->dofmap()->index_map_bs()); 93 | // is dofmap bs is different than indexmap bs? 94 | bs_dofs[d].push_back(space->dofmap()->bs()); 95 | } 96 | } 97 | 98 | // OK now figure out how to build a matrix with this. . . 99 | std::vector> p(rows); 100 | for (std::size_t row = 0; row < rows; ++row) 101 | for (std::size_t col = 0; col < cols; ++col) 102 | p[row].push_back(patterns[row][col].get()); 103 | 104 | la::SparsityPattern pattern(mesh->comm(), p, maps, bs_dofs); 105 | pattern.finalize(); 106 | return la::petsc::create_cuda_matrix(mesh->comm(), pattern); 107 | } 108 | 109 | } // namespace petsc 110 | } // namespace dolfinx::fem 111 | -------------------------------------------------------------------------------- /docker/Dockerfile.end-user: -------------------------------------------------------------------------------- 1 | # Dockerfile describing end-user CUDA-accelerated FEniCS environments 2 | # Modified version of the DOLFINx end user Docker file 3 | # 4 | # Authors: 5 | # Benjamin Pachev 6 | # 7 | 8 | ARG PYVISTA_VERSION=0.44.2 9 | 10 | # Used to set the correct PYTHONPATH for the real and complex install of 11 | # DOLFINx 12 | ARG PYTHON_VERSION=3.12 13 | # Base image for end-user images 14 | ARG BASEIMAGE=benpachev/cudolfinx:dev-env-v0.9.0 15 | ARG CUDOLFINX_TAG=v0.9.0 16 | 17 | FROM ${BASEIMAGE} as cudolfinx 18 | LABEL description="cuDOLFINx (onbuild)" 19 | 20 | ARG PYTHON_VERSION 21 | 22 | WORKDIR /src 23 | 24 | RUN git clone --depth 1 --branch v0.9.0 https://github.com/FEniCS/dolfinx.git 25 | RUN git clone --depth 1 --branch v0.9.0 https://github.com/FEniCS/ffcx.git 26 | RUN git clone --depth 1 --branch v0.9.0 https://github.com/FEniCS/basix.git 27 | RUN git clone --depth 1 --branch 2024.2.0 https://github.com/FEniCS/ufl.git 28 | RUN git clone --depth 1 --branch v0.9.0 https://github.com/bpachev/cuda-dolfinx.git 29 | 30 | RUN cp dolfinx/docker/dolfinx-real-mode /usr/local/bin/dolfinx-real-mode 31 | RUN chmod +x /usr/local/bin/dolfinx-*-mode 32 | 33 | # CMake build type for DOLFINx C++ build. See CMake documentation. 34 | ARG DOLFINX_CMAKE_BUILD_TYPE="Release" 35 | 36 | # Using pip install `.[test]` with --no-dependencies and --no-build-isolation 37 | # does not install necessary packages, hence install build and optional 38 | # dependencies manually here. 39 | RUN pip install --no-cache-dir -r dolfinx/python/build-requirements.txt && \ 40 | pip install --no-cache-dir pyamg pytest scipy matplotlib numba # test + optional set 41 | 42 | RUN cd basix && cmake -G Ninja -DCMAKE_BUILD_TYPE=${DOLFINX_CMAKE_BUILD_TYPE} -B build-dir -S ./cpp && \ 43 | cmake --build build-dir && \ 44 | cmake --install build-dir && \ 45 | pip install ./python && \ 46 | cd ../ufl && pip install --no-cache-dir . && \ 47 | cd ../ffcx && pip install --no-cache-dir . && \ 48 | cd ../ && pip install --no-cache-dir ipython 49 | 50 | RUN apt-get -qq update && \ 51 | apt-get install -y libboost-timer-dev libboost-filesystem-dev 52 | 53 | # --no-dependencies necessary as --target does not contain any dependencies e.g. 54 | # mpi4py - leading to unwanted rebuild. 55 | RUN cd dolfinx && \ 56 | mkdir -p build-real && \ 57 | cd build-real && \ 58 | PETSC_ARCH=linux-gnu-real64-32-cuda cmake -G Ninja -DCMAKE_INSTALL_PREFIX=/usr/local/dolfinx-real -DCMAKE_BUILD_TYPE=${DOLFINX_CMAKE_BUILD_TYPE} ../cpp && \ 59 | ninja install && \ 60 | cd ../python && \ 61 | PETSC_ARCH=linux-gnu-real64-32-cuda pip -v install \ 62 | --config-settings=cmake.build-type="${DOLFINX_CMAKE_BUILD_TYPE}" --config-settings=install.strip=false --no-build-isolation --check-build-dependencies \ 63 | --target /usr/local/dolfinx-real/lib/python${PYTHON_VERSION}/dist-packages --no-dependencies --no-cache-dir '.' 64 | 65 | # Currently cuDOLFINX only supports real mode, as the CUDA version of PETSc appears to not compile with complex types . . . . 66 | ENV PKG_CONFIG_PATH=/usr/local/dolfinx-real/lib/pkgconfig:$PKG_CONFIG_PATH \ 67 | CMAKE_PREFIX_PATH=/usr/local/dolfinx-real/lib/cmake:$CMAKE_PREFIX_PATH \ 68 | PETSC_ARCH=linux-gnu-real64-32-cuda \ 69 | PYTHONPATH=/usr/local/dolfinx-real/lib/python${PYTHON_VERSION}/dist-packages:$PYTHONPATH \ 70 | LD_LIBRARY_PATH=/usr/local/dolfinx-real/lib:$LD_LIBRARY_PATH 71 | 72 | RUN cd cuda-dolfinx && \ 73 | mkdir -p build-real && \ 74 | cd build-real && \ 75 | PETSC_ARCH=linux-gnu-real64-32-cuda cmake -G Ninja -DCMAKE_INSTALL_PREFIX=/usr/local/dolfinx-real -DCMAKE_BUILD_TYPE=${DOLFINX_CMAKE_BUILD_TYPE} ../cpp && \ 76 | ninja install && \ 77 | cd ../python && \ 78 | PETSC_ARCH=linux-gnu-real64-32-cuda pip -v install \ 79 | --config-settings=cmake.build-type="${DOLFINX_CMAKE_BUILD_TYPE}" --config-settings=install.strip=false --no-build-isolation --check-build-dependencies \ 80 | --target /usr/local/dolfinx-real/lib/python${PYTHON_VERSION}/dist-packages --no-dependencies --no-cache-dir '.' 81 | 82 | # Prepending /usr/local to paths is needed to make the correct version of MPI be used (not the one that comes with NVHPC) 83 | # Since this container doesn't currently install GPU aware MPI, PETSc needs the gpu aware MPI option turned off 84 | # TODO: fix the base container to install GPU-aware MPI 85 | ENV PETSC_OPTIONS="-use_gpu_aware_mpi 0" \ 86 | LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH \ 87 | PATH=/usr/local/bin:$PATH 88 | -------------------------------------------------------------------------------- /cpp/cudolfinx/fem/utils.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | namespace dolfinx::fem { 8 | 9 | // Create a restricted sparsity pattern 10 | // This emulates the functionality in multiphenicsx 11 | // However we don't want to depend on multiphenicsx just for this function 12 | // TODO accelerate sparsity pattern computation with a CUDA kernel 13 | template 14 | dolfinx::la::SparsityPattern compute_restricted_sparsity_pattern(std::shared_ptr> cuda_form) 15 | { 16 | auto form = cuda_form->form(); 17 | auto restriction = cuda_form->get_restriction(); 18 | 19 | std::shared_ptr dofmap0 = form->function_spaces()[0]->dofmap(); 20 | std::shared_ptr dofmap1 = form->function_spaces()[1]->dofmap(); 21 | 22 | int num_cell_dofs = dofmap0->map().extent(1); 23 | std::shared_ptr mesh = form->mesh(); 24 | const std::array index_maps{dofmap0->index_map, 25 | dofmap1->index_map}; 26 | const std::array bs 27 | = {dofmap0->index_map_bs(), dofmap1->index_map_bs()}; 28 | const std::array dofmaps = {dofmap0, dofmap1}; 29 | std::array, 2> restricted_cell_dofs; 30 | std::array, 2> restricted_cell_bounds; 31 | std::array, 2> insertion_dofs; 32 | for (std::size_t d = 0; d < 2; d++) { 33 | auto cell_map = dofmaps[d]->map(); 34 | int num_cells = cell_map.extent(0); 35 | const auto& restriction_map = *restriction[d]; 36 | restricted_cell_bounds[d].reserve(num_cells+1); 37 | restricted_cell_bounds[d].push_back(0); 38 | for (int cell = 0; cell < num_cells; cell++) { 39 | for (auto dof: dofmaps[d]->cell_dofs(cell)) { 40 | if (restriction_map.find(dof) != restriction_map.end()) { 41 | restricted_cell_dofs[d].push_back(restriction_map.at(dof)); 42 | } 43 | } 44 | restricted_cell_bounds[d].push_back(restricted_cell_dofs[d].size()); 45 | } 46 | } 47 | 48 | // Create and build sparsity pattern 49 | la::SparsityPattern pattern(mesh->comm(), index_maps, bs); 50 | 51 | for (auto integral_type : form->integral_types()) { 52 | std::vector ids = form->integral_ids(integral_type); 53 | if (integral_type == dolfinx::fem::IntegralType::interior_facet) { 54 | for (auto id: ids) { 55 | auto entities = form->domain(integral_type, id); 56 | for (std::size_t i = 0; i < entities.size(); i+=4) { 57 | int cell0 = entities[i]; 58 | int cell1 = entities[i+2]; 59 | for (std::size_t d = 0; d < 2; d++) { 60 | auto cell_dofs0 = std::span(restricted_cell_dofs[d].data() + restricted_cell_bounds[d][cell0], 61 | restricted_cell_bounds[d][cell0+1] - restricted_cell_bounds[d][cell0]); 62 | auto cell_dofs1 = std::span(restricted_cell_dofs[d].data() + restricted_cell_bounds[d][cell1], 63 | restricted_cell_bounds[d][cell1+1] - restricted_cell_bounds[d][cell1]); 64 | insertion_dofs[d].resize(cell_dofs0.size() + cell_dofs1.size()); 65 | std::copy(cell_dofs0.begin(), cell_dofs0.end(), insertion_dofs[d].begin()); 66 | std::copy(cell_dofs1.begin(), cell_dofs1.end(), 67 | std::next(insertion_dofs[d].begin(), cell_dofs0.size())); 68 | } 69 | pattern.insert(insertion_dofs[0], insertion_dofs[1]); 70 | } 71 | } 72 | } 73 | else { 74 | int increment = (integral_type == dolfinx::fem::IntegralType::exterior_facet) ? 2 : 1; 75 | for (auto id: ids) { 76 | auto entities = form->domain(integral_type, id); 77 | for (std::size_t i = 0; i < entities.size(); i+=increment) { 78 | int cell = entities[i]; 79 | auto cell_dofs0 = std::span(restricted_cell_dofs[0].data() + restricted_cell_bounds[0][cell], 80 | restricted_cell_bounds[0][cell+1] - restricted_cell_bounds[0][cell]); 81 | auto cell_dofs1 = std::span(restricted_cell_dofs[1].data() + restricted_cell_bounds[1][cell], 82 | restricted_cell_bounds[1][cell+1] - restricted_cell_bounds[1][cell]); 83 | pattern.insert(cell_dofs0, cell_dofs1); 84 | } 85 | } 86 | } 87 | } 88 | return pattern; 89 | } 90 | 91 | } // end namespace dolfinx::fem 92 | -------------------------------------------------------------------------------- /python/examples/poisson_sum_factorization.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2024 Benjamin Pachev 2 | # 3 | # This file is part of cuDOLFINX 4 | # 5 | # SPDX-License-Identifier: LGPL-3.0-or-later 6 | 7 | import argparse as ap 8 | from mpi4py import MPI 9 | from petsc4py import PETSc 10 | try: 11 | import cudolfinx as cufem 12 | except ImportError: 13 | print("Must have cudolfinx to test CUDA assembly.") 14 | 15 | from dolfinx import fem as fe, mesh 16 | from dolfinx.fem import petsc as fe_petsc 17 | import numpy as np 18 | import ufl 19 | import time 20 | from ufl import dx, ds, grad, inner 21 | import basix 22 | 23 | def create_mesh(res: int = 10): 24 | """Create a uniform tetrahedral mesh on the unit cube. 25 | 26 | Parameters 27 | ---------- 28 | res - Number of subdivisions along each dimension 29 | 30 | Returns 31 | ---------- 32 | mesh - The mesh object. 33 | """ 34 | 35 | return mesh.create_box( 36 | comm = MPI.COMM_WORLD, 37 | points = ((0,0,0), (1, 1, 1)), 38 | n = (res, res, res), 39 | cell_type = mesh.CellType.hexahedron, 40 | ghost_mode = mesh.GhostMode.none, 41 | dtype = np.float64 42 | ) 43 | 44 | def main(res, cuda=True, sum_factorization=True, degree=1): 45 | """Assembles a stiffness matrix for the Poisson problem with the given resolution. 46 | """ 47 | 48 | domain = create_mesh(res) 49 | # Tensor product element 50 | family = basix.ElementFamily.P 51 | variant = basix.LagrangeVariant.gll_warped 52 | cell_type = domain.basix_cell() 53 | 54 | basix_element = basix.create_tp_element( 55 | family, cell_type, degree, variant 56 | ) # doesn't work with tp element, why? 57 | element = basix.ufl._BasixElement(basix_element) # basix ufl element 58 | V = fe.functionspace(domain, element) 59 | u = ufl.TrialFunction(V) 60 | v = ufl.TestFunction(V) 61 | x = ufl.SpatialCoordinate(domain) 62 | f = 10*ufl.exp(-((x[0]-.5)**2 + (x[1]-.5)**2 + (x[2]-.5)**2) / .02) 63 | g = ufl.sin(5*x[0])*ufl.sin(5*x[1]) 64 | a = inner(grad(u), grad(v)) * dx 65 | L = inner(f, v) * dx + inner(g, v) * ds 66 | 67 | facets = mesh.locate_entities_boundary( 68 | domain, 69 | dim=(domain.topology.dim - 1), 70 | marker=lambda x: np.isclose(x[0], 0.0) | np.isclose(x[0], 2.0), 71 | ) 72 | 73 | dofs = fe.locate_dofs_topological(V=V, entity_dim=domain.topology.dim-1, entities=facets) 74 | bc = fe.dirichletbc(value=PETSc.ScalarType(0), dofs=dofs, V=V) 75 | 76 | form_compiler_options = {"sum_factorization": sum_factorization} 77 | 78 | if cuda: 79 | a = cufem.form(a, form_compiler_options=form_compiler_options) 80 | asm = cufem.CUDAAssembler() 81 | A = asm.create_matrix(a) 82 | device_bcs = asm.pack_bcs([bc]) 83 | else: 84 | a = fe.form( 85 | a, 86 | form_compiler_options=form_compiler_options, 87 | jit_options = {"cffi_extra_compile_args":["-O3", "-mcpu=neoverse-v2"]} 88 | ) 89 | A = fe_petsc.create_matrix(a) 90 | 91 | start = time.time() 92 | if cuda: 93 | asm.assemble_matrix(a, A, bcs=device_bcs) 94 | else: 95 | fe_petsc.assemble_matrix(A, a, bcs=[bc]) 96 | A.assemble() 97 | elapsed = time.time()-start 98 | 99 | timing = MPI.COMM_WORLD.gather(elapsed, root=0) 100 | if MPI.COMM_WORLD.rank == 0: 101 | timing = np.asarray(timing) 102 | timing = np.max(timing) 103 | # show max over all MPI processes, as that's the rate-limiter 104 | print(f"Res={res}, Num cells", domain.topology.index_map(domain.topology.dim).size_global) 105 | print(f"Assembly timing: {timing}, Dofs: {V.dofmap.index_map.size_global}") 106 | 107 | if __name__ == "__main__": 108 | parser = ap.ArgumentParser() 109 | parser.add_argument("--res", default=10, type=int, help="Number of subdivisions in each dimension.") 110 | parser.add_argument("--degree", default=1, type=int, help="Polynomial degree.") 111 | parser.add_argument("--no-sum-factorization", default=False, action="store_true", help="Disable sum factorization") 112 | parser.add_argument("--no-cuda", default=False, action="store_true", help="Disable GPU acceleration.") 113 | args = parser.parse_args() 114 | 115 | main( 116 | res = args.res, 117 | cuda = not args.no_cuda, 118 | sum_factorization = not args.no_sum_factorization, 119 | degree = args.degree 120 | ) 121 | -------------------------------------------------------------------------------- /cpp/cudolfinx/common/CUDA.h: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2024 Benjamin Pachev, James D. Trotter 2 | // 3 | // This file is part of cuDOLFINX 4 | // 5 | // SPDX-License-Identifier: LGPL-3.0-or-later 6 | 7 | #pragma once 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | namespace dolfinx 15 | { 16 | 17 | namespace CUDA 18 | { 19 | class Module; 20 | class Kernel; 21 | 22 | /// This class is a wrapper around a CUDA device context 23 | class Context 24 | { 25 | public: 26 | /// Create a CUDA device context 27 | Context(); 28 | 29 | /// Destructor 30 | ~Context(); 31 | 32 | /// Copy constructor 33 | /// @param[in] context The object to be copied 34 | Context(const Context& context) = delete; 35 | 36 | /// Move constructor 37 | /// @param[in] context The object to be moved 38 | Context(Context&& context) = delete; 39 | 40 | /// Assignment operator 41 | /// @param[in] context The object to assign from 42 | Context& operator=(const Context& context) = delete; 43 | 44 | /// Move assignment operator 45 | /// @param[in] context The object to assign from 46 | Context& operator=(Context&& context) = delete; 47 | 48 | /// Return underlying CUDA device 49 | const CUdevice& device() const; 50 | 51 | /// Return underlying CUDA context 52 | CUcontext& context(); 53 | 54 | private: 55 | CUdevice _device; 56 | CUcontext _context; 57 | }; 58 | 59 | /// This class is a wrapper around a module, which is obtained by 60 | /// compiling PTX assembly to CUDA device code. 61 | class Module 62 | { 63 | public: 64 | /// Create an empty module 65 | Module(); 66 | 67 | /// Create a module 68 | Module( 69 | const CUDA::Context& cuda_context, 70 | const std::string& ptx, 71 | CUjit_target target, 72 | int num_module_load_options, 73 | CUjit_option* module_load_options, 74 | void** module_load_option_values, 75 | bool verbose, 76 | bool debug); 77 | 78 | /// Destructor 79 | ~Module(); 80 | 81 | /// Copy constructor 82 | /// @param[in] module The object to be copied 83 | Module(const Module& module) = delete; 84 | 85 | /// Move constructor 86 | /// @param[in] module The object to be moved 87 | Module(Module&& module); 88 | 89 | /// Assignment operator 90 | /// @param[in] module The object to assign from 91 | Module& operator=(const Module& module) = delete; 92 | 93 | /// Move assignment operator 94 | /// @param[in] module The object to assign from 95 | Module& operator=(Module&& module); 96 | 97 | /// Get a device-side function from a loaded module 98 | CUfunction get_device_function( 99 | const std::string& device_function_name) const; 100 | 101 | /// Get info log for a loaded module 102 | const char* info_log() const { 103 | return _info_log; } 104 | 105 | /// Get error log for a loaded module 106 | const char* error_log() const { 107 | return _error_log; } 108 | 109 | private: 110 | /// Handle to the CUDA module 111 | CUmodule _module; 112 | 113 | /// Size of the buffer for informational log messages 114 | size_t _info_log_size; 115 | 116 | /// Informational log messages related to loading the module 117 | char* _info_log; 118 | 119 | /// Size of the buffer for error log messages 120 | size_t _error_log_size; 121 | 122 | /// Error log messages related to loading the module 123 | char* _error_log; 124 | }; 125 | 126 | /// Use the NVIDIA CUDA Runtime Compilation (nvrtc) library to compile 127 | /// device-side code for a given CUDA program. 128 | std::string compile_cuda_cpp_to_ptx( 129 | const char* program_name, 130 | int num_program_headers, 131 | const char** program_headers, 132 | const char** program_include_names, 133 | int num_compile_options, 134 | const char** compile_options, 135 | const char* program_src, 136 | const char* cudasrcdir, 137 | bool verbose); 138 | 139 | void safeMemAlloc(CUdeviceptr* dptr, size_t bytesize); 140 | void safeMemcpyDtoH(void* dstHost, CUdeviceptr srcDevice, size_t ByteCount); 141 | void safeMemcpyHtoD(CUdeviceptr dstDevice, const void* srcHost, size_t ByteCount); 142 | void safeDeviceGetAttribute(int * res, CUdevice_attribute attr, CUdevice dev); 143 | void safeCtxSynchronize(); 144 | void safeStreamCreate(CUstream* streamptr, unsigned int flags); 145 | 146 | template void safeVectorCreate(CUdeviceptr* dptr, std::vector arr) { 147 | size_t bytesize = sizeof(T) * arr.size(); 148 | safeMemAlloc(dptr, bytesize); 149 | safeMemcpyHtoD(*dptr, (void *)arr.data(), bytesize); 150 | } 151 | 152 | CUjit_target get_cujit_target(const Context& cuda_context); 153 | 154 | } // namespace CUDA 155 | 156 | 157 | } // namespace dolfinx 158 | -------------------------------------------------------------------------------- /python/cudolfinx/jit.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2024 Benjamin Pachev 2 | # 3 | # This file is part of cuDOLFINX 4 | # 5 | # SPDX-License-Identifier: LGPL-3.0-or-later 6 | 7 | """Routines for manipulating generated FFCX code 8 | """ 9 | 10 | from dolfinx import fem, cpp 11 | import numpy as np 12 | import pathlib 13 | from typing import * 14 | 15 | def get_tabulate_tensor_sources(form: fem.Form): 16 | """Given a compiled fem.Form, extract the C source code of the tabulate tensors 17 | """ 18 | 19 | module_file = pathlib.Path(form.module.__file__) 20 | source_filename = module_file.name.split(".")[0] + ".c" 21 | source_file = module_file.parent.joinpath(source_filename) 22 | if not source_file.is_file(): 23 | raise IOError("Could not find generated ffcx source file '{source_file}'!") 24 | 25 | tabulate_tensors = [] 26 | parsing_tabulate = False 27 | parsing_header = False 28 | bracket_count = 0 29 | with open(source_file) as fp: 30 | for line in fp: 31 | if "tabulate_tensor_integral" in line and line.strip().startswith("void"): 32 | parsing_tabulate = True 33 | parsing_header = True 34 | tabulate_id = line.strip().split()[1].split("_")[-1].split("(")[0] 35 | tabulate_body = [] 36 | elif parsing_header: 37 | if line.startswith("{"): 38 | parsing_header = False 39 | bracket_count += 1 40 | elif parsing_tabulate: 41 | if line.startswith("{"): bracket_count += 1 42 | elif line.startswith("}"): bracket_count -= 1 43 | if not bracket_count: 44 | tabulate_tensors.append((tabulate_id, "".join(tabulate_body))) 45 | parsing_tabulate = False 46 | else: 47 | tabulate_body.append(line) 48 | elif "form_integrals_form" in line: 49 | if "{" in line: 50 | arr = line.split("{")[-1].split("}")[0] 51 | ordered_integral_ids = [ 52 | part.strip().split("_")[-1] for part in arr.split(",") 53 | ] 54 | 55 | # map ids to order of appearance in tensor list 56 | id_order = {tabulate[0]: i for i, tabulate in enumerate(tabulate_tensors)} 57 | integral_tensor_indices = [id_order[integral_id] for integral_id in ordered_integral_ids] 58 | return tabulate_tensors, integral_tensor_indices 59 | 60 | cuda_tabulate_tensor_header = """ 61 | #define alignas(x) 62 | #define restrict __restrict__ 63 | 64 | typedef unsigned char uint8_t; 65 | typedef unsigned int uint32_t; 66 | typedef double ufc_scalar_t; 67 | 68 | extern "C" __global__ 69 | void tabulate_tensor_{factory_name}({scalar_type}* restrict A, 70 | const {scalar_type}* restrict w, 71 | const {scalar_type}* restrict c, 72 | const {geom_type}* restrict coordinate_dofs, 73 | const int* restrict entity_local_index, 74 | const uint8_t* restrict quadrature_permutation 75 | ) 76 | """ 77 | 78 | def _convert_dtype_to_str(dtype: Any): 79 | """Convert numpy dtype to named C type 80 | """ 81 | 82 | if dtype == np.float32: 83 | return "float" 84 | elif dtype == np.float64: 85 | return "double" 86 | else: 87 | raise TypeError(f"Unsupported dtype: '{dtype}'") 88 | 89 | def get_wrapped_tabulate_tensors(form: fem.Form, backend="cuda"): 90 | """Given a fem.Form, wrap the tabulate tensors for use on a GPU 91 | """ 92 | 93 | if backend != "cuda": 94 | raise NotImplementedError(f"Backend '{backend}' not yet supported.") 95 | 96 | # for now assume same type for form and mesh 97 | # this is typically the default 98 | geom_type = scalar_type = _convert_dtype_to_str(form.dtype) 99 | 100 | res = [] 101 | sources, integral_tensor_indices = get_tabulate_tensor_sources(form) 102 | for id, body in sources: 103 | factory_name = "integral_" + id 104 | name = "tabulate_tensor_" + factory_name 105 | header = cuda_tabulate_tensor_header.format( 106 | scalar_type=scalar_type, 107 | geom_type=geom_type, 108 | factory_name=factory_name 109 | ) 110 | wrapped_source = header + "{\n" + body + "}\n" 111 | res.append((name, wrapped_source)) 112 | 113 | return res, integral_tensor_indices 114 | 115 | -------------------------------------------------------------------------------- /python/cudolfinx/wrappers/caster_petsc.h: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2024 Benjamin Pachev, James D. Trotter 2 | // 3 | // This file is part of cuDOLFINX 4 | // 5 | // SPDX-License-Identifier: LGPL-3.0-or-later 6 | 7 | #pragma once 8 | 9 | #ifdef HAS_PETSC 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | // nanobind casters for PETSc/petsc4py objects 17 | 18 | namespace nb = nanobind; 19 | 20 | // Import petsc4py on demand 21 | #define VERIFY_PETSC4PY_FROMPY(func) \ 22 | if (!func) \ 23 | { \ 24 | if (import_petsc4py() != 0) \ 25 | return false; \ 26 | } 27 | 28 | #define VERIFY_PETSC4PY_FROMCPP(func) \ 29 | if (!func) \ 30 | { \ 31 | if (import_petsc4py() != 0) \ 32 | return {}; \ 33 | } 34 | 35 | // Macro for casting between PETSc and petsc4py objects 36 | #define PETSC_CASTER_MACRO(TYPE, P4PYTYPE, NAME) \ 37 | template <> \ 38 | class type_caster<_p_##TYPE> \ 39 | { \ 40 | public: \ 41 | NB_TYPE_CASTER(TYPE, const_name(#NAME)) \ 42 | bool from_python(handle src, uint8_t, cleanup_list*) noexcept \ 43 | { \ 44 | VERIFY_PETSC4PY_FROMPY(PyPetsc##P4PYTYPE##_Get); \ 45 | if (PyObject_TypeCheck(src.ptr(), &PyPetsc##P4PYTYPE##_Type) != 0) \ 46 | { \ 47 | value = PyPetsc##P4PYTYPE##_Get(src.ptr()); \ 48 | return true; \ 49 | } \ 50 | else \ 51 | return false; \ 52 | } \ 53 | \ 54 | static handle from_cpp(TYPE src, rv_policy policy, \ 55 | cleanup_list* /*cleanup*/) noexcept \ 56 | { \ 57 | VERIFY_PETSC4PY_FROMCPP(PyPetsc##P4PYTYPE##_New); \ 58 | if (policy == rv_policy::take_ownership) \ 59 | { \ 60 | PyObject* obj = PyPetsc##P4PYTYPE##_New(src); \ 61 | PetscObjectDereference((PetscObject)src); \ 62 | return nb::handle(obj); \ 63 | } \ 64 | else if (policy == rv_policy::automatic_reference \ 65 | or policy == rv_policy::reference) \ 66 | { \ 67 | PyObject* obj = PyPetsc##P4PYTYPE##_New(src); \ 68 | return nb::handle(obj); \ 69 | } \ 70 | else \ 71 | { \ 72 | return {}; \ 73 | } \ 74 | } \ 75 | \ 76 | operator TYPE() { return value; } \ 77 | } 78 | 79 | namespace nanobind::detail 80 | { 81 | PETSC_CASTER_MACRO(Mat, Mat, mat); 82 | PETSC_CASTER_MACRO(Vec, Vec, vec); 83 | } // namespace nanobind::detail 84 | #endif 85 | -------------------------------------------------------------------------------- /python/examples/poisson.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2024 Benjamin Pachev 2 | # 3 | # This file is part of cuDOLFINX 4 | # 5 | # SPDX-License-Identifier: LGPL-3.0-or-later 6 | 7 | import argparse as ap 8 | from mpi4py import MPI 9 | from petsc4py import PETSc 10 | try: 11 | import cudolfinx as cufem 12 | except ImportError: 13 | pass 14 | from dolfinx import fem as fe, mesh 15 | from dolfinx.fem import petsc as fe_petsc 16 | import numpy as np 17 | import ufl 18 | import time 19 | from ufl import dx, ds, grad, inner 20 | 21 | def create_mesh(res: int = 10, dim: int = 3): 22 | """Create a uniform tetrahedral mesh on the unit cube. 23 | 24 | Parameters 25 | ---------- 26 | res - Number of subdivisions along each dimension 27 | dim - Geometric dimension of mesh 28 | 29 | Returns 30 | ---------- 31 | mesh - The mesh object. 32 | """ 33 | 34 | if dim == 3: 35 | return mesh.create_box( 36 | comm = MPI.COMM_WORLD, 37 | points = ((0,0,0), (1, 1, 1)), 38 | n = (res, res, res), 39 | cell_type = mesh.CellType.tetrahedron 40 | ) 41 | elif dim == 2: 42 | return mesh.create_unit_square(MPI.COMM_WORLD, res, res) 43 | 44 | def main(res, cuda=True, degree=1, dim=3, repeats=1): 45 | """Assembles a stiffness matrix for the Poisson problem with the given resolution. 46 | """ 47 | 48 | domain = create_mesh(res, dim=dim) 49 | comm = domain.comm 50 | if cuda and comm.size > 1: 51 | if comm.rank == 0: 52 | print("Using ghost layer mesh for CUDA Assembly") 53 | domain = cufem.ghost_layer_mesh(domain) 54 | 55 | V = fe.functionspace(domain, ("Lagrange", degree)) 56 | u = ufl.TrialFunction(V) 57 | v = ufl.TestFunction(V) 58 | x = ufl.SpatialCoordinate(domain) 59 | if dim == 3: 60 | f = 10*ufl.exp(-((x[0]-.05)**2 + (x[1]-.05)**2 + (x[2]-.05)**2) / .02) 61 | elif dim == 2: 62 | f = 10*ufl.exp(-((x[0]-.05)**2 + (x[1]-.05)**2) / .02) 63 | g = ufl.sin(5*x[0])*ufl.sin(5*x[1]) 64 | a = inner(grad(u), grad(v)) * dx 65 | L = inner(f, v) * dx + inner(g, v) * ds 66 | 67 | facets = mesh.locate_entities_boundary( 68 | domain, 69 | dim=(domain.topology.dim - 1), 70 | marker=lambda x: np.isclose(x[0], 0.0) | np.isclose(x[0], 2.0), 71 | ) 72 | 73 | dofs = fe.locate_dofs_topological(V=V, entity_dim=domain.topology.dim-1, entities=facets) 74 | bc = fe.dirichletbc(value=PETSc.ScalarType(0), dofs=dofs, V=V) 75 | 76 | timing = {"mat_assemble":0.0, "vec_assemble": 0.0, "solve": 0.0} 77 | 78 | if cuda: 79 | a = cufem.form(a) 80 | L = cufem.form(L) 81 | asm = cufem.CUDAAssembler() 82 | cuda_A = asm.create_matrix(a) 83 | cuda_b = asm.create_vector(L) 84 | b = cuda_b.vector 85 | device_bcs = asm.pack_bcs([bc]) 86 | else: 87 | a = fe.form(a, jit_options = {"cffi_extra_compile_args":["-O3", "-mcpu=neoverse-v2"]}) 88 | L = fe.form(L, jit_options = {"cffi_extra_compile_args":["-O3", "-mcpu=neoverse-v2"]}) 89 | A = fe_petsc.create_matrix(a) 90 | b = fe_petsc.create_vector(L) 91 | for i in range(repeats): 92 | start = time.time() 93 | if cuda: 94 | asm.assemble_matrix(a, cuda_A, bcs=device_bcs) 95 | cuda_A.assemble() 96 | A = cuda_A.mat 97 | else: 98 | A.zeroEntries() 99 | fe_petsc.assemble_matrix(A, a, bcs=[bc]) 100 | A.assemble() 101 | timing["mat_assemble"] += time.time()-start 102 | 103 | start = time.time() 104 | if cuda: 105 | asm.assemble_vector(L, cuda_b) 106 | asm.apply_lifting(cuda_b, [a], [device_bcs]) 107 | asm.set_bc(cuda_b, bcs=device_bcs, V=V) 108 | else: 109 | with b.localForm() as b_local: 110 | b_local.set(0) 111 | fe_petsc.assemble_vector(b, L) 112 | fe_petsc.apply_lifting(b, [a], [[bc]]) 113 | fe_petsc.set_bc(b, bcs=[bc]) 114 | timing["vec_assemble"] += time.time() - start 115 | 116 | ksp = PETSc.KSP().create(comm) 117 | ksp.setOperators(A) 118 | ksp.setType("cg") 119 | pc = ksp.getPC() 120 | pc.setType("jacobi") 121 | 122 | start = time.time() 123 | out = b.copy() 124 | ksp.solve(b, out) 125 | timing["solve"] += time.time() - start 126 | 127 | max_timings = {} 128 | for metric in sorted(timing.keys()): 129 | max_timings[metric] = comm.gather(timing[metric]/repeats, root=0) 130 | sol_norm = out.norm() 131 | if comm.rank == 0: 132 | # show max over all MPI processes, as that's the rate-limiter 133 | print(f"Res={res}, Num cells", domain.topology.index_map(domain.topology.dim).size_global) 134 | print(f"Dofs: {V.dofmap.index_map.size_global}") 135 | print(f"Average timing ({repeats} trials):") 136 | for k, v in max_timings.items(): 137 | print(f"\t{k}: {max(v)}s") 138 | 139 | if __name__ == "__main__": 140 | parser = ap.ArgumentParser() 141 | parser.add_argument("--res", default=10, type=int, help="Number of subdivisions in each dimension.") 142 | parser.add_argument("--repeats", default=1, type=int, help="Number of times to repeat the experiment.") 143 | parser.add_argument("--degree", default=1, type=int, help="Polynomial degree.") 144 | parser.add_argument("--dim", default=3, type=int, help="Geometric dimension.") 145 | parser.add_argument("--no-cuda", default=False, action="store_true", help="Disable GPU acceleration.") 146 | args = parser.parse_args() 147 | 148 | main(res=args.res, cuda = not args.no_cuda, degree=args.degree, dim=args.dim, repeats=args.repeats) 149 | -------------------------------------------------------------------------------- /cpp/cudolfinx/la/CUDAMatrix.h: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2024 Benjamin Pachev, James D. Trotter 2 | // 3 | // This file is part of cuDOLFINX 4 | // 5 | // SPDX-License-Identifier: LGPL-3.0-or-later 6 | 7 | #pragma once 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | namespace dolfinx::la 15 | { 16 | 17 | class CUDASeqMatrix; 18 | 19 | /// A wrapper for a matrix in the compressed sparse row (CSR) format 20 | /// that is stored in the device memory of a CUDA device. 21 | 22 | class CUDAMatrix 23 | { 24 | public: 25 | /// Create an empty CUDA matrix 26 | CUDAMatrix(); 27 | 28 | /// Create a matrix from a PETSc Mat object 29 | /// 30 | /// @param[in] cuda_context A context for a CUDA device 31 | /// @param[in] A PETSc matrix to copy to the device 32 | /// @param[in] page_lock_values Whether or not to use page-locked 33 | /// memory for the host-side array of 34 | /// non-zero values. 35 | /// @param[in] use_seqaijcusparsegetarray Whether or not to use the 36 | /// function MatSeqAIJCUSPARSEGetArray(),which is only 37 | /// available in a custom-built version of PETSc. If it 38 | /// is set, this will avoid unnecessary copying of data 39 | /// between host and device for matrices of type 40 | /// MATSEQAIJCUSPARSE whenever a CUDA-based assembler 41 | /// is used. 42 | CUDAMatrix( 43 | const CUDA::Context& cuda_context, 44 | Mat A, 45 | bool page_lock_values, 46 | bool use_seqaijcusparsegetarray); 47 | 48 | /// Destructor 49 | ~CUDAMatrix(); 50 | 51 | /// Copy constructor 52 | /// @param[in] matrix The object to be copied 53 | CUDAMatrix(const CUDAMatrix& matrix) = delete; 54 | 55 | /// Move constructor 56 | /// @param[in] matrix The object to be moved 57 | CUDAMatrix(CUDAMatrix&& matrix); 58 | 59 | /// Assignment operator 60 | /// @param[in] matrix Another CUDAMatrix object 61 | CUDAMatrix& operator=(const CUDAMatrix& matrix) = delete; 62 | 63 | /// Move assignment operator 64 | /// @param[in] matrix Another CUDAMatrix object 65 | CUDAMatrix& operator=(CUDAMatrix&& matrix); 66 | 67 | /// Get the underlying PETSc matrix object 68 | Mat mat() { return _A; } 69 | 70 | /// Get the diagonal block of the local part of the matrix 71 | const CUDASeqMatrix * diag() const { return _diag.get(); } 72 | CUDASeqMatrix * diag() { return _diag.get(); } 73 | 74 | /// Get the off-diagonal block of the local part of the matrix 75 | const CUDASeqMatrix * offdiag() const { return _offdiag.get(); } 76 | CUDASeqMatrix * offdiag() { return _offdiag.get(); } 77 | 78 | /// Methods to get off diagonal column mapping 79 | CUdeviceptr colmap() const { return _dcolmap; } 80 | CUdeviceptr colmap_sorted() const { return _dcolmap_sorted; } 81 | CUdeviceptr colmap_sorted_indices() const { return _dcolmap_sorted_indices; } 82 | 83 | /// Get the number of matrix rows 84 | int32_t num_rows() const { return _num_rows; } 85 | 86 | /// Get the number of matrix columns 87 | int32_t num_columns() const { return _num_columns; } 88 | 89 | /// Get the global index of the first row 90 | int32_t local_row_start() const { return _local_row_start; } 91 | 92 | /// Get the global index of the last row 93 | int32_t local_row_end() const { return _local_row_end; } 94 | 95 | /// Get the number of local matrix rows 96 | int32_t num_local_rows() const { return _num_local_rows; } 97 | 98 | /// Get the number of local matrix columns 99 | int32_t num_local_columns() const { return _num_local_columns; } 100 | 101 | /// Get the number of local matrix columns in the off-diagonal part 102 | int32_t num_local_offdiag_columns() const { return _num_local_offdiag_columns; } 103 | 104 | /// Update the values of the underlying PETSc matrix by copying 105 | /// values from device memory to host memory. 106 | /// 107 | /// @param[in] cuda_context A context for a CUDA device 108 | void copy_matrix_values_to_host( 109 | const CUDA::Context& cuda_context); 110 | 111 | /// Finalize matrix assembly by calling PETSc's MatAssemblyBegin() 112 | /// and MatAssemblyEnd(). 113 | /// 114 | /// @param[in] type MAT_FLUSH_ASSEMBLY or MAT_FINAL_ASSEMBLY 115 | void apply(MatAssemblyType type); 116 | 117 | void debug_dump(); 118 | 119 | private: 120 | /// Handle to the corresponding PETSc matrix object 121 | Mat _A; 122 | 123 | /// The diagonal block of the local part of the matrix 124 | std::unique_ptr _diag; 125 | 126 | /// The off-diagonal block of the local part of the matrix. 127 | /// This is only used if the matrix is distributed. 128 | std::unique_ptr _offdiag; 129 | 130 | /// Device-side mapping from columns of the local, off-diagonal 131 | /// block of the matrix to columns of the global matrix. 132 | CUdeviceptr _dcolmap; 133 | CUdeviceptr _dcolmap_sorted; 134 | CUdeviceptr _dcolmap_sorted_indices; 135 | 136 | /// The number of rows in the global matrix 137 | int32_t _num_rows; 138 | 139 | /// The number of columns in the global matrix 140 | int32_t _num_columns; 141 | 142 | /// The first row owned by the current MPI process 143 | int32_t _local_row_start; 144 | 145 | /// The last row owned by the current MPI process 146 | int32_t _local_row_end; 147 | 148 | /// The number of rows owned by the current MPI process 149 | int32_t _num_local_rows; 150 | 151 | /// The number of columns owned by the current MPI process 152 | int32_t _num_local_columns; 153 | 154 | /// The number of columns in the off-diagonal part of the local 155 | /// matrix owned by the current MPI process 156 | int32_t _num_local_offdiag_columns; 157 | }; 158 | 159 | } // namespace dolfinx::la 160 | -------------------------------------------------------------------------------- /cpp/cudolfinx/la/CUDASeqMatrix.h: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2024 Benjamin Pachev, James D. Trotter 2 | // 3 | // This file is part of cuDOLFINX 4 | // 5 | // SPDX-License-Identifier: LGPL-3.0-or-later 6 | 7 | #pragma once 8 | 9 | #include 10 | #include 11 | #include 12 | 13 | namespace dolfinx::la 14 | { 15 | 16 | /// A wrapper for a matrix in the compressed sparse row (CSR) format 17 | /// that is stored in the device memory of a CUDA device. 18 | 19 | class CUDASeqMatrix 20 | { 21 | public: 22 | /// Create an empty CUDA matrix 23 | CUDASeqMatrix(); 24 | 25 | /// Create a matrix from a PETSc Mat object. Note that the Mat must 26 | /// be of type MATSEQAIJ. 27 | /// 28 | /// @param[in] cuda_context A context for a CUDA device 29 | /// @param[in] A PETSc matrix to copy to the device 30 | /// @param[in] page_lock_values Whether or not to use page-locked 31 | /// memory for the host-side array of 32 | /// non-zero values. 33 | /// @param[in] use_seqaijcusparsegetarray Whether or not to use the 34 | /// function MatSeqAIJCUSPARSEGetArray(),which is only 35 | /// available in a custom-built version of PETSc. If it 36 | /// is set, this will avoid unnecessary copying of data 37 | /// between host and device for matrices of type 38 | /// MATSEQAIJCUSPARSE whenever a CUDA-based assembler 39 | /// is used. 40 | CUDASeqMatrix( 41 | const CUDA::Context& cuda_context, 42 | Mat A, 43 | bool page_lock_values, 44 | bool use_seqaijcusparsegetarray); 45 | 46 | /// Destructor 47 | ~CUDASeqMatrix(); 48 | 49 | /// Copy constructor 50 | /// @param[in] matrix The object to be copied 51 | CUDASeqMatrix(const CUDASeqMatrix& matrix) = delete; 52 | 53 | /// Move constructor 54 | /// @param[in] matrix The object to be moved 55 | CUDASeqMatrix(CUDASeqMatrix&& matrix); 56 | 57 | /// Assignment operator 58 | /// @param[in] matrix Another CUDASeqMatrix object 59 | CUDASeqMatrix& operator=(const CUDASeqMatrix& matrix) = delete; 60 | 61 | /// Move assignment operator 62 | /// @param[in] matrix Another CUDASeqMatrix object 63 | CUDASeqMatrix& operator=(CUDASeqMatrix&& matrix); 64 | 65 | /// Get the underlying PETSc matrix object 66 | Mat mat() { return _A; } 67 | 68 | /// Get the number of matrix rows 69 | int32_t num_rows() const { return _num_rows; } 70 | 71 | /// Get the number of matrix columns 72 | int32_t num_columns() const { return _num_columns; } 73 | 74 | /// Get the global index of the first row 75 | int32_t local_row_start() const { return _local_row_start; } 76 | 77 | /// Get the global index of the last row 78 | int32_t local_row_end() const { return _local_row_end; } 79 | 80 | /// Get the number of local matrix rows 81 | int32_t num_local_rows() const { return _num_local_rows; } 82 | 83 | /// Get the number of local matrix columns 84 | int32_t num_local_columns() const { return _num_local_columns; } 85 | 86 | /// Get a handle to the device-side row pointers 87 | CUdeviceptr row_ptr() const { return _drow_ptr; } 88 | 89 | /// Get the number of local non-zeros 90 | int32_t num_local_nonzeros() const { return _num_local_nonzeros; } 91 | 92 | /// Get a handle to the device-side column indices 93 | CUdeviceptr column_indices() const { return _dcolumn_indices; } 94 | 95 | /// Get a handle to the device-side non-zero values 96 | CUdeviceptr values() const; 97 | 98 | /// Update the values of the underlying PETSc matrix by copying 99 | /// values from device memory to host memory. 100 | /// 101 | /// @param[in] cuda_context A context for a CUDA device 102 | void copy_matrix_values_to_host( 103 | const CUDA::Context& cuda_context); 104 | 105 | /// Finalize matrix assembly by calling PETSc's MatAssemblyBegin() 106 | /// and MatAssemblyEnd(). 107 | /// 108 | /// @param[in] type MAT_FLUSH_ASSEMBLY or MAT_FINAL_ASSEMBLY 109 | void apply(MatAssemblyType type); 110 | 111 | void debug_dump(); 112 | 113 | private: 114 | /// Handle to the corresponding PETSc matrix object 115 | Mat _A; 116 | 117 | /// Whether or not the host-side array of non-zero values uses 118 | /// page-locked or pinned memory 119 | bool _values_page_locked; 120 | 121 | /// The number of rows in the global matrix 122 | int32_t _num_rows; 123 | 124 | /// The number of columns in the global matrix 125 | int32_t _num_columns; 126 | 127 | /// The first row owned by the current MPI process 128 | int32_t _local_row_start; 129 | 130 | /// The last row owned by the current MPI process 131 | int32_t _local_row_end; 132 | 133 | /// The number of rows owned by the current MPI process 134 | int32_t _num_local_rows; 135 | 136 | /// The number of columns owned by the current MPI process 137 | int32_t _num_local_columns; 138 | 139 | /// Device-side storage for row pointers 140 | CUdeviceptr _drow_ptr; 141 | 142 | /// The number of non-zeros in the global matrix 143 | int32_t _num_local_nonzeros; 144 | 145 | /// Device-side storage for column indices 146 | CUdeviceptr _dcolumn_indices; 147 | 148 | /// Device-side storage for non-zero values 149 | CUdeviceptr _dvalues; 150 | 151 | /// Whether or not the device-side pointer is owned by PETSc and 152 | /// needs to be returned when we are done, or if it was allocated 153 | /// with cuMemAlloc() and needs to be freed with cuMemFree(). 154 | /// 155 | /// For now, PETSc does not provide access to device-side non-zero 156 | /// values, even for matrices that are stored on a CUDA 157 | /// device. Consequently, `_dvalues_petsc_owned` is always false, 158 | /// and there is potentially some unnecessary copying between the 159 | /// host and device. 160 | bool _dvalues_petsc_owned; 161 | }; 162 | 163 | } // namespace dolfinx::la 164 | -------------------------------------------------------------------------------- /cpp/cudolfinx/fem/CUDAFormConstants.h: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2024 Benjamin Pachev, James D. Trotter 2 | // 3 | // This file is part of cuDOLFINX 4 | // 5 | // SPDX-License-Identifier: LGPL-3.0-or-later 6 | 7 | #pragma once 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | namespace dolfinx { 15 | namespace fem { 16 | 17 | /// A wrapper for a form constant with data that is stored in the 18 | /// device memory of a CUDA device. 19 | template 20 | class CUDAFormConstants 21 | { 22 | public: 23 | 24 | /// Create an empty collection constant values 25 | CUDAFormConstants() 26 | : _form(nullptr) 27 | , _num_constant_values() 28 | , _dconstant_values(0) 29 | { 30 | } 31 | //----------------------------------------------------------------------------- 32 | /// Create a collection constant values from a given form 33 | /// 34 | /// @param[in] cuda_context A context for a CUDA device 35 | /// @param[in] form The variational form whose constants are used 36 | CUDAFormConstants( 37 | const CUDA::Context& cuda_context, 38 | const Form* form) 39 | : _form(form) 40 | , _num_constant_values() 41 | , _dconstant_values(0) 42 | { 43 | CUresult cuda_err; 44 | const char * cuda_err_description; 45 | 46 | const std::vector 47 | constant_values = pack_constants(*_form); 48 | 49 | // Allocate device-side storage for constant values 50 | _num_constant_values = constant_values.size(); 51 | if (_num_constant_values > 0) { 52 | size_t dconstant_values_size = 53 | _num_constant_values * sizeof(T); 54 | cuda_err = cuMemAlloc( 55 | &_dconstant_values, dconstant_values_size); 56 | if (cuda_err != CUDA_SUCCESS) { 57 | cuGetErrorString(cuda_err, &cuda_err_description); 58 | throw std::runtime_error( 59 | "cuMemAlloc() failed with " + std::string(cuda_err_description) + 60 | " at " + std::string(__FILE__) + ":" + std::to_string(__LINE__)); 61 | } 62 | 63 | // Copy constant values to device 64 | cuda_err = cuMemcpyHtoD( 65 | _dconstant_values, constant_values.data(), dconstant_values_size); 66 | if (cuda_err != CUDA_SUCCESS) { 67 | cuMemFree(_dconstant_values); 68 | cuGetErrorString(cuda_err, &cuda_err_description); 69 | throw std::runtime_error( 70 | "cuMemcpyHtoD() failed with " + std::string(cuda_err_description) + 71 | " at " + std::string(__FILE__) + ":" + std::to_string(__LINE__)); 72 | } 73 | } 74 | } 75 | //----------------------------------------------------------------------------- 76 | /// Destructor 77 | ~CUDAFormConstants() 78 | { 79 | if (_dconstant_values) 80 | cuMemFree(_dconstant_values); 81 | } 82 | //----------------------------------------------------------------------------- 83 | /// Copy constructor 84 | /// @param[in] form_constant The object to be copied 85 | CUDAFormConstants(const CUDAFormConstants& form_constant) = delete; 86 | 87 | /// Move constructor 88 | /// @param[in] form_constant The object to be moved 89 | CUDAFormConstants(CUDAFormConstants&& constants) 90 | : _form(constants._form) 91 | , _num_constant_values(constants._num_constant_values) 92 | , _dconstant_values(constants._dconstant_values) 93 | { 94 | constants._form = nullptr; 95 | constants._num_constant_values = 0; 96 | constants._dconstant_values = 0; 97 | } 98 | //----------------------------------------------------------------------------- 99 | /// Assignment operator 100 | /// @param[in] form_constant Another CUDAFormConstants object 101 | CUDAFormConstants& operator=(const CUDAFormConstants& form_constant) = delete; 102 | 103 | /// Move assignment operator 104 | /// @param[in] form_constant Another CUDAFormConstants object 105 | CUDAFormConstants& operator=(CUDAFormConstants&& constants) 106 | { 107 | _form = constants._form; 108 | _num_constant_values = constants._num_constant_values; 109 | _dconstant_values = constants._dconstant_values; 110 | constants._form = nullptr; 111 | constants._num_constant_values = 0; 112 | constants._dconstant_values = 0; 113 | return *this; 114 | } 115 | //----------------------------------------------------------------------------- 116 | /// Get the number of constant values that the constant applies to 117 | int32_t num_constant_values() const { return _num_constant_values; } 118 | 119 | /// Get the constant values that the constant applies to 120 | CUdeviceptr constant_values() const { return _dconstant_values; } 121 | 122 | /// Update the constant values by copying values from host to device 123 | void update_constant_values() const 124 | { 125 | CUresult cuda_err; 126 | const char * cuda_err_description; 127 | 128 | // Pack constants into an array 129 | const std::vector 130 | constant_values = pack_constants(*_form); 131 | assert(_num_constant_values == constant_values.size()); 132 | 133 | // Copy constant values to device 134 | if (_num_constant_values > 0) { 135 | size_t dconstant_values_size = 136 | _num_constant_values * sizeof(T); 137 | cuda_err = cuMemcpyHtoD( 138 | _dconstant_values, constant_values.data(), dconstant_values_size); 139 | if (cuda_err != CUDA_SUCCESS) { 140 | cuMemFree(_dconstant_values); 141 | cuGetErrorString(cuda_err, &cuda_err_description); 142 | throw std::runtime_error( 143 | "cuMemcpyHtoD() failed with " + std::string(cuda_err_description) + 144 | " at " + std::string(__FILE__) + ":" + std::to_string(__LINE__)); 145 | } 146 | } 147 | } 148 | //----------------------------------------------------------------------------- 149 | 150 | 151 | private: 152 | // The form that the constant applies to 153 | const Form* _form; 154 | 155 | /// The number of constant values 156 | int32_t _num_constant_values; 157 | 158 | /// The constant values 159 | CUdeviceptr _dconstant_values; 160 | }; 161 | 162 | } // namespace fem 163 | } // namespace dolfinx 164 | 165 | -------------------------------------------------------------------------------- /cpp/cudolfinx/la/CUDAVector.h: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2024 Benjamin Pachev, James D. Trotter 2 | // 3 | // This file is part of cuDOLFINX 4 | // 5 | // SPDX-License-Identifier: LGPL-3.0-or-later 6 | 7 | #pragma once 8 | 9 | #include 10 | #include 11 | #include 12 | 13 | namespace dolfinx::la 14 | { 15 | 16 | /// A wrapper for a dense vector that is stored in the device memory 17 | /// of a CUDA device. 18 | 19 | class CUDAVector 20 | { 21 | public: 22 | /// Create an empty CUDA vector 23 | CUDAVector(); 24 | 25 | /// Create a vector from a PETSc Vec object 26 | /// 27 | /// @param[in] cuda_context A context for a CUDA device 28 | /// @param[in] x PETSc vector to copy to the device 29 | /// @param[in] page_lock_values Whether or not to use page-locked 30 | /// memory for the host-side array of 31 | /// values. 32 | CUDAVector( 33 | const CUDA::Context& cuda_context, 34 | Vec x, 35 | bool page_lock_values = true, 36 | bool include_ghosts = true); 37 | 38 | 39 | /*template 40 | CUDAVector(const CUDA::Context& cuda_context, std::shared_ptr> x) 41 | : CUDAVector(cuda_context, la::petsc::create_vector_wrap(*x)) 42 | { 43 | }*/ 44 | 45 | /// Destructor 46 | ~CUDAVector(); 47 | 48 | /// Copy constructor 49 | /// @param[in] vector The object to be copied 50 | CUDAVector(const CUDAVector& vector) = delete; 51 | 52 | /// Move constructor 53 | /// @param[in] vector The object to be moved 54 | CUDAVector(CUDAVector&& vector); 55 | 56 | /// Assignment operator 57 | /// @param[in] vector Another CUDAVector object 58 | CUDAVector& operator=(const CUDAVector& vector) = delete; 59 | 60 | /// Move assignment operator 61 | /// @param[in] vector Another CUDAVector object 62 | CUDAVector& operator=(CUDAVector&& vector); 63 | 64 | /// Get a handle to the underlying PETSc vector object 65 | const Vec vector() const { return _x; } 66 | 67 | /// Get the number of vector values 68 | int32_t num_values() const { return _num_values; } 69 | 70 | /// Get the number of local vector values 71 | int32_t num_local_values() const { return _num_local_values; } 72 | 73 | /// Get the number of local vector values 74 | int32_t num_local_ghosted_values() const { return _num_local_ghosted_values; } 75 | 76 | bool ghosted() const; 77 | 78 | /// Get a handle to the device-side non-zero values 79 | CUdeviceptr values() const; 80 | 81 | /// Return a handle to the device-side non-zero values 82 | void restore_values() const; 83 | 84 | /// Get a handle to the device-side non-zero values 85 | CUdeviceptr values_write() const; 86 | 87 | /// Return a handle to the device-side non-zero values 88 | void restore_values_write() const; 89 | 90 | /// Update the device-side vector values from the underlying PETSc 91 | /// vector. If the PETSc vector resides in host memory, then the 92 | /// values are copied from host memory to device memory. This does 93 | /// nothing if the PETSc vector is already held in device memory. 94 | /// 95 | /// @param[in] cuda_context A context for a CUDA device 96 | void copy_vector_values_to_device( 97 | const CUDA::Context& cuda_context); 98 | 99 | /// Update the values of the underlying PETSc vector. If the PETSc 100 | /// vector resides in host memory, then the values are copied from 101 | /// device memory to host memory. This does nothing if the PETSc 102 | /// vector is already held in device memory. 103 | /// 104 | /// @param[in] cuda_context A context for a CUDA device 105 | void copy_vector_values_to_host( 106 | const CUDA::Context& cuda_context); 107 | 108 | /// Update the device-side values of ghost nodes from the underlying 109 | /// PETSc vector. If the PETSc vector resides in host memory, then 110 | /// values are copied from host memory to device memory. This does 111 | /// nothing if the PETSc vector is already held in device memory. 112 | /// 113 | /// @param[in] cuda_context A context for a CUDA device 114 | void copy_ghost_values_to_device( 115 | const CUDA::Context& cuda_context); 116 | 117 | /// Update the values of ghost nodes of the underlying PETSc vector. 118 | /// If the PETSc vector resides in host memory, then ghost values 119 | /// are copied from device memory to host memory. This does nothing 120 | /// if the PETSc vector is already held in device memory. 121 | /// 122 | /// @param[in] cuda_context A context for a CUDA device 123 | void copy_ghost_values_to_host( 124 | const CUDA::Context& cuda_context); 125 | 126 | /// Update vector entries that are owned by this process, but are 127 | /// represented as ghost values on other processes. 128 | void apply_ghosts( 129 | const CUDA::Context& cuda_context); 130 | 131 | /// Update vector entries corresponding to ghost values, meaning 132 | /// that ghost values are gathered from other processes that own 133 | /// them. 134 | bool update_ghosts( 135 | const CUDA::Context& cuda_context); 136 | 137 | private: 138 | /// Handle to the corresponding PETSc vector object 139 | Vec _x; 140 | 141 | /// Handle to the corresponding local PETSc vector object, if the 142 | /// vector is distributed. 143 | Vec _x_local; 144 | 145 | /// Whether or not the host-side array of values uses page-locked or 146 | /// pinned memory 147 | bool _values_page_locked; 148 | 149 | bool _include_ghosts; 150 | 151 | /// The number of values in the global vector 152 | int32_t _num_values; 153 | 154 | /// The number of values owned by the current MPI rank 155 | int32_t _num_local_values; 156 | 157 | /// The number of values owned by the current MPI rank 158 | int32_t _num_local_ghosted_values; 159 | 160 | /// The first value owned by the current MPI rank 161 | int32_t _local_values_start; 162 | 163 | /// The last value owned by the current MPI rank 164 | int32_t _local_values_end; 165 | 166 | /// Device-side storage for non-zero values 167 | mutable CUdeviceptr _dvalues; 168 | 169 | /// Whether or not the device-side pointer is owned by PETSc and 170 | /// needs to be returned when we are done, or if it was allocated 171 | /// with cuMemAlloc() and needs to be freed with cuMemFree(). 172 | bool _dvalues_petsc_owned; 173 | 174 | public: 175 | bool debug; 176 | }; 177 | 178 | } // namespace dolfinx::la 179 | -------------------------------------------------------------------------------- /cpp/cudolfinx/la/petsc.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2024 Benjamin Pachev, James D. Trotter 2 | // 3 | // This file is part of cuDOLFINX 4 | // 5 | // SPDX-License-Identifier: LGPL-3.0-or-later 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | Mat la::petsc::create_cuda_matrix(MPI_Comm comm, const SparsityPattern& sp) 15 | { 16 | PetscErrorCode ierr; 17 | Mat A; 18 | 19 | // Get IndexMaps from sparsity patterm, and block size 20 | std::array maps = {sp.index_map(0), sp.index_map(1)}; 21 | const std::array bs = {sp.block_size(0), sp.block_size(1)}; 22 | dolfinx::common::IndexMap col_map = sp.column_index_map(); 23 | 24 | // Get global and local dimensions 25 | const std::int64_t M = bs[0] * maps[0]->size_global(); 26 | const std::int64_t N = bs[1] * maps[1]->size_global(); 27 | const std::int32_t m = bs[0] * maps[0]->size_local(); 28 | const std::int32_t n = bs[1] * maps[1]->size_local(); 29 | 30 | // Build data to initialise sparsity pattern (modify for block size) 31 | std::vector _row_ptr; 32 | // Need to ensure correct int type. . . 33 | std::vector _column_indices; 34 | auto [_edges, _offsets] = sp.graph(); 35 | 36 | // The CUDA assembly kernels aren't currently robust to matrices with variable block size 37 | // So for now always unroll to 1 38 | _row_ptr.resize(m+1); 39 | _row_ptr[0] = 0; 40 | _column_indices.resize(_edges.size()*bs[0]*bs[1]); 41 | // index indicating where we are in _edges 42 | std::size_t edge_index = 0; 43 | std::size_t unrolled_edge_index = 0; 44 | // Iterate over (blocked) rows 45 | for (std::size_t row = 0; row < maps[0]->size_local(); row++) { 46 | // TODO test with differing block sizes to ensure this is still valid 47 | PetscInt row_nnz = _offsets[row+1] - _offsets[row]; 48 | PetscInt unrolled_row_nnz = row_nnz * bs[1]; 49 | 50 | // row ptr 51 | for (std::size_t unrolled_row = bs[0]*row; unrolled_row < bs[0]*(row+1); unrolled_row++) 52 | _row_ptr[unrolled_row+1] = _row_ptr[unrolled_row] + unrolled_row_nnz; 53 | 54 | for (std::size_t j = 0; j < row_nnz; j++) { 55 | for (std::size_t k = 0; k < bs[1]; k++) 56 | _column_indices[unrolled_edge_index + j*bs[1] + k] = bs[1]*_edges[edge_index+j] + k; 57 | } 58 | // Unroll row block 59 | for (std::size_t l = 1; l < bs[0]; l++) 60 | std::copy_n(std::next(_column_indices.begin(), unrolled_edge_index), unrolled_row_nnz, 61 | std::next(_column_indices.begin(), unrolled_edge_index + l*unrolled_row_nnz)); 62 | 63 | edge_index += row_nnz; 64 | unrolled_edge_index += bs[0] * unrolled_row_nnz; 65 | } 66 | 67 | 68 | // convert local column indices to global ones (unrolling blocked indices) 69 | std::vector global_column_indices(_column_indices.size()); 70 | auto col_local_size = bs[1]*col_map.size_local(); 71 | auto col_ghosts = col_map.ghosts(); 72 | auto col_local_range = bs[1]*col_map.local_range()[0]; 73 | for (std::size_t i = 0; i < _column_indices.size(); i++) { 74 | 75 | if (_column_indices[i] < col_local_size) 76 | global_column_indices[i] = _column_indices[i] + col_local_range; 77 | else { 78 | int diff = _column_indices[i] - col_local_size; 79 | global_column_indices[i] = bs[1] * col_ghosts[diff / bs[1]] + diff % bs[1]; 80 | } 81 | } 82 | MatCreateMPIAIJWithArrays(comm, m, n, M, N, _row_ptr.data(), global_column_indices.data(), nullptr, &A); 83 | // Change matrix type to CUDA 84 | ierr = MatSetType(A, MATMPIAIJCUSPARSE); 85 | if (ierr != 0) 86 | petsc::error(ierr, __FILE__, "MatSetType"); 87 | 88 | // Set block sizes 89 | ierr = MatSetBlockSizes(A, 1, 1); 90 | if (ierr != 0) 91 | petsc::error(ierr, __FILE__, "MatSetBlockSizes"); 92 | 93 | // Create PETSc local-to-global map/index sets 94 | ISLocalToGlobalMapping local_to_global0; 95 | // create unrolled global indices 96 | const std::vector map0 = maps[0]->global_indices(); 97 | std::vector _map0; 98 | _map0.resize(map0.size() * bs[0]); 99 | for (size_t i = 0; i < map0.size(); i++) 100 | for (size_t j = 0; j < bs[0]; j++) 101 | _map0[i*bs[0] + j] = map0[i]*bs[0] + j; 102 | //const std::vector _map0(map0.begin(), map0.end()); 103 | ierr = ISLocalToGlobalMappingCreate(MPI_COMM_SELF, 1, _map0.size(), 104 | _map0.data(), PETSC_COPY_VALUES, 105 | &local_to_global0); 106 | 107 | if (ierr != 0) 108 | petsc::error(ierr, __FILE__, "ISLocalToGlobalMappingCreate"); 109 | 110 | // Check for common index maps 111 | if (maps[0] == maps[1] and bs[0] == bs[1]) 112 | { 113 | ierr = MatSetLocalToGlobalMapping(A, local_to_global0, local_to_global0); 114 | if (ierr != 0) 115 | petsc::error(ierr, __FILE__, "MatSetLocalToGlobalMapping"); 116 | } 117 | else 118 | { 119 | ISLocalToGlobalMapping local_to_global1; 120 | const std::vector map1 = maps[1]->global_indices(); 121 | std::vector _map1; 122 | _map1.resize(map1.size() * bs[1]); 123 | for (size_t i = 0; i < map1.size(); i++) 124 | for (size_t j = 0; j < bs[1]; j++) 125 | _map1[i*bs[1] + j] = map1[i]*bs[1] + j; 126 | //const std::vector _map1(map1.begin(), map1.end()); 127 | ierr = ISLocalToGlobalMappingCreate(MPI_COMM_SELF, 1, _map1.size(), 128 | _map1.data(), PETSC_COPY_VALUES, 129 | &local_to_global1); 130 | if (ierr != 0) 131 | petsc::error(ierr, __FILE__, "ISLocalToGlobalMappingCreate"); 132 | ierr = MatSetLocalToGlobalMapping(A, local_to_global0, local_to_global1); 133 | if (ierr != 0) 134 | petsc::error(ierr, __FILE__, "MatSetLocalToGlobalMapping"); 135 | ierr = ISLocalToGlobalMappingDestroy(&local_to_global1); 136 | if (ierr != 0) 137 | petsc::error(ierr, __FILE__, "ISLocalToGlobalMappingDestroy"); 138 | } 139 | 140 | // Clean up local-to-global 0 141 | ierr = ISLocalToGlobalMappingDestroy(&local_to_global0); 142 | if (ierr != 0) 143 | petsc::error(ierr, __FILE__, "ISLocalToGlobalMappingDestroy"); 144 | 145 | // Set some options on Mat object 146 | ierr = MatSetOption(A, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE); 147 | if (ierr != 0) 148 | petsc::error(ierr, __FILE__, "MatSetOption"); 149 | ierr = MatSetOption(A, MAT_KEEP_NONZERO_PATTERN, PETSC_TRUE); 150 | if (ierr != 0) 151 | petsc::error(ierr, __FILE__, "MatSetOption"); 152 | return A; 153 | } 154 | 155 | -------------------------------------------------------------------------------- /python/test/test_cuda_assembly.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2024 Benjamin Pachev 2 | # 3 | # This file is part of cuDOLFINX 4 | # 5 | # SPDX-License-Identifier: LGPL-3.0-or-later 6 | 7 | import petsc4py 8 | from petsc4py import PETSc 9 | from mpi4py import MPI 10 | from dolfinx import fem as fe, mesh 11 | from dolfinx.fem import petsc 12 | import ufl 13 | import numpy as np 14 | import cudolfinx as cufem 15 | from cudolfinx.form import BlockCUDAForm 16 | from basix.ufl import element, mixed_element 17 | 18 | """ 19 | @author Benjamin Pachev 20 | @copyright 2024 21 | 22 | A set of simple variational forms to test the correctness of CUDA-accelerated assembly. 23 | """ 24 | 25 | 26 | def make_mixed_form(): 27 | """Test compilation of a mixed form. 28 | """ 29 | 30 | domain = mesh.create_unit_square(MPI.COMM_WORLD, 10, 10, mesh.CellType.triangle) 31 | el = element("P", domain.basix_cell(), 1) 32 | 33 | V = fe.functionspace(domain, el) 34 | u = ufl.TrialFunction(V) 35 | p = ufl.TestFunction(V) 36 | A = ufl.dot(ufl.grad(u), ufl.grad(p)) * ufl.dx 37 | F = fe.form(A) 38 | mat = fe.assemble_matrix(F) 39 | 40 | def make_test_domain(): 41 | """Make a test domain 42 | """ 43 | 44 | n = 19 45 | m = 27 46 | return mesh.create_unit_square(MPI.COMM_WORLD, n, m, mesh.CellType.triangle) 47 | 48 | def make_ufl(domain=None): 49 | """Create the UFL needed for making the forms 50 | """ 51 | 52 | if domain is None: 53 | domain = make_test_domain() 54 | 55 | V = fe.functionspace(domain, ("P", 1)) 56 | V_dg = fe.functionspace(domain, ("DG", 1)) 57 | u = fe.Function(V) 58 | p = ufl.TestFunction(V) 59 | p_dg = ufl.TestFunction(V_dg) 60 | n = ufl.FacetNormal(domain) 61 | u.interpolate(lambda x: x[0]**2 + x[1]) 62 | u_dg = fe.Function(V_dg) 63 | u_dg.interpolate(lambda x: x[0]**2 + x[1]) 64 | kappa = fe.Function(V) 65 | kappa.interpolate(lambda x: np.sin(x[0])*np.cos(x[1])) 66 | 67 | functional = ( 68 | ufl.exp(u)*kappa * ufl.dx + 69 | u*kappa * ufl.ds + 70 | ufl.avg(u_dg**2) * ufl.avg(kappa) * ufl.dS 71 | ) 72 | 73 | cell_residual = (ufl.exp(u)*p*kappa + ufl.dot(ufl.grad(u), ufl.grad(p))) * ufl.dx 74 | exterior_facet_residual = u*kappa*p * ufl.dot(ufl.grad(u), n) * ufl.ds 75 | interior_facet_residual = ufl.avg(p_dg) * ufl.avg(kappa) * ufl.avg(u_dg**2) * ufl.dS 76 | 77 | cell_jac = ufl.derivative(cell_residual, u) 78 | exterior_jac = ufl.derivative(exterior_facet_residual, u) 79 | interior_jac = ufl.derivative(interior_facet_residual, u_dg) 80 | 81 | f = fe.Function(V) 82 | f.interpolate(lambda x: x[0] +x[1]) 83 | dofs = fe.locate_dofs_geometrical(V, lambda x: np.isclose(x[0], 0)) 84 | bc = fe.dirichletbc(f, dofs) 85 | 86 | return { 87 | "coeff": kappa, 88 | "bcs": [bc], 89 | "scalar": [functional], 90 | "vector": [cell_residual, exterior_facet_residual, interior_facet_residual], 91 | "matrix": [cell_jac, exterior_jac, interior_jac]} 92 | 93 | def test_assembly(): 94 | """Test correctness of assembly 95 | """ 96 | 97 | ufl_forms = make_ufl() 98 | 99 | 100 | for i, form in enumerate(ufl_forms["vector"]): 101 | fenics_form = fe.form(form) 102 | vec = petsc.create_vector(fenics_form) 103 | petsc.assemble_vector(vec, fenics_form) 104 | 105 | for i, form in enumerate(ufl_forms["matrix"]): 106 | fenics_form = fe.form(form) 107 | mat = petsc.create_matrix(fenics_form) 108 | mat.zeroEntries() 109 | petsc.assemble_matrix(mat, fenics_form) 110 | mat.assemble() 111 | 112 | def compare_mats(matcsr, matpetsc): 113 | """Compare a native FEniCS MatrixCSR to a PETSc matrix 114 | """ 115 | 116 | indptr, indices, data = matpetsc.getValuesCSR() 117 | bad = np.where(~np.isclose(matcsr.data, data))[0] 118 | assert np.allclose(matcsr.data, data) 119 | 120 | def compare_vecs(vecfenics, vecpetsc): 121 | assert np.allclose(vecfenics.array, vecpetsc.array) 122 | 123 | def test_cuda_assembly(): 124 | """Check assembly on GPU 125 | """ 126 | 127 | 128 | ufl_forms = make_ufl() 129 | asm = cufem.CUDAAssembler() 130 | 131 | for i, form in enumerate(ufl_forms["scalar"]): 132 | fenics_form = fe.form(form) 133 | cuda_form = cufem.form(form) 134 | value1 = fe.assemble_scalar(fenics_form) 135 | value2 = asm.assemble_scalar(cuda_form) 136 | assert np.allclose(value1, value2) 137 | 138 | for i, form in enumerate(ufl_forms['vector']): 139 | f = fe.form(form) 140 | vec1 = fe.assemble_vector(f) 141 | vec2 = asm.assemble_vector(cufem.form(form)) 142 | compare_vecs(vec1, vec2.vector) 143 | 144 | for i, form in enumerate(ufl_forms['matrix']): 145 | f = fe.form(form) 146 | Mat1 = fe.assemble_matrix(f, bcs=ufl_forms['bcs']) 147 | Mat2 = asm.assemble_matrix(cufem.form(form), bcs=ufl_forms['bcs']) 148 | Mat2.assemble() 149 | # now we need to compare the two 150 | compare_mats(Mat1, Mat2.mat) 151 | 152 | def test_reassembly(): 153 | """Ensure correct assembly when coefficients are updated 154 | """ 155 | 156 | ufl_forms = make_ufl() 157 | coeff = ufl_forms["coeff"] 158 | cuda_vec_form = cufem.form(ufl_forms["vector"][0]) 159 | vec_form = cuda_vec_form.dolfinx_form 160 | #mat_form = fe.form(ufl_forms["matrix"][0]) 161 | asm = cufem.CUDAAssembler() 162 | vec_cuda = asm.assemble_vector(cuda_vec_form) 163 | vec_fe = fe.assemble_vector(vec_form) 164 | compare_vecs(vec_fe, vec_cuda.vector) 165 | 166 | for d in [2,3]: 167 | coeff.interpolate(lambda x: x[0]**d + x[1]**d) 168 | vec_fe.array[:] = 0 169 | cuda_vec_form.to_device() 170 | fe.assemble_vector(vec_fe.array, vec_form) 171 | asm.assemble_vector(cuda_vec_form, vec_cuda) 172 | 173 | compare_vecs(vec_fe, vec_cuda.vector) 174 | 175 | def test_lifting(): 176 | """Ensure lifting and bc setting work correctly 177 | """ 178 | 179 | ufl_forms = make_ufl() 180 | asm = cufem.CUDAAssembler() 181 | for vec_form, mat_form in zip(ufl_forms['vector'][1:2], ufl_forms['matrix'][1:2]): 182 | L = fe.form(vec_form) 183 | vec_cuda = asm.assemble_vector(cufem.form(vec_form)) 184 | vec_fe = fe.assemble_vector(L) 185 | cuda_a = cufem.form(mat_form) 186 | a = cuda_a.dolfinx_form 187 | compare_vecs(vec_fe, vec_cuda.vector) 188 | fe.set_bc(vec_fe.array, ufl_forms['bcs']) 189 | asm.set_bc(vec_cuda, ufl_forms['bcs'], L.function_spaces[0]) 190 | compare_vecs(vec_fe, vec_cuda.vector) 191 | fe.apply_lifting(vec_fe.array, [a], [ufl_forms['bcs']]) 192 | asm.apply_lifting(vec_cuda, [cuda_a], [ufl_forms['bcs']]) 193 | compare_vecs(vec_fe, vec_cuda.vector) 194 | 195 | def test_block_assembly(): 196 | """Test that basic block assembly works properly.""" 197 | 198 | domain = make_test_domain() 199 | V1 = fe.functionspace(domain, ("P", 1)) 200 | V2 = fe.functionspace(domain, ("P", 1)) 201 | p1, p2 = ufl.TestFunction(V1), ufl.TestFunction(V2) 202 | 203 | u1, u2 = fe.Function(V1), fe.Function(V2) 204 | u1.interpolate(lambda x: x[0]**2 + x[1]**3) 205 | u2.interpolate(lambda x: 1 + x[0] + x[1]**2) 206 | b1 = ufl.dot(ufl.grad(u1), ufl.grad(p1)) * ufl.dx 207 | b2 = ufl.dot(ufl.grad(u2), ufl.grad(p2)) * ufl.dx 208 | 209 | asm = cufem.CUDAAssembler() 210 | cuda_L = cufem.form([b1,b2]) 211 | 212 | vec_cuda = asm.create_vector_block(cuda_L) 213 | asm.assemble_vector_block(cuda_L, vec_cuda) 214 | 215 | vec_fe = fe.petsc.create_vector_block(cuda_L.dolfinx_forms) 216 | # TODO - update this when switching to DOLFINx v0.10.0 217 | fe.petsc.assemble_vector_block(vec_fe, cuda_L.dolfinx_forms, [[None], [None]]) 218 | compare_vecs(vec_fe, vec_cuda.vector) 219 | 220 | -------------------------------------------------------------------------------- /cpp/cudolfinx/fem/CUDAForm.h: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2024 Benjamin Pachev, James D. Trotter 2 | // 3 | // This file is part of cuDOLFINX 4 | // 5 | // SPDX-License-Identifier: LGPL-3.0-or-later 6 | 7 | #pragma once 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | 25 | namespace dolfinx { 26 | 27 | namespace fem { 28 | 29 | /// Consolidates all form classes into one 30 | template > 32 | class CUDAForm 33 | { 34 | 35 | public: 36 | /// Create GPU copies of data needed for assembly 37 | /// 38 | /// @param[in] cuda_context A context for a CUDA device 39 | /// @param[in] form Pointer to the variational form 40 | CUDAForm( 41 | const CUDA::Context& cuda_context, 42 | Form* form, 43 | ufcx_form* ufcx_form, 44 | std::vector& tabulate_tensor_names, 45 | std::vector& tabulate_tensor_sources, 46 | std::vector& integral_tensor_indices 47 | ) 48 | : _coefficients(cuda_context, form, _dofmap_store) 49 | , _constants(cuda_context, form) 50 | , _form(form) 51 | , _ufcx_form(ufcx_form) 52 | , _compiled(false) 53 | { 54 | _coefficients = CUDAFormCoefficients(cuda_context, form, _dofmap_store); 55 | const int* integral_offsets = ufcx_form->form_integral_offsets; 56 | if (integral_offsets[3] != integral_tensor_indices.size()) { 57 | throw std::runtime_error("UFCx form has " + std::to_string(integral_offsets[3]) 58 | + " integrals, but only " + std::to_string(tabulate_tensor_names.size()) 59 | + " tabulate tensor sources provided to CUDAForm!" 60 | ); 61 | } 62 | for (int i = 0; i < 3; i++) { 63 | for (int offset = integral_offsets[i]; offset < integral_offsets[i+1]; offset++) { 64 | int id = ufcx_form->form_integral_ids[offset]; 65 | int tensor_offset = integral_tensor_indices[offset]; 66 | _cuda_integrals[i].insert({id, {tabulate_tensor_names[tensor_offset], tabulate_tensor_sources[tensor_offset]}}); 67 | } 68 | } 69 | } 70 | 71 | /// Compile form on GPU 72 | /// Under the hood, this creates the integrals 73 | void compile( 74 | const CUDA::Context& cuda_context, 75 | int32_t max_threads_per_block, 76 | int32_t min_blocks_per_multiprocessor, 77 | enum assembly_kernel_type assembly_kernel_type) 78 | { 79 | auto cujit_target = CUDA::get_cujit_target(cuda_context); 80 | _integrals = cuda_form_integrals( 81 | cuda_context, cujit_target, *_form, _cuda_integrals, assembly_kernel_type, 82 | max_threads_per_block, min_blocks_per_multiprocessor, false, NULL, false); 83 | _compiled = true; 84 | } 85 | 86 | /// Copy constructor 87 | CUDAForm(const CUDAForm& form) = delete; 88 | 89 | /// Move constructor 90 | CUDAForm(CUDAForm&& form) = default; 91 | 92 | /// Destructor 93 | virtual ~CUDAForm() = default; 94 | 95 | bool compiled() { return _compiled; } 96 | 97 | bool restricted() { return _restricted_dofmaps.size() > 0; } 98 | 99 | std::map>>& integrals() { 100 | if (!_compiled) { 101 | throw std::runtime_error("Cannot access integrals for uncompiled cuda form!"); 102 | } 103 | return _integrals; 104 | } 105 | 106 | CUDAFormCoefficients& coefficients() { return _coefficients; } 107 | 108 | const CUDAFormConstants& constants() { return _constants; } 109 | 110 | std::shared_ptr unrestricted_dofmap(size_t i) { 111 | if (i >= _form->function_spaces().size()) throw std::runtime_error("Dofmap index out of bounds!"); 112 | return _dofmap_store.get_device_object(_form->function_spaces()[i]->dofmap().get()); 113 | } 114 | 115 | std::shared_ptr dofmap(size_t i) { 116 | if (!restricted()) return unrestricted_dofmap(i); 117 | if (i >= _restricted_dofmaps.size()) throw std::runtime_error("Dofmap index out of bounds!"); 118 | return _restricted_dofmaps[i]; 119 | } 120 | 121 | Form* form() { return _form; } 122 | 123 | CUDADirichletBC bc( 124 | const CUDA::Context& cuda_context, size_t i, 125 | std::vector>> bcs) 126 | { 127 | return CUDADirichletBC(cuda_context, *_form->function_spaces()[i], bcs); 128 | } 129 | 130 | /// Copy the coefficient and constant data to the device 131 | /// This can be necessary if either changes on the host 132 | void to_device(const CUDA::Context& cuda_context) 133 | { 134 | _coefficients.copy_coefficients_to_device(cuda_context); 135 | _constants.update_constant_values(); 136 | } 137 | 138 | void set_restriction( 139 | std::vector offsets, 140 | std::vector ghost_offsets, 141 | std::vector>> restriction) 142 | { 143 | if (restriction.size() != _form->function_spaces().size()) { 144 | throw std::runtime_error("Number of restrictions must equal arity of form (1 for vector, 2 for matrix)!"); 145 | } 146 | _restriction = restriction; 147 | if (_restricted_dofmaps.size()) { 148 | // need to update the restriction 149 | for (int i = 0; i < _restricted_dofmaps.size(); i++) { 150 | _restricted_dofmaps[i]->update( 151 | offsets[i], 152 | ghost_offsets[i], 153 | restriction[i].get() 154 | ); 155 | } 156 | } 157 | else { 158 | for (int i = 0; i < restriction.size(); i++) { 159 | _restricted_dofmaps.push_back( 160 | std::make_shared( 161 | _form->function_spaces()[i]->dofmap().get(), 162 | offsets[i], 163 | ghost_offsets[i], 164 | restriction[i].get() 165 | ) 166 | ); 167 | } 168 | } 169 | } 170 | 171 | const std::vector>> get_restriction() 172 | { 173 | return _restriction; 174 | } 175 | 176 | std::shared_ptr restriction_index_map(size_t i) { 177 | std::vector restricted_inds; 178 | for (auto const& pair: *_restriction[i]) restricted_inds.push_back(pair.first); 179 | auto [sub_imap, inds] = dolfinx::common::create_sub_index_map( 180 | *_form->function_spaces()[0]->dofmap()->index_map, 181 | restricted_inds, 182 | dolfinx::common::IndexMapOrder::preserve, false 183 | ); 184 | return std::make_shared(std::move(sub_imap)); 185 | } 186 | 187 | private: 188 | // Cache of CUDADofMaps 189 | common::CUDAStore _dofmap_store; 190 | // Restricted dofmaps 191 | std::vector> _restricted_dofmaps; 192 | // Restriction 193 | std::vector>> _restriction; 194 | // Form coefficients 195 | CUDAFormCoefficients _coefficients; 196 | // Form Constants 197 | CUDAFormConstants _constants; 198 | // Compiled CUDA kernels 199 | std::map>> _integrals; 200 | // CUDA tabulate tensors 201 | std::array>, 4> _cuda_integrals; 202 | bool _compiled; 203 | Form* _form; 204 | ufcx_form* _ufcx_form; 205 | }; 206 | 207 | } // end namespace fem 208 | 209 | } // end namespace dolfinx 210 | -------------------------------------------------------------------------------- /COPYING.LESSER: -------------------------------------------------------------------------------- 1 | GNU LESSER GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | 9 | This version of the GNU Lesser General Public License incorporates 10 | the terms and conditions of version 3 of the GNU General Public 11 | License, supplemented by the additional permissions listed below. 12 | 13 | 0. Additional Definitions. 14 | 15 | As used herein, "this License" refers to version 3 of the GNU Lesser 16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU 17 | General Public License. 18 | 19 | "The Library" refers to a covered work governed by this License, 20 | other than an Application or a Combined Work as defined below. 21 | 22 | An "Application" is any work that makes use of an interface provided 23 | by the Library, but which is not otherwise based on the Library. 24 | Defining a subclass of a class defined by the Library is deemed a mode 25 | of using an interface provided by the Library. 26 | 27 | A "Combined Work" is a work produced by combining or linking an 28 | Application with the Library. The particular version of the Library 29 | with which the Combined Work was made is also called the "Linked 30 | Version". 31 | 32 | The "Minimal Corresponding Source" for a Combined Work means the 33 | Corresponding Source for the Combined Work, excluding any source code 34 | for portions of the Combined Work that, considered in isolation, are 35 | based on the Application, and not on the Linked Version. 36 | 37 | The "Corresponding Application Code" for a Combined Work means the 38 | object code and/or source code for the Application, including any data 39 | and utility programs needed for reproducing the Combined Work from the 40 | Application, but excluding the System Libraries of the Combined Work. 41 | 42 | 1. Exception to Section 3 of the GNU GPL. 43 | 44 | You may convey a covered work under sections 3 and 4 of this License 45 | without being bound by section 3 of the GNU GPL. 46 | 47 | 2. Conveying Modified Versions. 48 | 49 | If you modify a copy of the Library, and, in your modifications, a 50 | facility refers to a function or data to be supplied by an Application 51 | that uses the facility (other than as an argument passed when the 52 | facility is invoked), then you may convey a copy of the modified 53 | version: 54 | 55 | a) under this License, provided that you make a good faith effort to 56 | ensure that, in the event an Application does not supply the 57 | function or data, the facility still operates, and performs 58 | whatever part of its purpose remains meaningful, or 59 | 60 | b) under the GNU GPL, with none of the additional permissions of 61 | this License applicable to that copy. 62 | 63 | 3. Object Code Incorporating Material from Library Header Files. 64 | 65 | The object code form of an Application may incorporate material from 66 | a header file that is part of the Library. You may convey such object 67 | code under terms of your choice, provided that, if the incorporated 68 | material is not limited to numerical parameters, data structure 69 | layouts and accessors, or small macros, inline functions and templates 70 | (ten or fewer lines in length), you do both of the following: 71 | 72 | a) Give prominent notice with each copy of the object code that the 73 | Library is used in it and that the Library and its use are 74 | covered by this License. 75 | 76 | b) Accompany the object code with a copy of the GNU GPL and this license 77 | document. 78 | 79 | 4. Combined Works. 80 | 81 | You may convey a Combined Work under terms of your choice that, 82 | taken together, effectively do not restrict modification of the 83 | portions of the Library contained in the Combined Work and reverse 84 | engineering for debugging such modifications, if you also do each of 85 | the following: 86 | 87 | a) Give prominent notice with each copy of the Combined Work that 88 | the Library is used in it and that the Library and its use are 89 | covered by this License. 90 | 91 | b) Accompany the Combined Work with a copy of the GNU GPL and this license 92 | document. 93 | 94 | c) For a Combined Work that displays copyright notices during 95 | execution, include the copyright notice for the Library among 96 | these notices, as well as a reference directing the user to the 97 | copies of the GNU GPL and this license document. 98 | 99 | d) Do one of the following: 100 | 101 | 0) Convey the Minimal Corresponding Source under the terms of this 102 | License, and the Corresponding Application Code in a form 103 | suitable for, and under terms that permit, the user to 104 | recombine or relink the Application with a modified version of 105 | the Linked Version to produce a modified Combined Work, in the 106 | manner specified by section 6 of the GNU GPL for conveying 107 | Corresponding Source. 108 | 109 | 1) Use a suitable shared library mechanism for linking with the 110 | Library. A suitable mechanism is one that (a) uses at run time 111 | a copy of the Library already present on the user's computer 112 | system, and (b) will operate properly with a modified version 113 | of the Library that is interface-compatible with the Linked 114 | Version. 115 | 116 | e) Provide Installation Information, but only if you would otherwise 117 | be required to provide such information under section 6 of the 118 | GNU GPL, and only to the extent that such information is 119 | necessary to install and execute a modified version of the 120 | Combined Work produced by recombining or relinking the 121 | Application with a modified version of the Linked Version. (If 122 | you use option 4d0, the Installation Information must accompany 123 | the Minimal Corresponding Source and Corresponding Application 124 | Code. If you use option 4d1, you must provide the Installation 125 | Information in the manner specified by section 6 of the GNU GPL 126 | for conveying Corresponding Source.) 127 | 128 | 5. Combined Libraries. 129 | 130 | You may place library facilities that are a work based on the 131 | Library side by side in a single library together with other library 132 | facilities that are not Applications and are not covered by this 133 | License, and convey such a combined library under terms of your 134 | choice, if you do both of the following: 135 | 136 | a) Accompany the combined library with a copy of the same work based 137 | on the Library, uncombined with any other library facilities, 138 | conveyed under the terms of this License. 139 | 140 | b) Give prominent notice with the combined library that part of it 141 | is a work based on the Library, and explaining where to find the 142 | accompanying uncombined form of the same work. 143 | 144 | 6. Revised Versions of the GNU Lesser General Public License. 145 | 146 | The Free Software Foundation may publish revised and/or new versions 147 | of the GNU Lesser General Public License from time to time. Such new 148 | versions will be similar in spirit to the present version, but may 149 | differ in detail to address new problems or concerns. 150 | 151 | Each version is given a distinguishing version number. If the 152 | Library as you received it specifies that a certain numbered version 153 | of the GNU Lesser General Public License "or any later version" 154 | applies to it, you have the option of following the terms and 155 | conditions either of that published version or of any later version 156 | published by the Free Software Foundation. If the Library as you 157 | received it does not specify a version number of the GNU Lesser 158 | General Public License, you may choose any version of the GNU Lesser 159 | General Public License ever published by the Free Software Foundation. 160 | 161 | If the Library as you received it specifies that a proxy can decide 162 | whether future versions of the GNU Lesser General Public License shall 163 | apply, that proxy's public statement of acceptance of any version is 164 | permanent authorization for you to choose that version for the 165 | Library. 166 | -------------------------------------------------------------------------------- /cpp/cudolfinx/la/CUDAMatrix.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2024 Benjamin Pachev, James D. Trotter 2 | // 3 | // This file is part of cuDOLFINX 4 | // 5 | // SPDX-License-Identifier: LGPL-3.0-or-later 6 | 7 | #include "CUDAMatrix.h" 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | using namespace dolfinx; 18 | using namespace dolfinx::la; 19 | 20 | //----------------------------------------------------------------------------- 21 | CUDAMatrix::CUDAMatrix() 22 | : _A(nullptr) 23 | , _diag() 24 | , _offdiag() 25 | , _dcolmap(0) 26 | , _num_rows() 27 | , _num_columns() 28 | , _local_row_start() 29 | , _local_row_end() 30 | , _num_local_rows() 31 | , _num_local_columns() 32 | , _num_local_offdiag_columns() 33 | { 34 | } 35 | //----------------------------------------------------------------------------- 36 | CUDAMatrix::CUDAMatrix( 37 | const CUDA::Context& cuda_context, 38 | Mat A, 39 | bool page_lock_values, 40 | bool use_seqaijcusparsegetarray) 41 | : _A(A) 42 | , _diag() 43 | , _offdiag() 44 | , _dcolmap(0) 45 | , _num_rows() 46 | , _num_columns() 47 | , _local_row_start() 48 | , _local_row_end() 49 | , _num_local_rows() 50 | , _num_local_columns() 51 | , _num_local_offdiag_columns() 52 | { 53 | PetscErrorCode ierr; 54 | CUresult cuda_err; 55 | const char * cuda_err_description; 56 | 57 | // Check the type of matrix 58 | MatType matrix_type; 59 | ierr = MatGetType(A, &matrix_type); 60 | if (ierr != 0) 61 | la::petsc::error(ierr, __FILE__, "MatGetType"); 62 | 63 | // Get the number of matrix rows and columns 64 | ierr = MatGetSize(A, &_num_rows, &_num_columns); 65 | if (ierr != 0) 66 | la::petsc::error(ierr, __FILE__, "MatGetSize"); 67 | 68 | // Get the number of rows and columns owned by the current MPI process 69 | ierr = MatGetLocalSize(A, &_num_local_rows, &_num_local_columns); 70 | if (ierr != 0) 71 | la::petsc::error(ierr, __FILE__, "MatGetLocalSize"); 72 | 73 | // TODO: We might need to do some additional work to handle non-zero 74 | // local_row_start. 75 | ierr = MatGetOwnershipRange(A, &_local_row_start, &_local_row_end); 76 | if (ierr != 0) 77 | la::petsc::error(ierr, __FILE__, "MatGetOwnershipRange"); 78 | 79 | if (strcmp(matrix_type, MATSEQAIJ) == 0 || 80 | strcmp(matrix_type, MATSEQAIJCUSPARSE) == 0) 81 | { 82 | // A non-distributed matrix only has a diagonal part 83 | _diag = std::make_unique( 84 | cuda_context, A, page_lock_values, use_seqaijcusparsegetarray); 85 | } else if (strcmp(matrix_type, MATMPIAIJ) == 0 || 86 | strcmp(matrix_type, MATMPIAIJCUSPARSE) == 0) 87 | { 88 | // For a distributed matrix, we obtain local diagonal and 89 | // off-diagonal blocks using MatMPIAIJGetSeqAIJ(). 90 | Mat Ad, Ao; 91 | const int * colmap; 92 | ierr = MatMPIAIJGetSeqAIJ(A, &Ad, &Ao, &colmap); 93 | if (ierr != 0) 94 | la::petsc::error(ierr, __FILE__, "MatMPIAIJGetSeqAIJ"); 95 | _diag = std::make_unique( 96 | cuda_context, Ad, page_lock_values, use_seqaijcusparsegetarray); 97 | _offdiag = std::make_unique( 98 | cuda_context, Ao, page_lock_values, use_seqaijcusparsegetarray); 99 | 100 | // Get the number of columns in the off-diagonal part of the local 101 | // matrix. 102 | ierr = MatGetLocalSize(Ao, NULL, &_num_local_offdiag_columns); 103 | if (ierr != 0) 104 | la::petsc::error(ierr, __FILE__, "MatGetLocalSize"); 105 | 106 | // Convert the column map from global numbering to the 107 | // process-local numbering 108 | ISLocalToGlobalMapping cmapping; 109 | ierr = MatGetLocalToGlobalMapping(A, NULL, &cmapping); 110 | if (ierr != 0) 111 | la::petsc::error(ierr, __FILE__, "MatGetLocalToGlobalMapping"); 112 | 113 | std::vector colmap_local(_num_local_offdiag_columns); 114 | ierr = ISGlobalToLocalMappingApply( 115 | cmapping, IS_GTOLM_MASK, _num_local_offdiag_columns, colmap, 116 | NULL, colmap_local.data()); 117 | if (ierr != 0) 118 | la::petsc::error(ierr, __FILE__, "ISGlobalToLocalMappingApply"); 119 | 120 | // Allocate device-side storage for off-diagonal column map 121 | if (_num_local_offdiag_columns > 0) { 122 | std::vector> combined; 123 | for (int i = 0; i < colmap_local.size(); i++) { 124 | combined.emplace_back(colmap_local[i], i); 125 | } 126 | std::sort(combined.begin(), combined.end(), 127 | [](const std::pair& a, const std::pair& b) { 128 | return a.first < b.first; 129 | }); 130 | std::vector colmap_sorted(combined.size()); 131 | std::vector colmap_sorted_indices(combined.size()); 132 | 133 | for (int i = 0; i < combined.size(); i++) { 134 | colmap_sorted[i] = combined[i].first; 135 | colmap_sorted_indices[i] = combined[i].second; 136 | } 137 | 138 | dolfinx::CUDA::safeVectorCreate(&_dcolmap, colmap_local); 139 | dolfinx::CUDA::safeVectorCreate(&_dcolmap_sorted, colmap_sorted); 140 | dolfinx::CUDA::safeVectorCreate(&_dcolmap_sorted_indices, colmap_sorted_indices); 141 | } 142 | 143 | } else { 144 | throw std::runtime_error( 145 | "Unsupported matrix type '" + std::string(matrix_type) + "' " 146 | "at " + std::string(__FILE__) + ":" + std::to_string(__LINE__)); 147 | } 148 | } 149 | //----------------------------------------------------------------------------- 150 | CUDAMatrix::~CUDAMatrix() 151 | { 152 | } 153 | //----------------------------------------------------------------------------- 154 | CUDAMatrix::CUDAMatrix(CUDAMatrix&& matrix) 155 | : _A(matrix._A) 156 | , _diag(std::move(matrix._diag)) 157 | , _offdiag(std::move(matrix._offdiag)) 158 | , _dcolmap(matrix._dcolmap) 159 | , _num_rows(matrix._num_rows) 160 | , _num_columns(matrix._num_columns) 161 | , _local_row_start(matrix._local_row_start) 162 | , _local_row_end(matrix._local_row_end) 163 | , _num_local_rows(matrix._num_local_rows) 164 | , _num_local_columns(matrix._num_local_columns) 165 | , _num_local_offdiag_columns(matrix._num_local_offdiag_columns) 166 | { 167 | matrix._A = nullptr; 168 | matrix._diag = nullptr; 169 | matrix._offdiag = nullptr; 170 | matrix._dcolmap = 0; 171 | matrix._num_rows = 0; 172 | matrix._num_columns = 0; 173 | matrix._local_row_start = 0; 174 | matrix._local_row_end = 0; 175 | matrix._num_local_rows = 0; 176 | matrix._num_local_columns = 0; 177 | matrix._num_local_offdiag_columns = 0; 178 | } 179 | //----------------------------------------------------------------------------- 180 | CUDAMatrix& CUDAMatrix::operator=(CUDAMatrix&& matrix) 181 | { 182 | _A = matrix._A; 183 | _diag = std::move(matrix._diag); 184 | _offdiag = std::move(matrix._offdiag); 185 | _dcolmap = matrix._dcolmap; 186 | _num_rows = matrix._num_rows; 187 | _num_columns = matrix._num_columns; 188 | _local_row_start = matrix._local_row_start; 189 | _local_row_end = matrix._local_row_end; 190 | _num_local_rows = matrix._num_local_rows; 191 | _num_local_columns = matrix._num_local_columns; 192 | _num_local_offdiag_columns = matrix._num_local_offdiag_columns; 193 | matrix._A = nullptr; 194 | matrix._diag = nullptr; 195 | matrix._offdiag = nullptr; 196 | matrix._dcolmap = 0; 197 | matrix._num_rows = 0; 198 | matrix._num_columns = 0; 199 | matrix._local_row_start = 0; 200 | matrix._local_row_end = 0; 201 | matrix._num_local_rows = 0; 202 | matrix._num_local_columns = 0; 203 | matrix._num_local_offdiag_columns = 0; 204 | return *this; 205 | } 206 | //----------------------------------------------------------------------------- 207 | void CUDAMatrix::copy_matrix_values_to_host( 208 | const CUDA::Context& cuda_context) 209 | { 210 | if (_diag) 211 | _diag->copy_matrix_values_to_host(cuda_context); 212 | if (_offdiag) 213 | _offdiag->copy_matrix_values_to_host(cuda_context); 214 | } 215 | //----------------------------------------------------------------------------- 216 | void CUDAMatrix::apply(MatAssemblyType type) 217 | { 218 | PetscErrorCode ierr; 219 | ierr = MatAssemblyBegin(_A, type); 220 | if (ierr != 0) 221 | petsc::error(ierr, __FILE__, "MatAssemblyBegin"); 222 | ierr = MatAssemblyEnd(_A, type); 223 | if (ierr != 0) 224 | petsc::error(ierr, __FILE__, "MatAssemblyEnd"); 225 | } 226 | //----------------------------------------------------------------------------- 227 | void CUDAMatrix::debug_dump() 228 | { 229 | if (_diag) { 230 | std::cout << "Dumping diag matrix." << std::endl; 231 | _diag->debug_dump(); 232 | } 233 | if (_offdiag) { 234 | std::cout << "Dumping offdiag matrix." << std::endl; 235 | _offdiag->debug_dump(); 236 | } 237 | } 238 | 239 | -------------------------------------------------------------------------------- /python/cudolfinx/form.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2024 Benjamin Pachev 2 | # 3 | # This file is part of cuDOLFINX 4 | # 5 | # SPDX-License-Identifier: LGPL-3.0-or-later 6 | 7 | import collections 8 | from cudolfinx.context import get_cuda_context 9 | from cudolfinx import cpp as _cucpp, jit 10 | from dolfinx import fem as fe 11 | from dolfinx import cpp as _cpp 12 | import functools 13 | import numpy as np 14 | import typing 15 | import ufl 16 | 17 | class CUDAForm: 18 | """CUDA wrapper class for a dolfinx.fem.Form 19 | """ 20 | 21 | def __init__(self, form: fe.Form): 22 | """Initialize the wrapper 23 | """ 24 | 25 | self._ctx = get_cuda_context() 26 | self._cuda_mesh = _create_mesh_on_device(form.mesh) 27 | 28 | self._dolfinx_form = form 29 | self._wrapped_tabulate_tensors, self._integral_tensor_indices = jit.get_wrapped_tabulate_tensors(form) 30 | ufcx_form_addr = form.module.ffi.cast("uintptr_t", form.module.ffi.addressof(form.ufcx_form)) 31 | 32 | cpp_form = form._cpp_object 33 | if type(cpp_form) is _cpp.fem.Form_float32: 34 | form_cls = _cucpp.fem.CUDAForm_float32 35 | elif type(cpp_form) is _cpp.fem.Form_float64: 36 | form_cls = _cucpp.fem.CUDAForm_float64 37 | else: 38 | raise ValueError(f"Cannot instantiate CUDAForm for Form of type {type(cpp_form)}!") 39 | 40 | _tabulate_tensor_names = [] 41 | _tabulate_tensor_sources = [] 42 | for name, source in self._wrapped_tabulate_tensors: 43 | _tabulate_tensor_names.append(name) 44 | _tabulate_tensor_sources.append(source) 45 | 46 | self._cuda_form = form_cls( 47 | self._ctx, 48 | cpp_form, 49 | ufcx_form_addr, 50 | _tabulate_tensor_names, 51 | _tabulate_tensor_sources, 52 | self._integral_tensor_indices 53 | ) 54 | 55 | # TODO expose these parameters to the user 56 | self._cuda_form.compile(self._ctx, max_threads_per_block=1024, min_blocks_per_multiprocessor=1) 57 | 58 | def to_device(self): 59 | """Copy host-side coefficients and constants to the device 60 | """ 61 | 62 | self._cuda_form.to_device(self._ctx) 63 | 64 | @property 65 | def cuda_form(self): 66 | """Return the underlying cpp CUDAForm 67 | """ 68 | 69 | return self._cuda_form 70 | 71 | @property 72 | def cuda_mesh(self): 73 | """Return the underlying cpp CUDAMesh""" 74 | 75 | return self._cuda_mesh 76 | 77 | @property 78 | def dolfinx_form(self): 79 | """Return the underlying Dolfinx form 80 | """ 81 | 82 | return self._dolfinx_form 83 | 84 | @property 85 | def function_spaces(self): 86 | """Return a list of FunctionSpaces corresponding to the form 87 | """ 88 | 89 | return self._dolfinx_form.function_spaces 90 | 91 | class BlockCUDAForm: 92 | """Data structure containing multiple CUDA forms to be used in block assembly.""" 93 | 94 | def __init__( 95 | self, forms: typing.Union[list[CUDAForm], list[list[CUDAForm]]], 96 | restrictions: typing.Optional[ 97 | typing.Union[ 98 | list[np.typing.NDArray[np.int32]], 99 | tuple[list[np.typing.NDArray[np.int32]], list[np.typing.NDArray[np.int32]]] 100 | ]] = None 101 | ): 102 | """Initialize the data structure.""" 103 | 104 | self._forms = forms 105 | self._restrictions = restrictions 106 | 107 | if not len(forms): raise ValueError("Must provide at least one form!") 108 | if type(forms[0]) is CUDAForm: self._init_vector() 109 | else: self._init_matrix() 110 | 111 | def _get_restriction_offsets(self, forms, restrictions=None): 112 | """Get a list of offsets and restriction indices.""" 113 | 114 | offset = 0 115 | ghost_offset = 0 116 | ghost_offsets = [ghost_offset] 117 | offsets = [offset] 118 | restriction_inds_list = [] 119 | local_sizes = [] 120 | for i, form in enumerate(forms): 121 | dofmap = form.function_spaces[0].dofmap 122 | local_size = dofmap.index_map.size_local 123 | if restrictions is not None: 124 | restriction_inds = restrictions[i] 125 | local_size = len(restriction_inds[restriction_inds < local_size]) 126 | num_ghosts = len(restriction_inds) - local_size 127 | else: 128 | num_ghosts = dofmap.index_map.num_ghosts 129 | restriction_inds = np.arange(local_size+num_ghosts, dtype=np.int32) 130 | offset += local_size * dofmap.index_map_bs 131 | ghost_offset += num_ghosts * dofmap.index_map_bs 132 | offsets.append(offset) 133 | ghost_offsets.append(ghost_offset) 134 | local_sizes.append(local_size*dofmap.index_map_bs) 135 | restriction_inds_list.append(restriction_inds) 136 | # create offsets that can be directly added to the local index of the ghost 137 | # hence the need to subtract out the local size as the CUDADofMap doesn't know how many 138 | # restricted dofs are acutally local 139 | # TODO just reimplement RestrictedDofMap from multiphenicsx instead of all this dancing around 140 | ghost_offsets = [offsets[-1] + ghost_offset - local_size for ghost_offset,local_size in zip(ghost_offsets, local_sizes)] 141 | return restriction_inds_list, offsets, ghost_offsets 142 | 143 | 144 | def _init_vector(self): 145 | """Initialize vector form.""" 146 | 147 | # don't need ghost offsets for vector assembly 148 | restriction_inds_list, self._offsets, ghost_offsets = self._get_restriction_offsets( 149 | self._forms, self._restrictions) 150 | for form, offset, ghost_offset, restriction_inds in zip(self._forms, self._offsets, ghost_offsets, restriction_inds_list): 151 | form.cuda_form.set_restriction( 152 | [offset], [ghost_offset], [restriction_inds] 153 | ) 154 | 155 | comm = self._forms[0].dolfinx_form.mesh.comm 156 | self._global_size = comm.allreduce(self._offsets[-1]) 157 | 158 | def _init_matrix(self): 159 | """Initialize matrix form.""" 160 | 161 | row_forms = [row[0] for row in self._forms] 162 | col_forms = self._forms[0] 163 | 164 | row_restrictions, row_offsets, row_ghost_offsets = self._get_restriction_offsets( 165 | row_forms, self._restrictions[0] if self._restrictions is not None else None 166 | ) 167 | 168 | col_restrictions, col_offsets, col_ghost_offsets = self._get_restriction_offsets( 169 | col_forms, self._restrictions[1] if self._restrictions is not None else None 170 | ) 171 | 172 | # restrict forms appropriately 173 | for i, row in enumerate(self._forms): 174 | for j, form in enumerate(row): 175 | form.cuda_form.set_restriction( 176 | [row_offsets[i], col_offsets[j]], 177 | [row_ghost_offsets[i], col_ghost_offsets[j]], 178 | [row_restrictions[i], col_restrictions[j]] 179 | ) 180 | 181 | @property 182 | def forms(self): 183 | """Return the list of forms.""" 184 | 185 | return self._forms 186 | 187 | @property 188 | def dolfinx_forms(self): 189 | """Return list of underlying dolfinx forms.""" 190 | 191 | return [f.dolfinx_form for f in self._forms] 192 | 193 | @property 194 | def offsets(self): 195 | """Return list of offsets.""" 196 | 197 | return self._offsets 198 | 199 | @property 200 | def local_size(self): 201 | """Return size of local vector.""" 202 | 203 | return self._offsets[-1] 204 | 205 | @property 206 | def global_size(self): 207 | """Return size of global vector.""" 208 | 209 | return self._global_size 210 | 211 | def form( 212 | form: typing.Union[ufl.Form, typing.Iterable[ufl.Form]], 213 | restriction: typing.Optional[typing.Iterable[np.typing.NDArray[np.int32]]] = None, 214 | **kwargs): 215 | """Create a CUDAForm from a ufl form.""" 216 | 217 | def _create_form(form): 218 | """Recursively convert ufl.Forms to CUDAForm.""" 219 | 220 | if isinstance(form, ufl.Form): 221 | dolfinx_form = fe.form(form, **kwargs) 222 | return CUDAForm(dolfinx_form) 223 | elif isinstance(form, collections.abc.Iterable): 224 | return list(map(lambda sub_form: _create_form(sub_form), form)) 225 | else: 226 | raise TypeError("Expected form to be a ufl.Form or an iterable, got type '{type(form)}'!") 227 | 228 | cuda_form = _create_form(form) 229 | # TODO: properly handle restriction for a single form 230 | if isinstance(form, collections.abc.Iterable): 231 | return BlockCUDAForm(cuda_form, restriction) 232 | else: return cuda_form 233 | 234 | # Cache this so we don't create multiple copies of the same CUDAMesh 235 | @functools.cache 236 | def _create_mesh_on_device(cpp_mesh: typing.Union[_cpp.mesh.Mesh_float32, _cpp.mesh.Mesh_float64]): 237 | """Create device-side mesh data 238 | """ 239 | 240 | if type(cpp_mesh) is _cpp.mesh.Mesh_float32: 241 | return _cucpp.fem.CUDAMesh_float32(cpp_mesh) 242 | elif type(cpp_mesh) is _cpp.mesh.Mesh_float64: 243 | return _cucpp.fem.CUDAMesh_float64(cpp_mesh) 244 | else: 245 | raise ValueError(f"Cannot instantiate CUDAMesh for Mesh of type {type(cpp_mesh)}!") 246 | 247 | -------------------------------------------------------------------------------- /docker/Dockerfile.test-env: -------------------------------------------------------------------------------- 1 | FROM nvcr.io/nvidia/nvhpc:24.9-devel-cuda12.6-ubuntu24.04 as cudolfinx-dev-env 2 | 3 | ARG ADIOS2_VERSION=2.10.2 4 | ARG DOXYGEN_VERSION=1_13_2 5 | ARG GMSH_VERSION=4_13_1 6 | ARG HDF5_VERSION=1.14.6 7 | ARG KAHIP_VERSION=3.18 8 | # NOTE: The NumPy version (https://pypi.org/project/numpy/#history) 9 | # should be pinned to the most recent NumPy release that is supported by 10 | # the most recent Numba release, see 11 | # https://numba.readthedocs.io/en/stable/user/installing.html#version-support-information 12 | ARG NUMPY_VERSION=2.1.3 13 | ARG PETSC_VERSION=3.22.4 14 | ARG SLEPC_VERSION=3.22.2 15 | ARG SPDLOG_VERSION=1.15.1 16 | 17 | ARG MPICH_VERSION=4.2.3 18 | ARG OPENMPI_SERIES=5.0 19 | ARG OPENMPI_PATCH=7 20 | 21 | ######################################## 22 | 23 | LABEL maintainer="Benjamin Pachev " 24 | LABEL description="Modified FEniCS dev environment with CUDA PETSc installed." 25 | 26 | ARG ADIOS2_VERSION 27 | ARG DOXYGEN_VERSION 28 | ARG GMSH_VERSION 29 | ARG HDF5_VERSION 30 | ARG KAHIP_VERSION 31 | ARG PETSC_VERSION 32 | ARG SLEPC_VERSION 33 | ARG SPDLOG_VERSION 34 | ARG NUMPY_VERSION 35 | ARG MPICH_VERSION 36 | ARG OPENMPI_SERIES 37 | ARG OPENMPI_PATCH 38 | 39 | # The following ARGS are used in the dev-env layer. 40 | # They are safe defaults. They can be overridden by the user. 41 | # Compiler optimisation flags for SLEPc and PETSc, all languages. 42 | ARG PETSC_SLEPC_OPTFLAGS="-O2" 43 | # Turn on PETSc and SLEPc debugging. "yes" or "no". 44 | ARG PETSC_SLEPC_DEBUGGING="no" 45 | 46 | # MPI variant. "mpich" or "openmpi". 47 | ARG MPI="openmpi" 48 | 49 | # Number of build threads to use with make 50 | ARG BUILD_NP=4 51 | 52 | WORKDIR /tmp 53 | 54 | # Environment variables 55 | ENV OPENBLAS_NUM_THREADS=1 \ 56 | OPENBLAS_VERBOSE=0 57 | 58 | # Install dependencies available via apt-get. 59 | # - First set of packages are required to build and run FEniCS. 60 | # - Second set of packages are recommended and/or required to build 61 | # documentation or tests. 62 | # - Third set of packages are optional, but required to run gmsh 63 | # pre-built binaries. 64 | RUN export DEBIAN_FRONTEND=noninteractive && \ 65 | apt-get -qq update && \ 66 | apt-get -yq --with-new-pkgs -o Dpkg::Options::="--force-confold" upgrade && \ 67 | apt-get -y install \ 68 | cmake \ 69 | g++ \ 70 | gfortran \ 71 | libboost-dev \ 72 | liblapack-dev \ 73 | libopenblas-dev \ 74 | libpugixml-dev \ 75 | ninja-build \ 76 | pkg-config \ 77 | python3-dev \ 78 | python3-pip \ 79 | python3-venv && \ 80 | # 81 | apt-get -y install \ 82 | catch2 \ 83 | git \ 84 | graphviz \ 85 | libeigen3-dev \ 86 | valgrind \ 87 | wget && \ 88 | # 89 | apt-get -y install \ 90 | libglu1 \ 91 | libxcursor-dev \ 92 | libxft2 \ 93 | libxinerama1 \ 94 | libfltk1.3-dev \ 95 | libfreetype6-dev \ 96 | libgl1-mesa-dev \ 97 | libocct-foundation-dev \ 98 | libocct-data-exchange-dev && \ 99 | apt-get clean && \ 100 | rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* 101 | 102 | # Install spdlog from source - Ubuntu version is incompatible with CUDA 12. 103 | RUN wget -nc --quiet https://github.com/gabime/spdlog/archive/refs/tags/v${SPDLOG_VERSION}.tar.gz && \ 104 | tar xfz v${SPDLOG_VERSION}.tar.gz && \ 105 | cd spdlog-${SPDLOG_VERSION} && \ 106 | cmake -G Ninja -DCMAKE_BUILD_TYPE=Release -DSPDLOG_BUILD_SHARED=ON -DSPDLOG_BUILD_PIC=ON -B build-dir . && \ 107 | cmake --build build-dir && \ 108 | cmake --install build-dir && \ 109 | rm -rf /tmp/* 110 | 111 | # Install Doxygen 112 | RUN apt-get -qq update && \ 113 | apt-get -y install bison flex && \ 114 | wget -nc --quiet https://github.com/doxygen/doxygen/archive/refs/tags/Release_${DOXYGEN_VERSION}.tar.gz && \ 115 | tar xfz Release_${DOXYGEN_VERSION}.tar.gz && \ 116 | cd doxygen-Release_${DOXYGEN_VERSION} && \ 117 | cmake -G Ninja -DCMAKE_BUILD_TYPE=Release -B build-dir . && \ 118 | cmake --build build-dir && \ 119 | cmake --install build-dir && \ 120 | apt-get -y purge bison flex && \ 121 | apt-get -y autoremove && \ 122 | apt-get clean && \ 123 | rm -rf /tmp/* 124 | 125 | # Install MPI 126 | RUN if [ "$MPI" = "mpich" ]; then \ 127 | wget https://www.mpich.org/static/downloads/${MPICH_VERSION}/mpich-${MPICH_VERSION}.tar.gz && \ 128 | tar xfz mpich-${MPICH_VERSION}.tar.gz && \ 129 | cd mpich-${MPICH_VERSION} && \ 130 | ./configure && \ 131 | make -j${BUILD_NP} install; \ 132 | else \ 133 | wget https://download.open-mpi.org/release/open-mpi/v${OPENMPI_SERIES}/openmpi-${OPENMPI_SERIES}.${OPENMPI_PATCH}.tar.gz && \ 134 | tar xfz openmpi-${OPENMPI_SERIES}.${OPENMPI_PATCH}.tar.gz && \ 135 | cd openmpi-${OPENMPI_SERIES}.${OPENMPI_PATCH} && \ 136 | ./configure && \ 137 | make -j${BUILD_NP} install; \ 138 | fi && \ 139 | ldconfig && \ 140 | rm -rf /tmp/* 141 | 142 | ENV VIRTUAL_ENV=/dolfinx-env 143 | ENV PATH=/dolfinx-env/bin:$PATH 144 | RUN python3 -m venv ${VIRTUAL_ENV} 145 | 146 | # Install Python packages (via pip) 147 | RUN pip install --no-cache-dir --upgrade pip setuptools wheel && \ 148 | pip install --no-cache-dir cython numpy==${NUMPY_VERSION} && \ 149 | CFLAGS="-noswitcherror" pip install --no-cache-dir mpi4py 150 | 151 | # Install KaHIP 152 | RUN wget -nc --quiet https://github.com/kahip/kahip/archive/v${KAHIP_VERSION}.tar.gz && \ 153 | tar -xf v${KAHIP_VERSION}.tar.gz && \ 154 | cmake -G Ninja -DCMAKE_BUILD_TYPE=Release -DNONATIVEOPTIMIZATIONS=on -B build-dir -S KaHIP-${KAHIP_VERSION} && \ 155 | cmake --build build-dir && \ 156 | cmake --install build-dir && \ 157 | rm -rf /tmp/* 158 | 159 | # Install HDF5 160 | # Note: HDF5 CMake install has numerous bugs and inconsistencies. Test carefully. 161 | # HDF5 overrides CMAKE_INSTALL_PREFIX by default, hence it is set 162 | # below to ensure that HDF5 is installed into a path where it can be 163 | # found. 164 | RUN wget -nc --quiet https://github.com/HDFGroup/hdf5/archive/refs/tags/hdf5_${HDF5_VERSION}.tar.gz && \ 165 | tar xfz hdf5_${HDF5_VERSION}.tar.gz && \ 166 | cmake -G Ninja -DCMAKE_INSTALL_PREFIX=/usr/local -DCMAKE_BUILD_TYPE=Release -DHDF5_ENABLE_PARALLEL=on -DHDF5_ENABLE_Z_LIB_SUPPORT=on -B build-dir -S hdf5-hdf5_${HDF5_VERSION} && \ 167 | cmake --build build-dir && \ 168 | cmake --install build-dir && \ 169 | rm -rf /tmp/* 170 | 171 | # Install ADIOS2 (Python interface in /usr/local/lib), same as GMSH 172 | RUN wget -nc --quiet https://github.com/ornladios/ADIOS2/archive/v${ADIOS2_VERSION}.tar.gz -O adios2-v${ADIOS2_VERSION}.tar.gz && \ 173 | mkdir -p adios2-v${ADIOS2_VERSION} && \ 174 | tar -xf adios2-v${ADIOS2_VERSION}.tar.gz -C adios2-v${ADIOS2_VERSION} --strip-components 1 && \ 175 | cmake -G Ninja -DADIOS2_USE_HDF5=on -DCMAKE_INSTALL_PYTHONDIR=/usr/local/lib/ -DADIOS2_USE_Fortran=off -DBUILD_TESTING=off -DADIOS2_BUILD_EXAMPLES=off -DADIOS2_USE_ZeroMQ=off -B build-dir -S ./adios2-v${ADIOS2_VERSION} && \ 176 | cmake --build build-dir && \ 177 | cmake --install build-dir && \ 178 | rm -rf /tmp/* 179 | 180 | # Install GMSH 181 | RUN git clone -b gmsh_${GMSH_VERSION} --single-branch --depth 1 https://gitlab.onelab.info/gmsh/gmsh.git && \ 182 | cmake -G Ninja -DCMAKE_BUILD_TYPE=Release -DENABLE_BUILD_DYNAMIC=1 -DENABLE_OPENMP=1 -B build-dir -S gmsh && \ 183 | cmake --build build-dir && \ 184 | cmake --install build-dir && \ 185 | rm -rf /tmp/* 186 | 187 | # GMSH installs python library in /usr/local/lib, see: https://gitlab.onelab.info/gmsh/gmsh/-/issues/1414 188 | ENV PYTHONPATH=/usr/local/lib:$PYTHONPATH 189 | 190 | # Install PETSc and petsc4py with real and complex types 191 | ENV PETSC_DIR=/usr/local/petsc SLEPC_DIR=/usr/local/slepc 192 | RUN ln -sf /opt/nvidia/hpc_sdk/Linux_x86_64/24.9/cuda/lib64/stubs/libcuda.so /opt/nvidia/hpc_sdk/Linux_x86_64/24.9/cuda/lib64/stubs/libcuda.so.1 193 | RUN apt-get -qq update && \ 194 | apt-get -y install bison flex && \ 195 | git clone --depth=1 -b v${PETSC_VERSION} https://gitlab.com/petsc/petsc.git ${PETSC_DIR} && \ 196 | cd ${PETSC_DIR} && \ 197 | # Real64, 32-bit int with CUDA 198 | LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/nvidia/hpc_sdk/Linux_x86_64/24.9/cuda/lib64/stubs/ ./configure \ 199 | PETSC_ARCH=linux-gnu-real64-32-cuda \ 200 | --COPTFLAGS="${PETSC_SLEPC_OPTFLAGS}" \ 201 | --CXXOPTFLAGS="${PETSC_SLEPC_OPTFLAGS}" \ 202 | --FOPTFLAGS="${PETSC_SLEPC_OPTFLAGS}" \ 203 | --with-64-bit-indices=no \ 204 | --with-debugging=${PETSC_SLEPC_DEBUGGING} \ 205 | --with-fortran-bindings=no \ 206 | --with-shared-libraries \ 207 | --download-hypre \ 208 | --download-metis \ 209 | --download-mumps-avoid-mpi-in-place \ 210 | --download-mumps \ 211 | --download-ptscotch \ 212 | --download-scalapack \ 213 | --with-cuda\ 214 | --download-spai \ 215 | --download-suitesparse \ 216 | --with-scalar-type=real \ 217 | --with-precision=double && \ 218 | make PETSC_ARCH=linux-gnu-real64-32-cuda ${MAKEFLAGS} all 219 | 220 | # Install petsc4py 221 | RUN cd ${PETSC_DIR}/src/binding/petsc4py && \ 222 | PETSC_ARCH=linux-gnu-real64-32-cuda CFLAGS="-noswitcherror" pip -v install --no-cache-dir --no-build-isolation . && \ 223 | # Cleanup 224 | apt-get -y purge bison flex && \ 225 | apt-get -y autoremove && \ 226 | apt-get clean && \ 227 | rm -rf \ 228 | ${PETSC_DIR}/**/tests/ \ 229 | ${PETSC_DIR}/**/obj/ \ 230 | ${PETSC_DIR}/**/externalpackages/ \ 231 | ${PETSC_DIR}/CTAGS \ 232 | ${PETSC_DIR}/RDict.log \ 233 | ${PETSC_DIR}/TAGS \ 234 | ${PETSC_DIR}/docs/ \ 235 | ${PETSC_DIR}/share/ \ 236 | ${PETSC_DIR}/src/ \ 237 | ${PETSC_DIR}/systems/ \ 238 | rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* 239 | 240 | WORKDIR /root 241 | 242 | -------------------------------------------------------------------------------- /cpp/cudolfinx/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | include(GNUInstallDirs) 3 | 4 | # ------------------------------------------------------------------------------ 5 | # Declare the library (target) 6 | add_library(cudolfinx) 7 | 8 | # ------------------------------------------------------------------------------ 9 | # Add source files to the target 10 | set(CUDOLFINX_DIRS 11 | common 12 | fem 13 | la 14 | mesh 15 | ) 16 | 17 | # Add source to dolfinx target, and get sets of header files 18 | foreach(DIR ${CUDOLFINX_DIRS}) 19 | add_subdirectory(${DIR}) 20 | endforeach() 21 | 22 | # Set target include location (for build and installed) 23 | target_include_directories( 24 | cudolfinx 25 | PUBLIC 26 | $ 27 | "$" 28 | ) 29 | 30 | # ------------------------------------------------------------------------------ 31 | # Configure the common/version.h file 32 | configure_file( 33 | ${CMAKE_CURRENT_SOURCE_DIR}/common/version.h.in common/version.h @ONLY 34 | ) 35 | 36 | # ------------------------------------------------------------------------------ 37 | # Set target properties 38 | set_target_properties( 39 | cudolfinx 40 | PROPERTIES VERSION ${CUDOLFINX_VERSION} 41 | SOVERSION ${CUDOLFINX_VERSION_MAJOR}.${CUDOLFINX_VERSION_MINOR} 42 | ) 43 | 44 | # Add git revision flag to the one affected file 45 | #set_source_files_properties( 46 | # common/defines.cpp 47 | # PROPERTIES 48 | # COMPILE_DEFINITIONS 49 | # "UFCX_SIGNATURE=\"${UFCX_SIGNATURE}\";CUDOLFINX_GIT_COMMIT_HASH=\"${GIT_COMMIT_HASH}\"" 50 | #) 51 | 52 | # ------------------------------------------------------------------------------ 53 | # Set compiler options and definitions 54 | 55 | # Set 'Developer' build type flags 56 | target_compile_options( 57 | cudolfinx PRIVATE $<$:${CUDOLFINX_CXX_DEVELOPER_FLAGS}> 58 | ) 59 | 60 | # Add version to definitions (public) 61 | target_compile_definitions(cudolfinx PUBLIC CUDOLFINX_VERSION="${CUDOLFINX_VERSION}") 62 | 63 | # ------------------------------------------------------------------------------ 64 | # Add include directories and libraries of required packages 65 | 66 | # UFCx 67 | if(TARGET ufcx::ufcx) 68 | target_link_libraries(cudolfinx PUBLIC ufcx::ufcx) 69 | else() 70 | target_include_directories(cudolfinx SYSTEM PUBLIC ${UFCX_INCLUDE_DIRS}) 71 | endif() 72 | 73 | # Basix 74 | target_link_libraries(cudolfinx PUBLIC Basix::basix) 75 | 76 | # Boost 77 | target_link_libraries(cudolfinx PUBLIC Boost::headers) 78 | target_link_libraries(cudolfinx PUBLIC Boost::timer) 79 | 80 | # MPI 81 | target_link_libraries(cudolfinx PUBLIC MPI::MPI_CXX) 82 | 83 | # HDF5 84 | target_link_libraries(cudolfinx PUBLIC hdf5::hdf5) 85 | 86 | # CUDA Toolkit 87 | target_link_libraries(cudolfinx PRIVATE CUDA::cuda_driver CUDA::nvrtc CUDA::cupti) 88 | target_include_directories(cudolfinx SYSTEM PRIVATE ${CUDAToolkit_INCLUDE_DIRS}) 89 | 90 | # Dolfinx 91 | target_link_libraries(cudolfinx PUBLIC dolfinx) 92 | 93 | # ------------------------------------------------------------------------------ 94 | # Optional packages 95 | 96 | # PETSc 97 | if(CUDOLFINX_ENABLE_PETSC AND PETSC_FOUND) 98 | target_link_libraries(cudolfinx PUBLIC PkgConfig::PETSC) 99 | target_compile_definitions(cudolfinx PUBLIC HAS_PETSC) 100 | endif() 101 | 102 | 103 | # ------------------------------------------------------------------------------ 104 | # Install cudolfinx library and header files 105 | install( 106 | TARGETS cudolfinx 107 | EXPORT CUDOLFINXTargets 108 | RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} COMPONENT RuntimeExecutables 109 | LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT RuntimeLibraries 110 | ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT Development 111 | ) 112 | 113 | # Generate CUDOLFINXTargets.cmake 114 | install(EXPORT CUDOLFINXTargets DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/cudolfinx) 115 | 116 | # Install the header files 117 | install( 118 | FILES cudolfinx.h 119 | DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} 120 | COMPONENT Development 121 | ) 122 | 123 | foreach(DIR ${CUDOLFINX_DIRS}) 124 | install( 125 | FILES ${HEADERS_${DIR}} 126 | DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/cudolfinx/${DIR} 127 | COMPONENT Development 128 | ) 129 | endforeach() 130 | 131 | install( 132 | FILES ${CMAKE_CURRENT_BINARY_DIR}/common/version.h 133 | DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/cudolfinx/common 134 | COMPONENT Development 135 | ) 136 | 137 | # ------------------------------------------------------------------------------ 138 | # Generate CMake config files (CUDOLFINXConfig{,Version}.cmake) 139 | include(CMakePackageConfigHelpers) 140 | write_basic_package_version_file( 141 | ${CMAKE_BINARY_DIR}/cudolfinx/CUDOLFINXConfigVersion.cmake 142 | VERSION ${CUDOLFINX_VERSION} 143 | COMPATIBILITY AnyNewerVersion 144 | ) 145 | 146 | configure_package_config_file( 147 | ${CUDOLFINX_SOURCE_DIR}/cmake/templates/CUDOLFINXConfig.cmake.in 148 | ${CMAKE_BINARY_DIR}/cudolfinx/CUDOLFINXConfig.cmake 149 | INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/cudolfinx 150 | ) 151 | 152 | # Install CMake helper files 153 | install( 154 | FILES ${CMAKE_BINARY_DIR}/cudolfinx/CUDOLFINXConfig.cmake 155 | ${CMAKE_BINARY_DIR}/cudolfinx/CUDOLFINXConfigVersion.cmake 156 | DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/cudolfinx 157 | COMPONENT Development 158 | ) 159 | 160 | # ------------------------------------------------------------------------------ 161 | # Generate pkg-config file and install it 162 | 163 | # Define packages that should be required by pkg-config file 164 | set(PKG_REQUIRES "") 165 | 166 | # Get link libraries and includes 167 | get_target_property( 168 | PKGCONFIG_CUDOLFINX_TARGET_LINK_LIBRARIES cudolfinx INTERFACE_LINK_LIBRARIES 169 | ) 170 | get_target_property( 171 | PKGCONFIG_CUDOLFINX_INCLUDE_DIRECTORIES cudolfinx 172 | INTERFACE_SYSTEM_INCLUDE_DIRECTORIES 173 | ) 174 | 175 | # Add imported targets to lists for creating pkg-config file 176 | set(PKGCONFIG_CUDOLFINX_LIBS) 177 | 178 | foreach(_target ${PKGCONFIG_CUDOLFINX_TARGET_LINK_LIBRARIES}) 179 | if("${_target}" MATCHES "^[^<>]+$") # Skip "$", which we get with 180 | # static libs 181 | if("${_target}" MATCHES "^.*::.*$") 182 | # Get include paths 183 | get_target_property(_inc_dirs ${_target} INTERFACE_INCLUDE_DIRECTORIES) 184 | 185 | if(_inc_dirs) 186 | list(APPEND PKGCONFIG_CUDOLFINX_INCLUDE_DIRECTORIES ${_inc_dirs}) 187 | endif() 188 | 189 | # Get libraries 190 | get_target_property(_libs ${_target} INTERFACE_LINK_LIBRARIES) 191 | 192 | if(_libs) 193 | list(APPEND PKGCONFIG_CUDOLFINX_LIBS ${_libs}) 194 | endif() 195 | 196 | else() 197 | # 'regular' libs, i.e. not imported targets 198 | list(APPEND PKGCONFIG_CUDOLFINX_LIBS ${_target}) 199 | endif() 200 | 201 | # Special handling for compiled Boost imported targets 202 | if(("${_target}" MATCHES "^.*Boost::.*$") AND NOT "${_target}" STREQUAL 203 | "Boost::headers" 204 | ) 205 | get_target_property(_libs ${_target} IMPORTED_LOCATION_RELEASE) 206 | 207 | if(_libs) 208 | list(APPEND PKGCONFIG_CUDOLFINX_LIBS ${_libs}) 209 | endif() 210 | endif() 211 | endif() 212 | endforeach() 213 | 214 | # Join include lists and remove duplicates 215 | list(REMOVE_DUPLICATES PKGCONFIG_CUDOLFINX_INCLUDE_DIRECTORIES) 216 | list(REMOVE_DUPLICATES PKGCONFIG_CUDOLFINX_LIBS) 217 | 218 | # Convert include dirs to -I form 219 | foreach(_inc_dir ${PKGCONFIG_CUDOLFINX_INCLUDE_DIRECTORIES}) 220 | set(PKG_INCLUDES "-I${_inc_dir} ${PKG_INCLUDES}") 221 | endforeach() 222 | 223 | # Get cudolfinx definitions 224 | get_target_property( 225 | PKG_CUDOLFINX_DEFINITIONS cudolfinx INTERFACE_COMPILE_DEFINITIONS 226 | ) 227 | set(PKG_DEFINITIONS) 228 | 229 | foreach(_def ${PKG_DOLFINX_DEFINITIONS}) 230 | set(PKG_DEFINITIONS "${PKG_DEFINITIONS} -D${_def}") 231 | endforeach() 232 | 233 | # Get basix definitions (this is required to propagate Basix definition to the 234 | # pkg-config file, in the future Basix should create its own basix.pc file, see 235 | # https://github.com/FEniCS/basix/issues/204) 236 | get_target_property( 237 | PKG_BASIX_DEFINITIONS Basix::basix INTERFACE_COMPILE_DEFINITIONS 238 | ) 239 | 240 | foreach(_def ${PKG_BASIX_DEFINITIONS}) 241 | set(PKG_DEFINITIONS "${PKG_DEFINITIONS} -D${_def}") 242 | endforeach() 243 | 244 | # Convert compiler flags and definitions into space separated strings 245 | string(REPLACE ";" " " PKG_CXXFLAGS "${CMAKE_CXX_FLAGS}") 246 | string(REPLACE ";" " " PKG_LINKFLAGS "${CMAKE_EXE_LINKER_FLAGS}") 247 | 248 | # Convert libraries to -L -l form 249 | foreach(_lib ${PKGCONFIG_CUDOLFINX_LIBS}) 250 | # Add -Wl,option directives 251 | if("${_lib}" MATCHES "-Wl,[^ ]*") 252 | set(PKG_LINKFLAGS "${_lib} ${PKG_LINKFLAGS}") 253 | else() 254 | get_filename_component(_path ${_lib} DIRECTORY) 255 | get_filename_component(_name ${_lib} NAME_WE) 256 | string(REPLACE "lib" "" _name "${_name}") 257 | 258 | # Add libraries that matches the form -L -l 259 | if(NOT "${_path}" STREQUAL "") 260 | set(PKG_LINKFLAGS "-L${_path} -l${_name} ${PKG_LINKFLAGS}") 261 | endif() 262 | endif() 263 | endforeach() 264 | 265 | # Remove duplicated link flags 266 | separate_arguments(PKG_LINKFLAGS) 267 | list(REMOVE_DUPLICATES PKG_LINKFLAGS) 268 | string(REPLACE ";" " " PKG_LINKFLAGS "${PKG_LINKFLAGS}") 269 | 270 | # Add additional link flags 271 | foreach(_linkflag ${CUDOLFINX_LINK_FLAGS}) 272 | set(PKG_LINKFLAGS "${PKG_LINKFLAGS} ${_linkflag}") 273 | endforeach() 274 | 275 | # Boost include dir (used as pkg-config variable) 276 | get_target_property( 277 | BOOST_INCLUDE_DIR Boost::headers INTERFACE_INCLUDE_DIRECTORIES 278 | ) 279 | 280 | # Configure and install pkg-config file 281 | configure_file( 282 | ${CUDOLFINX_SOURCE_DIR}/cmake/templates/cudolfinx.pc.in 283 | ${CMAKE_BINARY_DIR}/cudolfinx.pc @ONLY 284 | ) 285 | install( 286 | FILES ${CMAKE_BINARY_DIR}/cudolfinx.pc 287 | DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig 288 | COMPONENT Development 289 | ) 290 | 291 | # ------------------------------------------------------------------------------ 292 | -------------------------------------------------------------------------------- /cpp/cudolfinx/mesh/util.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | using namespace dolfinx; 7 | 8 | std::vector dolfinx::mesh::ghost_exterior_facet_indices(std::shared_ptr topology) 9 | { 10 | const int tdim = topology->dim(); 11 | auto f_to_c = topology->connectivity(tdim - 1, tdim); 12 | auto f_to_v = topology->connectivity(tdim-1, 0); 13 | if (!f_to_c) { 14 | topology->create_connectivity(tdim-1, tdim); 15 | f_to_c = topology->connectivity(tdim-1, tdim); 16 | } 17 | if (!f_to_v) { 18 | topology->create_connectivity(tdim-1, 0); 19 | f_to_v = topology->connectivity(tdim-1, 0); 20 | } 21 | // Find all owned facets (not ghost) with only one attached cell 22 | auto facet_map = topology->index_map(tdim - 1); 23 | const int num_local_facets = facet_map->size_local(); 24 | const int num_ghost_facets = facet_map->num_ghosts(); 25 | const int num_local_vertices = topology->index_map(0)->size_local(); 26 | std::vector facets; 27 | for (std::int32_t f = num_local_facets; f < num_local_facets+num_ghost_facets; ++f) 28 | { 29 | if (f_to_c->num_links(f) == 1) { 30 | // check to make sure at least one facet vertex is owned 31 | // otherwise this is not needed 32 | auto vertices = f_to_v->links(f); 33 | bool has_owned_vertex = false; 34 | for (int i = 0; i < vertices.size(); i++) { 35 | if (vertices[i] < num_local_vertices) has_owned_vertex = true; 36 | } 37 | if (has_owned_vertex) facets.push_back(f); 38 | } 39 | } 40 | // Remove facets on internal inter-process boundary 41 | std::vector ext_facets; 42 | std::ranges::set_difference(facets, topology->interprocess_facets(), 43 | std::back_inserter(ext_facets)); 44 | return ext_facets; 45 | } 46 | 47 | std::vector dolfinx::mesh::ghost_entities( 48 | dolfinx::fem::IntegralType integral_type, 49 | std::shared_ptr topology) 50 | { 51 | std::vector ghost_entities; 52 | int tdim = topology->dim(); 53 | switch (integral_type) { 54 | case fem::IntegralType::cell: 55 | { 56 | auto cell_index_map = topology->index_map(tdim); 57 | int num_ghost_cells = cell_index_map->num_ghosts(); 58 | int num_owned_cells = cell_index_map->size_local(); 59 | ghost_entities.resize(num_ghost_cells); 60 | std::iota(ghost_entities.begin(), ghost_entities.end(), num_owned_cells); 61 | } 62 | break; 63 | case fem::IntegralType::exterior_facet: 64 | { 65 | auto ghost_exterior_facets = dolfinx::mesh::ghost_exterior_facet_indices(topology); 66 | ghost_entities.reserve(2*ghost_exterior_facets.size()); 67 | auto c_to_f = topology->connectivity(tdim, tdim-1); 68 | auto f_to_c = topology->connectivity(tdim-1, tdim); 69 | for (std::int32_t f : ghost_exterior_facets) { 70 | auto pair = 71 | dolfinx::fem::impl::get_cell_facet_pairs<1>(f, f_to_c->links(f), *c_to_f); 72 | ghost_entities.insert(ghost_entities.end(), pair.begin(), pair.end()); 73 | } 74 | } 75 | break; 76 | case fem::IntegralType::interior_facet: 77 | { 78 | auto c_to_f = topology->connectivity(tdim, tdim-1); 79 | auto f_to_c = topology->connectivity(tdim-1, tdim); 80 | auto facet_map = topology->index_map(tdim-1); 81 | int num_local_facets = facet_map->size_local(); 82 | int total_facets = num_local_facets + facet_map->num_ghosts(); 83 | for (int f = num_local_facets; f < total_facets; f++) { 84 | if (f_to_c->num_links(f) == 2) { 85 | auto pairs = 86 | fem::impl::get_cell_facet_pairs<2>(f, f_to_c->links(f), *c_to_f); 87 | ghost_entities.insert(ghost_entities.end(), pairs.begin(), pairs.end()); 88 | } 89 | } 90 | } 91 | default: 92 | break; 93 | } 94 | return ghost_entities; 95 | } 96 | 97 | std::vector dolfinx::mesh::active_ghost_entities( 98 | std::span active_local_entities, 99 | fem::IntegralType integral_type, 100 | std::shared_ptr topology) 101 | { 102 | std::vector ghost_entities; 103 | MPI_Comm comm = topology->comm(); 104 | // no need for ghosting if there is only one process 105 | if (dolfinx::MPI::size(comm) == 1) return ghost_entities; 106 | int rank = dolfinx::MPI::rank(comm); 107 | int tdim = topology->dim(); 108 | int ent_dim = (integral_type == fem::IntegralType::cell) ? tdim : tdim-1; 109 | // Step 1: determine the active entities which are ghosted on other processes 110 | std::map> dest_entities; 111 | auto imap = topology->index_map(ent_dim); 112 | int num_local_entities = imap->size_local(); 113 | auto entity_ranks = imap->index_to_dest_ranks(); 114 | int facet_increment = (integral_type == fem::IntegralType::interior_facet) ? 4 : 2; 115 | switch (integral_type) { 116 | case fem::IntegralType::cell: 117 | for (auto& cell : active_local_entities) { 118 | if (cell >= entity_ranks.num_nodes()) continue; 119 | for (auto& r : entity_ranks.links(cell)) { 120 | if (dest_entities.find(r) == dest_entities.end()) { 121 | dest_entities[r] = {cell}; 122 | } 123 | else dest_entities[r].push_back(cell); 124 | } 125 | } 126 | break; 127 | case fem::IntegralType::interior_facet: 128 | case fem::IntegralType::exterior_facet: { 129 | auto c_to_f = topology->connectivity(tdim, tdim-1); 130 | if (!c_to_f) { 131 | topology->create_connectivity(tdim, tdim-1); 132 | c_to_f = topology->connectivity(tdim, tdim-1); 133 | } 134 | for (int i = 0; i < active_local_entities.size(); i += facet_increment) { 135 | auto cell = active_local_entities[i]; 136 | auto facet_index = active_local_entities[i+1]; 137 | auto facet = c_to_f->links(cell)[facet_index]; 138 | if (facet >= entity_ranks.num_nodes()) continue; 139 | for (auto& r : entity_ranks.links(facet)) { 140 | if (dest_entities.find(r) == dest_entities.end()) { 141 | dest_entities[r] = {facet}; 142 | } 143 | else dest_entities[r].push_back(facet); 144 | } 145 | } 146 | } 147 | default: 148 | break; 149 | } 150 | 151 | // Step 2: send those entities to the other processes 152 | std::vector indices_send_buffer; 153 | // construct list of destination MPI ranks 154 | std::vector dest; 155 | std::vector send_sizes; 156 | for (const auto& pair : dest_entities) { 157 | dest.push_back(pair.first); 158 | std::size_t num_inds = pair.second.size(); 159 | send_sizes.push_back(num_inds); 160 | std::vector global_inds(num_inds); 161 | imap->local_to_global(pair.second, global_inds); 162 | for (const auto& i : global_inds) 163 | indices_send_buffer.push_back(i); 164 | } 165 | // get source ranks 166 | std::vector src = dolfinx::MPI::compute_graph_edges_nbx(comm, dest); 167 | // Create neighbor communicator 168 | MPI_Comm neigh_comm; 169 | int ierr = MPI_Dist_graph_create_adjacent( 170 | comm, src.size(), src.data(), MPI_UNWEIGHTED, dest.size(), 171 | dest.data(), MPI_UNWEIGHTED, MPI_INFO_NULL, false, &neigh_comm); 172 | dolfinx::MPI::check_error(comm, ierr); 173 | // Share lengths of indices to be sent to each rank 174 | std::vector recv_sizes(src.size(), 0); 175 | ierr = MPI_Neighbor_alltoall(send_sizes.data(), 1, MPI_INT, 176 | recv_sizes.data(), 1, MPI_INT, neigh_comm); 177 | dolfinx::MPI::check_error(comm, ierr); 178 | // Prepare displacement arrays 179 | std::vector send_disp(dest.size() + 1, 0); 180 | std::vector recv_disp(src.size() + 1, 0); 181 | std::partial_sum(send_sizes.begin(), send_sizes.end(), 182 | std::next(send_disp.begin())); 183 | std::partial_sum(recv_sizes.begin(), recv_sizes.end(), 184 | std::next(recv_disp.begin())); 185 | // next steps - construct recv buffers and perform communication 186 | std::size_t recv_buf_size = recv_disp.back(); 187 | // make sure that the buffer pointers actually get allocated 188 | std::vector indices_recv_buffer(recv_buf_size); 189 | ierr = MPI_Neighbor_alltoallv(indices_send_buffer.data(), send_sizes.data(), 190 | send_disp.data(), MPI_INT64_T, 191 | indices_recv_buffer.data(), recv_sizes.data(), 192 | recv_disp.data(), MPI_INT64_T, neigh_comm); 193 | dolfinx::MPI::check_error(comm, ierr); 194 | // Step 3: Convert from global to local indices and do entity computation 195 | std::vector local_recv_indices(indices_recv_buffer.size()); 196 | imap->global_to_local(indices_recv_buffer, local_recv_indices); 197 | 198 | switch (integral_type) { 199 | case fem::IntegralType::cell: 200 | return local_recv_indices; 201 | break; 202 | case fem::IntegralType::exterior_facet: { 203 | // Remove facets on internal inter-process boundary 204 | std::vector ext_facets; 205 | std::sort(local_recv_indices.begin(), local_recv_indices.end()); 206 | std::ranges::set_difference(local_recv_indices, topology->interprocess_facets(), 207 | std::back_inserter(ext_facets)); 208 | auto c_to_f = topology->connectivity(tdim, tdim-1); 209 | auto f_to_c = topology->connectivity(tdim-1, tdim); 210 | ghost_entities.reserve(2*ext_facets.size()); 211 | for (auto& facet : ext_facets) { 212 | if (f_to_c->num_links(facet) == 1) { 213 | auto pair = 214 | dolfinx::fem::impl::get_cell_facet_pairs<1>(facet, f_to_c->links(facet), *c_to_f); 215 | ghost_entities.insert(ghost_entities.end(), pair.begin(), pair.end()); 216 | } 217 | } 218 | break; 219 | } 220 | case fem::IntegralType::interior_facet: { 221 | auto c_to_f = topology->connectivity(tdim, tdim-1); 222 | auto f_to_c = topology->connectivity(tdim-1, tdim); 223 | for (auto& facet : local_recv_indices) { 224 | if (f_to_c->num_links(facet) == 2) { 225 | auto pair = 226 | dolfinx::fem::impl::get_cell_facet_pairs<2>(facet, f_to_c->links(facet), *c_to_f); 227 | ghost_entities.insert(ghost_entities.end(), pair.begin(), pair.end()); 228 | } 229 | } 230 | } 231 | default: 232 | break; 233 | } 234 | 235 | return ghost_entities; 236 | } 237 | -------------------------------------------------------------------------------- /cpp/cudolfinx/fem/CUDADofMap.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2024 Benjamin Pachev, James D. Trotter 2 | // 3 | // This file is part of cuDOLFINX 4 | // 5 | // SPDX-License-Identifier: LGPL-3.0-or-later 6 | 7 | #include "CUDADofMap.h" 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | using namespace dolfinx; 14 | using namespace dolfinx::fem; 15 | 16 | //----------------------------------------------------------------------------- 17 | CUDADofMap::CUDADofMap() 18 | : _dofmap(nullptr) 19 | , _num_dofs() 20 | , _num_cells() 21 | , _num_dofs_per_cell() 22 | , _ddofs_per_cell(0) 23 | , _dcells_per_dof_ptr(0) 24 | , _dcells_per_dof(0) 25 | { 26 | } 27 | 28 | CUDADofMap::CUDADofMap( 29 | const dolfinx::fem::DofMap* dofmap) 30 | : CUDADofMap::CUDADofMap(*dofmap, 0, 0, nullptr) 31 | { 32 | } 33 | 34 | CUDADofMap::CUDADofMap( 35 | const dolfinx::fem::DofMap* dofmap, 36 | std::int32_t offset, 37 | std::int32_t ghost_offset, 38 | std::map* restriction) 39 | : CUDADofMap::CUDADofMap(*dofmap, offset, ghost_offset, restriction) 40 | { 41 | } 42 | 43 | CUDADofMap::CUDADofMap( 44 | const dolfinx::fem::DofMap& dofmap) 45 | : CUDADofMap::CUDADofMap(dofmap, 0, 0, nullptr) 46 | { 47 | } 48 | 49 | //----------------------------------------------------------------------------- 50 | CUDADofMap::CUDADofMap( 51 | const dolfinx::fem::DofMap& dofmap, 52 | std::int32_t offset, 53 | std::int32_t ghost_offset, 54 | std::map* restriction) 55 | : _dofmap(&dofmap) 56 | , _num_dofs() 57 | , _num_cells() 58 | , _num_dofs_per_cell() 59 | , _ddofs_per_cell(0) 60 | , _dcells_per_dof_ptr(0) 61 | , _dcells_per_dof(0) 62 | { 63 | CUresult cuda_err; 64 | const char * cuda_err_description; 65 | 66 | auto dofs = dofmap.map(); 67 | auto element_dof_layout = dofmap.element_dof_layout(); 68 | // get block sizes and ensure positivity (sometimes the default is -1) 69 | std::int32_t element_block_size = element_dof_layout.block_size(); 70 | _block_size = dofmap.bs(); 71 | element_block_size = (element_block_size < 0) ? 1 : element_block_size; 72 | _block_size = (_block_size < 0) ? 1 : _block_size; 73 | _num_cells = dofs.extent(0); 74 | _num_dofs_per_cell = element_dof_layout.num_dofs() * element_block_size; 75 | _num_dofs = dofs.size() * _block_size; 76 | if (_num_dofs != _num_cells * _num_dofs_per_cell) { 77 | throw std::runtime_error( 78 | "Num dofs " + std::to_string(_num_dofs) + " != " + std::to_string(_num_cells) + 79 | "*" + std::to_string(_num_dofs_per_cell) + " at " + std::string(__FILE__) + ":" + std::to_string(__LINE__)); 80 | } 81 | // Allocate device-side storage for degrees of freedom 82 | if (_num_cells > 0 && _num_dofs_per_cell > 0) { 83 | size_t ddofs_per_cell_size = _num_dofs * sizeof(int32_t); 84 | cuda_err = cuMemAlloc( 85 | &_ddofs_per_cell, 86 | ddofs_per_cell_size); 87 | if (cuda_err != CUDA_SUCCESS) { 88 | cuGetErrorString(cuda_err, &cuda_err_description); 89 | throw std::runtime_error( 90 | "cuMemAlloc() failed with " + std::string(cuda_err_description) + 91 | " at " + std::string(__FILE__) + ":" + std::to_string(__LINE__)); 92 | } 93 | } 94 | update(offset, ghost_offset, restriction); 95 | 96 | // cells_per_dof_ptr and cells_per_dof are only used for 97 | // lookup table computations, which currently aren't in use 98 | /* 99 | // Compute mapping from degrees of freedom to cells 100 | std::vector cells_per_dof_ptr(_num_dofs+1); 101 | 102 | // Count the number cells containing each degree of freedom 103 | for (int32_t i = 0; i < _num_cells; i++) { 104 | auto cell_dofs = dofmap.cell_dofs(i); 105 | for (int32_t l = 0; l < cell_dofs.size(); l++) { 106 | int32_t j = cell_dofs[l]; 107 | cells_per_dof_ptr[j+1]++; 108 | } 109 | } 110 | 111 | // Compute offset to the first cell for each degree of freedom 112 | for (int32_t i = 0; i < _num_dofs; i++) 113 | cells_per_dof_ptr[i+1] += cells_per_dof_ptr[i]; 114 | int32_t num_dof_cells = cells_per_dof_ptr[_num_dofs]; 115 | if (num_dof_cells != _num_cells * _num_dofs_per_cell) { 116 | cuMemFree(_ddofs_per_cell); 117 | throw std::logic_error( 118 | "Expected " + std::to_string(_num_cells) + " cells, " + 119 | std::to_string(_num_dofs_per_cell) + " degrees of freedom per cell, " 120 | "but the mapping from degrees of freedom to cells contains " + 121 | std::to_string(num_dof_cells) + " values" ); 122 | } 123 | 124 | // Allocate storage for and compute the cells containing each degree 125 | // of freedom 126 | std::vector cells_per_dof(num_dof_cells); 127 | for (int32_t i = 0; i < _num_cells; i++) { 128 | auto cell_dofs = dofmap.cell_dofs(i); 129 | for (int32_t l = 0; l < cell_dofs.size(); l++) { 130 | int32_t j = cell_dofs[l]; 131 | int32_t p = cells_per_dof_ptr[j]; 132 | cells_per_dof[p] = i; 133 | cells_per_dof_ptr[j]++; 134 | } 135 | } 136 | 137 | // Adjust offsets to first cell 138 | for (int32_t i = _num_dofs; i > 0; i--) 139 | cells_per_dof_ptr[i] = cells_per_dof_ptr[i-1]; 140 | cells_per_dof_ptr[0] = 0; 141 | 142 | // Allocate device-side storage for offsets to the first cell 143 | // containing each degree of freedom 144 | if (_num_dofs > 0) { 145 | size_t dcells_per_dof_ptr_size = (_num_dofs+1) * sizeof(int32_t); 146 | cuda_err = cuMemAlloc( 147 | &_dcells_per_dof_ptr, dcells_per_dof_ptr_size); 148 | if (cuda_err != CUDA_SUCCESS) { 149 | cuGetErrorString(cuda_err, &cuda_err_description); 150 | cuMemFree(_ddofs_per_cell); 151 | throw std::runtime_error( 152 | "cuMemAlloc() failed with " + std::string(cuda_err_description) + 153 | " at " + std::string(__FILE__) + ":" + std::to_string(__LINE__)); 154 | } 155 | 156 | // Copy cell degrees of freedom to device 157 | cuda_err = cuMemcpyHtoD( 158 | _dcells_per_dof_ptr, cells_per_dof_ptr.data(), dcells_per_dof_ptr_size); 159 | if (cuda_err != CUDA_SUCCESS) { 160 | cuGetErrorString(cuda_err, &cuda_err_description); 161 | cuMemFree(_dcells_per_dof_ptr); 162 | cuMemFree(_ddofs_per_cell); 163 | throw std::runtime_error( 164 | "cuMemcpyHtoD() failed with " + std::string(cuda_err_description) + 165 | " at " + std::string(__FILE__) + ":" + std::to_string(__LINE__)); 166 | } 167 | } 168 | 169 | // Allocate device-side storage for cells containing each degree of freedom 170 | if (_num_cells > 0 && _num_dofs_per_cell > 0) { 171 | size_t dcells_per_dof_size = num_dof_cells * sizeof(int32_t); 172 | cuda_err = cuMemAlloc( 173 | &_dcells_per_dof, 174 | dcells_per_dof_size); 175 | if (cuda_err != CUDA_SUCCESS) { 176 | cuGetErrorString(cuda_err, &cuda_err_description); 177 | cuMemFree(_dcells_per_dof_ptr); 178 | cuMemFree(_ddofs_per_cell); 179 | throw std::runtime_error( 180 | "cuMemAlloc() failed with " + std::string(cuda_err_description) + 181 | " at " + std::string(__FILE__) + ":" + std::to_string(__LINE__)); 182 | } 183 | 184 | // Copy cell degrees of freedom to device 185 | cuda_err = cuMemcpyHtoD( 186 | _dcells_per_dof, cells_per_dof.data(), dcells_per_dof_size); 187 | if (cuda_err != CUDA_SUCCESS) { 188 | cuGetErrorString(cuda_err, &cuda_err_description); 189 | cuMemFree(_dcells_per_dof); 190 | cuMemFree(_dcells_per_dof_ptr); 191 | cuMemFree(_ddofs_per_cell); 192 | throw std::runtime_error( 193 | "cuMemcpyHtoD() failed with " + std::string(cuda_err_description) + 194 | " at " + std::string(__FILE__) + ":" + std::to_string(__LINE__)); 195 | } 196 | }*/ 197 | } 198 | //----------------------------------------------------------------------------- 199 | void CUDADofMap::update( 200 | std::int32_t offset, 201 | std::int32_t ghost_offset, 202 | std::map* restriction) 203 | { 204 | std::vector unrolled_dofs; 205 | const std::int32_t* dofs_per_cell, *dofs_orig; 206 | auto dofs = _dofmap->map(); 207 | dofs_orig = dofs.data_handle(); 208 | std::size_t local_size = _dofmap->index_map->size_local(); 209 | 210 | unrolled_dofs.resize(_num_dofs); 211 | 212 | for (std::size_t i = 0; i < dofs.size(); i++) { 213 | std::int32_t dof = dofs_orig[i]; 214 | std::int32_t offset_for_dof = (dof < local_size) ? offset : ghost_offset; 215 | if (restriction) { 216 | if (restriction->find(dof) != restriction->end()) { 217 | dof = (*restriction)[dof]; 218 | } 219 | else { 220 | for (int j = 0; j < _block_size; j++) 221 | unrolled_dofs[i*_block_size + j] = -1; // we should not be using this degree of freedom 222 | continue; 223 | } 224 | } 225 | 226 | for (int j = 0; j < _block_size; j++) 227 | unrolled_dofs[i*_block_size + j] = offset_for_dof + dof*_block_size + j; 228 | } 229 | 230 | // Copy cell degrees of freedom to device 231 | if (_num_cells > 0 && _num_dofs_per_cell > 0) { 232 | size_t ddofs_per_cell_size = _num_dofs * sizeof(int32_t); 233 | CUDA::safeMemcpyHtoD(_ddofs_per_cell, unrolled_dofs.data(), ddofs_per_cell_size); 234 | } 235 | 236 | } 237 | //----------------------------------------------------------------------------- 238 | CUDADofMap::~CUDADofMap() 239 | { 240 | if (_dcells_per_dof) 241 | cuMemFree(_dcells_per_dof); 242 | if (_dcells_per_dof_ptr) 243 | cuMemFree(_dcells_per_dof_ptr); 244 | if (_ddofs_per_cell) 245 | cuMemFree(_ddofs_per_cell); 246 | } 247 | //----------------------------------------------------------------------------- 248 | CUDADofMap::CUDADofMap(CUDADofMap&& dofmap) 249 | : _dofmap(dofmap._dofmap) 250 | , _num_dofs(dofmap._num_dofs) 251 | , _num_cells(dofmap._num_cells) 252 | , _num_dofs_per_cell(dofmap._num_dofs_per_cell) 253 | , _ddofs_per_cell(dofmap._ddofs_per_cell) 254 | , _dcells_per_dof_ptr(dofmap._dcells_per_dof_ptr) 255 | , _dcells_per_dof(dofmap._dcells_per_dof) 256 | { 257 | dofmap._dofmap = nullptr; 258 | dofmap._num_dofs = 0; 259 | dofmap._num_cells = 0; 260 | dofmap._num_dofs_per_cell = 0; 261 | dofmap._ddofs_per_cell = 0; 262 | dofmap._dcells_per_dof_ptr = 0; 263 | dofmap._dcells_per_dof = 0; 264 | } 265 | //----------------------------------------------------------------------------- 266 | CUDADofMap& CUDADofMap::operator=(CUDADofMap&& dofmap) 267 | { 268 | _dofmap = dofmap._dofmap; 269 | _num_dofs = dofmap._num_dofs; 270 | _num_cells = dofmap._num_cells; 271 | _num_dofs_per_cell = dofmap._num_dofs_per_cell; 272 | _ddofs_per_cell = dofmap._ddofs_per_cell; 273 | _dcells_per_dof_ptr = dofmap._dcells_per_dof_ptr; 274 | _dcells_per_dof = dofmap._dcells_per_dof; 275 | dofmap._dofmap = nullptr; 276 | dofmap._num_dofs = 0; 277 | dofmap._num_cells = 0; 278 | dofmap._num_dofs_per_cell = 0; 279 | dofmap._ddofs_per_cell = 0; 280 | dofmap._dcells_per_dof_ptr = 0; 281 | dofmap._dcells_per_dof = 0; 282 | return *this; 283 | } 284 | //----------------------------------------------------------------------------- 285 | -------------------------------------------------------------------------------- /cpp/cudolfinx/mesh/CUDAMesh.h: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2024 Benjamin Pachev, James D. Trotter 2 | // 3 | // This file is part of cuDOLFINX 4 | // 5 | // SPDX-License-Identifier: LGPL-3.0-or-later 6 | 7 | #pragma once 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | namespace dolfinx { 16 | namespace mesh { 17 | 18 | /// A wrapper for mesh data that is stored in the device memory of a 19 | /// CUDA device. 20 | template 21 | class CUDAMesh 22 | { 23 | public: 24 | /// Create an empty mesh 25 | CUDAMesh() 26 | : _tdim() 27 | , _num_vertices() 28 | , _num_coordinates_per_vertex() 29 | , _dvertex_coordinates(0) 30 | , _num_cells() 31 | , _num_vertices_per_cell() 32 | , _dvertex_indices_per_cell(0) 33 | , _dcell_permutations(0) 34 | , _mesh_entities() 35 | { 36 | } 37 | //----------------------------------------------------------------------------- 38 | /// Create a mesh 39 | /// 40 | /// @param[in] mesh Data structures for mesh topology and geometry 41 | CUDAMesh(const dolfinx::mesh::Mesh& mesh) 42 | { 43 | CUresult cuda_err; 44 | const char * cuda_err_description; 45 | 46 | _tdim = mesh.topology()->dim(); 47 | 48 | // Allocate device-side storage for vertex coordinates 49 | auto vertex_coordinates = mesh.geometry().x(); 50 | _num_vertices = vertex_coordinates.size() / 3; 51 | // TODO figure out how to handle this properly 52 | // FEniCSx has a dimension of 3 during assembly, but returns a 53 | // different value for the dim of mesh.geometry 54 | _num_coordinates_per_vertex = 3; 55 | //_num_coordinates_per_vertex = mesh.geometry().dim(); 56 | if (_num_vertices > 0 && _num_coordinates_per_vertex > 0) { 57 | if (_num_coordinates_per_vertex > 3) { 58 | throw std::runtime_error( 59 | "Expected at most 3 coordinates per vertex " 60 | "instead of " + std::to_string(_num_coordinates_per_vertex) + " " 61 | "at " + std::string(__FILE__) + ":" + std::to_string(__LINE__)); 62 | } 63 | 64 | size_t dvertex_coordinates_size = 65 | _num_vertices * 3 * sizeof(double); 66 | cuda_err = cuMemAlloc( 67 | &_dvertex_coordinates, 68 | dvertex_coordinates_size); 69 | if (cuda_err != CUDA_SUCCESS) { 70 | cuGetErrorString(cuda_err, &cuda_err_description); 71 | throw std::runtime_error( 72 | "cuMemAlloc() failed with " + std::string(cuda_err_description) + 73 | " at " + std::string(__FILE__) + ":" + std::to_string(__LINE__)); 74 | } 75 | 76 | // Copy vertex coordinates to device 77 | cuda_err = cuMemcpyHtoD( 78 | _dvertex_coordinates, 79 | vertex_coordinates.data(), 80 | dvertex_coordinates_size); 81 | if (cuda_err != CUDA_SUCCESS) { 82 | cuMemFree(_dvertex_coordinates); 83 | cuGetErrorString(cuda_err, &cuda_err_description); 84 | throw std::runtime_error( 85 | "cuMemcpyHtoD() failed with " + std::string(cuda_err_description) + 86 | " at " + std::string(__FILE__) + ":" + std::to_string(__LINE__)); 87 | } 88 | } 89 | 90 | // Obtain mesh geometry 91 | auto x_dofmap = 92 | mesh.geometry().dofmap(); 93 | 94 | // Allocate device-side storage for cell vertex indices 95 | _num_cells = x_dofmap.extent(0); 96 | _num_vertices_per_cell = x_dofmap.extent(1); 97 | if (_num_cells > 0 && _num_vertices_per_cell > 0) { 98 | size_t dvertex_indices_per_cell_size = 99 | _num_cells * _num_vertices_per_cell * sizeof(int32_t); 100 | cuda_err = cuMemAlloc( 101 | &_dvertex_indices_per_cell, 102 | dvertex_indices_per_cell_size); 103 | if (cuda_err != CUDA_SUCCESS) { 104 | cuMemFree(_dvertex_coordinates); 105 | cuGetErrorString(cuda_err, &cuda_err_description); 106 | throw std::runtime_error( 107 | "cuMemAlloc() failed with " + std::string(cuda_err_description) + 108 | " at " + std::string(__FILE__) + ":" + std::to_string(__LINE__)); 109 | } 110 | 111 | // Copy cell vertex indices to device 112 | cuda_err = cuMemcpyHtoD( 113 | _dvertex_indices_per_cell, 114 | x_dofmap.data_handle(), 115 | dvertex_indices_per_cell_size); 116 | if (cuda_err != CUDA_SUCCESS) { 117 | cuMemFree(_dvertex_indices_per_cell); 118 | cuMemFree(_dvertex_coordinates); 119 | cuGetErrorString(cuda_err, &cuda_err_description); 120 | throw std::runtime_error( 121 | "cuMemcpyHtoD() failed with " + std::string(cuda_err_description) + 122 | " at " + std::string(__FILE__) + ":" + std::to_string(__LINE__)); 123 | } 124 | } 125 | 126 | // Obtain cell permutations 127 | mesh.topology_mutable()->create_entity_permutations(); 128 | auto cell_permutations = mesh.topology()->get_cell_permutation_info(); 129 | 130 | // Allocate device-side storage for cell permutations 131 | if (_num_cells > 0) { 132 | size_t dcell_permutations_size = 133 | _num_cells * sizeof(uint32_t); 134 | cuda_err = cuMemAlloc( 135 | &_dcell_permutations, 136 | dcell_permutations_size); 137 | if (cuda_err != CUDA_SUCCESS) { 138 | cuMemFree(_dvertex_indices_per_cell); 139 | cuMemFree(_dvertex_coordinates); 140 | cuGetErrorString(cuda_err, &cuda_err_description); 141 | throw std::runtime_error( 142 | "cuMemAlloc() failed with " + std::string(cuda_err_description) + 143 | " at " + std::string(__FILE__) + ":" + std::to_string(__LINE__)); 144 | } 145 | 146 | // Copy cell permutations to device 147 | cuda_err = cuMemcpyHtoD( 148 | _dcell_permutations, 149 | cell_permutations.data(), 150 | dcell_permutations_size); 151 | if (cuda_err != CUDA_SUCCESS) { 152 | cuMemFree(_dcell_permutations); 153 | cuGetErrorString(cuda_err, &cuda_err_description); 154 | throw std::runtime_error( 155 | "cuMemcpyHtoD() failed with " + std::string(cuda_err_description) + 156 | " at " + std::string(__FILE__) + ":" + std::to_string(__LINE__)); 157 | } 158 | } 159 | 160 | for (int dim = 0; dim < _tdim; dim++) { 161 | _mesh_entities.emplace_back( 162 | mesh, dim); 163 | } 164 | } 165 | //----------------------------------------------------------------------------- 166 | /// Destructor 167 | ~CUDAMesh() 168 | { 169 | if (_dcell_permutations) 170 | cuMemFree(_dcell_permutations); 171 | if (_dvertex_indices_per_cell) 172 | cuMemFree(_dvertex_indices_per_cell); 173 | if (_dvertex_coordinates) 174 | cuMemFree(_dvertex_coordinates); 175 | } 176 | //----------------------------------------------------------------------------- 177 | /// Copy constructor 178 | /// @param[in] mesh The object to be copied 179 | CUDAMesh(const CUDAMesh& mesh) = delete; 180 | 181 | /// Move constructor 182 | /// @param[in] mesh The object to be moved 183 | CUDAMesh(CUDAMesh&& mesh) 184 | : _tdim(mesh._tdim) 185 | , _num_vertices(mesh._num_vertices) 186 | , _num_coordinates_per_vertex(mesh._num_coordinates_per_vertex) 187 | , _dvertex_coordinates(mesh._dvertex_coordinates) 188 | , _num_cells(mesh._num_cells) 189 | , _num_vertices_per_cell(mesh._num_vertices_per_cell) 190 | , _dvertex_indices_per_cell(mesh._dvertex_indices_per_cell) 191 | , _dcell_permutations(mesh._dcell_permutations) 192 | , _mesh_entities(std::move(mesh._mesh_entities)) 193 | { 194 | mesh._tdim = 0; 195 | mesh._num_vertices = 0; 196 | mesh._num_coordinates_per_vertex = 0; 197 | mesh._dvertex_coordinates = 0; 198 | mesh._num_cells = 0; 199 | mesh._num_vertices_per_cell = 0; 200 | mesh._dvertex_indices_per_cell = 0; 201 | mesh._dcell_permutations = 0; 202 | } 203 | //----------------------------------------------------------------------------- 204 | /// Assignment operator 205 | /// @param[in] mesh Another CUDAMesh object 206 | CUDAMesh& operator=(const CUDAMesh& mesh) = delete; 207 | 208 | /// Move assignment operator 209 | /// @param[in] mesh Another CUDAMesh object 210 | CUDAMesh& operator=(CUDAMesh&& mesh) 211 | { 212 | _tdim = mesh._tdim; 213 | _num_vertices = mesh._num_vertices; 214 | _num_coordinates_per_vertex = mesh._num_coordinates_per_vertex; 215 | _dvertex_coordinates = mesh._dvertex_coordinates; 216 | _num_cells = mesh._num_cells; 217 | _num_vertices_per_cell = mesh._num_vertices_per_cell; 218 | _dvertex_indices_per_cell = mesh._dvertex_indices_per_cell; 219 | _dcell_permutations = mesh._dcell_permutations; 220 | _mesh_entities = std::move(mesh._mesh_entities); 221 | mesh._tdim = 0; 222 | mesh._num_vertices = 0; 223 | mesh._num_coordinates_per_vertex = 0; 224 | mesh._dvertex_coordinates = 0; 225 | mesh._num_cells = 0; 226 | mesh._num_vertices_per_cell = 0; 227 | mesh._dvertex_indices_per_cell = 0; 228 | mesh._dcell_permutations = 0; 229 | return *this; 230 | } 231 | //----------------------------------------------------------------------------- 232 | 233 | 234 | /// Get the topological dimension of the mesh 235 | int32_t tdim() const { return _tdim; } 236 | 237 | /// Get the number of vertices 238 | int32_t num_vertices() const { return _num_vertices; } 239 | 240 | /// Get the number of coordinates per vertex 241 | int32_t num_coordinates_per_vertex() const { 242 | return _num_coordinates_per_vertex; } 243 | 244 | /// Get a handle to the device-side vertex coordinates 245 | CUdeviceptr vertex_coordinates() const { 246 | return _dvertex_coordinates; } 247 | 248 | /// Get the number of cells 249 | int32_t num_cells() const { return _num_cells; } 250 | 251 | /// Get the number of vertices per cell 252 | int32_t num_vertices_per_cell() const { 253 | return _num_vertices_per_cell; } 254 | 255 | /// Get a handle to the device-side cell vertex indices 256 | CUdeviceptr vertex_indices_per_cell() const { 257 | return _dvertex_indices_per_cell; } 258 | 259 | /// Get a handle to the device-side cell permutations 260 | CUdeviceptr cell_permutations() const { 261 | return _dcell_permutations; } 262 | 263 | /// Get the mesh entities of each dimension 264 | const std::vector>& mesh_entities() const { 265 | return _mesh_entities; } 266 | 267 | private: 268 | /// The topological dimension of the mesh, or the largest dimension 269 | /// of any of the mesh entities 270 | int32_t _tdim; 271 | 272 | /// The number of vertices in the mesh 273 | int32_t _num_vertices; 274 | 275 | /// The number of coordinates for each vertex 276 | int32_t _num_coordinates_per_vertex; 277 | 278 | /// The coordinates of the mesh vertices 279 | CUdeviceptr _dvertex_coordinates; 280 | 281 | /// The number of cells in the mesh 282 | int32_t _num_cells; 283 | 284 | /// The number of vertices in each cell 285 | int32_t _num_vertices_per_cell; 286 | 287 | /// The vertex indices of each cell 288 | CUdeviceptr _dvertex_indices_per_cell; 289 | 290 | /// Cell permutations 291 | CUdeviceptr _dcell_permutations; 292 | 293 | /// The mesh entities of each dimension 294 | std::vector> _mesh_entities; 295 | }; 296 | 297 | } // namespace mesh 298 | } // namespace dolfinx 299 | 300 | -------------------------------------------------------------------------------- /cpp/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Top level CMakeLists.txt file for DOLFINx 3 | cmake_minimum_required(VERSION 3.19) 4 | 5 | # ------------------------------------------------------------------------------ 6 | # Set project name and version number 7 | project(CUDOLFINX VERSION "0.9.0") 8 | 9 | set(DOXYGEN_CUDOLFINX_VERSION 10 | ${CUDOLFINX_VERSION} 11 | CACHE STRING "Version for Doxygen" FORCE 12 | ) 13 | 14 | # ------------------------------------------------------------------------------ 15 | # Use C++20 16 | set(CMAKE_CXX_STANDARD 20) 17 | 18 | # Require C++20 19 | set(CMAKE_CXX_STANDARD_REQUIRED ON) 20 | 21 | # Do not enable compler-specific extensions 22 | set(CMAKE_CXX_EXTENSIONS OFF) 23 | 24 | # ------------------------------------------------------------------------------ 25 | # Get GIT changeset, if available 26 | find_program(GIT_FOUND git) 27 | 28 | if(GIT_FOUND) 29 | # Get the commit hash of the working branch 30 | execute_process( 31 | COMMAND git rev-parse HEAD 32 | WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} 33 | OUTPUT_VARIABLE GIT_COMMIT_HASH 34 | OUTPUT_STRIP_TRAILING_WHITESPACE 35 | ) 36 | else() 37 | set(GIT_COMMIT_HASH "unknown") 38 | endif() 39 | 40 | # ------------------------------------------------------------------------------ 41 | # General configuration 42 | 43 | # Set location of our FindFoo.cmake modules 44 | set(CMAKE_MODULE_PATH "${CUDOLFINX_SOURCE_DIR}/cmake/modules") 45 | 46 | # Make sure CMake uses the correct DOLFINConfig.cmake for tests and demos 47 | set(CMAKE_PREFIX_PATH ${CMAKE_PREFIX_PATH} ${CMAKE_CURRENT_BINARY_DIR}/cudolfinx) 48 | 49 | # ------------------------------------------------------------------------------ 50 | # Configurable options for how we want to build 51 | include(FeatureSummary) 52 | 53 | option(BUILD_SHARED_LIBS "Build CUDOLFINx with shared libraries." ON) 54 | add_feature_info( 55 | BUILD_SHARED_LIBS BUILD_SHARED_LIBS "Build CUDOLFINx with shared libraries." 56 | ) 57 | 58 | option(CUDOLFINX_SKIP_BUILD_TESTS 59 | "Skip build tests for testing usability of dependency packages." OFF 60 | ) 61 | add_feature_info( 62 | CUDOLFINX_SKIP_BUILD_TESTS CUDOLFINX_SKIP_BUILD_TESTS 63 | "Skip build tests for testing usability of dependency packages." 64 | ) 65 | 66 | # Add shared library paths so shared libs in non-system paths are found 67 | option(CMAKE_INSTALL_RPATH_USE_LINK_PATH 68 | "Add paths to linker search and installed rpath." ON 69 | ) 70 | add_feature_info( 71 | CMAKE_INSTALL_RPATH_USE_LINK_PATH CMAKE_INSTALL_RPATH_USE_LINK_PATH 72 | "Add paths to linker search and installed rpath." 73 | ) 74 | 75 | # Control UFCx discovery 76 | option( 77 | CUDOLFINX_UFCX_PYTHON 78 | "Enable UFCx discovery using Python. Disable if UFCx should be found using CMake." 79 | ON 80 | ) 81 | add_feature_info( 82 | CUDOLFINX_UFCX_PYTHON 83 | CUDOLFINX_UFCX_PYTHON 84 | "Enable UFCx discovery using Python. Disable if UFCx should be found using a CMake config file." 85 | ) 86 | 87 | # ------------------------------------------------------------------------------ 88 | # Enable or disable optional packages 89 | 90 | 91 | if(CUDOLFINX_ENABLE_PETSC) 92 | set(_REQUIRE_PETSC 93 | TRUE 94 | CACHE BOOL "Is PETSc REQUIRED?" 95 | ) 96 | else() 97 | set(_REQUIRE_PETSC 98 | FALSE 99 | CACHE BOOL "Is PETSc REQUIRED?" 100 | ) 101 | endif() 102 | 103 | option(CUDOLFINX_ENABLE_PETSC "Compile with support for PETSc." ON) 104 | set_package_properties( 105 | PETSc PROPERTIES 106 | TYPE RECOMMENDED 107 | DESCRIPTION "Portable, Extensible Toolkit for Scientific Computation" 108 | URL "https://petsc.org/" 109 | PURPOSE "Linear and nonlinear solvers" 110 | ) 111 | 112 | 113 | # ------------------------------------------------------------------------------ 114 | # Check for MPI 115 | find_package(MPI 3 REQUIRED) 116 | 117 | # ------------------------------------------------------------------------------ 118 | # Compiler flags 119 | 120 | # Default build type (can be overridden by user) 121 | if(NOT CMAKE_BUILD_TYPE) 122 | set(CMAKE_BUILD_TYPE 123 | "RelWithDebInfo" 124 | CACHE 125 | STRING 126 | "Choose the type of build, options are: Debug Developer MinSizeRel Release RelWithDebInfo." 127 | FORCE 128 | ) 129 | endif() 130 | 131 | # Check for some compiler flags 132 | include(CheckCXXCompilerFlag) 133 | check_cxx_compiler_flag(-pipe HAVE_PIPE) 134 | 135 | if(HAVE_PIPE) 136 | list(APPEND CUDOLFINX_CXX_DEVELOPER_FLAGS -pipe) 137 | endif() 138 | 139 | # Add some strict compiler checks 140 | check_cxx_compiler_flag("-Wall -Werror -Wextra -pedantic" HAVE_PEDANTIC) 141 | 142 | if(HAVE_PEDANTIC) 143 | list(APPEND CUDOLFINX_CXX_DEVELOPER_FLAGS -Wall;-Werror;-Wextra;-pedantic) 144 | endif() 145 | 146 | # Debug flags 147 | check_cxx_compiler_flag(-g HAVE_DEBUG) 148 | 149 | if(HAVE_DEBUG) 150 | list(APPEND CUDOLFINX_CXX_DEVELOPER_FLAGS -g) 151 | endif() 152 | 153 | # Optimisation 154 | check_cxx_compiler_flag(-O2 HAVE_O2_OPTIMISATION) 155 | 156 | if(HAVE_O2_OPTIMISATION) 157 | list(APPEND CUDOLFINX_CXX_DEVELOPER_FLAGS -O2) 158 | endif() 159 | 160 | # ------------------------------------------------------------------------------ 161 | # Find required packages 162 | 163 | # pugixml 164 | find_package(pugixml REQUIRED) 165 | 166 | # Note: When updating Boost version, also update CUDOLFINXCongif.cmake.in 167 | if(DEFINED ENV{BOOST_ROOT} OR DEFINED BOOST_ROOT) 168 | set(Boost_NO_SYSTEM_PATHS on) 169 | endif() 170 | 171 | set(Boost_USE_MULTITHREADED $ENV{BOOST_USE_MULTITHREADED}) 172 | set(Boost_VERBOSE TRUE) 173 | find_package(Boost 1.70 REQUIRED timer) 174 | set_package_properties( 175 | Boost PROPERTIES 176 | TYPE REQUIRED 177 | DESCRIPTION "Boost C++ libraries" 178 | URL "http://www.boost.org" 179 | ) 180 | 181 | # Use Python for detecting UFCx and Basix 182 | find_package( 183 | Python3 184 | COMPONENTS Interpreter 185 | QUIET 186 | ) 187 | 188 | # Check for Basix Note: Basix may be installed as a standalone C++ library, or 189 | # in the Basix Python module tree 190 | if(Python3_Interpreter_FOUND) 191 | message(STATUS "Checking for basix hints with ${Python3_EXECUTABLE}") 192 | execute_process( 193 | COMMAND 194 | ${Python3_EXECUTABLE} -c 195 | "import basix, os, sys; sys.stdout.write(os.path.dirname(basix.__file__))" 196 | OUTPUT_VARIABLE BASIX_PY_DIR 197 | RESULT_VARIABLE BASIX_PY_COMMAND_RESULT 198 | ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE 199 | ) 200 | 201 | if(BASIX_PY_DIR) 202 | message(STATUS "Adding ${BASIX_PY_DIR} to Basix search hints") 203 | 204 | # Basix installed from manylinux wheel 205 | if(IS_DIRECTORY ${BASIX_PY_DIR}/../fenics_basix.libs) 206 | set(CMAKE_INSTALL_RPATH ${BASIX_PY_DIR}/../fenics_basix.libs) 207 | endif() 208 | endif() 209 | endif() 210 | 211 | find_package(Basix 0.8 REQUIRED CONFIG HINTS ${BASIX_PY_DIR}) 212 | set_package_properties( 213 | basix PROPERTIES 214 | TYPE REQUIRED 215 | DESCRIPTION "FEniCS tabulation library" 216 | URL "https://github.com/fenics/basix" 217 | ) 218 | 219 | find_package(DOLFINX 0.8 REQUIRED CONFIG) 220 | set_package_properties( 221 | DOLFINX PROPERTIES 222 | TYPE REQUIRED 223 | DESCRIPTION "Dynamic Object-oriented Library for FINite element computation" 224 | URL "https://github.com/fenics/basix" 225 | ) 226 | 227 | # Check for HDF5 228 | set(HDF5_PREFER_PARALLEL TRUE) 229 | set(HDF5_FIND_DEBUG TRUE) 230 | find_package(HDF5 REQUIRED COMPONENTS C) 231 | 232 | if(NOT HDF5_IS_PARALLEL) 233 | message( 234 | FATAL_ERROR 235 | "Found serial HDF5 build, MPI HDF5 build required, try setting HDF5_DIR or HDF5_ROOT" 236 | ) 237 | endif() 238 | 239 | set_package_properties( 240 | HDF5 PROPERTIES 241 | TYPE REQUIRED 242 | DESCRIPTION "Hierarchical Data Format 5 (HDF5)" 243 | URL "https://www.hdfgroup.org/HDF5" 244 | ) 245 | 246 | # Check for UFC Note: we use the case (ufcx vs UFCx) elsewhere to determine by 247 | # which method UFCx was found 248 | if(NOT CUDOLFINX_UFCX_PYTHON) 249 | # Check in CONFIG mode, i.e. look for installed ufcxConfig.cmake 250 | find_package(ufcx 0.8 REQUIRED CONFIG) 251 | else() 252 | # Check in MODULE mode (using FindUFCX.cmake) 253 | find_package( 254 | Python3 255 | COMPONENTS Interpreter 256 | REQUIRED 257 | ) 258 | find_package(UFCx 0.8 REQUIRED MODULE) 259 | endif() 260 | 261 | set_package_properties( 262 | UFCx PROPERTIES 263 | TYPE REQUIRED 264 | DESCRIPTION "Interface for form-compilers (part of FFCx)" 265 | URL "https://github.com/fenics/ffcx" 266 | ) 267 | 268 | find_package(CUDAToolkit REQUIRED) 269 | 270 | set_package_properties(CUDAToolkit PROPERTIES TYPE OPTIONAL 271 | DESCRIPTION "Parallel computing platform for GPUs" 272 | URL "https://developer.nvidia.com/cuda-toolkit" 273 | PURPOSE "Enables GPU-accelerated computing" 274 | ) 275 | 276 | 277 | # ------------------------------------------------------------------------------ 278 | # Find optional packages 279 | 280 | if(CUDOLFINX_ENABLE_PETSC) 281 | find_package(PkgConfig REQUIRED) 282 | set(ENV{PKG_CONFIG_PATH} 283 | "$ENV{PETSC_DIR}/$ENV{PETSC_ARCH}/lib/pkgconfig:$ENV{PETSC_DIR}/lib/pkgconfig:$ENV{PKG_CONFIG_PATH}" 284 | ) 285 | if(_REQUIRE_PETSC) 286 | pkg_search_module(PETSC REQUIRED IMPORTED_TARGET PETSc>=3.15 petsc>=3.15) 287 | else() 288 | pkg_search_module(PETSC OPTIONAL IMPORTED_TARGET PETSc>=3.15 petsc>=3.15) 289 | endif() 290 | 291 | # Setting for FeatureSummary 292 | if(PETSC_FOUND) 293 | message( 294 | STATUS "Found PETSc version ${PETSC_VERSION}, prefix: ${PETSC_PREFIX}" 295 | ) 296 | set_property(GLOBAL APPEND PROPERTY PACKAGES_FOUND PETSc) 297 | else() 298 | set_property(GLOBAL APPEND PROPERTY PACKAGES_NOT_FOUND PETSc) 299 | endif() 300 | endif() 301 | 302 | # ------------------------------------------------------------------------------ 303 | # Print summary of found and not found optional packages 304 | feature_summary(WHAT ALL) 305 | 306 | 307 | 308 | # ------------------------------------------------------------------------------ 309 | # Installation of DOLFINx library 310 | add_subdirectory(cudolfinx) 311 | 312 | # ------------------------------------------------------------------------------ 313 | # Generate and install helper file cudolfinx.conf 314 | 315 | # FIXME: Can CMake provide the library path name variable? 316 | if(APPLE) 317 | set(OS_LIBRARY_PATH_NAME "DYLD_LIBRARY_PATH") 318 | else() 319 | set(OS_LIBRARY_PATH_NAME "LD_LIBRARY_PATH") 320 | endif() 321 | 322 | # FIXME: not cross-platform compatible Create and install cudolfinx.conf file 323 | configure_file( 324 | ${CUDOLFINX_SOURCE_DIR}/cmake/templates/cudolfinx.conf.in 325 | ${CMAKE_BINARY_DIR}/cudolfinx.conf @ONLY 326 | ) 327 | install( 328 | FILES ${CMAKE_BINARY_DIR}/cudolfinx.conf 329 | DESTINATION ${CMAKE_INSTALL_LIBDIR}/cudolfinx 330 | COMPONENT Development 331 | ) 332 | 333 | # ------------------------------------------------------------------------------ 334 | # Add "make uninstall" target 335 | configure_file( 336 | "${CUDOLFINX_SOURCE_DIR}/cmake/templates/cmake_uninstall.cmake.in" 337 | "${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake" IMMEDIATE @ONLY 338 | ) 339 | 340 | add_custom_target( 341 | uninstall "${CMAKE_COMMAND}" -P 342 | "${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake" 343 | ) 344 | 345 | # ------------------------------------------------------------------------------ 346 | # Print post-install message 347 | add_subdirectory(cmake/post-install) 348 | 349 | # ------------------------------------------------------------------------------ 350 | --------------------------------------------------------------------------------