├── COPYING ├── COPYING.LESSER ├── README.md ├── cpp ├── CMakeLists.txt ├── cmake │ ├── modules │ │ └── FindUFCx.cmake │ ├── post-install │ │ └── CMakeLists.txt │ └── templates │ │ ├── CUDOLFINXConfig.cmake.in │ │ ├── cmake_uninstall.cmake.in │ │ ├── cudolfinx.conf.in │ │ └── cudolfinx.pc.in └── cudolfinx │ ├── CMakeLists.txt │ ├── common │ ├── CMakeLists.txt │ ├── CUDA.cpp │ ├── CUDA.h │ ├── CUDAStore.h │ └── version.h.in │ ├── cudolfinx.h │ ├── fem │ ├── CMakeLists.txt │ ├── CUDAAssembler.cpp │ ├── CUDAAssembler.h │ ├── CUDACoefficient.h │ ├── CUDADirichletBC.h │ ├── CUDADofMap.cpp │ ├── CUDADofMap.h │ ├── CUDAForm.h │ ├── CUDAFormCoefficients.h │ ├── CUDAFormConstants.h │ ├── CUDAFormIntegral.cpp │ ├── CUDAFormIntegral.h │ └── petsc.h │ ├── la │ ├── CMakeLists.txt │ ├── CUDAMatrix.cpp │ ├── CUDAMatrix.h │ ├── CUDASeqMatrix.cpp │ ├── CUDASeqMatrix.h │ ├── CUDAVector.cpp │ ├── CUDAVector.h │ ├── petsc.cpp │ └── petsc.h │ └── mesh │ ├── CMakeLists.txt │ ├── CUDAMesh.h │ ├── CUDAMeshEntities.h │ ├── util.cpp │ └── util.h ├── docker ├── Dockerfile.end-user └── Dockerfile.test-env ├── python ├── CMakeLists.txt ├── README.md ├── build-requirements.txt ├── cudolfinx │ ├── __init__.py │ ├── assemble.py │ ├── bcs.py │ ├── context.py │ ├── form.py │ ├── jit.py │ ├── la.py │ ├── mesh.py │ └── wrappers │ │ ├── caster_petsc.h │ │ ├── cudolfinx.cpp │ │ ├── fem.cpp │ │ └── petsc.cpp ├── examples │ ├── poisson.py │ └── poisson_sum_factorization.py ├── pyproject.toml └── test │ ├── test_cuda_assembly.py │ └── test_multigpu_assembly.py └── spack ├── packages ├── cuda-dolfinx │ └── package.py └── py-cuda-dolfinx │ └── package.py └── repo.yaml /COPYING.LESSER: -------------------------------------------------------------------------------- 1 | GNU LESSER GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | 9 | This version of the GNU Lesser General Public License incorporates 10 | the terms and conditions of version 3 of the GNU General Public 11 | License, supplemented by the additional permissions listed below. 12 | 13 | 0. Additional Definitions. 14 | 15 | As used herein, "this License" refers to version 3 of the GNU Lesser 16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU 17 | General Public License. 18 | 19 | "The Library" refers to a covered work governed by this License, 20 | other than an Application or a Combined Work as defined below. 21 | 22 | An "Application" is any work that makes use of an interface provided 23 | by the Library, but which is not otherwise based on the Library. 24 | Defining a subclass of a class defined by the Library is deemed a mode 25 | of using an interface provided by the Library. 26 | 27 | A "Combined Work" is a work produced by combining or linking an 28 | Application with the Library. The particular version of the Library 29 | with which the Combined Work was made is also called the "Linked 30 | Version". 31 | 32 | The "Minimal Corresponding Source" for a Combined Work means the 33 | Corresponding Source for the Combined Work, excluding any source code 34 | for portions of the Combined Work that, considered in isolation, are 35 | based on the Application, and not on the Linked Version. 36 | 37 | The "Corresponding Application Code" for a Combined Work means the 38 | object code and/or source code for the Application, including any data 39 | and utility programs needed for reproducing the Combined Work from the 40 | Application, but excluding the System Libraries of the Combined Work. 41 | 42 | 1. Exception to Section 3 of the GNU GPL. 43 | 44 | You may convey a covered work under sections 3 and 4 of this License 45 | without being bound by section 3 of the GNU GPL. 46 | 47 | 2. Conveying Modified Versions. 48 | 49 | If you modify a copy of the Library, and, in your modifications, a 50 | facility refers to a function or data to be supplied by an Application 51 | that uses the facility (other than as an argument passed when the 52 | facility is invoked), then you may convey a copy of the modified 53 | version: 54 | 55 | a) under this License, provided that you make a good faith effort to 56 | ensure that, in the event an Application does not supply the 57 | function or data, the facility still operates, and performs 58 | whatever part of its purpose remains meaningful, or 59 | 60 | b) under the GNU GPL, with none of the additional permissions of 61 | this License applicable to that copy. 62 | 63 | 3. Object Code Incorporating Material from Library Header Files. 64 | 65 | The object code form of an Application may incorporate material from 66 | a header file that is part of the Library. You may convey such object 67 | code under terms of your choice, provided that, if the incorporated 68 | material is not limited to numerical parameters, data structure 69 | layouts and accessors, or small macros, inline functions and templates 70 | (ten or fewer lines in length), you do both of the following: 71 | 72 | a) Give prominent notice with each copy of the object code that the 73 | Library is used in it and that the Library and its use are 74 | covered by this License. 75 | 76 | b) Accompany the object code with a copy of the GNU GPL and this license 77 | document. 78 | 79 | 4. Combined Works. 80 | 81 | You may convey a Combined Work under terms of your choice that, 82 | taken together, effectively do not restrict modification of the 83 | portions of the Library contained in the Combined Work and reverse 84 | engineering for debugging such modifications, if you also do each of 85 | the following: 86 | 87 | a) Give prominent notice with each copy of the Combined Work that 88 | the Library is used in it and that the Library and its use are 89 | covered by this License. 90 | 91 | b) Accompany the Combined Work with a copy of the GNU GPL and this license 92 | document. 93 | 94 | c) For a Combined Work that displays copyright notices during 95 | execution, include the copyright notice for the Library among 96 | these notices, as well as a reference directing the user to the 97 | copies of the GNU GPL and this license document. 98 | 99 | d) Do one of the following: 100 | 101 | 0) Convey the Minimal Corresponding Source under the terms of this 102 | License, and the Corresponding Application Code in a form 103 | suitable for, and under terms that permit, the user to 104 | recombine or relink the Application with a modified version of 105 | the Linked Version to produce a modified Combined Work, in the 106 | manner specified by section 6 of the GNU GPL for conveying 107 | Corresponding Source. 108 | 109 | 1) Use a suitable shared library mechanism for linking with the 110 | Library. A suitable mechanism is one that (a) uses at run time 111 | a copy of the Library already present on the user's computer 112 | system, and (b) will operate properly with a modified version 113 | of the Library that is interface-compatible with the Linked 114 | Version. 115 | 116 | e) Provide Installation Information, but only if you would otherwise 117 | be required to provide such information under section 6 of the 118 | GNU GPL, and only to the extent that such information is 119 | necessary to install and execute a modified version of the 120 | Combined Work produced by recombining or relinking the 121 | Application with a modified version of the Linked Version. (If 122 | you use option 4d0, the Installation Information must accompany 123 | the Minimal Corresponding Source and Corresponding Application 124 | Code. If you use option 4d1, you must provide the Installation 125 | Information in the manner specified by section 6 of the GNU GPL 126 | for conveying Corresponding Source.) 127 | 128 | 5. Combined Libraries. 129 | 130 | You may place library facilities that are a work based on the 131 | Library side by side in a single library together with other library 132 | facilities that are not Applications and are not covered by this 133 | License, and convey such a combined library under terms of your 134 | choice, if you do both of the following: 135 | 136 | a) Accompany the combined library with a copy of the same work based 137 | on the Library, uncombined with any other library facilities, 138 | conveyed under the terms of this License. 139 | 140 | b) Give prominent notice with the combined library that part of it 141 | is a work based on the Library, and explaining where to find the 142 | accompanying uncombined form of the same work. 143 | 144 | 6. Revised Versions of the GNU Lesser General Public License. 145 | 146 | The Free Software Foundation may publish revised and/or new versions 147 | of the GNU Lesser General Public License from time to time. Such new 148 | versions will be similar in spirit to the present version, but may 149 | differ in detail to address new problems or concerns. 150 | 151 | Each version is given a distinguishing version number. If the 152 | Library as you received it specifies that a certain numbered version 153 | of the GNU Lesser General Public License "or any later version" 154 | applies to it, you have the option of following the terms and 155 | conditions either of that published version or of any later version 156 | published by the Free Software Foundation. If the Library as you 157 | received it does not specify a version number of the GNU Lesser 158 | General Public License, you may choose any version of the GNU Lesser 159 | General Public License ever published by the Free Software Foundation. 160 | 161 | If the Library as you received it specifies that a proxy can decide 162 | whether future versions of the GNU Lesser General Public License shall 163 | apply, that proxy's public statement of acceptance of any version is 164 | permanent authorization for you to choose that version for the 165 | Library. 166 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # About 2 | 3 | This repository is an add-on extension to the DOLFINx library providing CUDA accelerated assembly routines. It complements the CUDA linear solvers in PETSc to enable fully GPU-accelerated DOLFINx codes. It is designed to enable GPU acceleration for existing codes with minimal changes. 4 | 5 | # Basic Usage 6 | 7 | ``` 8 | import cudolfinx as cufem 9 | 10 | # given UFL forms A and L representing a stiffness matrix and right-hand-side 11 | cuda_A = cufem.form(A) 12 | cuda_L = cufem.form(L) 13 | asm = cufem.CUDAAssembler() 14 | # returns a custom type CUDAMatrix 15 | mat = asm.assemble_matrix(cuda_A) 16 | mat.assemble() 17 | # get PETSc matrix 18 | petsc_mat = mat.mat() 19 | # returns a custom type CUDAVector 20 | vec = asm.assemble_vector(cuda_L) 21 | #get PETSc vector 22 | petsc_vec = vec.vector() 23 | ``` 24 | 25 | # Dependencies 26 | 27 | - dolfinx 0.9.0 28 | - PETSc with CUDA support 29 | - CUDA Toolkit 12.x 30 | 31 | # Installation 32 | 33 | There are three ways to do the install, in increasing order of difficulty. Currently, it is not possible to use `cudolfinx` with the existing Conda and Docker distributions of `dolfinx`, because these force installation of PETSc without CUDA support. Consequently, installing `cudolfinx` requires a custom modification to the `dolfinx` dependency stack that has CUDA-enabled PETSc. 34 | 35 | ## Docker 36 | 37 | Using Docker is by far the easiest approach. 38 | 39 | ``` 40 | docker run --gpus all -it benpachev/cudolfinx:v0.9.0-cuda12.6 41 | ``` 42 | You may experience errors with the prebuilt container due to CUDA Toolkit or MPI version mismatch between the host and container. In this case, the Dockerfiles in `docker/` can be modified to use a different CUDA Toolkit version or MPI version to build a container that will work with your system. Note that on HPC systems, Docker is not available, but Docker containers can be converted to Apptainer/Singularity containers. 43 | 44 | ``` 45 | apptainer pull docker://benpachev/cudolfinx:v0.9.0-cuda12.6 46 | apptainer run --nv cudolfinx_v0.9.0-cuda12.6.sif 47 | ``` 48 | 49 | ## Spack 50 | 51 | Spack is a management tool for HPC software, and allows for an extreme amount of flexibility in compilation of code and dependency selection. It has somewhat of a learning curve, and typically doesn't work out of the box without some manual configuration. However, it can be a good choice for HPC systems without Apptainer installed, or if more control over the compilation process and dependencies is desired. To install with Spack: 52 | 53 | ``` 54 | git clone https://github.com/spack/spack.git 55 | . spack/share/spack/setup-env.sh 56 | spack env create cudolfinx-env 57 | spacktivate cudolfinx-env 58 | git clone https://github.com/bpachev/cuda-dolfinx.git 59 | spack repo add cuda-dolfinx/spack 60 | spack add cuda-dolfinx py-cuda-dolfinx 61 | spack install 62 | ``` 63 | 64 | If this leads to errors, it is likely due to either (a) Spack is unable to find a suitable compiler or properly configure your existing compiler (b) Spack is trying to build a poorly supported low-level package from source. To resolve (a), you can usually do `spack compiler add`. Especially on HPC systems, [additional configuration](https://spack-tutorial.readthedocs.io/en/latest/tutorial_configuration.html#compiler-configuration) is usually needed. To solve (b), you will often need to [force Spack to use existing](https://spack-tutorial.readthedocs.io/en/latest/tutorial_configuration.html#external-packages) low-level software on your system instead of trying to install it from source. 65 | 66 | ## From Source 67 | 68 | The difficult part about the install is the dependencies. The Dockerfiles under `docker/` provide a template for how to install the dependencies on Debian-based systems (and using Docker is by far the easiest way to get a development environment). Once that is taken care of, the installation of `cuda-dolfinx` itself is simple. 69 | 70 | ### C++ Core 71 | ``` 72 | cd cpp 73 | mkdir build 74 | cmake .. -DCUDOLFINX_SKIP_BUILD_TESTS=YES 75 | make install 76 | ``` 77 | 78 | ### Python Bindings: 79 | ``` 80 | cd python 81 | pip --check-build-dependencies --no-build-isolation . 82 | ``` 83 | 84 | For help with installing or using the library, feel free to contact me at benjaminpachev@gmail.com. 85 | -------------------------------------------------------------------------------- /cpp/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | # Top level CMakeLists.txt file for DOLFINx 3 | cmake_minimum_required(VERSION 3.19) 4 | 5 | # ------------------------------------------------------------------------------ 6 | # Set project name and version number 7 | project(CUDOLFINX VERSION "0.9.0") 8 | 9 | set(DOXYGEN_CUDOLFINX_VERSION 10 | ${CUDOLFINX_VERSION} 11 | CACHE STRING "Version for Doxygen" FORCE 12 | ) 13 | 14 | # ------------------------------------------------------------------------------ 15 | # Use C++20 16 | set(CMAKE_CXX_STANDARD 20) 17 | 18 | # Require C++20 19 | set(CMAKE_CXX_STANDARD_REQUIRED ON) 20 | 21 | # Do not enable compler-specific extensions 22 | set(CMAKE_CXX_EXTENSIONS OFF) 23 | 24 | # ------------------------------------------------------------------------------ 25 | # Get GIT changeset, if available 26 | find_program(GIT_FOUND git) 27 | 28 | if(GIT_FOUND) 29 | # Get the commit hash of the working branch 30 | execute_process( 31 | COMMAND git rev-parse HEAD 32 | WORKING_DIRECTORY ${CMAKE_SOURCE_DIR} 33 | OUTPUT_VARIABLE GIT_COMMIT_HASH 34 | OUTPUT_STRIP_TRAILING_WHITESPACE 35 | ) 36 | else() 37 | set(GIT_COMMIT_HASH "unknown") 38 | endif() 39 | 40 | # ------------------------------------------------------------------------------ 41 | # General configuration 42 | 43 | # Set location of our FindFoo.cmake modules 44 | set(CMAKE_MODULE_PATH "${CUDOLFINX_SOURCE_DIR}/cmake/modules") 45 | 46 | # Make sure CMake uses the correct DOLFINConfig.cmake for tests and demos 47 | set(CMAKE_PREFIX_PATH ${CMAKE_PREFIX_PATH} ${CMAKE_CURRENT_BINARY_DIR}/cudolfinx) 48 | 49 | # ------------------------------------------------------------------------------ 50 | # Configurable options for how we want to build 51 | include(FeatureSummary) 52 | 53 | option(BUILD_SHARED_LIBS "Build CUDOLFINx with shared libraries." ON) 54 | add_feature_info( 55 | BUILD_SHARED_LIBS BUILD_SHARED_LIBS "Build CUDOLFINx with shared libraries." 56 | ) 57 | 58 | option(CUDOLFINX_SKIP_BUILD_TESTS 59 | "Skip build tests for testing usability of dependency packages." OFF 60 | ) 61 | add_feature_info( 62 | CUDOLFINX_SKIP_BUILD_TESTS CUDOLFINX_SKIP_BUILD_TESTS 63 | "Skip build tests for testing usability of dependency packages." 64 | ) 65 | 66 | # Add shared library paths so shared libs in non-system paths are found 67 | option(CMAKE_INSTALL_RPATH_USE_LINK_PATH 68 | "Add paths to linker search and installed rpath." ON 69 | ) 70 | add_feature_info( 71 | CMAKE_INSTALL_RPATH_USE_LINK_PATH CMAKE_INSTALL_RPATH_USE_LINK_PATH 72 | "Add paths to linker search and installed rpath." 73 | ) 74 | 75 | # Control UFCx discovery 76 | option( 77 | CUDOLFINX_UFCX_PYTHON 78 | "Enable UFCx discovery using Python. Disable if UFCx should be found using CMake." 79 | ON 80 | ) 81 | add_feature_info( 82 | CUDOLFINX_UFCX_PYTHON 83 | CUDOLFINX_UFCX_PYTHON 84 | "Enable UFCx discovery using Python. Disable if UFCx should be found using a CMake config file." 85 | ) 86 | 87 | # ------------------------------------------------------------------------------ 88 | # Enable or disable optional packages 89 | 90 | 91 | if(CUDOLFINX_ENABLE_PETSC) 92 | set(_REQUIRE_PETSC 93 | TRUE 94 | CACHE BOOL "Is PETSc REQUIRED?" 95 | ) 96 | else() 97 | set(_REQUIRE_PETSC 98 | FALSE 99 | CACHE BOOL "Is PETSc REQUIRED?" 100 | ) 101 | endif() 102 | 103 | option(CUDOLFINX_ENABLE_PETSC "Compile with support for PETSc." ON) 104 | set_package_properties( 105 | PETSc PROPERTIES 106 | TYPE RECOMMENDED 107 | DESCRIPTION "Portable, Extensible Toolkit for Scientific Computation" 108 | URL "https://petsc.org/" 109 | PURPOSE "Linear and nonlinear solvers" 110 | ) 111 | 112 | 113 | # ------------------------------------------------------------------------------ 114 | # Check for MPI 115 | find_package(MPI 3 REQUIRED) 116 | 117 | # ------------------------------------------------------------------------------ 118 | # Compiler flags 119 | 120 | # Default build type (can be overridden by user) 121 | if(NOT CMAKE_BUILD_TYPE) 122 | set(CMAKE_BUILD_TYPE 123 | "RelWithDebInfo" 124 | CACHE 125 | STRING 126 | "Choose the type of build, options are: Debug Developer MinSizeRel Release RelWithDebInfo." 127 | FORCE 128 | ) 129 | endif() 130 | 131 | # Check for some compiler flags 132 | include(CheckCXXCompilerFlag) 133 | check_cxx_compiler_flag(-pipe HAVE_PIPE) 134 | 135 | if(HAVE_PIPE) 136 | list(APPEND CUDOLFINX_CXX_DEVELOPER_FLAGS -pipe) 137 | endif() 138 | 139 | # Add some strict compiler checks 140 | check_cxx_compiler_flag("-Wall -Werror -Wextra -pedantic" HAVE_PEDANTIC) 141 | 142 | if(HAVE_PEDANTIC) 143 | list(APPEND CUDOLFINX_CXX_DEVELOPER_FLAGS -Wall;-Werror;-Wextra;-pedantic) 144 | endif() 145 | 146 | # Debug flags 147 | check_cxx_compiler_flag(-g HAVE_DEBUG) 148 | 149 | if(HAVE_DEBUG) 150 | list(APPEND CUDOLFINX_CXX_DEVELOPER_FLAGS -g) 151 | endif() 152 | 153 | # Optimisation 154 | check_cxx_compiler_flag(-O2 HAVE_O2_OPTIMISATION) 155 | 156 | if(HAVE_O2_OPTIMISATION) 157 | list(APPEND CUDOLFINX_CXX_DEVELOPER_FLAGS -O2) 158 | endif() 159 | 160 | # ------------------------------------------------------------------------------ 161 | # Find required packages 162 | 163 | # pugixml 164 | find_package(pugixml REQUIRED) 165 | 166 | # Note: When updating Boost version, also update CUDOLFINXCongif.cmake.in 167 | if(DEFINED ENV{BOOST_ROOT} OR DEFINED BOOST_ROOT) 168 | set(Boost_NO_SYSTEM_PATHS on) 169 | endif() 170 | 171 | set(Boost_USE_MULTITHREADED $ENV{BOOST_USE_MULTITHREADED}) 172 | set(Boost_VERBOSE TRUE) 173 | find_package(Boost 1.70 REQUIRED timer) 174 | set_package_properties( 175 | Boost PROPERTIES 176 | TYPE REQUIRED 177 | DESCRIPTION "Boost C++ libraries" 178 | URL "http://www.boost.org" 179 | ) 180 | 181 | # Use Python for detecting UFCx and Basix 182 | find_package( 183 | Python3 184 | COMPONENTS Interpreter 185 | QUIET 186 | ) 187 | 188 | # Check for Basix Note: Basix may be installed as a standalone C++ library, or 189 | # in the Basix Python module tree 190 | if(Python3_Interpreter_FOUND) 191 | message(STATUS "Checking for basix hints with ${Python3_EXECUTABLE}") 192 | execute_process( 193 | COMMAND 194 | ${Python3_EXECUTABLE} -c 195 | "import basix, os, sys; sys.stdout.write(os.path.dirname(basix.__file__))" 196 | OUTPUT_VARIABLE BASIX_PY_DIR 197 | RESULT_VARIABLE BASIX_PY_COMMAND_RESULT 198 | ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE 199 | ) 200 | 201 | if(BASIX_PY_DIR) 202 | message(STATUS "Adding ${BASIX_PY_DIR} to Basix search hints") 203 | 204 | # Basix installed from manylinux wheel 205 | if(IS_DIRECTORY ${BASIX_PY_DIR}/../fenics_basix.libs) 206 | set(CMAKE_INSTALL_RPATH ${BASIX_PY_DIR}/../fenics_basix.libs) 207 | endif() 208 | endif() 209 | endif() 210 | 211 | find_package(Basix 0.8 REQUIRED CONFIG HINTS ${BASIX_PY_DIR}) 212 | set_package_properties( 213 | basix PROPERTIES 214 | TYPE REQUIRED 215 | DESCRIPTION "FEniCS tabulation library" 216 | URL "https://github.com/fenics/basix" 217 | ) 218 | 219 | find_package(DOLFINX 0.8 REQUIRED CONFIG) 220 | set_package_properties( 221 | DOLFINX PROPERTIES 222 | TYPE REQUIRED 223 | DESCRIPTION "Dynamic Object-oriented Library for FINite element computation" 224 | URL "https://github.com/fenics/basix" 225 | ) 226 | 227 | # Check for HDF5 228 | set(HDF5_PREFER_PARALLEL TRUE) 229 | set(HDF5_FIND_DEBUG TRUE) 230 | find_package(HDF5 REQUIRED COMPONENTS C) 231 | 232 | if(NOT HDF5_IS_PARALLEL) 233 | message( 234 | FATAL_ERROR 235 | "Found serial HDF5 build, MPI HDF5 build required, try setting HDF5_DIR or HDF5_ROOT" 236 | ) 237 | endif() 238 | 239 | set_package_properties( 240 | HDF5 PROPERTIES 241 | TYPE REQUIRED 242 | DESCRIPTION "Hierarchical Data Format 5 (HDF5)" 243 | URL "https://www.hdfgroup.org/HDF5" 244 | ) 245 | 246 | # Check for UFC Note: we use the case (ufcx vs UFCx) elsewhere to determine by 247 | # which method UFCx was found 248 | if(NOT CUDOLFINX_UFCX_PYTHON) 249 | # Check in CONFIG mode, i.e. look for installed ufcxConfig.cmake 250 | find_package(ufcx 0.8 REQUIRED CONFIG) 251 | else() 252 | # Check in MODULE mode (using FindUFCX.cmake) 253 | find_package( 254 | Python3 255 | COMPONENTS Interpreter 256 | REQUIRED 257 | ) 258 | find_package(UFCx 0.8 REQUIRED MODULE) 259 | endif() 260 | 261 | set_package_properties( 262 | UFCx PROPERTIES 263 | TYPE REQUIRED 264 | DESCRIPTION "Interface for form-compilers (part of FFCx)" 265 | URL "https://github.com/fenics/ffcx" 266 | ) 267 | 268 | find_package(CUDAToolkit REQUIRED) 269 | 270 | set_package_properties(CUDAToolkit PROPERTIES TYPE OPTIONAL 271 | DESCRIPTION "Parallel computing platform for GPUs" 272 | URL "https://developer.nvidia.com/cuda-toolkit" 273 | PURPOSE "Enables GPU-accelerated computing" 274 | ) 275 | 276 | 277 | # ------------------------------------------------------------------------------ 278 | # Find optional packages 279 | 280 | if(CUDOLFINX_ENABLE_PETSC) 281 | find_package(PkgConfig REQUIRED) 282 | set(ENV{PKG_CONFIG_PATH} 283 | "$ENV{PETSC_DIR}/$ENV{PETSC_ARCH}/lib/pkgconfig:$ENV{PETSC_DIR}/lib/pkgconfig:$ENV{PKG_CONFIG_PATH}" 284 | ) 285 | if(_REQUIRE_PETSC) 286 | pkg_search_module(PETSC REQUIRED IMPORTED_TARGET PETSc>=3.15 petsc>=3.15) 287 | else() 288 | pkg_search_module(PETSC OPTIONAL IMPORTED_TARGET PETSc>=3.15 petsc>=3.15) 289 | endif() 290 | 291 | # Setting for FeatureSummary 292 | if(PETSC_FOUND) 293 | message( 294 | STATUS "Found PETSc version ${PETSC_VERSION}, prefix: ${PETSC_PREFIX}" 295 | ) 296 | set_property(GLOBAL APPEND PROPERTY PACKAGES_FOUND PETSc) 297 | else() 298 | set_property(GLOBAL APPEND PROPERTY PACKAGES_NOT_FOUND PETSc) 299 | endif() 300 | endif() 301 | 302 | # ------------------------------------------------------------------------------ 303 | # Print summary of found and not found optional packages 304 | feature_summary(WHAT ALL) 305 | 306 | 307 | 308 | # ------------------------------------------------------------------------------ 309 | # Installation of DOLFINx library 310 | add_subdirectory(cudolfinx) 311 | 312 | # ------------------------------------------------------------------------------ 313 | # Generate and install helper file cudolfinx.conf 314 | 315 | # FIXME: Can CMake provide the library path name variable? 316 | if(APPLE) 317 | set(OS_LIBRARY_PATH_NAME "DYLD_LIBRARY_PATH") 318 | else() 319 | set(OS_LIBRARY_PATH_NAME "LD_LIBRARY_PATH") 320 | endif() 321 | 322 | # FIXME: not cross-platform compatible Create and install cudolfinx.conf file 323 | configure_file( 324 | ${CUDOLFINX_SOURCE_DIR}/cmake/templates/cudolfinx.conf.in 325 | ${CMAKE_BINARY_DIR}/cudolfinx.conf @ONLY 326 | ) 327 | install( 328 | FILES ${CMAKE_BINARY_DIR}/cudolfinx.conf 329 | DESTINATION ${CMAKE_INSTALL_LIBDIR}/cudolfinx 330 | COMPONENT Development 331 | ) 332 | 333 | # ------------------------------------------------------------------------------ 334 | # Add "make uninstall" target 335 | configure_file( 336 | "${CUDOLFINX_SOURCE_DIR}/cmake/templates/cmake_uninstall.cmake.in" 337 | "${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake" IMMEDIATE @ONLY 338 | ) 339 | 340 | add_custom_target( 341 | uninstall "${CMAKE_COMMAND}" -P 342 | "${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake" 343 | ) 344 | 345 | # ------------------------------------------------------------------------------ 346 | # Print post-install message 347 | add_subdirectory(cmake/post-install) 348 | 349 | # ------------------------------------------------------------------------------ 350 | -------------------------------------------------------------------------------- /cpp/cmake/modules/FindUFCx.cmake: -------------------------------------------------------------------------------- 1 | #============================================================================= 2 | # - Try to find UFCx by interrogating the Python module FFCx 3 | # Once done this will define 4 | # 5 | # UFCX_FOUND - system has UFCx 6 | # UFCX_INCLUDE_DIRS - include directories for UFCx 7 | # UFCX_SIGNATURE - signature for UFCx 8 | # UFCX_VERSION - version for UFCx 9 | # 10 | #============================================================================= 11 | # Copyright (C) 2010-2021 Johannes Ring and Garth N. Wells 12 | # All rights reserved. 13 | # 14 | # Redistribution and use in source and binary forms, with or without 15 | # modification, are permitted provided that the following conditions 16 | # are met: 17 | # 18 | # 1. Redistributions of source code must retain the above copyright 19 | # notice, this list of conditions and the following disclaimer. 20 | # 2. Redistributions in binary form must reproduce the above copyright 21 | # notice, this list of conditions and the following disclaimer in 22 | # the documentation and/or other materials provided with the 23 | # distribution. 24 | # 25 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 26 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 27 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 28 | # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 29 | # COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 30 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 31 | # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 32 | # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 33 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 | # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 35 | # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 36 | # POSSIBILITY OF SUCH DAMAGE. 37 | #============================================================================= 38 | 39 | message( 40 | STATUS 41 | "Asking Python module FFCx for location of UFC... (Python executable: ${Python3_EXECUTABLE})" 42 | ) 43 | 44 | # Get include path 45 | execute_process( 46 | COMMAND 47 | ${Python3_EXECUTABLE} -c 48 | "import ffcx.codegeneration, sys; sys.stdout.write(ffcx.codegeneration.get_include_path())" 49 | OUTPUT_VARIABLE UFCX_INCLUDE_DIR 50 | ) 51 | 52 | # Get ufcx.h version 53 | if(UFCX_INCLUDE_DIR) 54 | set(UFCX_INCLUDE_DIRS 55 | ${UFCX_INCLUDE_DIR} 56 | CACHE STRING "Where to find ufcx.h" 57 | ) 58 | execute_process( 59 | COMMAND ${Python3_EXECUTABLE} -c 60 | "import ffcx, sys; sys.stdout.write(ffcx.__version__)" 61 | OUTPUT_VARIABLE UFCX_VERSION 62 | ) 63 | endif() 64 | 65 | # Compute hash of ufcx.h 66 | find_file(_UFCX_HEADER "ufcx.h" ${UFCX_INCLUDE_DIR}) 67 | if(_UFCX_HEADER) 68 | file(SHA1 ${_UFCX_HEADER} UFCX_SIGNATURE) 69 | endif() 70 | 71 | mark_as_advanced(UFCX_VERSION UFCX_INCLUDE_DIRS UFCX_SIGNATURE) 72 | find_package_handle_standard_args( 73 | UFCx 74 | REQUIRED_VARS UFCX_INCLUDE_DIRS UFCX_SIGNATURE UFCX_VERSION 75 | VERSION_VAR UFCX_VERSION HANDLE_VERSION_RANGE REASON_FAILURE_MESSAGE 76 | "UFCx could not be found." 77 | ) 78 | -------------------------------------------------------------------------------- /cpp/cmake/post-install/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | install( 2 | CODE "MESSAGE( 3 | \"---------------------------------------------------------------------------- 4 | CUDOLFINx has now been installed in 5 | 6 | ${CMAKE_INSTALL_PREFIX} 7 | 8 | 9 | Don't forget to update your environment variables. This can be done 10 | easily using the helper file 'cudolfinx.conf' which sets the appropriate 11 | variables (for users of the Bash shell). 12 | 13 | To update your environment variables, run the following command: 14 | 15 | source ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}/cudolfinx/cudolfinx.conf 16 | 17 | ----------------------------------------------------------------------------\")" 18 | ) 19 | -------------------------------------------------------------------------------- /cpp/cmake/templates/CUDOLFINXConfig.cmake.in: -------------------------------------------------------------------------------- 1 | # * Build details for CUDOLFINx: CUDA extension for DOLFINX 2 | # 3 | # This file has been automatically generated. 4 | 5 | # FIXME: Check that naming conforms to CMake standards 6 | 7 | @PACKAGE_INIT@ 8 | include(CMakeFindDependencyMacro) 9 | 10 | find_dependency(MPI REQUIRED) 11 | find_dependency(pugixml) 12 | 13 | # Check for Boost 14 | if(DEFINED ENV{BOOST_ROOT} OR DEFINED BOOST_ROOT) 15 | set(Boost_NO_SYSTEM_PATHS on) 16 | endif() 17 | set(Boost_USE_MULTITHREADED $ENV{BOOST_USE_MULTITHREADED}) 18 | set(Boost_VERBOSE TRUE) 19 | find_dependency(Boost 1.70 REQUIRED COMPONENTS timer filesystem) 20 | 21 | if(@ufcx_FOUND@) 22 | find_dependency(ufcx) 23 | endif() 24 | 25 | # Basix 26 | find_package(Python3 COMPONENTS Interpreter) 27 | if(Python3_Interpreter_FOUND) 28 | execute_process( 29 | COMMAND 30 | ${Python3_EXECUTABLE} -c 31 | "import basix, os, sys; sys.stdout.write(os.path.dirname(basix.__file__))" 32 | OUTPUT_VARIABLE BASIX_PY_DIR 33 | RESULT_VARIABLE BASIX_PY_COMMAND_RESULT 34 | ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE 35 | ) 36 | endif() 37 | if(BASIX_PY_DIR) 38 | message(STATUS "Adding ${BASIX_PY_DIR} to Basix search hints") 39 | endif() 40 | find_dependency(Basix CONFIG HINTS ${BASIX_PY_DIR}) 41 | 42 | # HDF5 43 | if(NOT TARGET hdf5::hdf5) 44 | set(HDF5_PREFER_PARALLEL TRUE) 45 | set(HDF5_FIND_DEBUG TRUE) 46 | find_dependency(HDF5 COMPONENTS C) 47 | if(HDF5_FOUND AND NOT HDF5_IS_PARALLEL) 48 | message(FATAL_ERROR "Found serial HDF5 build, MPI HDF5 build required") 49 | endif() 50 | endif() 51 | 52 | if(@PETSC_FOUND@) 53 | if(NOT TARGET PkgConfig::PETSC) 54 | find_package(PkgConfig REQUIRED) 55 | set(ENV{PKG_CONFIG_PATH} 56 | "$ENV{PETSC_DIR}/$ENV{PETSC_ARCH}/lib/pkgconfig:$ENV{PETSC_DIR}/lib/pkgconfig:$ENV{PKG_CONFIG_PATH}" 57 | ) 58 | pkg_search_module(PETSC REQUIRED IMPORTED_TARGET PETSc petsc) 59 | endif() 60 | endif() 61 | 62 | if(@SLEPC_FOUND@) 63 | if(NOT TARGET PkgConfig::SLEPC) 64 | find_package(PkgConfig REQUIRED) 65 | set(ENV{PKG_CONFIG_PATH} 66 | "$ENV{SLEPC_DIR}/$ENV{PETSC_ARCH}/lib/pkgconfig:$ENV{SLEPC_DIR}/lib/pkgconfig:$ENV{PKG_CONFIG_PATH}" 67 | ) 68 | set(ENV{PKG_CONFIG_PATH} 69 | "$ENV{PETSC_DIR}/$ENV{PETSC_ARCH}/lib/pkgconfig:$ENV{PETSC_DIR}/lib/pkgconfig:$ENV{PKG_CONFIG_PATH}" 70 | ) 71 | set(ENV{PKG_CONFIG_PATH} 72 | "$ENV{PETSC_DIR}/$ENV{PETSC_ARCH}:$ENV{PETSC_DIR}:$ENV{PKG_CONFIG_PATH}" 73 | ) 74 | pkg_search_module(SLEPC REQUIRED IMPORTED_TARGET SLEPc slepc) 75 | endif() 76 | endif() 77 | 78 | if(NOT TARGET cudolfinx) 79 | include("${CMAKE_CURRENT_LIST_DIR}/CUDOLFINXTargets.cmake") 80 | endif() 81 | 82 | check_required_components(CUDOLFINX) 83 | -------------------------------------------------------------------------------- /cpp/cmake/templates/cmake_uninstall.cmake.in: -------------------------------------------------------------------------------- 1 | if(NOT EXISTS "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt") 2 | message( 3 | FATAL_ERROR 4 | "Cannot find install manifest: \"@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt\"" 5 | ) 6 | endif() 7 | 8 | file(READ "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt" files) 9 | string(REGEX REPLACE "\n" ";" files "${files}") 10 | foreach(file ${files}) 11 | message(STATUS "Uninstalling \"$ENV{DESTDIR}${file}\"") 12 | if(EXISTS "$ENV{DESTDIR}${file}") 13 | exec_program( 14 | "@CMAKE_COMMAND@" ARGS 15 | "-E remove \"$ENV{DESTDIR}${file}\"" 16 | OUTPUT_VARIABLE rm_out 17 | RETURN_VALUE rm_retval 18 | ) 19 | if(NOT "${rm_retval}" STREQUAL 0) 20 | message(FATAL_ERROR "Problem when removing \"$ENV{DESTDIR}${file}\"") 21 | endif() 22 | else() 23 | message(STATUS "File \"$ENV{DESTDIR}${file}\" does not exist.") 24 | endif() 25 | endforeach() 26 | -------------------------------------------------------------------------------- /cpp/cmake/templates/cudolfinx.conf.in: -------------------------------------------------------------------------------- 1 | # Helper file for setting non-default CUDOLFINx environment variables 2 | 3 | # Common Unix variables 4 | export @OS_LIBRARY_PATH_NAME@=@CMAKE_INSTALL_PREFIX@/@CMAKE_INSTALL_LIBDIR@:$@OS_LIBRARY_PATH_NAME@ 5 | export PATH=@CMAKE_INSTALL_PREFIX@/@CMAKE_INSTALL_BINDIR@:$PATH 6 | export PKG_CONFIG_PATH=@CMAKE_INSTALL_PREFIX@/@CMAKE_INSTALL_LIBDIR@/pkgconfig:$PKG_CONFIG_PATH 7 | export CMAKE_PREFIX_PATH=@CMAKE_INSTALL_PREFIX@/@CMAKE_INSTALL_LIBDIR@/cmake:$CMAKE_PREFIX_PATH 8 | 9 | # Special macOS variables 10 | export DYLD_FRAMEWORK_PATH=/opt/local/Library/Frameworks:$DYLD_FRAMEWORK_PATH 11 | -------------------------------------------------------------------------------- /cpp/cmake/templates/cudolfinx.pc.in: -------------------------------------------------------------------------------- 1 | # pkg-config configuration for CUDOLFINx 2 | prefix=@CMAKE_INSTALL_PREFIX@ 3 | exec_prefix=@CMAKE_INSTALL_PREFIX@ 4 | libdir=${exec_prefix}/@CMAKE_INSTALL_LIBDIR@ 5 | includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@ 6 | compiler=@CMAKE_CXX_COMPILER@ 7 | definitions=@PKG_DEFINITIONS@ 8 | extlibs=@CUDOLFINX_EXT_LIBS@ 9 | 10 | Name: CUDOLFINx 11 | Description: CUDA extension for DOLFINX 12 | Version: @CUDOLFINX_VERSION@ 13 | Requires: @PKG_REQUIRES@ 14 | Conflicts: 15 | Libs: @PKG_LINKFLAGS@ -L${libdir} -lcudolfinx 16 | Cflags: @PKG_CXXFLAGS@ -DCUDOLFINX_VERSION=\"@CUDOLFINX_VERSION@\" ${definitions} -I${includedir} @PKG_INCLUDES@ 17 | -------------------------------------------------------------------------------- /cpp/cudolfinx/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------ 2 | include(GNUInstallDirs) 3 | 4 | # ------------------------------------------------------------------------------ 5 | # Declare the library (target) 6 | add_library(cudolfinx) 7 | 8 | # ------------------------------------------------------------------------------ 9 | # Add source files to the target 10 | set(CUDOLFINX_DIRS 11 | common 12 | fem 13 | la 14 | mesh 15 | ) 16 | 17 | # Add source to dolfinx target, and get sets of header files 18 | foreach(DIR ${CUDOLFINX_DIRS}) 19 | add_subdirectory(${DIR}) 20 | endforeach() 21 | 22 | # Set target include location (for build and installed) 23 | target_include_directories( 24 | cudolfinx 25 | PUBLIC 26 | $ 27 | "$" 28 | ) 29 | 30 | # ------------------------------------------------------------------------------ 31 | # Configure the common/version.h file 32 | configure_file( 33 | ${CMAKE_CURRENT_SOURCE_DIR}/common/version.h.in common/version.h @ONLY 34 | ) 35 | 36 | # ------------------------------------------------------------------------------ 37 | # Set target properties 38 | set_target_properties( 39 | cudolfinx 40 | PROPERTIES VERSION ${CUDOLFINX_VERSION} 41 | SOVERSION ${CUDOLFINX_VERSION_MAJOR}.${CUDOLFINX_VERSION_MINOR} 42 | ) 43 | 44 | # Add git revision flag to the one affected file 45 | #set_source_files_properties( 46 | # common/defines.cpp 47 | # PROPERTIES 48 | # COMPILE_DEFINITIONS 49 | # "UFCX_SIGNATURE=\"${UFCX_SIGNATURE}\";CUDOLFINX_GIT_COMMIT_HASH=\"${GIT_COMMIT_HASH}\"" 50 | #) 51 | 52 | # ------------------------------------------------------------------------------ 53 | # Set compiler options and definitions 54 | 55 | # Set 'Developer' build type flags 56 | target_compile_options( 57 | cudolfinx PRIVATE $<$:${CUDOLFINX_CXX_DEVELOPER_FLAGS}> 58 | ) 59 | 60 | # Add version to definitions (public) 61 | target_compile_definitions(cudolfinx PUBLIC CUDOLFINX_VERSION="${CUDOLFINX_VERSION}") 62 | 63 | # ------------------------------------------------------------------------------ 64 | # Add include directories and libraries of required packages 65 | 66 | # UFCx 67 | if(TARGET ufcx::ufcx) 68 | target_link_libraries(cudolfinx PUBLIC ufcx::ufcx) 69 | else() 70 | target_include_directories(cudolfinx SYSTEM PUBLIC ${UFCX_INCLUDE_DIRS}) 71 | endif() 72 | 73 | # Basix 74 | target_link_libraries(cudolfinx PUBLIC Basix::basix) 75 | 76 | # Boost 77 | target_link_libraries(cudolfinx PUBLIC Boost::headers) 78 | target_link_libraries(cudolfinx PUBLIC Boost::timer) 79 | 80 | # MPI 81 | target_link_libraries(cudolfinx PUBLIC MPI::MPI_CXX) 82 | 83 | # HDF5 84 | target_link_libraries(cudolfinx PUBLIC hdf5::hdf5) 85 | 86 | # CUDA Toolkit 87 | target_link_libraries(cudolfinx PRIVATE CUDA::cuda_driver CUDA::nvrtc CUDA::cupti) 88 | target_include_directories(cudolfinx SYSTEM PRIVATE ${CUDAToolkit_INCLUDE_DIRS}) 89 | 90 | # Dolfinx 91 | target_link_libraries(cudolfinx PUBLIC dolfinx) 92 | 93 | # ------------------------------------------------------------------------------ 94 | # Optional packages 95 | 96 | # PETSc 97 | if(CUDOLFINX_ENABLE_PETSC AND PETSC_FOUND) 98 | target_link_libraries(cudolfinx PUBLIC PkgConfig::PETSC) 99 | target_compile_definitions(cudolfinx PUBLIC HAS_PETSC) 100 | endif() 101 | 102 | 103 | # ------------------------------------------------------------------------------ 104 | # Install cudolfinx library and header files 105 | install( 106 | TARGETS cudolfinx 107 | EXPORT CUDOLFINXTargets 108 | RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} COMPONENT RuntimeExecutables 109 | LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT RuntimeLibraries 110 | ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT Development 111 | ) 112 | 113 | # Generate CUDOLFINXTargets.cmake 114 | install(EXPORT CUDOLFINXTargets DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/cudolfinx) 115 | 116 | # Install the header files 117 | install( 118 | FILES cudolfinx.h 119 | DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} 120 | COMPONENT Development 121 | ) 122 | 123 | foreach(DIR ${CUDOLFINX_DIRS}) 124 | install( 125 | FILES ${HEADERS_${DIR}} 126 | DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/cudolfinx/${DIR} 127 | COMPONENT Development 128 | ) 129 | endforeach() 130 | 131 | install( 132 | FILES ${CMAKE_CURRENT_BINARY_DIR}/common/version.h 133 | DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/cudolfinx/common 134 | COMPONENT Development 135 | ) 136 | 137 | # ------------------------------------------------------------------------------ 138 | # Generate CMake config files (CUDOLFINXConfig{,Version}.cmake) 139 | include(CMakePackageConfigHelpers) 140 | write_basic_package_version_file( 141 | ${CMAKE_BINARY_DIR}/cudolfinx/CUDOLFINXConfigVersion.cmake 142 | VERSION ${CUDOLFINX_VERSION} 143 | COMPATIBILITY AnyNewerVersion 144 | ) 145 | 146 | configure_package_config_file( 147 | ${CUDOLFINX_SOURCE_DIR}/cmake/templates/CUDOLFINXConfig.cmake.in 148 | ${CMAKE_BINARY_DIR}/cudolfinx/CUDOLFINXConfig.cmake 149 | INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/cudolfinx 150 | ) 151 | 152 | # Install CMake helper files 153 | install( 154 | FILES ${CMAKE_BINARY_DIR}/cudolfinx/CUDOLFINXConfig.cmake 155 | ${CMAKE_BINARY_DIR}/cudolfinx/CUDOLFINXConfigVersion.cmake 156 | DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/cudolfinx 157 | COMPONENT Development 158 | ) 159 | 160 | # ------------------------------------------------------------------------------ 161 | # Generate pkg-config file and install it 162 | 163 | # Define packages that should be required by pkg-config file 164 | set(PKG_REQUIRES "") 165 | 166 | # Get link libraries and includes 167 | get_target_property( 168 | PKGCONFIG_CUDOLFINX_TARGET_LINK_LIBRARIES cudolfinx INTERFACE_LINK_LIBRARIES 169 | ) 170 | get_target_property( 171 | PKGCONFIG_CUDOLFINX_INCLUDE_DIRECTORIES cudolfinx 172 | INTERFACE_SYSTEM_INCLUDE_DIRECTORIES 173 | ) 174 | 175 | # Add imported targets to lists for creating pkg-config file 176 | set(PKGCONFIG_CUDOLFINX_LIBS) 177 | 178 | foreach(_target ${PKGCONFIG_CUDOLFINX_TARGET_LINK_LIBRARIES}) 179 | if("${_target}" MATCHES "^[^<>]+$") # Skip "$", which we get with 180 | # static libs 181 | if("${_target}" MATCHES "^.*::.*$") 182 | # Get include paths 183 | get_target_property(_inc_dirs ${_target} INTERFACE_INCLUDE_DIRECTORIES) 184 | 185 | if(_inc_dirs) 186 | list(APPEND PKGCONFIG_CUDOLFINX_INCLUDE_DIRECTORIES ${_inc_dirs}) 187 | endif() 188 | 189 | # Get libraries 190 | get_target_property(_libs ${_target} INTERFACE_LINK_LIBRARIES) 191 | 192 | if(_libs) 193 | list(APPEND PKGCONFIG_CUDOLFINX_LIBS ${_libs}) 194 | endif() 195 | 196 | else() 197 | # 'regular' libs, i.e. not imported targets 198 | list(APPEND PKGCONFIG_CUDOLFINX_LIBS ${_target}) 199 | endif() 200 | 201 | # Special handling for compiled Boost imported targets 202 | if(("${_target}" MATCHES "^.*Boost::.*$") AND NOT "${_target}" STREQUAL 203 | "Boost::headers" 204 | ) 205 | get_target_property(_libs ${_target} IMPORTED_LOCATION_RELEASE) 206 | 207 | if(_libs) 208 | list(APPEND PKGCONFIG_CUDOLFINX_LIBS ${_libs}) 209 | endif() 210 | endif() 211 | endif() 212 | endforeach() 213 | 214 | # Join include lists and remove duplicates 215 | list(REMOVE_DUPLICATES PKGCONFIG_CUDOLFINX_INCLUDE_DIRECTORIES) 216 | list(REMOVE_DUPLICATES PKGCONFIG_CUDOLFINX_LIBS) 217 | 218 | # Convert include dirs to -I form 219 | foreach(_inc_dir ${PKGCONFIG_CUDOLFINX_INCLUDE_DIRECTORIES}) 220 | set(PKG_INCLUDES "-I${_inc_dir} ${PKG_INCLUDES}") 221 | endforeach() 222 | 223 | # Get cudolfinx definitions 224 | get_target_property( 225 | PKG_CUDOLFINX_DEFINITIONS cudolfinx INTERFACE_COMPILE_DEFINITIONS 226 | ) 227 | set(PKG_DEFINITIONS) 228 | 229 | foreach(_def ${PKG_DOLFINX_DEFINITIONS}) 230 | set(PKG_DEFINITIONS "${PKG_DEFINITIONS} -D${_def}") 231 | endforeach() 232 | 233 | # Get basix definitions (this is required to propagate Basix definition to the 234 | # pkg-config file, in the future Basix should create its own basix.pc file, see 235 | # https://github.com/FEniCS/basix/issues/204) 236 | get_target_property( 237 | PKG_BASIX_DEFINITIONS Basix::basix INTERFACE_COMPILE_DEFINITIONS 238 | ) 239 | 240 | foreach(_def ${PKG_BASIX_DEFINITIONS}) 241 | set(PKG_DEFINITIONS "${PKG_DEFINITIONS} -D${_def}") 242 | endforeach() 243 | 244 | # Convert compiler flags and definitions into space separated strings 245 | string(REPLACE ";" " " PKG_CXXFLAGS "${CMAKE_CXX_FLAGS}") 246 | string(REPLACE ";" " " PKG_LINKFLAGS "${CMAKE_EXE_LINKER_FLAGS}") 247 | 248 | # Convert libraries to -L -l form 249 | foreach(_lib ${PKGCONFIG_CUDOLFINX_LIBS}) 250 | # Add -Wl,option directives 251 | if("${_lib}" MATCHES "-Wl,[^ ]*") 252 | set(PKG_LINKFLAGS "${_lib} ${PKG_LINKFLAGS}") 253 | else() 254 | get_filename_component(_path ${_lib} DIRECTORY) 255 | get_filename_component(_name ${_lib} NAME_WE) 256 | string(REPLACE "lib" "" _name "${_name}") 257 | 258 | # Add libraries that matches the form -L -l 259 | if(NOT "${_path}" STREQUAL "") 260 | set(PKG_LINKFLAGS "-L${_path} -l${_name} ${PKG_LINKFLAGS}") 261 | endif() 262 | endif() 263 | endforeach() 264 | 265 | # Remove duplicated link flags 266 | separate_arguments(PKG_LINKFLAGS) 267 | list(REMOVE_DUPLICATES PKG_LINKFLAGS) 268 | string(REPLACE ";" " " PKG_LINKFLAGS "${PKG_LINKFLAGS}") 269 | 270 | # Add additional link flags 271 | foreach(_linkflag ${CUDOLFINX_LINK_FLAGS}) 272 | set(PKG_LINKFLAGS "${PKG_LINKFLAGS} ${_linkflag}") 273 | endforeach() 274 | 275 | # Boost include dir (used as pkg-config variable) 276 | get_target_property( 277 | BOOST_INCLUDE_DIR Boost::headers INTERFACE_INCLUDE_DIRECTORIES 278 | ) 279 | 280 | # Configure and install pkg-config file 281 | configure_file( 282 | ${CUDOLFINX_SOURCE_DIR}/cmake/templates/cudolfinx.pc.in 283 | ${CMAKE_BINARY_DIR}/cudolfinx.pc @ONLY 284 | ) 285 | install( 286 | FILES ${CMAKE_BINARY_DIR}/cudolfinx.pc 287 | DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig 288 | COMPONENT Development 289 | ) 290 | 291 | # ------------------------------------------------------------------------------ 292 | -------------------------------------------------------------------------------- /cpp/cudolfinx/common/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(HEADERS_common 2 | ${CMAKE_CURRENT_SOURCE_DIR}/CUDA.h 3 | ${CMAKE_CURRENT_SOURCE_DIR}/CUDAStore.h 4 | PARENT_SCOPE 5 | ) 6 | 7 | target_sources( 8 | cudolfinx 9 | PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/CUDA.cpp 10 | ) 11 | -------------------------------------------------------------------------------- /cpp/cudolfinx/common/CUDA.h: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2024 Benjamin Pachev, James D. Trotter 2 | // 3 | // This file is part of cuDOLFINX 4 | // 5 | // SPDX-License-Identifier: LGPL-3.0-or-later 6 | 7 | #pragma once 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | namespace dolfinx 15 | { 16 | 17 | namespace CUDA 18 | { 19 | class Module; 20 | class Kernel; 21 | 22 | /// This class is a wrapper around a CUDA device context 23 | class Context 24 | { 25 | public: 26 | /// Create a CUDA device context 27 | Context(); 28 | 29 | /// Destructor 30 | ~Context(); 31 | 32 | /// Copy constructor 33 | /// @param[in] context The object to be copied 34 | Context(const Context& context) = delete; 35 | 36 | /// Move constructor 37 | /// @param[in] context The object to be moved 38 | Context(Context&& context) = delete; 39 | 40 | /// Assignment operator 41 | /// @param[in] context The object to assign from 42 | Context& operator=(const Context& context) = delete; 43 | 44 | /// Move assignment operator 45 | /// @param[in] context The object to assign from 46 | Context& operator=(Context&& context) = delete; 47 | 48 | /// Return underlying CUDA device 49 | const CUdevice& device() const; 50 | 51 | /// Return underlying CUDA context 52 | CUcontext& context(); 53 | 54 | private: 55 | CUdevice _device; 56 | CUcontext _context; 57 | }; 58 | 59 | /// This class is a wrapper around a module, which is obtained by 60 | /// compiling PTX assembly to CUDA device code. 61 | class Module 62 | { 63 | public: 64 | /// Create an empty module 65 | Module(); 66 | 67 | /// Create a module 68 | Module( 69 | const CUDA::Context& cuda_context, 70 | const std::string& ptx, 71 | CUjit_target target, 72 | int num_module_load_options, 73 | CUjit_option* module_load_options, 74 | void** module_load_option_values, 75 | bool verbose, 76 | bool debug); 77 | 78 | /// Destructor 79 | ~Module(); 80 | 81 | /// Copy constructor 82 | /// @param[in] module The object to be copied 83 | Module(const Module& module) = delete; 84 | 85 | /// Move constructor 86 | /// @param[in] module The object to be moved 87 | Module(Module&& module); 88 | 89 | /// Assignment operator 90 | /// @param[in] module The object to assign from 91 | Module& operator=(const Module& module) = delete; 92 | 93 | /// Move assignment operator 94 | /// @param[in] module The object to assign from 95 | Module& operator=(Module&& module); 96 | 97 | /// Get a device-side function from a loaded module 98 | CUfunction get_device_function( 99 | const std::string& device_function_name) const; 100 | 101 | /// Get info log for a loaded module 102 | const char* info_log() const { 103 | return _info_log; } 104 | 105 | /// Get error log for a loaded module 106 | const char* error_log() const { 107 | return _error_log; } 108 | 109 | private: 110 | /// Handle to the CUDA module 111 | CUmodule _module; 112 | 113 | /// Size of the buffer for informational log messages 114 | size_t _info_log_size; 115 | 116 | /// Informational log messages related to loading the module 117 | char* _info_log; 118 | 119 | /// Size of the buffer for error log messages 120 | size_t _error_log_size; 121 | 122 | /// Error log messages related to loading the module 123 | char* _error_log; 124 | }; 125 | 126 | /// Use the NVIDIA CUDA Runtime Compilation (nvrtc) library to compile 127 | /// device-side code for a given CUDA program. 128 | std::string compile_cuda_cpp_to_ptx( 129 | const char* program_name, 130 | int num_program_headers, 131 | const char** program_headers, 132 | const char** program_include_names, 133 | int num_compile_options, 134 | const char** compile_options, 135 | const char* program_src, 136 | const char* cudasrcdir, 137 | bool verbose); 138 | 139 | void safeMemAlloc(CUdeviceptr* dptr, size_t bytesize); 140 | void safeMemcpyDtoH(void* dstHost, CUdeviceptr srcDevice, size_t ByteCount); 141 | void safeMemcpyHtoD(CUdeviceptr dstDevice, const void* srcHost, size_t ByteCount); 142 | void safeDeviceGetAttribute(int * res, CUdevice_attribute attr, CUdevice dev); 143 | void safeCtxSynchronize(); 144 | void safeStreamCreate(CUstream* streamptr, unsigned int flags); 145 | 146 | template void safeVectorCreate(CUdeviceptr* dptr, std::vector arr) { 147 | size_t bytesize = sizeof(T) * arr.size(); 148 | safeMemAlloc(dptr, bytesize); 149 | safeMemcpyHtoD(*dptr, (void *)arr.data(), bytesize); 150 | } 151 | 152 | CUjit_target get_cujit_target(const Context& cuda_context); 153 | 154 | } // namespace CUDA 155 | 156 | 157 | } // namespace dolfinx 158 | -------------------------------------------------------------------------------- /cpp/cudolfinx/common/CUDAStore.h: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2024 Benjamin Pachev, James D. Trotter 2 | // 3 | // This file is part of cuDOLFINX 4 | // 5 | // SPDX-License-Identifier: LGPL-3.0-or-later 6 | 7 | #pragma once 8 | #include 9 | #include 10 | 11 | namespace dolfinx::common 12 | { 13 | /// @brief This class represents an abstract mapping between host-side 14 | /// and device-side objects. Its purpose is to prevent creation of duplicate 15 | /// copies of host-side objects on the device. 16 | 17 | template 18 | class CUDAStore 19 | { 20 | public: 21 | 22 | /// @brief Empty constructor 23 | CUDAStore() 24 | { 25 | } 26 | 27 | /// @brief Return stored device object, or update accordingly 28 | /// @param[in] host_object Shared pointer to the host-side object 29 | std::shared_ptr get_device_object(const H* host_object) { 30 | auto it = _map.find(host_object); 31 | if (it != _map.end()) return it->second; 32 | auto device_object = std::make_shared(host_object); 33 | _map[host_object] = device_object; 34 | return device_object; 35 | } 36 | 37 | private: 38 | std::map> _map; 39 | }; 40 | } 41 | 42 | -------------------------------------------------------------------------------- /cpp/cudolfinx/common/version.h.in: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #define CUDOLFINX_VERSION_RELEASE @CUDOLFINX_VERSION_RELEASE@ 4 | #define CUDOLFINX_VERSION_MAJOR @CUDOLFINX_VERSION_MAJOR@ 5 | #define CUDOLFINX_VERSION_MINOR @CUDOLFINX_VERSION_MINOR@ 6 | #define CUDOLFINX_VERSION_MICRO @CUDOLFINX_VERSION_MICRO_STRIPPED@ 7 | #define CUDOLFINX_VERSION_STRING "@CUDOLFINX_VERSION@" 8 | #define CUDOLFINX_VERSION_GIT "@GIT_COMMIT_HASH@" 9 | #define UFCX_SIGNATURE "@UFCX_SIGNATURE@" 10 | -------------------------------------------------------------------------------- /cpp/cudolfinx/cudolfinx.h: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2024 Benjamin Pachev, James D. Trotter 2 | // 3 | // This file is part of cuDOLFINX 4 | // 5 | // SPDX-License-Identifier: LGPL-3.0-or-later 6 | 7 | #pragma once 8 | 9 | namespace cudolfinx 10 | { 11 | } 12 | 13 | // TODO: actually develop a C++ interface, currently the target is Python 14 | -------------------------------------------------------------------------------- /cpp/cudolfinx/fem/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(HEADERS_fem 2 | ${CMAKE_CURRENT_SOURCE_DIR}/CUDAAssembler.h 3 | ${CMAKE_CURRENT_SOURCE_DIR}/CUDACoefficient.h 4 | ${CMAKE_CURRENT_SOURCE_DIR}/CUDADirichletBC.h 5 | ${CMAKE_CURRENT_SOURCE_DIR}/CUDADofMap.h 6 | ${CMAKE_CURRENT_SOURCE_DIR}/CUDAForm.h 7 | ${CMAKE_CURRENT_SOURCE_DIR}/CUDAFormConstants.h 8 | ${CMAKE_CURRENT_SOURCE_DIR}/CUDAFormCoefficients.h 9 | ${CMAKE_CURRENT_SOURCE_DIR}/CUDAFormIntegral.h 10 | ${CMAKE_CURRENT_SOURCE_DIR}/petsc.h 11 | PARENT_SCOPE 12 | ) 13 | 14 | target_sources( 15 | cudolfinx 16 | PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/CUDAAssembler.cpp 17 | ${CMAKE_CURRENT_SOURCE_DIR}/CUDADofMap.cpp 18 | ${CMAKE_CURRENT_SOURCE_DIR}/CUDAFormIntegral.cpp 19 | ) 20 | -------------------------------------------------------------------------------- /cpp/cudolfinx/fem/CUDACoefficient.h: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2024 Benjamin Pachev, James D. Trotter 2 | // 3 | // This file is part of cuDOLFINX 4 | // 5 | // SPDX-License-Identifier: LGPL-3.0-or-later 6 | 7 | #pragma once 8 | 9 | #include 10 | #include 11 | 12 | namespace dolfinx::fem 13 | { 14 | /// @brief a wrapper around a Function 15 | template > 17 | class CUDACoefficient 18 | { 19 | public: 20 | 21 | /// @brief Construct a new CUDACoefficient 22 | CUDACoefficient(std::shared_ptr> f) { 23 | _f = f; 24 | _x = f->x(); 25 | _dvalues_size = _x->bs() * (_x->index_map()->size_local()+_x->index_map()->num_ghosts()) * sizeof(T); 26 | CUDA::safeMemAlloc(&_dvalues, _dvalues_size); 27 | copy_host_values_to_device(); 28 | } 29 | 30 | /// Copy to device, allocating GPU memory if required 31 | void copy_host_values_to_device() 32 | { 33 | CUDA::safeMemcpyHtoD(_dvalues, (void*)(_x->array().data()), _dvalues_size); 34 | } 35 | 36 | /// Get pointer to vector data on device 37 | CUdeviceptr device_values() const 38 | { 39 | return _dvalues; 40 | } 41 | 42 | ~CUDACoefficient() 43 | { 44 | if (_dvalues) 45 | cuMemFree(_dvalues); 46 | } 47 | 48 | private: 49 | 50 | // Device-side coefficient array 51 | CUdeviceptr _dvalues; 52 | // Size of coefficient array 53 | size_t _dvalues_size; 54 | // Pointer to host-side Function 55 | std::shared_ptr> _f; 56 | // Pointer to host-side coefficient vector 57 | std::shared_ptr> _x; 58 | }; 59 | 60 | } 61 | -------------------------------------------------------------------------------- /cpp/cudolfinx/fem/CUDADirichletBC.h: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2024 Benjamin Pachev, James D. Trotter 2 | // 3 | // This file is part of cuDOLFINX 4 | // 5 | // SPDX-License-Identifier: LGPL-3.0-or-later 6 | 7 | #pragma once 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | #include 17 | #include 18 | 19 | namespace dolfinx { 20 | 21 | //namespace function { 22 | //class FunctionSpace; 23 | //} 24 | 25 | namespace fem { 26 | //class DirichletBC; 27 | 28 | /// A wrapper for data marking which degrees of freedom that are 29 | /// affected by Dirichlet boundary conditions, with data being stored 30 | /// in the device memory of a CUDA device. 31 | template > 33 | class CUDADirichletBC 34 | { 35 | public: 36 | 37 | //----------------------------------------------------------------------------- 38 | /// Create empty Dirichlet boundary conditions 39 | CUDADirichletBC() 40 | : _num_dofs() 41 | , _num_owned_boundary_dofs() 42 | , _num_boundary_dofs() 43 | , _ddof_markers(0) 44 | , _ddof_indices(0) 45 | , _ddof_values(0) 46 | { 47 | } 48 | //----------------------------------------------------------------------------- 49 | /// Create Dirichlet boundary conditions 50 | /// 51 | /// @param[in] cuda_context A context for a CUDA device 52 | /// @param[in] V The function space to build dof markers for. 53 | /// Boundary conditions are only applied for degrees of 54 | /// freedom that belong to the given function space. 55 | /// @param[in] bcs The boundary conditions to copy to device memory 56 | CUDADirichletBC( 57 | const CUDA::Context& cuda_context, 58 | const dolfinx::fem::FunctionSpace& V, 59 | const std::vector>>& bcs) 60 | : _num_dofs() 61 | , _num_owned_boundary_dofs() 62 | , _num_boundary_dofs() 63 | , _ddof_markers(0) 64 | , _ddof_indices(0) 65 | , _ddof_values(0) 66 | { 67 | CUresult cuda_err; 68 | const char * cuda_err_description; 69 | 70 | // Count the number of degrees of freedom 71 | const dolfinx::fem::DofMap& dofmap = *(V.dofmap()); 72 | const common::IndexMap& index_map = *dofmap.index_map; 73 | // Looks like index_map no longer has block_size 74 | const int block_size = dofmap.index_map_bs(); 75 | _num_dofs = block_size * ( 76 | index_map.size_local() + index_map.num_ghosts()); 77 | 78 | // Count the number of degrees of freedom affected by boundary 79 | // conditions 80 | _num_owned_boundary_dofs = 0; 81 | _num_boundary_dofs = 0; 82 | 83 | // Build dof markers, indices and values 84 | signed char* dof_markers = nullptr; 85 | std::vector dof_indices; 86 | std::vector ghost_dof_indices; 87 | for (auto const& bc : bcs) { 88 | if (V.contains(*bc->function_space())) { 89 | if (!dof_markers) { 90 | dof_markers = new signed char[_num_dofs]; 91 | for (int i = 0; i < _num_dofs; i++) { 92 | dof_markers[i] = 0; 93 | } 94 | _dof_values.assign(_num_dofs, 0.0); 95 | } 96 | 97 | bc->mark_dofs(std::span(dof_markers, _num_dofs)); 98 | auto const [dofs, range] = bc->dof_indices(); 99 | for (std::int32_t i = 0; i < dofs.size(); i++) { 100 | if (i < range) dof_indices.push_back(dofs[i]); 101 | else ghost_dof_indices.push_back(dofs[i]); 102 | } 103 | bc->set(std::span(_dof_values), {}, 1); 104 | } 105 | } 106 | _num_owned_boundary_dofs = dof_indices.size(); 107 | _num_boundary_dofs = _num_owned_boundary_dofs + ghost_dof_indices.size(); 108 | dof_indices.insert(dof_indices.end(), ghost_dof_indices.begin(), ghost_dof_indices.end()); 109 | // Allocate device-side storage for dof markers 110 | if (dof_markers && _num_dofs > 0) { 111 | size_t ddof_markers_size = _num_dofs * sizeof(char); 112 | cuda_err = cuMemAlloc(&_ddof_markers, ddof_markers_size); 113 | if (cuda_err != CUDA_SUCCESS) { 114 | delete[] dof_markers; 115 | cuGetErrorString(cuda_err, &cuda_err_description); 116 | throw std::runtime_error( 117 | "cuMemAlloc() failed with " + std::string(cuda_err_description) + 118 | " at " + std::string(__FILE__) + ":" + std::to_string(__LINE__)); 119 | } 120 | 121 | // Copy dof markers to device 122 | cuda_err = cuMemcpyHtoD( 123 | _ddof_markers, dof_markers, ddof_markers_size); 124 | if (cuda_err != CUDA_SUCCESS) { 125 | cuMemFree(_ddof_markers); 126 | delete[] dof_markers; 127 | cuGetErrorString(cuda_err, &cuda_err_description); 128 | throw std::runtime_error( 129 | "cuMemcpyHtoD() failed with " + std::string(cuda_err_description) + 130 | " at " + std::string(__FILE__) + ":" + std::to_string(__LINE__)); 131 | } 132 | } 133 | if (dof_markers) 134 | delete[] dof_markers; 135 | 136 | // Allocate device-side storage for dof indices 137 | if (_num_boundary_dofs > 0) { 138 | size_t ddof_indices_size = dof_indices.size() * sizeof(std::int32_t); 139 | cuda_err = cuMemAlloc(&_ddof_indices, ddof_indices_size); 140 | if (cuda_err != CUDA_SUCCESS) { 141 | if (_ddof_markers) 142 | cuMemFree(_ddof_markers); 143 | cuGetErrorString(cuda_err, &cuda_err_description); 144 | throw std::runtime_error( 145 | "cuMemAlloc() failed with " + std::string(cuda_err_description) + 146 | " at " + std::string(__FILE__) + ":" + std::to_string(__LINE__)); 147 | } 148 | 149 | // Copy dof indices to device 150 | cuda_err = cuMemcpyHtoD( 151 | _ddof_indices, dof_indices.data(), ddof_indices_size); 152 | if (cuda_err != CUDA_SUCCESS) { 153 | cuMemFree(_ddof_indices); 154 | if (_ddof_markers) 155 | cuMemFree(_ddof_markers); 156 | cuGetErrorString(cuda_err, &cuda_err_description); 157 | throw std::runtime_error( 158 | "cuMemcpyHtoD() failed with " + std::string(cuda_err_description) + 159 | " at " + std::string(__FILE__) + ":" + std::to_string(__LINE__)); 160 | } 161 | } 162 | 163 | 164 | // Allocate device-side storage for dof values 165 | if (dof_markers && _num_dofs > 0) { 166 | size_t ddof_values_size = _num_dofs * sizeof(T); 167 | cuda_err = cuMemAlloc(&_ddof_values, ddof_values_size); 168 | if (cuda_err != CUDA_SUCCESS) { 169 | if (_ddof_indices) 170 | cuMemFree(_ddof_indices); 171 | if (_ddof_markers) 172 | cuMemFree(_ddof_markers); 173 | cuGetErrorString(cuda_err, &cuda_err_description); 174 | throw std::runtime_error( 175 | "cuMemAlloc() failed with " + std::string(cuda_err_description) + 176 | " at " + std::string(__FILE__) + ":" + std::to_string(__LINE__)); 177 | } 178 | 179 | // Copy dof values to device 180 | cuda_err = cuMemcpyHtoD( 181 | _ddof_values, _dof_values.data(), ddof_values_size); 182 | if (cuda_err != CUDA_SUCCESS) { 183 | cuMemFree(_ddof_values); 184 | if (_ddof_indices) 185 | cuMemFree(_ddof_indices); 186 | if (_ddof_markers) 187 | cuMemFree(_ddof_markers); 188 | cuGetErrorString(cuda_err, &cuda_err_description); 189 | throw std::runtime_error( 190 | "cuMemcpyHtoD() failed with " + std::string(cuda_err_description) + 191 | " at " + std::string(__FILE__) + ":" + std::to_string(__LINE__)); 192 | } 193 | } 194 | } 195 | //----------------------------------------------------------------------------- 196 | /// Destructor 197 | ~CUDADirichletBC() 198 | { 199 | if (_ddof_values) 200 | cuMemFree(_ddof_values); 201 | if (_ddof_indices) 202 | cuMemFree(_ddof_indices); 203 | if (_ddof_markers) 204 | cuMemFree(_ddof_markers); 205 | } 206 | //----------------------------------------------------------------------------- 207 | /// Copy constructor 208 | /// @param[in] bc The object to be copied 209 | CUDADirichletBC(const CUDADirichletBC& bc) = delete; 210 | 211 | /// Move constructor 212 | /// @param[in] bc The object to be moved 213 | CUDADirichletBC(CUDADirichletBC&& bc) 214 | : _num_dofs(bc._num_dofs) 215 | , _num_owned_boundary_dofs(bc._num_owned_boundary_dofs) 216 | , _num_boundary_dofs(bc._num_boundary_dofs) 217 | , _ddof_markers(bc._ddof_markers) 218 | , _ddof_indices(bc._ddof_indices) 219 | , _ddof_values(bc._ddof_values) 220 | { 221 | bc._num_dofs = 0; 222 | bc._num_owned_boundary_dofs = 0; 223 | bc._num_boundary_dofs = 0; 224 | bc._ddof_markers = 0; 225 | bc._ddof_indices = 0; 226 | bc._ddof_values = 0; 227 | } 228 | //----------------------------------------------------------------------------- 229 | /// Assignment operator 230 | /// @param[in] bc Another CUDADirichletBC object 231 | CUDADirichletBC& operator=(const CUDADirichletBC& bc) = delete; 232 | 233 | /// Move assignment operator 234 | /// @param[in] bc Another CUDADirichletBC object 235 | CUDADirichletBC& operator=(CUDADirichletBC&& bc) 236 | { 237 | _num_dofs = bc._num_dofs; 238 | _num_owned_boundary_dofs = bc._num_owned_boundary_dofs; 239 | _num_boundary_dofs = bc._num_boundary_dofs; 240 | _ddof_markers = bc._ddof_markers; 241 | _ddof_indices = bc._ddof_indices; 242 | _ddof_values = bc._ddof_values; 243 | bc._num_dofs = 0; 244 | bc._num_owned_boundary_dofs = 0; 245 | bc._num_boundary_dofs = 0; 246 | bc._ddof_markers = 0; 247 | bc._ddof_indices = 0; 248 | bc._ddof_values = 0; 249 | return *this; 250 | } 251 | //----------------------------------------------------------------------------- 252 | 253 | /// Update device-side values for all provided boundary conditions 254 | /// The user is responsible for ensuring the provided conditions are in the original list 255 | void update(const std::vector>>& bcs) { 256 | for (auto const& bc: bcs) { 257 | bc->set(std::span(_dof_values), {}); 258 | } 259 | 260 | CUDA::safeMemcpyHtoD(_ddof_values, _dof_values.data(), _num_dofs * sizeof(T)); 261 | } 262 | 263 | /// Get the number of degrees of freedom 264 | int32_t num_dofs() const { return _num_dofs; } 265 | 266 | /// Get a handle to the device-side dof markers 267 | CUdeviceptr dof_markers() const { return _ddof_markers; } 268 | 269 | /// Get the number of owned degrees of freedom subject to boundary 270 | /// conditions 271 | int32_t num_owned_boundary_dofs() const { return _num_owned_boundary_dofs; } 272 | 273 | /// Get the number of degrees of freedom subject to boundary 274 | /// conditions 275 | int32_t num_boundary_dofs() const { return _num_boundary_dofs; } 276 | 277 | /// Get a handle to the device-side dof indices 278 | CUdeviceptr dof_indices() const { return _ddof_indices; } 279 | 280 | /// Get a handle to the device-side dofs for the values 281 | CUdeviceptr dof_value_indices() const { return _ddof_indices; } 282 | 283 | /// Get a handle to the device-side dof values 284 | CUdeviceptr dof_values() const { return _ddof_values; } 285 | 286 | private: 287 | /// The number of degrees of freedom 288 | int32_t _num_dofs; 289 | 290 | /// The number of degrees of freedom owned by the current process 291 | /// that are subject to the essential boundary conditions. 292 | int32_t _num_owned_boundary_dofs; 293 | 294 | /// The number of degrees of freedom that are subject to the 295 | /// essential boundary conditions, including ghost nodes. 296 | int32_t _num_boundary_dofs; 297 | 298 | /// A host-side vector with the values for the boundary conditions 299 | /// Used for cases when the boundary condition values change 300 | std::vector _dof_values; 301 | 302 | /// Markers for each degree of freedom, indicating whether or not 303 | /// they are subject to essential boundary conditions 304 | CUdeviceptr _ddof_markers; 305 | 306 | /// Indices of the degrees of freedom that are subject to essential 307 | /// boundary conditions 308 | CUdeviceptr _ddof_indices; 309 | 310 | /// Values for each degree of freedom, indicating whether or not 311 | /// they are subject to essential boundary conditions 312 | CUdeviceptr _ddof_values; 313 | }; 314 | 315 | } // namespace fem 316 | } // namespace dolfinx 317 | 318 | -------------------------------------------------------------------------------- /cpp/cudolfinx/fem/CUDADofMap.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2024 Benjamin Pachev, James D. Trotter 2 | // 3 | // This file is part of cuDOLFINX 4 | // 5 | // SPDX-License-Identifier: LGPL-3.0-or-later 6 | 7 | #include "CUDADofMap.h" 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | using namespace dolfinx; 14 | using namespace dolfinx::fem; 15 | 16 | //----------------------------------------------------------------------------- 17 | CUDADofMap::CUDADofMap() 18 | : _dofmap(nullptr) 19 | , _num_dofs() 20 | , _num_cells() 21 | , _num_dofs_per_cell() 22 | , _ddofs_per_cell(0) 23 | , _dcells_per_dof_ptr(0) 24 | , _dcells_per_dof(0) 25 | { 26 | } 27 | 28 | CUDADofMap::CUDADofMap( 29 | const dolfinx::fem::DofMap* dofmap) 30 | : CUDADofMap::CUDADofMap(*dofmap, nullptr) 31 | { 32 | } 33 | 34 | CUDADofMap::CUDADofMap( 35 | const dolfinx::fem::DofMap* dofmap, std::map* restriction) 36 | : CUDADofMap::CUDADofMap(*dofmap, restriction) 37 | { 38 | } 39 | 40 | CUDADofMap::CUDADofMap( 41 | const dolfinx::fem::DofMap& dofmap) 42 | : CUDADofMap::CUDADofMap(dofmap, nullptr) 43 | { 44 | } 45 | 46 | //----------------------------------------------------------------------------- 47 | CUDADofMap::CUDADofMap( 48 | const dolfinx::fem::DofMap& dofmap, std::map* restriction) 49 | : _dofmap(&dofmap) 50 | , _num_dofs() 51 | , _num_cells() 52 | , _num_dofs_per_cell() 53 | , _ddofs_per_cell(0) 54 | , _dcells_per_dof_ptr(0) 55 | , _dcells_per_dof(0) 56 | { 57 | CUresult cuda_err; 58 | const char * cuda_err_description; 59 | 60 | auto dofs = dofmap.map(); 61 | auto element_dof_layout = dofmap.element_dof_layout(); 62 | // get block sizes and ensure positivity (sometimes the default is -1) 63 | std::int32_t element_block_size = element_dof_layout.block_size(); 64 | _block_size = dofmap.bs(); 65 | element_block_size = (element_block_size < 0) ? 1 : element_block_size; 66 | _block_size = (_block_size < 0) ? 1 : _block_size; 67 | _num_cells = dofs.extent(0); 68 | _num_dofs_per_cell = element_dof_layout.num_dofs() * element_block_size; 69 | _num_dofs = dofs.size() * _block_size; 70 | if (_num_dofs != _num_cells * _num_dofs_per_cell) { 71 | throw std::runtime_error( 72 | "Num dofs " + std::to_string(_num_dofs) + " != " + std::to_string(_num_cells) + 73 | "*" + std::to_string(_num_dofs_per_cell) + " at " + std::string(__FILE__) + ":" + std::to_string(__LINE__)); 74 | } 75 | // Allocate device-side storage for degrees of freedom 76 | if (_num_cells > 0 && _num_dofs_per_cell > 0) { 77 | size_t ddofs_per_cell_size = _num_dofs * sizeof(int32_t); 78 | cuda_err = cuMemAlloc( 79 | &_ddofs_per_cell, 80 | ddofs_per_cell_size); 81 | if (cuda_err != CUDA_SUCCESS) { 82 | cuGetErrorString(cuda_err, &cuda_err_description); 83 | throw std::runtime_error( 84 | "cuMemAlloc() failed with " + std::string(cuda_err_description) + 85 | " at " + std::string(__FILE__) + ":" + std::to_string(__LINE__)); 86 | } 87 | } 88 | update(restriction); 89 | 90 | // cells_per_dof_ptr and cells_per_dof are only used for 91 | // lookup table computations, which currently aren't in use 92 | /* 93 | // Compute mapping from degrees of freedom to cells 94 | std::vector cells_per_dof_ptr(_num_dofs+1); 95 | 96 | // Count the number cells containing each degree of freedom 97 | for (int32_t i = 0; i < _num_cells; i++) { 98 | auto cell_dofs = dofmap.cell_dofs(i); 99 | for (int32_t l = 0; l < cell_dofs.size(); l++) { 100 | int32_t j = cell_dofs[l]; 101 | cells_per_dof_ptr[j+1]++; 102 | } 103 | } 104 | 105 | // Compute offset to the first cell for each degree of freedom 106 | for (int32_t i = 0; i < _num_dofs; i++) 107 | cells_per_dof_ptr[i+1] += cells_per_dof_ptr[i]; 108 | int32_t num_dof_cells = cells_per_dof_ptr[_num_dofs]; 109 | if (num_dof_cells != _num_cells * _num_dofs_per_cell) { 110 | cuMemFree(_ddofs_per_cell); 111 | throw std::logic_error( 112 | "Expected " + std::to_string(_num_cells) + " cells, " + 113 | std::to_string(_num_dofs_per_cell) + " degrees of freedom per cell, " 114 | "but the mapping from degrees of freedom to cells contains " + 115 | std::to_string(num_dof_cells) + " values" ); 116 | } 117 | 118 | // Allocate storage for and compute the cells containing each degree 119 | // of freedom 120 | std::vector cells_per_dof(num_dof_cells); 121 | for (int32_t i = 0; i < _num_cells; i++) { 122 | auto cell_dofs = dofmap.cell_dofs(i); 123 | for (int32_t l = 0; l < cell_dofs.size(); l++) { 124 | int32_t j = cell_dofs[l]; 125 | int32_t p = cells_per_dof_ptr[j]; 126 | cells_per_dof[p] = i; 127 | cells_per_dof_ptr[j]++; 128 | } 129 | } 130 | 131 | // Adjust offsets to first cell 132 | for (int32_t i = _num_dofs; i > 0; i--) 133 | cells_per_dof_ptr[i] = cells_per_dof_ptr[i-1]; 134 | cells_per_dof_ptr[0] = 0; 135 | 136 | // Allocate device-side storage for offsets to the first cell 137 | // containing each degree of freedom 138 | if (_num_dofs > 0) { 139 | size_t dcells_per_dof_ptr_size = (_num_dofs+1) * sizeof(int32_t); 140 | cuda_err = cuMemAlloc( 141 | &_dcells_per_dof_ptr, dcells_per_dof_ptr_size); 142 | if (cuda_err != CUDA_SUCCESS) { 143 | cuGetErrorString(cuda_err, &cuda_err_description); 144 | cuMemFree(_ddofs_per_cell); 145 | throw std::runtime_error( 146 | "cuMemAlloc() failed with " + std::string(cuda_err_description) + 147 | " at " + std::string(__FILE__) + ":" + std::to_string(__LINE__)); 148 | } 149 | 150 | // Copy cell degrees of freedom to device 151 | cuda_err = cuMemcpyHtoD( 152 | _dcells_per_dof_ptr, cells_per_dof_ptr.data(), dcells_per_dof_ptr_size); 153 | if (cuda_err != CUDA_SUCCESS) { 154 | cuGetErrorString(cuda_err, &cuda_err_description); 155 | cuMemFree(_dcells_per_dof_ptr); 156 | cuMemFree(_ddofs_per_cell); 157 | throw std::runtime_error( 158 | "cuMemcpyHtoD() failed with " + std::string(cuda_err_description) + 159 | " at " + std::string(__FILE__) + ":" + std::to_string(__LINE__)); 160 | } 161 | } 162 | 163 | // Allocate device-side storage for cells containing each degree of freedom 164 | if (_num_cells > 0 && _num_dofs_per_cell > 0) { 165 | size_t dcells_per_dof_size = num_dof_cells * sizeof(int32_t); 166 | cuda_err = cuMemAlloc( 167 | &_dcells_per_dof, 168 | dcells_per_dof_size); 169 | if (cuda_err != CUDA_SUCCESS) { 170 | cuGetErrorString(cuda_err, &cuda_err_description); 171 | cuMemFree(_dcells_per_dof_ptr); 172 | cuMemFree(_ddofs_per_cell); 173 | throw std::runtime_error( 174 | "cuMemAlloc() failed with " + std::string(cuda_err_description) + 175 | " at " + std::string(__FILE__) + ":" + std::to_string(__LINE__)); 176 | } 177 | 178 | // Copy cell degrees of freedom to device 179 | cuda_err = cuMemcpyHtoD( 180 | _dcells_per_dof, cells_per_dof.data(), dcells_per_dof_size); 181 | if (cuda_err != CUDA_SUCCESS) { 182 | cuGetErrorString(cuda_err, &cuda_err_description); 183 | cuMemFree(_dcells_per_dof); 184 | cuMemFree(_dcells_per_dof_ptr); 185 | cuMemFree(_ddofs_per_cell); 186 | throw std::runtime_error( 187 | "cuMemcpyHtoD() failed with " + std::string(cuda_err_description) + 188 | " at " + std::string(__FILE__) + ":" + std::to_string(__LINE__)); 189 | } 190 | }*/ 191 | } 192 | //----------------------------------------------------------------------------- 193 | void CUDADofMap::update(std::map* restriction) 194 | { 195 | std::vector unrolled_dofs; 196 | const std::int32_t* dofs_per_cell, *dofs_orig; 197 | auto dofs = _dofmap->map(); 198 | dofs_orig = dofs.data_handle(); 199 | 200 | if (restriction) { 201 | unrolled_dofs.resize(_num_dofs); 202 | for (std::size_t i = 0; i < dofs.size(); i++) { 203 | const std::int32_t dof = dofs_orig[i]; 204 | if (restriction->find(dof) != restriction->end()) { 205 | std::int32_t mapped_dof = (*restriction)[dof]; 206 | for (int j = 0; j < _block_size; j++) 207 | unrolled_dofs[i*_block_size + j] = mapped_dof*_block_size + j; 208 | } 209 | else { 210 | for (int j = 0; j < _block_size; j++) 211 | unrolled_dofs[i*_block_size + j] = -1; // we should not be using this degree of freedom 212 | } 213 | } 214 | dofs_per_cell = unrolled_dofs.data(); 215 | } 216 | else if (_block_size == 1) { 217 | dofs_per_cell = dofs_orig; 218 | } 219 | else { 220 | unrolled_dofs.resize(_num_dofs); 221 | for (std::size_t i = 0; i < _num_dofs; i++) 222 | unrolled_dofs[i] = _block_size*dofs_orig[i/_block_size] + i%_block_size; 223 | 224 | dofs_per_cell = unrolled_dofs.data(); 225 | } 226 | 227 | // Copy cell degrees of freedom to device 228 | if (_num_cells > 0 && _num_dofs_per_cell > 0) { 229 | size_t ddofs_per_cell_size = _num_dofs * sizeof(int32_t); 230 | CUDA::safeMemcpyHtoD(_ddofs_per_cell, dofs_per_cell, ddofs_per_cell_size); 231 | } 232 | 233 | } 234 | //----------------------------------------------------------------------------- 235 | CUDADofMap::~CUDADofMap() 236 | { 237 | if (_dcells_per_dof) 238 | cuMemFree(_dcells_per_dof); 239 | if (_dcells_per_dof_ptr) 240 | cuMemFree(_dcells_per_dof_ptr); 241 | if (_ddofs_per_cell) 242 | cuMemFree(_ddofs_per_cell); 243 | } 244 | //----------------------------------------------------------------------------- 245 | CUDADofMap::CUDADofMap(CUDADofMap&& dofmap) 246 | : _dofmap(dofmap._dofmap) 247 | , _num_dofs(dofmap._num_dofs) 248 | , _num_cells(dofmap._num_cells) 249 | , _num_dofs_per_cell(dofmap._num_dofs_per_cell) 250 | , _ddofs_per_cell(dofmap._ddofs_per_cell) 251 | , _dcells_per_dof_ptr(dofmap._dcells_per_dof_ptr) 252 | , _dcells_per_dof(dofmap._dcells_per_dof) 253 | { 254 | dofmap._dofmap = nullptr; 255 | dofmap._num_dofs = 0; 256 | dofmap._num_cells = 0; 257 | dofmap._num_dofs_per_cell = 0; 258 | dofmap._ddofs_per_cell = 0; 259 | dofmap._dcells_per_dof_ptr = 0; 260 | dofmap._dcells_per_dof = 0; 261 | } 262 | //----------------------------------------------------------------------------- 263 | CUDADofMap& CUDADofMap::operator=(CUDADofMap&& dofmap) 264 | { 265 | _dofmap = dofmap._dofmap; 266 | _num_dofs = dofmap._num_dofs; 267 | _num_cells = dofmap._num_cells; 268 | _num_dofs_per_cell = dofmap._num_dofs_per_cell; 269 | _ddofs_per_cell = dofmap._ddofs_per_cell; 270 | _dcells_per_dof_ptr = dofmap._dcells_per_dof_ptr; 271 | _dcells_per_dof = dofmap._dcells_per_dof; 272 | dofmap._dofmap = nullptr; 273 | dofmap._num_dofs = 0; 274 | dofmap._num_cells = 0; 275 | dofmap._num_dofs_per_cell = 0; 276 | dofmap._ddofs_per_cell = 0; 277 | dofmap._dcells_per_dof_ptr = 0; 278 | dofmap._dcells_per_dof = 0; 279 | return *this; 280 | } 281 | //----------------------------------------------------------------------------- 282 | -------------------------------------------------------------------------------- /cpp/cudolfinx/fem/CUDADofMap.h: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2024 Benjamin Pachev, James D. Trotter 2 | // 3 | // This file is part of cuDOLFINX 4 | // 5 | // SPDX-License-Identifier: LGPL-3.0-or-later 6 | 7 | #pragma once 8 | 9 | #include 10 | #include 11 | #include 12 | 13 | namespace dolfinx { 14 | namespace fem { 15 | class DofMap; 16 | 17 | /// A wrapper for a cellwise-to-global mapping of degres of freedom 18 | /// that is stored in the device memory of a CUDA device. 19 | class CUDADofMap 20 | { 21 | public: 22 | /// Create an empty dofmap 23 | CUDADofMap(); 24 | 25 | /// Create a dofmap 26 | /// 27 | /// @param[in] dofmap The dofmap to copy to device memory 28 | CUDADofMap(const dolfinx::fem::DofMap& dofmap, std::map* restriction); 29 | 30 | // constructors without restriction 31 | CUDADofMap(const dolfinx::fem::DofMap* dofmap); 32 | 33 | CUDADofMap(const dolfinx::fem::DofMap& dofmap); 34 | 35 | /// Alternate constructor 36 | CUDADofMap(const dolfinx::fem::DofMap* dofmap, std::map* restriction); 37 | 38 | /// Destructor 39 | ~CUDADofMap(); 40 | 41 | /// Copy constructor 42 | /// @param[in] dofmap The object to be copied 43 | CUDADofMap(const CUDADofMap& dofmap) = delete; 44 | 45 | /// Move constructor 46 | /// @param[in] dofmap The object to be moved 47 | CUDADofMap(CUDADofMap&& dofmap); 48 | 49 | /// Assignment operator 50 | /// @param[in] dofmap Another CUDADofMap object 51 | CUDADofMap& operator=(const CUDADofMap& dofmap) = delete; 52 | 53 | /// Move assignment operator 54 | /// @param[in] dofmap Another CUDADofMap object 55 | CUDADofMap& operator=(CUDADofMap&& dofmap); 56 | 57 | /// Update the dofmap on the device, possibly with a new restriction 58 | void update(std::map* restriction); 59 | 60 | /// Get the underlying dofmap on the host 61 | const dolfinx::fem::DofMap* dofmap() const { return _dofmap; } 62 | 63 | /// Get the number of degrees of freedom 64 | int32_t num_dofs() const { return _num_dofs; } 65 | 66 | /// Get the number of cells 67 | int32_t num_cells() const { return _num_cells; } 68 | 69 | /// Get the number of dofs per cell 70 | int32_t num_dofs_per_cell() const { 71 | return _num_dofs_per_cell; } 72 | 73 | /// Get a handle to the device-side dofs of each cell 74 | CUdeviceptr dofs_per_cell() const { 75 | return _ddofs_per_cell; } 76 | 77 | /// Get the offsets to the first cell containing each degree of freedom 78 | CUdeviceptr cells_per_dof_ptr() const { 79 | return _dcells_per_dof_ptr; } 80 | 81 | /// Get the cells containing each degree of freedom 82 | CUdeviceptr cells_per_dof() const { 83 | return _dcells_per_dof; } 84 | 85 | private: 86 | /// The underlying dofmap on the host 87 | const dolfinx::fem::DofMap* _dofmap; 88 | 89 | /// The number of degrees of freedom 90 | int32_t _num_dofs; 91 | 92 | /// The number of cells in the mesh 93 | int32_t _num_cells; 94 | 95 | /// The number of degrees of freedom in each cell 96 | int32_t _num_dofs_per_cell; 97 | 98 | /// The block size 99 | int32_t _block_size; 100 | 101 | /// The degrees of freedom of each cell 102 | CUdeviceptr _ddofs_per_cell; 103 | 104 | /// Offsets to the first cell containing each degree of freedom 105 | CUdeviceptr _dcells_per_dof_ptr; 106 | 107 | /// The cells containing each degree of freedom 108 | CUdeviceptr _dcells_per_dof; 109 | }; 110 | 111 | } // namespace fem 112 | } // namespace dolfinx 113 | 114 | -------------------------------------------------------------------------------- /cpp/cudolfinx/fem/CUDAForm.h: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2024 Benjamin Pachev, James D. Trotter 2 | // 3 | // This file is part of cuDOLFINX 4 | // 5 | // SPDX-License-Identifier: LGPL-3.0-or-later 6 | 7 | #pragma once 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | 23 | namespace dolfinx { 24 | 25 | namespace fem { 26 | 27 | /// Consolidates all form classes into one 28 | template > 30 | class CUDAForm 31 | { 32 | 33 | public: 34 | /// Create GPU copies of data needed for assembly 35 | /// 36 | /// @param[in] cuda_context A context for a CUDA device 37 | /// @param[in] form Pointer to the variational form 38 | CUDAForm( 39 | const CUDA::Context& cuda_context, 40 | Form* form, 41 | ufcx_form* ufcx_form, 42 | std::vector& tabulate_tensor_names, 43 | std::vector& tabulate_tensor_sources 44 | ) 45 | : _coefficients(cuda_context, form, _dofmap_store) 46 | , _constants(cuda_context, form) 47 | , _form(form) 48 | , _ufcx_form(ufcx_form) 49 | , _compiled(false) 50 | { 51 | _coefficients = CUDAFormCoefficients(cuda_context, form, _dofmap_store); 52 | const int* integral_offsets = ufcx_form->form_integral_offsets; 53 | if (integral_offsets[3] != tabulate_tensor_names.size()) { 54 | throw std::runtime_error("UFCx form has " + std::to_string(integral_offsets[3]) 55 | + " integrals, but only " + std::to_string(tabulate_tensor_names.size()) 56 | + " tabulate tensor sources provided to CUDAForm!" 57 | ); 58 | } 59 | for (int i = 0; i < 3; i++) { 60 | for (int offset = integral_offsets[i]; offset < integral_offsets[i+1]; offset++) { 61 | int id = ufcx_form->form_integral_ids[offset]; 62 | _cuda_integrals[i].insert({id, {tabulate_tensor_names[offset], tabulate_tensor_sources[offset]}}); 63 | } 64 | } 65 | } 66 | 67 | /// Compile form on GPU 68 | /// Under the hood, this creates the integrals 69 | void compile( 70 | const CUDA::Context& cuda_context, 71 | int32_t max_threads_per_block, 72 | int32_t min_blocks_per_multiprocessor, 73 | enum assembly_kernel_type assembly_kernel_type) 74 | { 75 | auto cujit_target = CUDA::get_cujit_target(cuda_context); 76 | _integrals = cuda_form_integrals( 77 | cuda_context, cujit_target, *_form, _cuda_integrals, assembly_kernel_type, 78 | max_threads_per_block, min_blocks_per_multiprocessor, false, NULL, false); 79 | _compiled = true; 80 | } 81 | 82 | /// Copy constructor 83 | CUDAForm(const CUDAForm& form) = delete; 84 | 85 | /// Move constructor 86 | CUDAForm(CUDAForm&& form) = default; 87 | 88 | /// Destructor 89 | virtual ~CUDAForm() = default; 90 | 91 | bool compiled() { return _compiled; } 92 | 93 | bool restricted() { return _restricted_dofmaps.size() > 0; } 94 | 95 | std::map>>& integrals() { 96 | if (!_compiled) { 97 | throw std::runtime_error("Cannot access integrals for uncompiled cuda form!"); 98 | } 99 | return _integrals; 100 | } 101 | 102 | CUDAFormCoefficients& coefficients() { return _coefficients; } 103 | 104 | const CUDAFormConstants& constants() { return _constants; } 105 | 106 | std::shared_ptr unrestricted_dofmap(size_t i) { 107 | if (i >= _form->function_spaces().size()) throw std::runtime_error("Dofmap index out of bounds!"); 108 | return _dofmap_store.get_device_object(_form->function_spaces()[i]->dofmap().get()); 109 | } 110 | 111 | std::shared_ptr dofmap(size_t i) { 112 | if (!restricted()) return unrestricted_dofmap(i); 113 | if (i >= _restricted_dofmaps.size()) throw std::runtime_error("Dofmap index out of bounds!"); 114 | return _restricted_dofmaps[i]; 115 | } 116 | 117 | Form* form() { return _form; } 118 | 119 | CUDADirichletBC bc( 120 | const CUDA::Context& cuda_context, size_t i, 121 | std::vector>> bcs) 122 | { 123 | return CUDADirichletBC(cuda_context, *_form->function_spaces()[i], bcs); 124 | } 125 | 126 | /// Copy the coefficient and constant data to the device 127 | /// This can be necessary if either changes on the host 128 | void to_device(const CUDA::Context& cuda_context) 129 | { 130 | _coefficients.copy_coefficients_to_device(cuda_context); 131 | _constants.update_constant_values(); 132 | } 133 | 134 | void set_restriction(std::vector>> restriction) 135 | { 136 | if (restriction.size() != _form->function_spaces().size()) { 137 | throw std::runtime_error("Number of restrictions must equal arity of form (1 for vector, 2 for matrix)!"); 138 | } 139 | 140 | if (_restricted_dofmaps.size()) { 141 | // need to update the restriction 142 | for (int i = 0; i < _restricted_dofmaps.size(); i++) { 143 | _restricted_dofmaps[i]->update(restriction[i].get()); 144 | } 145 | } 146 | else { 147 | for (int i = 0; i < restriction.size(); i++) { 148 | _restricted_dofmaps.push_back( 149 | std::make_shared( 150 | _form->function_spaces()[i]->dofmap().get(), 151 | restriction[i].get() 152 | ) 153 | ); 154 | } 155 | } 156 | } 157 | 158 | private: 159 | // Cache of CUDADofMaps 160 | common::CUDAStore _dofmap_store; 161 | // Restricted dofmaps 162 | std::vector> _restricted_dofmaps; 163 | // Form coefficients 164 | CUDAFormCoefficients _coefficients; 165 | // Form Constants 166 | CUDAFormConstants _constants; 167 | // Compiled CUDA kernels 168 | std::map>> _integrals; 169 | // CUDA tabulate tensors 170 | std::array>, 4> _cuda_integrals; 171 | bool _compiled; 172 | Form* _form; 173 | ufcx_form* _ufcx_form; 174 | }; 175 | 176 | } // end namespace fem 177 | 178 | } // end namespace dolfinx 179 | -------------------------------------------------------------------------------- /cpp/cudolfinx/fem/CUDAFormConstants.h: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2024 Benjamin Pachev, James D. Trotter 2 | // 3 | // This file is part of cuDOLFINX 4 | // 5 | // SPDX-License-Identifier: LGPL-3.0-or-later 6 | 7 | #pragma once 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | namespace dolfinx { 15 | namespace fem { 16 | 17 | /// A wrapper for a form constant with data that is stored in the 18 | /// device memory of a CUDA device. 19 | template 20 | class CUDAFormConstants 21 | { 22 | public: 23 | 24 | /// Create an empty collection constant values 25 | CUDAFormConstants() 26 | : _form(nullptr) 27 | , _num_constant_values() 28 | , _dconstant_values(0) 29 | { 30 | } 31 | //----------------------------------------------------------------------------- 32 | /// Create a collection constant values from a given form 33 | /// 34 | /// @param[in] cuda_context A context for a CUDA device 35 | /// @param[in] form The variational form whose constants are used 36 | CUDAFormConstants( 37 | const CUDA::Context& cuda_context, 38 | const Form* form) 39 | : _form(form) 40 | , _num_constant_values() 41 | , _dconstant_values(0) 42 | { 43 | CUresult cuda_err; 44 | const char * cuda_err_description; 45 | 46 | const std::vector 47 | constant_values = pack_constants(*_form); 48 | 49 | // Allocate device-side storage for constant values 50 | _num_constant_values = constant_values.size(); 51 | if (_num_constant_values > 0) { 52 | size_t dconstant_values_size = 53 | _num_constant_values * sizeof(T); 54 | cuda_err = cuMemAlloc( 55 | &_dconstant_values, dconstant_values_size); 56 | if (cuda_err != CUDA_SUCCESS) { 57 | cuGetErrorString(cuda_err, &cuda_err_description); 58 | throw std::runtime_error( 59 | "cuMemAlloc() failed with " + std::string(cuda_err_description) + 60 | " at " + std::string(__FILE__) + ":" + std::to_string(__LINE__)); 61 | } 62 | 63 | // Copy constant values to device 64 | cuda_err = cuMemcpyHtoD( 65 | _dconstant_values, constant_values.data(), dconstant_values_size); 66 | if (cuda_err != CUDA_SUCCESS) { 67 | cuMemFree(_dconstant_values); 68 | cuGetErrorString(cuda_err, &cuda_err_description); 69 | throw std::runtime_error( 70 | "cuMemcpyHtoD() failed with " + std::string(cuda_err_description) + 71 | " at " + std::string(__FILE__) + ":" + std::to_string(__LINE__)); 72 | } 73 | } 74 | } 75 | //----------------------------------------------------------------------------- 76 | /// Destructor 77 | ~CUDAFormConstants() 78 | { 79 | if (_dconstant_values) 80 | cuMemFree(_dconstant_values); 81 | } 82 | //----------------------------------------------------------------------------- 83 | /// Copy constructor 84 | /// @param[in] form_constant The object to be copied 85 | CUDAFormConstants(const CUDAFormConstants& form_constant) = delete; 86 | 87 | /// Move constructor 88 | /// @param[in] form_constant The object to be moved 89 | CUDAFormConstants(CUDAFormConstants&& constants) 90 | : _form(constants._form) 91 | , _num_constant_values(constants._num_constant_values) 92 | , _dconstant_values(constants._dconstant_values) 93 | { 94 | constants._form = nullptr; 95 | constants._num_constant_values = 0; 96 | constants._dconstant_values = 0; 97 | } 98 | //----------------------------------------------------------------------------- 99 | /// Assignment operator 100 | /// @param[in] form_constant Another CUDAFormConstants object 101 | CUDAFormConstants& operator=(const CUDAFormConstants& form_constant) = delete; 102 | 103 | /// Move assignment operator 104 | /// @param[in] form_constant Another CUDAFormConstants object 105 | CUDAFormConstants& operator=(CUDAFormConstants&& constants) 106 | { 107 | _form = constants._form; 108 | _num_constant_values = constants._num_constant_values; 109 | _dconstant_values = constants._dconstant_values; 110 | constants._form = nullptr; 111 | constants._num_constant_values = 0; 112 | constants._dconstant_values = 0; 113 | return *this; 114 | } 115 | //----------------------------------------------------------------------------- 116 | /// Get the number of constant values that the constant applies to 117 | int32_t num_constant_values() const { return _num_constant_values; } 118 | 119 | /// Get the constant values that the constant applies to 120 | CUdeviceptr constant_values() const { return _dconstant_values; } 121 | 122 | /// Update the constant values by copying values from host to device 123 | void update_constant_values() const 124 | { 125 | CUresult cuda_err; 126 | const char * cuda_err_description; 127 | 128 | // Pack constants into an array 129 | const std::vector 130 | constant_values = pack_constants(*_form); 131 | assert(_num_constant_values == constant_values.size()); 132 | 133 | // Copy constant values to device 134 | if (_num_constant_values > 0) { 135 | size_t dconstant_values_size = 136 | _num_constant_values * sizeof(T); 137 | cuda_err = cuMemcpyHtoD( 138 | _dconstant_values, constant_values.data(), dconstant_values_size); 139 | if (cuda_err != CUDA_SUCCESS) { 140 | cuMemFree(_dconstant_values); 141 | cuGetErrorString(cuda_err, &cuda_err_description); 142 | throw std::runtime_error( 143 | "cuMemcpyHtoD() failed with " + std::string(cuda_err_description) + 144 | " at " + std::string(__FILE__) + ":" + std::to_string(__LINE__)); 145 | } 146 | } 147 | } 148 | //----------------------------------------------------------------------------- 149 | 150 | 151 | private: 152 | // The form that the constant applies to 153 | const Form* _form; 154 | 155 | /// The number of constant values 156 | int32_t _num_constant_values; 157 | 158 | /// The constant values 159 | CUdeviceptr _dconstant_values; 160 | }; 161 | 162 | } // namespace fem 163 | } // namespace dolfinx 164 | 165 | -------------------------------------------------------------------------------- /cpp/cudolfinx/fem/petsc.h: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2024 Benjamin Pachev, James D. Trotter 2 | // 3 | // This file is part of cuDOLFINX 4 | // 5 | // SPDX-License-Identifier: LGPL-3.0-or-later 6 | 7 | #pragma once 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | 22 | namespace dolfinx::fem 23 | { 24 | 25 | namespace petsc 26 | { 27 | 28 | template 29 | Mat create_cuda_matrix(const Form& a) 30 | { 31 | la::SparsityPattern pattern = fem::create_sparsity_pattern(a); 32 | pattern.finalize(); 33 | return la::petsc::create_cuda_matrix(a.mesh()->comm(), pattern); 34 | } 35 | 36 | } // namespace petsc 37 | } // namespace dolfinx::fem 38 | -------------------------------------------------------------------------------- /cpp/cudolfinx/la/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(HEADERS_la 2 | ${CMAKE_CURRENT_SOURCE_DIR}/CUDAMatrix.h 3 | ${CMAKE_CURRENT_SOURCE_DIR}/CUDASeqMatrix.h 4 | ${CMAKE_CURRENT_SOURCE_DIR}/CUDAVector.h 5 | ${CMAKE_CURRENT_SOURCE_DIR}/petsc.h 6 | PARENT_SCOPE 7 | ) 8 | 9 | target_sources( 10 | cudolfinx 11 | PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/CUDAMatrix.cpp 12 | ${CMAKE_CURRENT_SOURCE_DIR}/CUDASeqMatrix.cpp 13 | ${CMAKE_CURRENT_SOURCE_DIR}/CUDAVector.cpp 14 | ${CMAKE_CURRENT_SOURCE_DIR}/petsc.cpp 15 | ) 16 | -------------------------------------------------------------------------------- /cpp/cudolfinx/la/CUDAMatrix.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2024 Benjamin Pachev, James D. Trotter 2 | // 3 | // This file is part of cuDOLFINX 4 | // 5 | // SPDX-License-Identifier: LGPL-3.0-or-later 6 | 7 | #include "CUDAMatrix.h" 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | 17 | using namespace dolfinx; 18 | using namespace dolfinx::la; 19 | 20 | //----------------------------------------------------------------------------- 21 | CUDAMatrix::CUDAMatrix() 22 | : _A(nullptr) 23 | , _diag() 24 | , _offdiag() 25 | , _dcolmap(0) 26 | , _num_rows() 27 | , _num_columns() 28 | , _local_row_start() 29 | , _local_row_end() 30 | , _num_local_rows() 31 | , _num_local_columns() 32 | , _num_local_offdiag_columns() 33 | { 34 | } 35 | //----------------------------------------------------------------------------- 36 | CUDAMatrix::CUDAMatrix( 37 | const CUDA::Context& cuda_context, 38 | Mat A, 39 | bool page_lock_values, 40 | bool use_seqaijcusparsegetarray) 41 | : _A(A) 42 | , _diag() 43 | , _offdiag() 44 | , _dcolmap(0) 45 | , _num_rows() 46 | , _num_columns() 47 | , _local_row_start() 48 | , _local_row_end() 49 | , _num_local_rows() 50 | , _num_local_columns() 51 | , _num_local_offdiag_columns() 52 | { 53 | PetscErrorCode ierr; 54 | CUresult cuda_err; 55 | const char * cuda_err_description; 56 | 57 | // Check the type of matrix 58 | MatType matrix_type; 59 | ierr = MatGetType(A, &matrix_type); 60 | if (ierr != 0) 61 | la::petsc::error(ierr, __FILE__, "MatGetType"); 62 | 63 | // Get the number of matrix rows and columns 64 | ierr = MatGetSize(A, &_num_rows, &_num_columns); 65 | if (ierr != 0) 66 | la::petsc::error(ierr, __FILE__, "MatGetSize"); 67 | 68 | // Get the number of rows and columns owned by the current MPI process 69 | ierr = MatGetLocalSize(A, &_num_local_rows, &_num_local_columns); 70 | if (ierr != 0) 71 | la::petsc::error(ierr, __FILE__, "MatGetLocalSize"); 72 | 73 | // TODO: We might need to do some additional work to handle non-zero 74 | // local_row_start. 75 | ierr = MatGetOwnershipRange(A, &_local_row_start, &_local_row_end); 76 | if (ierr != 0) 77 | la::petsc::error(ierr, __FILE__, "MatGetOwnershipRange"); 78 | 79 | if (strcmp(matrix_type, MATSEQAIJ) == 0 || 80 | strcmp(matrix_type, MATSEQAIJCUSPARSE) == 0) 81 | { 82 | // A non-distributed matrix only has a diagonal part 83 | _diag = std::make_unique( 84 | cuda_context, A, page_lock_values, use_seqaijcusparsegetarray); 85 | } else if (strcmp(matrix_type, MATMPIAIJ) == 0 || 86 | strcmp(matrix_type, MATMPIAIJCUSPARSE) == 0) 87 | { 88 | // For a distributed matrix, we obtain local diagonal and 89 | // off-diagonal blocks using MatMPIAIJGetSeqAIJ(). 90 | Mat Ad, Ao; 91 | const int * colmap; 92 | ierr = MatMPIAIJGetSeqAIJ(A, &Ad, &Ao, &colmap); 93 | if (ierr != 0) 94 | la::petsc::error(ierr, __FILE__, "MatMPIAIJGetSeqAIJ"); 95 | _diag = std::make_unique( 96 | cuda_context, Ad, page_lock_values, use_seqaijcusparsegetarray); 97 | _offdiag = std::make_unique( 98 | cuda_context, Ao, page_lock_values, use_seqaijcusparsegetarray); 99 | 100 | // Get the number of columns in the off-diagonal part of the local 101 | // matrix. 102 | ierr = MatGetLocalSize(Ao, NULL, &_num_local_offdiag_columns); 103 | if (ierr != 0) 104 | la::petsc::error(ierr, __FILE__, "MatGetLocalSize"); 105 | 106 | // Convert the column map from global numbering to the 107 | // process-local numbering 108 | ISLocalToGlobalMapping cmapping; 109 | ierr = MatGetLocalToGlobalMapping(A, NULL, &cmapping); 110 | if (ierr != 0) 111 | la::petsc::error(ierr, __FILE__, "MatGetLocalToGlobalMapping"); 112 | 113 | std::vector colmap_local(_num_local_offdiag_columns); 114 | ierr = ISGlobalToLocalMappingApply( 115 | cmapping, IS_GTOLM_MASK, _num_local_offdiag_columns, colmap, 116 | NULL, colmap_local.data()); 117 | if (ierr != 0) 118 | la::petsc::error(ierr, __FILE__, "ISGlobalToLocalMappingApply"); 119 | 120 | // Allocate device-side storage for off-diagonal column map 121 | if (_num_local_offdiag_columns > 0) { 122 | std::vector> combined; 123 | for (int i = 0; i < colmap_local.size(); i++) { 124 | combined.emplace_back(colmap_local[i], i); 125 | } 126 | std::sort(combined.begin(), combined.end(), 127 | [](const std::pair& a, const std::pair& b) { 128 | return a.first < b.first; 129 | }); 130 | std::vector colmap_sorted(combined.size()); 131 | std::vector colmap_sorted_indices(combined.size()); 132 | 133 | for (int i = 0; i < combined.size(); i++) { 134 | colmap_sorted[i] = combined[i].first; 135 | colmap_sorted_indices[i] = combined[i].second; 136 | } 137 | 138 | dolfinx::CUDA::safeVectorCreate(&_dcolmap, colmap_local); 139 | dolfinx::CUDA::safeVectorCreate(&_dcolmap_sorted, colmap_sorted); 140 | dolfinx::CUDA::safeVectorCreate(&_dcolmap_sorted_indices, colmap_sorted_indices); 141 | } 142 | 143 | } else { 144 | throw std::runtime_error( 145 | "Unsupported matrix type '" + std::string(matrix_type) + "' " 146 | "at " + std::string(__FILE__) + ":" + std::to_string(__LINE__)); 147 | } 148 | } 149 | //----------------------------------------------------------------------------- 150 | CUDAMatrix::~CUDAMatrix() 151 | { 152 | } 153 | //----------------------------------------------------------------------------- 154 | CUDAMatrix::CUDAMatrix(CUDAMatrix&& matrix) 155 | : _A(matrix._A) 156 | , _diag(std::move(matrix._diag)) 157 | , _offdiag(std::move(matrix._offdiag)) 158 | , _dcolmap(matrix._dcolmap) 159 | , _num_rows(matrix._num_rows) 160 | , _num_columns(matrix._num_columns) 161 | , _local_row_start(matrix._local_row_start) 162 | , _local_row_end(matrix._local_row_end) 163 | , _num_local_rows(matrix._num_local_rows) 164 | , _num_local_columns(matrix._num_local_columns) 165 | , _num_local_offdiag_columns(matrix._num_local_offdiag_columns) 166 | { 167 | matrix._A = nullptr; 168 | matrix._diag = nullptr; 169 | matrix._offdiag = nullptr; 170 | matrix._dcolmap = 0; 171 | matrix._num_rows = 0; 172 | matrix._num_columns = 0; 173 | matrix._local_row_start = 0; 174 | matrix._local_row_end = 0; 175 | matrix._num_local_rows = 0; 176 | matrix._num_local_columns = 0; 177 | matrix._num_local_offdiag_columns = 0; 178 | } 179 | //----------------------------------------------------------------------------- 180 | CUDAMatrix& CUDAMatrix::operator=(CUDAMatrix&& matrix) 181 | { 182 | _A = matrix._A; 183 | _diag = std::move(matrix._diag); 184 | _offdiag = std::move(matrix._offdiag); 185 | _dcolmap = matrix._dcolmap; 186 | _num_rows = matrix._num_rows; 187 | _num_columns = matrix._num_columns; 188 | _local_row_start = matrix._local_row_start; 189 | _local_row_end = matrix._local_row_end; 190 | _num_local_rows = matrix._num_local_rows; 191 | _num_local_columns = matrix._num_local_columns; 192 | _num_local_offdiag_columns = matrix._num_local_offdiag_columns; 193 | matrix._A = nullptr; 194 | matrix._diag = nullptr; 195 | matrix._offdiag = nullptr; 196 | matrix._dcolmap = 0; 197 | matrix._num_rows = 0; 198 | matrix._num_columns = 0; 199 | matrix._local_row_start = 0; 200 | matrix._local_row_end = 0; 201 | matrix._num_local_rows = 0; 202 | matrix._num_local_columns = 0; 203 | matrix._num_local_offdiag_columns = 0; 204 | return *this; 205 | } 206 | //----------------------------------------------------------------------------- 207 | void CUDAMatrix::copy_matrix_values_to_host( 208 | const CUDA::Context& cuda_context) 209 | { 210 | if (_diag) 211 | _diag->copy_matrix_values_to_host(cuda_context); 212 | if (_offdiag) 213 | _offdiag->copy_matrix_values_to_host(cuda_context); 214 | } 215 | //----------------------------------------------------------------------------- 216 | void CUDAMatrix::apply(MatAssemblyType type) 217 | { 218 | PetscErrorCode ierr; 219 | ierr = MatAssemblyBegin(_A, type); 220 | if (ierr != 0) 221 | petsc::error(ierr, __FILE__, "MatAssemblyBegin"); 222 | ierr = MatAssemblyEnd(_A, type); 223 | if (ierr != 0) 224 | petsc::error(ierr, __FILE__, "MatAssemblyEnd"); 225 | } 226 | //----------------------------------------------------------------------------- 227 | void CUDAMatrix::debug_dump() 228 | { 229 | if (_diag) { 230 | std::cout << "Dumping diag matrix." << std::endl; 231 | _diag->debug_dump(); 232 | } 233 | if (_offdiag) { 234 | std::cout << "Dumping offdiag matrix." << std::endl; 235 | _offdiag->debug_dump(); 236 | } 237 | } 238 | 239 | -------------------------------------------------------------------------------- /cpp/cudolfinx/la/CUDAMatrix.h: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2024 Benjamin Pachev, James D. Trotter 2 | // 3 | // This file is part of cuDOLFINX 4 | // 5 | // SPDX-License-Identifier: LGPL-3.0-or-later 6 | 7 | #pragma once 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | namespace dolfinx::la 15 | { 16 | 17 | class CUDASeqMatrix; 18 | 19 | /// A wrapper for a matrix in the compressed sparse row (CSR) format 20 | /// that is stored in the device memory of a CUDA device. 21 | 22 | class CUDAMatrix 23 | { 24 | public: 25 | /// Create an empty CUDA matrix 26 | CUDAMatrix(); 27 | 28 | /// Create a matrix from a PETSc Mat object 29 | /// 30 | /// @param[in] cuda_context A context for a CUDA device 31 | /// @param[in] A PETSc matrix to copy to the device 32 | /// @param[in] page_lock_values Whether or not to use page-locked 33 | /// memory for the host-side array of 34 | /// non-zero values. 35 | /// @param[in] use_seqaijcusparsegetarray Whether or not to use the 36 | /// function MatSeqAIJCUSPARSEGetArray(),which is only 37 | /// available in a custom-built version of PETSc. If it 38 | /// is set, this will avoid unnecessary copying of data 39 | /// between host and device for matrices of type 40 | /// MATSEQAIJCUSPARSE whenever a CUDA-based assembler 41 | /// is used. 42 | CUDAMatrix( 43 | const CUDA::Context& cuda_context, 44 | Mat A, 45 | bool page_lock_values, 46 | bool use_seqaijcusparsegetarray); 47 | 48 | /// Destructor 49 | ~CUDAMatrix(); 50 | 51 | /// Copy constructor 52 | /// @param[in] matrix The object to be copied 53 | CUDAMatrix(const CUDAMatrix& matrix) = delete; 54 | 55 | /// Move constructor 56 | /// @param[in] matrix The object to be moved 57 | CUDAMatrix(CUDAMatrix&& matrix); 58 | 59 | /// Assignment operator 60 | /// @param[in] matrix Another CUDAMatrix object 61 | CUDAMatrix& operator=(const CUDAMatrix& matrix) = delete; 62 | 63 | /// Move assignment operator 64 | /// @param[in] matrix Another CUDAMatrix object 65 | CUDAMatrix& operator=(CUDAMatrix&& matrix); 66 | 67 | /// Get the underlying PETSc matrix object 68 | Mat mat() { return _A; } 69 | 70 | /// Get the diagonal block of the local part of the matrix 71 | const CUDASeqMatrix * diag() const { return _diag.get(); } 72 | CUDASeqMatrix * diag() { return _diag.get(); } 73 | 74 | /// Get the off-diagonal block of the local part of the matrix 75 | const CUDASeqMatrix * offdiag() const { return _offdiag.get(); } 76 | CUDASeqMatrix * offdiag() { return _offdiag.get(); } 77 | 78 | /// Methods to get off diagonal column mapping 79 | CUdeviceptr colmap() const { return _dcolmap; } 80 | CUdeviceptr colmap_sorted() const { return _dcolmap_sorted; } 81 | CUdeviceptr colmap_sorted_indices() const { return _dcolmap_sorted_indices; } 82 | 83 | /// Get the number of matrix rows 84 | int32_t num_rows() const { return _num_rows; } 85 | 86 | /// Get the number of matrix columns 87 | int32_t num_columns() const { return _num_columns; } 88 | 89 | /// Get the global index of the first row 90 | int32_t local_row_start() const { return _local_row_start; } 91 | 92 | /// Get the global index of the last row 93 | int32_t local_row_end() const { return _local_row_end; } 94 | 95 | /// Get the number of local matrix rows 96 | int32_t num_local_rows() const { return _num_local_rows; } 97 | 98 | /// Get the number of local matrix columns 99 | int32_t num_local_columns() const { return _num_local_columns; } 100 | 101 | /// Get the number of local matrix columns in the off-diagonal part 102 | int32_t num_local_offdiag_columns() const { return _num_local_offdiag_columns; } 103 | 104 | /// Update the values of the underlying PETSc matrix by copying 105 | /// values from device memory to host memory. 106 | /// 107 | /// @param[in] cuda_context A context for a CUDA device 108 | void copy_matrix_values_to_host( 109 | const CUDA::Context& cuda_context); 110 | 111 | /// Finalize matrix assembly by calling PETSc's MatAssemblyBegin() 112 | /// and MatAssemblyEnd(). 113 | /// 114 | /// @param[in] type MAT_FLUSH_ASSEMBLY or MAT_FINAL_ASSEMBLY 115 | void apply(MatAssemblyType type); 116 | 117 | void debug_dump(); 118 | 119 | private: 120 | /// Handle to the corresponding PETSc matrix object 121 | Mat _A; 122 | 123 | /// The diagonal block of the local part of the matrix 124 | std::unique_ptr _diag; 125 | 126 | /// The off-diagonal block of the local part of the matrix. 127 | /// This is only used if the matrix is distributed. 128 | std::unique_ptr _offdiag; 129 | 130 | /// Device-side mapping from columns of the local, off-diagonal 131 | /// block of the matrix to columns of the global matrix. 132 | CUdeviceptr _dcolmap; 133 | CUdeviceptr _dcolmap_sorted; 134 | CUdeviceptr _dcolmap_sorted_indices; 135 | 136 | /// The number of rows in the global matrix 137 | int32_t _num_rows; 138 | 139 | /// The number of columns in the global matrix 140 | int32_t _num_columns; 141 | 142 | /// The first row owned by the current MPI process 143 | int32_t _local_row_start; 144 | 145 | /// The last row owned by the current MPI process 146 | int32_t _local_row_end; 147 | 148 | /// The number of rows owned by the current MPI process 149 | int32_t _num_local_rows; 150 | 151 | /// The number of columns owned by the current MPI process 152 | int32_t _num_local_columns; 153 | 154 | /// The number of columns in the off-diagonal part of the local 155 | /// matrix owned by the current MPI process 156 | int32_t _num_local_offdiag_columns; 157 | }; 158 | 159 | } // namespace dolfinx::la 160 | -------------------------------------------------------------------------------- /cpp/cudolfinx/la/CUDASeqMatrix.h: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2024 Benjamin Pachev, James D. Trotter 2 | // 3 | // This file is part of cuDOLFINX 4 | // 5 | // SPDX-License-Identifier: LGPL-3.0-or-later 6 | 7 | #pragma once 8 | 9 | #include 10 | #include 11 | #include 12 | 13 | namespace dolfinx::la 14 | { 15 | 16 | /// A wrapper for a matrix in the compressed sparse row (CSR) format 17 | /// that is stored in the device memory of a CUDA device. 18 | 19 | class CUDASeqMatrix 20 | { 21 | public: 22 | /// Create an empty CUDA matrix 23 | CUDASeqMatrix(); 24 | 25 | /// Create a matrix from a PETSc Mat object. Note that the Mat must 26 | /// be of type MATSEQAIJ. 27 | /// 28 | /// @param[in] cuda_context A context for a CUDA device 29 | /// @param[in] A PETSc matrix to copy to the device 30 | /// @param[in] page_lock_values Whether or not to use page-locked 31 | /// memory for the host-side array of 32 | /// non-zero values. 33 | /// @param[in] use_seqaijcusparsegetarray Whether or not to use the 34 | /// function MatSeqAIJCUSPARSEGetArray(),which is only 35 | /// available in a custom-built version of PETSc. If it 36 | /// is set, this will avoid unnecessary copying of data 37 | /// between host and device for matrices of type 38 | /// MATSEQAIJCUSPARSE whenever a CUDA-based assembler 39 | /// is used. 40 | CUDASeqMatrix( 41 | const CUDA::Context& cuda_context, 42 | Mat A, 43 | bool page_lock_values, 44 | bool use_seqaijcusparsegetarray); 45 | 46 | /// Destructor 47 | ~CUDASeqMatrix(); 48 | 49 | /// Copy constructor 50 | /// @param[in] matrix The object to be copied 51 | CUDASeqMatrix(const CUDASeqMatrix& matrix) = delete; 52 | 53 | /// Move constructor 54 | /// @param[in] matrix The object to be moved 55 | CUDASeqMatrix(CUDASeqMatrix&& matrix); 56 | 57 | /// Assignment operator 58 | /// @param[in] matrix Another CUDASeqMatrix object 59 | CUDASeqMatrix& operator=(const CUDASeqMatrix& matrix) = delete; 60 | 61 | /// Move assignment operator 62 | /// @param[in] matrix Another CUDASeqMatrix object 63 | CUDASeqMatrix& operator=(CUDASeqMatrix&& matrix); 64 | 65 | /// Get the underlying PETSc matrix object 66 | Mat mat() { return _A; } 67 | 68 | /// Get the number of matrix rows 69 | int32_t num_rows() const { return _num_rows; } 70 | 71 | /// Get the number of matrix columns 72 | int32_t num_columns() const { return _num_columns; } 73 | 74 | /// Get the global index of the first row 75 | int32_t local_row_start() const { return _local_row_start; } 76 | 77 | /// Get the global index of the last row 78 | int32_t local_row_end() const { return _local_row_end; } 79 | 80 | /// Get the number of local matrix rows 81 | int32_t num_local_rows() const { return _num_local_rows; } 82 | 83 | /// Get the number of local matrix columns 84 | int32_t num_local_columns() const { return _num_local_columns; } 85 | 86 | /// Get a handle to the device-side row pointers 87 | CUdeviceptr row_ptr() const { return _drow_ptr; } 88 | 89 | /// Get the number of local non-zeros 90 | int32_t num_local_nonzeros() const { return _num_local_nonzeros; } 91 | 92 | /// Get a handle to the device-side column indices 93 | CUdeviceptr column_indices() const { return _dcolumn_indices; } 94 | 95 | /// Get a handle to the device-side non-zero values 96 | CUdeviceptr values() const; 97 | 98 | /// Update the values of the underlying PETSc matrix by copying 99 | /// values from device memory to host memory. 100 | /// 101 | /// @param[in] cuda_context A context for a CUDA device 102 | void copy_matrix_values_to_host( 103 | const CUDA::Context& cuda_context); 104 | 105 | /// Finalize matrix assembly by calling PETSc's MatAssemblyBegin() 106 | /// and MatAssemblyEnd(). 107 | /// 108 | /// @param[in] type MAT_FLUSH_ASSEMBLY or MAT_FINAL_ASSEMBLY 109 | void apply(MatAssemblyType type); 110 | 111 | void debug_dump(); 112 | 113 | private: 114 | /// Handle to the corresponding PETSc matrix object 115 | Mat _A; 116 | 117 | /// Whether or not the host-side array of non-zero values uses 118 | /// page-locked or pinned memory 119 | bool _values_page_locked; 120 | 121 | /// The number of rows in the global matrix 122 | int32_t _num_rows; 123 | 124 | /// The number of columns in the global matrix 125 | int32_t _num_columns; 126 | 127 | /// The first row owned by the current MPI process 128 | int32_t _local_row_start; 129 | 130 | /// The last row owned by the current MPI process 131 | int32_t _local_row_end; 132 | 133 | /// The number of rows owned by the current MPI process 134 | int32_t _num_local_rows; 135 | 136 | /// The number of columns owned by the current MPI process 137 | int32_t _num_local_columns; 138 | 139 | /// Device-side storage for row pointers 140 | CUdeviceptr _drow_ptr; 141 | 142 | /// The number of non-zeros in the global matrix 143 | int32_t _num_local_nonzeros; 144 | 145 | /// Device-side storage for column indices 146 | CUdeviceptr _dcolumn_indices; 147 | 148 | /// Device-side storage for non-zero values 149 | CUdeviceptr _dvalues; 150 | 151 | /// Whether or not the device-side pointer is owned by PETSc and 152 | /// needs to be returned when we are done, or if it was allocated 153 | /// with cuMemAlloc() and needs to be freed with cuMemFree(). 154 | /// 155 | /// For now, PETSc does not provide access to device-side non-zero 156 | /// values, even for matrices that are stored on a CUDA 157 | /// device. Consequently, `_dvalues_petsc_owned` is always false, 158 | /// and there is potentially some unnecessary copying between the 159 | /// host and device. 160 | bool _dvalues_petsc_owned; 161 | }; 162 | 163 | } // namespace dolfinx::la 164 | -------------------------------------------------------------------------------- /cpp/cudolfinx/la/CUDAVector.h: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2024 Benjamin Pachev, James D. Trotter 2 | // 3 | // This file is part of cuDOLFINX 4 | // 5 | // SPDX-License-Identifier: LGPL-3.0-or-later 6 | 7 | #pragma once 8 | 9 | #include 10 | #include 11 | #include 12 | 13 | namespace dolfinx::la 14 | { 15 | 16 | /// A wrapper for a dense vector that is stored in the device memory 17 | /// of a CUDA device. 18 | 19 | class CUDAVector 20 | { 21 | public: 22 | /// Create an empty CUDA vector 23 | CUDAVector(); 24 | 25 | /// Create a vector from a PETSc Vec object 26 | /// 27 | /// @param[in] cuda_context A context for a CUDA device 28 | /// @param[in] x PETSc vector to copy to the device 29 | /// @param[in] page_lock_values Whether or not to use page-locked 30 | /// memory for the host-side array of 31 | /// values. 32 | CUDAVector( 33 | const CUDA::Context& cuda_context, 34 | Vec x, 35 | bool page_lock_values = true, 36 | bool include_ghosts = true); 37 | 38 | 39 | /*template 40 | CUDAVector(const CUDA::Context& cuda_context, std::shared_ptr> x) 41 | : CUDAVector(cuda_context, la::petsc::create_vector_wrap(*x)) 42 | { 43 | }*/ 44 | 45 | /// Destructor 46 | ~CUDAVector(); 47 | 48 | /// Copy constructor 49 | /// @param[in] vector The object to be copied 50 | CUDAVector(const CUDAVector& vector) = delete; 51 | 52 | /// Move constructor 53 | /// @param[in] vector The object to be moved 54 | CUDAVector(CUDAVector&& vector); 55 | 56 | /// Assignment operator 57 | /// @param[in] vector Another CUDAVector object 58 | CUDAVector& operator=(const CUDAVector& vector) = delete; 59 | 60 | /// Move assignment operator 61 | /// @param[in] vector Another CUDAVector object 62 | CUDAVector& operator=(CUDAVector&& vector); 63 | 64 | /// Get a handle to the underlying PETSc vector object 65 | const Vec vector() const { return _x; } 66 | 67 | /// Get the number of vector values 68 | int32_t num_values() const { return _num_values; } 69 | 70 | /// Get the number of local vector values 71 | int32_t num_local_values() const { return _num_local_values; } 72 | 73 | /// Get the number of local vector values 74 | int32_t num_local_ghosted_values() const { return _num_local_ghosted_values; } 75 | 76 | bool ghosted() const; 77 | 78 | /// Get a handle to the device-side non-zero values 79 | CUdeviceptr values() const; 80 | 81 | /// Return a handle to the device-side non-zero values 82 | void restore_values() const; 83 | 84 | /// Get a handle to the device-side non-zero values 85 | CUdeviceptr values_write() const; 86 | 87 | /// Return a handle to the device-side non-zero values 88 | void restore_values_write() const; 89 | 90 | /// Update the device-side vector values from the underlying PETSc 91 | /// vector. If the PETSc vector resides in host memory, then the 92 | /// values are copied from host memory to device memory. This does 93 | /// nothing if the PETSc vector is already held in device memory. 94 | /// 95 | /// @param[in] cuda_context A context for a CUDA device 96 | void copy_vector_values_to_device( 97 | const CUDA::Context& cuda_context); 98 | 99 | /// Update the values of the underlying PETSc vector. If the PETSc 100 | /// vector resides in host memory, then the values are copied from 101 | /// device memory to host memory. This does nothing if the PETSc 102 | /// vector is already held in device memory. 103 | /// 104 | /// @param[in] cuda_context A context for a CUDA device 105 | void copy_vector_values_to_host( 106 | const CUDA::Context& cuda_context); 107 | 108 | /// Update the device-side values of ghost nodes from the underlying 109 | /// PETSc vector. If the PETSc vector resides in host memory, then 110 | /// values are copied from host memory to device memory. This does 111 | /// nothing if the PETSc vector is already held in device memory. 112 | /// 113 | /// @param[in] cuda_context A context for a CUDA device 114 | void copy_ghost_values_to_device( 115 | const CUDA::Context& cuda_context); 116 | 117 | /// Update the values of ghost nodes of the underlying PETSc vector. 118 | /// If the PETSc vector resides in host memory, then ghost values 119 | /// are copied from device memory to host memory. This does nothing 120 | /// if the PETSc vector is already held in device memory. 121 | /// 122 | /// @param[in] cuda_context A context for a CUDA device 123 | void copy_ghost_values_to_host( 124 | const CUDA::Context& cuda_context); 125 | 126 | /// Update vector entries that are owned by this process, but are 127 | /// represented as ghost values on other processes. 128 | void apply_ghosts( 129 | const CUDA::Context& cuda_context); 130 | 131 | /// Update vector entries corresponding to ghost values, meaning 132 | /// that ghost values are gathered from other processes that own 133 | /// them. 134 | bool update_ghosts( 135 | const CUDA::Context& cuda_context); 136 | 137 | private: 138 | /// Handle to the corresponding PETSc vector object 139 | Vec _x; 140 | 141 | /// Handle to the corresponding local PETSc vector object, if the 142 | /// vector is distributed. 143 | Vec _x_local; 144 | 145 | /// Whether or not the host-side array of values uses page-locked or 146 | /// pinned memory 147 | bool _values_page_locked; 148 | 149 | bool _include_ghosts; 150 | 151 | /// The number of values in the global vector 152 | int32_t _num_values; 153 | 154 | /// The number of values owned by the current MPI rank 155 | int32_t _num_local_values; 156 | 157 | /// The number of values owned by the current MPI rank 158 | int32_t _num_local_ghosted_values; 159 | 160 | /// The first value owned by the current MPI rank 161 | int32_t _local_values_start; 162 | 163 | /// The last value owned by the current MPI rank 164 | int32_t _local_values_end; 165 | 166 | /// Device-side storage for non-zero values 167 | mutable CUdeviceptr _dvalues; 168 | 169 | /// Whether or not the device-side pointer is owned by PETSc and 170 | /// needs to be returned when we are done, or if it was allocated 171 | /// with cuMemAlloc() and needs to be freed with cuMemFree(). 172 | bool _dvalues_petsc_owned; 173 | 174 | public: 175 | bool debug; 176 | }; 177 | 178 | } // namespace dolfinx::la 179 | -------------------------------------------------------------------------------- /cpp/cudolfinx/la/petsc.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2024 Benjamin Pachev, James D. Trotter 2 | // 3 | // This file is part of cuDOLFINX 4 | // 5 | // SPDX-License-Identifier: LGPL-3.0-or-later 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | Mat la::petsc::create_cuda_matrix(MPI_Comm comm, const SparsityPattern& sp) 15 | { 16 | PetscErrorCode ierr; 17 | Mat A; 18 | 19 | // Get IndexMaps from sparsity patterm, and block size 20 | std::array maps = {sp.index_map(0), sp.index_map(1)}; 21 | const std::array bs = {sp.block_size(0), sp.block_size(1)}; 22 | dolfinx::common::IndexMap col_map = sp.column_index_map(); 23 | 24 | // Get global and local dimensions 25 | const std::int64_t M = bs[0] * maps[0]->size_global(); 26 | const std::int64_t N = bs[1] * maps[1]->size_global(); 27 | const std::int32_t m = bs[0] * maps[0]->size_local(); 28 | const std::int32_t n = bs[1] * maps[1]->size_local(); 29 | 30 | // Build data to initialise sparsity pattern (modify for block size) 31 | std::vector _row_ptr; 32 | // Need to ensure correct int type. . . 33 | std::vector _column_indices; 34 | auto [_edges, _offsets] = sp.graph(); 35 | 36 | // The CUDA assembly kernels aren't currently robust to matrices with variable block size 37 | // So for now always unroll to 1 38 | _row_ptr.resize(m+1); 39 | _row_ptr[0] = 0; 40 | _column_indices.resize(_edges.size()*bs[0]*bs[1]); 41 | // index indicating where we are in _edges 42 | std::size_t edge_index = 0; 43 | std::size_t unrolled_edge_index = 0; 44 | // Iterate over (blocked) rows 45 | for (std::size_t row = 0; row < maps[0]->size_local(); row++) { 46 | // TODO test with differing block sizes to ensure this is still valid 47 | PetscInt row_nnz = _offsets[row+1] - _offsets[row]; 48 | PetscInt unrolled_row_nnz = row_nnz * bs[1]; 49 | 50 | // row ptr 51 | for (std::size_t unrolled_row = bs[0]*row; unrolled_row < bs[0]*(row+1); unrolled_row++) 52 | _row_ptr[unrolled_row+1] = _row_ptr[unrolled_row] + unrolled_row_nnz; 53 | 54 | for (std::size_t j = 0; j < row_nnz; j++) { 55 | for (std::size_t k = 0; k < bs[1]; k++) 56 | _column_indices[unrolled_edge_index + j*bs[1] + k] = bs[1]*_edges[edge_index+j] + k; 57 | } 58 | // Unroll row block 59 | for (std::size_t l = 1; l < bs[0]; l++) 60 | std::copy_n(std::next(_column_indices.begin(), unrolled_edge_index), unrolled_row_nnz, 61 | std::next(_column_indices.begin(), unrolled_edge_index + l*unrolled_row_nnz)); 62 | 63 | edge_index += row_nnz; 64 | unrolled_edge_index += bs[0] * unrolled_row_nnz; 65 | } 66 | 67 | // convert local column indices to global ones (unrolling blocked indices) 68 | std::vector global_column_indices(_column_indices.size()); 69 | auto col_local_size = bs[1]*col_map.size_local(); 70 | auto col_ghosts = col_map.ghosts(); 71 | auto col_local_range = bs[1]*col_map.local_range()[0]; 72 | for (std::size_t i = 0; i < _column_indices.size(); i++) { 73 | 74 | if (_column_indices[i] < col_local_size) 75 | global_column_indices[i] = _column_indices[i] + col_local_range; 76 | else { 77 | int diff = _column_indices[i] - col_local_size; 78 | global_column_indices[i] = bs[1] * col_ghosts[diff / bs[1]] + diff % bs[1]; 79 | } 80 | } 81 | MatCreateMPIAIJWithArrays(comm, m, n, M, N, _row_ptr.data(), global_column_indices.data(), nullptr, &A); 82 | // Change matrix type to CUDA 83 | ierr = MatSetType(A, MATMPIAIJCUSPARSE); 84 | if (ierr != 0) 85 | petsc::error(ierr, __FILE__, "MatSetFromOptions"); 86 | 87 | // Set block sizes 88 | ierr = MatSetBlockSizes(A, 1, 1); 89 | if (ierr != 0) 90 | petsc::error(ierr, __FILE__, "MatSetBlockSizes"); 91 | 92 | // Create PETSc local-to-global map/index sets 93 | ISLocalToGlobalMapping local_to_global0; 94 | // create unrolled global indices 95 | const std::vector map0 = maps[0]->global_indices(); 96 | std::vector _map0; 97 | _map0.resize(map0.size() * bs[0]); 98 | for (size_t i = 0; i < map0.size(); i++) 99 | for (size_t j = 0; j < bs[0]; j++) 100 | _map0[i*bs[0] + j] = map0[i]*bs[0] + j; 101 | //const std::vector _map0(map0.begin(), map0.end()); 102 | ierr = ISLocalToGlobalMappingCreate(MPI_COMM_SELF, 1, _map0.size(), 103 | _map0.data(), PETSC_COPY_VALUES, 104 | &local_to_global0); 105 | 106 | if (ierr != 0) 107 | petsc::error(ierr, __FILE__, "ISLocalToGlobalMappingCreate"); 108 | 109 | // Check for common index maps 110 | if (maps[0] == maps[1] and bs[0] == bs[1]) 111 | { 112 | ierr = MatSetLocalToGlobalMapping(A, local_to_global0, local_to_global0); 113 | if (ierr != 0) 114 | petsc::error(ierr, __FILE__, "MatSetLocalToGlobalMapping"); 115 | } 116 | else 117 | { 118 | ISLocalToGlobalMapping local_to_global1; 119 | const std::vector map1 = maps[1]->global_indices(); 120 | std::vector _map1; 121 | _map1.resize(map1.size() * bs[1]); 122 | for (size_t i = 0; i < map1.size(); i++) 123 | for (size_t j = 0; j < bs[1]; j++) 124 | _map1[i*bs[1] + j] = map1[i]*bs[1] + j; 125 | //const std::vector _map1(map1.begin(), map1.end()); 126 | ierr = ISLocalToGlobalMappingCreate(MPI_COMM_SELF, 1, _map1.size(), 127 | _map1.data(), PETSC_COPY_VALUES, 128 | &local_to_global1); 129 | if (ierr != 0) 130 | petsc::error(ierr, __FILE__, "ISLocalToGlobalMappingCreate"); 131 | ierr = MatSetLocalToGlobalMapping(A, local_to_global0, local_to_global1); 132 | if (ierr != 0) 133 | petsc::error(ierr, __FILE__, "MatSetLocalToGlobalMapping"); 134 | ierr = ISLocalToGlobalMappingDestroy(&local_to_global1); 135 | if (ierr != 0) 136 | petsc::error(ierr, __FILE__, "ISLocalToGlobalMappingDestroy"); 137 | } 138 | 139 | // Clean up local-to-global 0 140 | ierr = ISLocalToGlobalMappingDestroy(&local_to_global0); 141 | if (ierr != 0) 142 | petsc::error(ierr, __FILE__, "ISLocalToGlobalMappingDestroy"); 143 | 144 | // Set some options on Mat object 145 | ierr = MatSetOption(A, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE); 146 | if (ierr != 0) 147 | petsc::error(ierr, __FILE__, "MatSetOption"); 148 | ierr = MatSetOption(A, MAT_KEEP_NONZERO_PATTERN, PETSC_TRUE); 149 | if (ierr != 0) 150 | petsc::error(ierr, __FILE__, "MatSetOption"); 151 | 152 | return A; 153 | } 154 | 155 | -------------------------------------------------------------------------------- /cpp/cudolfinx/la/petsc.h: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2024 Benjamin Pachev, James D. Trotter 2 | // 3 | // This file is part of cuDOLFINX 4 | // 5 | // SPDX-License-Identifier: LGPL-3.0-or-later 6 | 7 | #pragma once 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | namespace dolfinx::la 16 | { 17 | 18 | namespace petsc 19 | { 20 | 21 | Mat create_cuda_matrix(MPI_Comm comm, const SparsityPattern& sp); 22 | 23 | } // namespace petsc 24 | } // namespace dolfinx::la 25 | -------------------------------------------------------------------------------- /cpp/cudolfinx/mesh/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(HEADERS_mesh 2 | ${CMAKE_CURRENT_SOURCE_DIR}/CUDAMesh.h 3 | ${CMAKE_CURRENT_SOURCE_DIR}/CUDAMeshEntities.h 4 | ${CMAKE_CURRENT_SOURCE_DIR}/util.h 5 | PARENT_SCOPE 6 | ) 7 | 8 | target_sources( 9 | cudolfinx 10 | PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/util.cpp 11 | ) 12 | 13 | -------------------------------------------------------------------------------- /cpp/cudolfinx/mesh/CUDAMesh.h: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2024 Benjamin Pachev, James D. Trotter 2 | // 3 | // This file is part of cuDOLFINX 4 | // 5 | // SPDX-License-Identifier: LGPL-3.0-or-later 6 | 7 | #pragma once 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | namespace dolfinx { 16 | namespace mesh { 17 | 18 | /// A wrapper for mesh data that is stored in the device memory of a 19 | /// CUDA device. 20 | template 21 | class CUDAMesh 22 | { 23 | public: 24 | /// Create an empty mesh 25 | CUDAMesh() 26 | : _tdim() 27 | , _num_vertices() 28 | , _num_coordinates_per_vertex() 29 | , _dvertex_coordinates(0) 30 | , _num_cells() 31 | , _num_vertices_per_cell() 32 | , _dvertex_indices_per_cell(0) 33 | , _dcell_permutations(0) 34 | , _mesh_entities() 35 | { 36 | } 37 | //----------------------------------------------------------------------------- 38 | /// Create a mesh 39 | /// 40 | /// @param[in] cuda_context A context for a CUDA device 41 | /// @param[in] mesh Data structures for mesh topology and geometry 42 | CUDAMesh(const CUDA::Context& cuda_context, const dolfinx::mesh::Mesh& mesh) 43 | { 44 | CUresult cuda_err; 45 | const char * cuda_err_description; 46 | 47 | _tdim = mesh.topology()->dim(); 48 | 49 | // Allocate device-side storage for vertex coordinates 50 | auto vertex_coordinates = mesh.geometry().x(); 51 | _num_vertices = vertex_coordinates.size() / 3; 52 | // TODO figure out how to handle this properly 53 | // FEniCSx has a dimension of 3 during assembly, but returns a 54 | // different value for the dim of mesh.geometry 55 | _num_coordinates_per_vertex = 3; 56 | //_num_coordinates_per_vertex = mesh.geometry().dim(); 57 | if (_num_vertices > 0 && _num_coordinates_per_vertex > 0) { 58 | if (_num_coordinates_per_vertex > 3) { 59 | throw std::runtime_error( 60 | "Expected at most 3 coordinates per vertex " 61 | "instead of " + std::to_string(_num_coordinates_per_vertex) + " " 62 | "at " + std::string(__FILE__) + ":" + std::to_string(__LINE__)); 63 | } 64 | 65 | size_t dvertex_coordinates_size = 66 | _num_vertices * 3 * sizeof(double); 67 | cuda_err = cuMemAlloc( 68 | &_dvertex_coordinates, 69 | dvertex_coordinates_size); 70 | if (cuda_err != CUDA_SUCCESS) { 71 | cuGetErrorString(cuda_err, &cuda_err_description); 72 | throw std::runtime_error( 73 | "cuMemAlloc() failed with " + std::string(cuda_err_description) + 74 | " at " + std::string(__FILE__) + ":" + std::to_string(__LINE__)); 75 | } 76 | 77 | // Copy vertex coordinates to device 78 | cuda_err = cuMemcpyHtoD( 79 | _dvertex_coordinates, 80 | vertex_coordinates.data(), 81 | dvertex_coordinates_size); 82 | if (cuda_err != CUDA_SUCCESS) { 83 | cuMemFree(_dvertex_coordinates); 84 | cuGetErrorString(cuda_err, &cuda_err_description); 85 | throw std::runtime_error( 86 | "cuMemcpyHtoD() failed with " + std::string(cuda_err_description) + 87 | " at " + std::string(__FILE__) + ":" + std::to_string(__LINE__)); 88 | } 89 | } 90 | 91 | // Obtain mesh geometry 92 | auto x_dofmap = 93 | mesh.geometry().dofmap(); 94 | 95 | // Allocate device-side storage for cell vertex indices 96 | _num_cells = x_dofmap.extent(0); 97 | _num_vertices_per_cell = x_dofmap.extent(1); 98 | if (_num_cells > 0 && _num_vertices_per_cell > 0) { 99 | size_t dvertex_indices_per_cell_size = 100 | _num_cells * _num_vertices_per_cell * sizeof(int32_t); 101 | cuda_err = cuMemAlloc( 102 | &_dvertex_indices_per_cell, 103 | dvertex_indices_per_cell_size); 104 | if (cuda_err != CUDA_SUCCESS) { 105 | cuMemFree(_dvertex_coordinates); 106 | cuGetErrorString(cuda_err, &cuda_err_description); 107 | throw std::runtime_error( 108 | "cuMemAlloc() failed with " + std::string(cuda_err_description) + 109 | " at " + std::string(__FILE__) + ":" + std::to_string(__LINE__)); 110 | } 111 | 112 | // Copy cell vertex indices to device 113 | cuda_err = cuMemcpyHtoD( 114 | _dvertex_indices_per_cell, 115 | x_dofmap.data_handle(), 116 | dvertex_indices_per_cell_size); 117 | if (cuda_err != CUDA_SUCCESS) { 118 | cuMemFree(_dvertex_indices_per_cell); 119 | cuMemFree(_dvertex_coordinates); 120 | cuGetErrorString(cuda_err, &cuda_err_description); 121 | throw std::runtime_error( 122 | "cuMemcpyHtoD() failed with " + std::string(cuda_err_description) + 123 | " at " + std::string(__FILE__) + ":" + std::to_string(__LINE__)); 124 | } 125 | } 126 | 127 | // Obtain cell permutations 128 | mesh.topology_mutable()->create_entity_permutations(); 129 | auto cell_permutations = mesh.topology()->get_cell_permutation_info(); 130 | 131 | // Allocate device-side storage for cell permutations 132 | if (_num_cells > 0) { 133 | size_t dcell_permutations_size = 134 | _num_cells * sizeof(uint32_t); 135 | cuda_err = cuMemAlloc( 136 | &_dcell_permutations, 137 | dcell_permutations_size); 138 | if (cuda_err != CUDA_SUCCESS) { 139 | cuMemFree(_dvertex_indices_per_cell); 140 | cuMemFree(_dvertex_coordinates); 141 | cuGetErrorString(cuda_err, &cuda_err_description); 142 | throw std::runtime_error( 143 | "cuMemAlloc() failed with " + std::string(cuda_err_description) + 144 | " at " + std::string(__FILE__) + ":" + std::to_string(__LINE__)); 145 | } 146 | 147 | // Copy cell permutations to device 148 | cuda_err = cuMemcpyHtoD( 149 | _dcell_permutations, 150 | cell_permutations.data(), 151 | dcell_permutations_size); 152 | if (cuda_err != CUDA_SUCCESS) { 153 | cuMemFree(_dcell_permutations); 154 | cuGetErrorString(cuda_err, &cuda_err_description); 155 | throw std::runtime_error( 156 | "cuMemcpyHtoD() failed with " + std::string(cuda_err_description) + 157 | " at " + std::string(__FILE__) + ":" + std::to_string(__LINE__)); 158 | } 159 | } 160 | 161 | for (int dim = 0; dim < _tdim; dim++) { 162 | _mesh_entities.emplace_back( 163 | cuda_context, mesh, dim); 164 | } 165 | } 166 | //----------------------------------------------------------------------------- 167 | /// Destructor 168 | ~CUDAMesh() 169 | { 170 | if (_dcell_permutations) 171 | cuMemFree(_dcell_permutations); 172 | if (_dvertex_indices_per_cell) 173 | cuMemFree(_dvertex_indices_per_cell); 174 | if (_dvertex_coordinates) 175 | cuMemFree(_dvertex_coordinates); 176 | } 177 | //----------------------------------------------------------------------------- 178 | /// Copy constructor 179 | /// @param[in] mesh The object to be copied 180 | CUDAMesh(const CUDAMesh& mesh) = delete; 181 | 182 | /// Move constructor 183 | /// @param[in] mesh The object to be moved 184 | CUDAMesh(CUDAMesh&& mesh) 185 | : _tdim(mesh._tdim) 186 | , _num_vertices(mesh._num_vertices) 187 | , _num_coordinates_per_vertex(mesh._num_coordinates_per_vertex) 188 | , _dvertex_coordinates(mesh._dvertex_coordinates) 189 | , _num_cells(mesh._num_cells) 190 | , _num_vertices_per_cell(mesh._num_vertices_per_cell) 191 | , _dvertex_indices_per_cell(mesh._dvertex_indices_per_cell) 192 | , _dcell_permutations(mesh._dcell_permutations) 193 | , _mesh_entities(std::move(mesh._mesh_entities)) 194 | { 195 | mesh._tdim = 0; 196 | mesh._num_vertices = 0; 197 | mesh._num_coordinates_per_vertex = 0; 198 | mesh._dvertex_coordinates = 0; 199 | mesh._num_cells = 0; 200 | mesh._num_vertices_per_cell = 0; 201 | mesh._dvertex_indices_per_cell = 0; 202 | mesh._dcell_permutations = 0; 203 | } 204 | //----------------------------------------------------------------------------- 205 | /// Assignment operator 206 | /// @param[in] mesh Another CUDAMesh object 207 | CUDAMesh& operator=(const CUDAMesh& mesh) = delete; 208 | 209 | /// Move assignment operator 210 | /// @param[in] mesh Another CUDAMesh object 211 | CUDAMesh& operator=(CUDAMesh&& mesh) 212 | { 213 | _tdim = mesh._tdim; 214 | _num_vertices = mesh._num_vertices; 215 | _num_coordinates_per_vertex = mesh._num_coordinates_per_vertex; 216 | _dvertex_coordinates = mesh._dvertex_coordinates; 217 | _num_cells = mesh._num_cells; 218 | _num_vertices_per_cell = mesh._num_vertices_per_cell; 219 | _dvertex_indices_per_cell = mesh._dvertex_indices_per_cell; 220 | _dcell_permutations = mesh._dcell_permutations; 221 | _mesh_entities = std::move(mesh._mesh_entities); 222 | mesh._tdim = 0; 223 | mesh._num_vertices = 0; 224 | mesh._num_coordinates_per_vertex = 0; 225 | mesh._dvertex_coordinates = 0; 226 | mesh._num_cells = 0; 227 | mesh._num_vertices_per_cell = 0; 228 | mesh._dvertex_indices_per_cell = 0; 229 | mesh._dcell_permutations = 0; 230 | return *this; 231 | } 232 | //----------------------------------------------------------------------------- 233 | 234 | 235 | /// Get the topological dimension of the mesh 236 | int32_t tdim() const { return _tdim; } 237 | 238 | /// Get the number of vertices 239 | int32_t num_vertices() const { return _num_vertices; } 240 | 241 | /// Get the number of coordinates per vertex 242 | int32_t num_coordinates_per_vertex() const { 243 | return _num_coordinates_per_vertex; } 244 | 245 | /// Get a handle to the device-side vertex coordinates 246 | CUdeviceptr vertex_coordinates() const { 247 | return _dvertex_coordinates; } 248 | 249 | /// Get the number of cells 250 | int32_t num_cells() const { return _num_cells; } 251 | 252 | /// Get the number of vertices per cell 253 | int32_t num_vertices_per_cell() const { 254 | return _num_vertices_per_cell; } 255 | 256 | /// Get a handle to the device-side cell vertex indices 257 | CUdeviceptr vertex_indices_per_cell() const { 258 | return _dvertex_indices_per_cell; } 259 | 260 | /// Get a handle to the device-side cell permutations 261 | CUdeviceptr cell_permutations() const { 262 | return _dcell_permutations; } 263 | 264 | /// Get the mesh entities of each dimension 265 | const std::vector>& mesh_entities() const { 266 | return _mesh_entities; } 267 | 268 | private: 269 | /// The topological dimension of the mesh, or the largest dimension 270 | /// of any of the mesh entities 271 | int32_t _tdim; 272 | 273 | /// The number of vertices in the mesh 274 | int32_t _num_vertices; 275 | 276 | /// The number of coordinates for each vertex 277 | int32_t _num_coordinates_per_vertex; 278 | 279 | /// The coordinates of the mesh vertices 280 | CUdeviceptr _dvertex_coordinates; 281 | 282 | /// The number of cells in the mesh 283 | int32_t _num_cells; 284 | 285 | /// The number of vertices in each cell 286 | int32_t _num_vertices_per_cell; 287 | 288 | /// The vertex indices of each cell 289 | CUdeviceptr _dvertex_indices_per_cell; 290 | 291 | /// Cell permutations 292 | CUdeviceptr _dcell_permutations; 293 | 294 | /// The mesh entities of each dimension 295 | std::vector> _mesh_entities; 296 | }; 297 | 298 | } // namespace mesh 299 | } // namespace dolfinx 300 | 301 | -------------------------------------------------------------------------------- /cpp/cudolfinx/mesh/util.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | using namespace dolfinx; 7 | 8 | std::vector dolfinx::mesh::ghost_exterior_facet_indices(std::shared_ptr topology) 9 | { 10 | const int tdim = topology->dim(); 11 | auto f_to_c = topology->connectivity(tdim - 1, tdim); 12 | auto f_to_v = topology->connectivity(tdim-1, 0); 13 | if (!f_to_c) { 14 | topology->create_connectivity(tdim-1, tdim); 15 | f_to_c = topology->connectivity(tdim-1, tdim); 16 | } 17 | if (!f_to_v) { 18 | topology->create_connectivity(tdim-1, 0); 19 | f_to_v = topology->connectivity(tdim-1, 0); 20 | } 21 | // Find all owned facets (not ghost) with only one attached cell 22 | auto facet_map = topology->index_map(tdim - 1); 23 | const int num_local_facets = facet_map->size_local(); 24 | const int num_ghost_facets = facet_map->num_ghosts(); 25 | const int num_local_vertices = topology->index_map(0)->size_local(); 26 | std::vector facets; 27 | for (std::int32_t f = num_local_facets; f < num_local_facets+num_ghost_facets; ++f) 28 | { 29 | if (f_to_c->num_links(f) == 1) { 30 | // check to make sure at least one facet vertex is owned 31 | // otherwise this is not needed 32 | auto vertices = f_to_v->links(f); 33 | bool has_owned_vertex = false; 34 | for (int i = 0; i < vertices.size(); i++) { 35 | if (vertices[i] < num_local_vertices) has_owned_vertex = true; 36 | } 37 | if (has_owned_vertex) facets.push_back(f); 38 | } 39 | } 40 | // Remove facets on internal inter-process boundary 41 | std::vector ext_facets; 42 | std::ranges::set_difference(facets, topology->interprocess_facets(), 43 | std::back_inserter(ext_facets)); 44 | return ext_facets; 45 | } 46 | 47 | std::vector dolfinx::mesh::ghost_entities( 48 | dolfinx::fem::IntegralType integral_type, 49 | std::shared_ptr topology) 50 | { 51 | std::vector ghost_entities; 52 | int tdim = topology->dim(); 53 | switch (integral_type) { 54 | case fem::IntegralType::cell: 55 | { 56 | auto cell_index_map = topology->index_map(tdim); 57 | int num_ghost_cells = cell_index_map->num_ghosts(); 58 | int num_owned_cells = cell_index_map->size_local(); 59 | ghost_entities.resize(num_ghost_cells); 60 | std::iota(ghost_entities.begin(), ghost_entities.end(), num_owned_cells); 61 | } 62 | break; 63 | case fem::IntegralType::exterior_facet: 64 | { 65 | auto ghost_exterior_facets = dolfinx::mesh::ghost_exterior_facet_indices(topology); 66 | ghost_entities.reserve(2*ghost_exterior_facets.size()); 67 | auto c_to_f = topology->connectivity(tdim, tdim-1); 68 | auto f_to_c = topology->connectivity(tdim-1, tdim); 69 | for (std::int32_t f : ghost_exterior_facets) { 70 | auto pair = 71 | dolfinx::fem::impl::get_cell_facet_pairs<1>(f, f_to_c->links(f), *c_to_f); 72 | ghost_entities.insert(ghost_entities.end(), pair.begin(), pair.end()); 73 | } 74 | } 75 | break; 76 | case fem::IntegralType::interior_facet: 77 | { 78 | auto c_to_f = topology->connectivity(tdim, tdim-1); 79 | auto f_to_c = topology->connectivity(tdim-1, tdim); 80 | auto facet_map = topology->index_map(tdim-1); 81 | int num_local_facets = facet_map->size_local(); 82 | int total_facets = num_local_facets + facet_map->num_ghosts(); 83 | for (int f = num_local_facets; f < total_facets; f++) { 84 | if (f_to_c->num_links(f) == 2) { 85 | auto pairs = 86 | fem::impl::get_cell_facet_pairs<2>(f, f_to_c->links(f), *c_to_f); 87 | ghost_entities.insert(ghost_entities.end(), pairs.begin(), pairs.end()); 88 | } 89 | } 90 | } 91 | default: 92 | break; 93 | } 94 | return ghost_entities; 95 | } 96 | 97 | std::vector dolfinx::mesh::active_ghost_entities( 98 | std::span active_local_entities, 99 | fem::IntegralType integral_type, 100 | std::shared_ptr topology) 101 | { 102 | std::vector ghost_entities; 103 | MPI_Comm comm = topology->comm(); 104 | int rank = dolfinx::MPI::rank(comm); 105 | int tdim = topology->dim(); 106 | int ent_dim = (integral_type == fem::IntegralType::cell) ? tdim : tdim-1; 107 | // Step 1: determine the active entities which are ghosted on other processes 108 | std::map> dest_entities; 109 | auto imap = topology->index_map(ent_dim); 110 | int num_local_entities = imap->size_local(); 111 | auto entity_ranks = imap->index_to_dest_ranks(); 112 | int facet_increment = (integral_type == fem::IntegralType::interior_facet) ? 4 : 2; 113 | switch (integral_type) { 114 | case fem::IntegralType::cell: 115 | if (rank == 0) std::cout << "cell" <= entity_ranks.num_nodes()) continue; 118 | for (auto& r : entity_ranks.links(cell)) { 119 | if (dest_entities.find(r) == dest_entities.end()) { 120 | dest_entities[r] = {cell}; 121 | } 122 | else dest_entities[r].push_back(cell); 123 | } 124 | } 125 | break; 126 | case fem::IntegralType::interior_facet: 127 | case fem::IntegralType::exterior_facet: { 128 | auto c_to_f = topology->connectivity(tdim, tdim-1); 129 | if (!c_to_f) { 130 | topology->create_connectivity(tdim, tdim-1); 131 | c_to_f = topology->connectivity(tdim, tdim-1); 132 | } 133 | for (int i = 0; i < active_local_entities.size(); i += facet_increment) { 134 | auto cell = active_local_entities[i]; 135 | auto facet_index = active_local_entities[i+1]; 136 | auto facet = c_to_f->links(cell)[facet_index]; 137 | if (facet >= entity_ranks.num_nodes()) continue; 138 | for (auto& r : entity_ranks.links(facet)) { 139 | if (dest_entities.find(r) == dest_entities.end()) { 140 | dest_entities[r] = {facet}; 141 | } 142 | else dest_entities[r].push_back(facet); 143 | } 144 | } 145 | } 146 | default: 147 | break; 148 | } 149 | 150 | // Step 2: send those entities to the other processes 151 | std::vector indices_send_buffer; 152 | // construct list of destination MPI ranks 153 | std::vector dest; 154 | std::vector send_sizes; 155 | for (const auto& pair : dest_entities) { 156 | dest.push_back(pair.first); 157 | std::size_t num_inds = pair.second.size(); 158 | send_sizes.push_back(num_inds); 159 | std::vector global_inds(num_inds); 160 | imap->local_to_global(pair.second, global_inds); 161 | for (const auto& i : global_inds) 162 | indices_send_buffer.push_back(i); 163 | } 164 | // get source ranks 165 | std::vector src = dolfinx::MPI::compute_graph_edges_nbx(comm, dest); 166 | // Create neighbor communicator 167 | MPI_Comm neigh_comm; 168 | int ierr = MPI_Dist_graph_create_adjacent( 169 | comm, src.size(), src.data(), MPI_UNWEIGHTED, dest.size(), 170 | dest.data(), MPI_UNWEIGHTED, MPI_INFO_NULL, false, &neigh_comm); 171 | dolfinx::MPI::check_error(comm, ierr); 172 | // Share lengths of indices to be sent to each rank 173 | std::vector recv_sizes(src.size(), 0); 174 | ierr = MPI_Neighbor_alltoall(send_sizes.data(), 1, MPI_INT, 175 | recv_sizes.data(), 1, MPI_INT, neigh_comm); 176 | dolfinx::MPI::check_error(comm, ierr); 177 | // Prepare displacement arrays 178 | std::vector send_disp(dest.size() + 1, 0); 179 | std::vector recv_disp(src.size() + 1, 0); 180 | std::partial_sum(send_sizes.begin(), send_sizes.end(), 181 | std::next(send_disp.begin())); 182 | std::partial_sum(recv_sizes.begin(), recv_sizes.end(), 183 | std::next(recv_disp.begin())); 184 | // next steps - construct recv buffers and perform communication 185 | std::size_t recv_buf_size = recv_disp.back(); 186 | // make sure that the buffer pointers actually get allocated 187 | std::vector indices_recv_buffer(recv_buf_size); 188 | ierr = MPI_Neighbor_alltoallv(indices_send_buffer.data(), send_sizes.data(), 189 | send_disp.data(), MPI_INT64_T, 190 | indices_recv_buffer.data(), recv_sizes.data(), 191 | recv_disp.data(), MPI_INT64_T, neigh_comm); 192 | dolfinx::MPI::check_error(comm, ierr); 193 | // Step 3: Convert from global to local indices and do entity computation 194 | std::vector local_recv_indices(indices_recv_buffer.size()); 195 | imap->global_to_local(indices_recv_buffer, local_recv_indices); 196 | 197 | switch (integral_type) { 198 | case fem::IntegralType::cell: 199 | return local_recv_indices; 200 | break; 201 | case fem::IntegralType::exterior_facet: { 202 | // Remove facets on internal inter-process boundary 203 | std::vector ext_facets; 204 | std::sort(local_recv_indices.begin(), local_recv_indices.end()); 205 | std::ranges::set_difference(local_recv_indices, topology->interprocess_facets(), 206 | std::back_inserter(ext_facets)); 207 | auto c_to_f = topology->connectivity(tdim, tdim-1); 208 | auto f_to_c = topology->connectivity(tdim-1, tdim); 209 | ghost_entities.reserve(2*ext_facets.size()); 210 | for (auto& facet : ext_facets) { 211 | if (f_to_c->num_links(facet) == 1) { 212 | auto pair = 213 | dolfinx::fem::impl::get_cell_facet_pairs<1>(facet, f_to_c->links(facet), *c_to_f); 214 | ghost_entities.insert(ghost_entities.end(), pair.begin(), pair.end()); 215 | } 216 | } 217 | break; 218 | } 219 | case fem::IntegralType::interior_facet: { 220 | auto c_to_f = topology->connectivity(tdim, tdim-1); 221 | auto f_to_c = topology->connectivity(tdim-1, tdim); 222 | for (auto& facet : local_recv_indices) { 223 | if (f_to_c->num_links(facet) == 2) { 224 | auto pair = 225 | dolfinx::fem::impl::get_cell_facet_pairs<2>(facet, f_to_c->links(facet), *c_to_f); 226 | ghost_entities.insert(ghost_entities.end(), pair.begin(), pair.end()); 227 | } 228 | } 229 | } 230 | default: 231 | break; 232 | } 233 | 234 | return ghost_entities; 235 | } 236 | -------------------------------------------------------------------------------- /docker/Dockerfile.end-user: -------------------------------------------------------------------------------- 1 | # Dockerfile describing end-user CUDA-accelerated FEniCS environments 2 | # Modified version of the DOLFINx end user Docker file 3 | # 4 | # Authors: 5 | # Benjamin Pachev 6 | # 7 | 8 | ARG PYVISTA_VERSION=0.44.2 9 | 10 | # Used to set the correct PYTHONPATH for the real and complex install of 11 | # DOLFINx 12 | ARG PYTHON_VERSION=3.12 13 | # Base image for end-user images 14 | ARG BASEIMAGE=benpachev/cudolfinx:dev-env-v0.9.0 15 | ARG CUDOLFINX_TAG=v0.9.0 16 | 17 | FROM ${BASEIMAGE} as cudolfinx 18 | LABEL description="cuDOLFINx (onbuild)" 19 | 20 | ARG PYTHON_VERSION 21 | 22 | WORKDIR /src 23 | 24 | RUN git clone --depth 1 --branch v0.9.0 https://github.com/FEniCS/dolfinx.git 25 | RUN git clone --depth 1 --branch v0.9.0 https://github.com/FEniCS/ffcx.git 26 | RUN git clone --depth 1 --branch v0.9.0 https://github.com/FEniCS/basix.git 27 | RUN git clone --depth 1 --branch 2024.2.0 https://github.com/FEniCS/ufl.git 28 | RUN git clone --depth 1 --branch v0.9.0 https://github.com/bpachev/cuda-dolfinx.git 29 | 30 | RUN cp dolfinx/docker/dolfinx-real-mode /usr/local/bin/dolfinx-real-mode 31 | RUN chmod +x /usr/local/bin/dolfinx-*-mode 32 | 33 | # CMake build type for DOLFINx C++ build. See CMake documentation. 34 | ARG DOLFINX_CMAKE_BUILD_TYPE="Release" 35 | 36 | # Using pip install `.[test]` with --no-dependencies and --no-build-isolation 37 | # does not install necessary packages, hence install build and optional 38 | # dependencies manually here. 39 | RUN pip install --no-cache-dir -r dolfinx/python/build-requirements.txt && \ 40 | pip install --no-cache-dir pyamg pytest scipy matplotlib numba # test + optional set 41 | 42 | RUN cd basix && cmake -G Ninja -DCMAKE_BUILD_TYPE=${DOLFINX_CMAKE_BUILD_TYPE} -B build-dir -S ./cpp && \ 43 | cmake --build build-dir && \ 44 | cmake --install build-dir && \ 45 | pip install ./python && \ 46 | cd ../ufl && pip install --no-cache-dir . && \ 47 | cd ../ffcx && pip install --no-cache-dir . && \ 48 | cd ../ && pip install --no-cache-dir ipython 49 | 50 | RUN apt-get -qq update && \ 51 | apt-get install -y libboost-timer-dev libboost-filesystem-dev 52 | 53 | # --no-dependencies necessary as --target does not contain any dependencies e.g. 54 | # mpi4py - leading to unwanted rebuild. 55 | RUN cd dolfinx && \ 56 | mkdir -p build-real && \ 57 | cd build-real && \ 58 | PETSC_ARCH=linux-gnu-real64-32-cuda cmake -G Ninja -DCMAKE_INSTALL_PREFIX=/usr/local/dolfinx-real -DCMAKE_BUILD_TYPE=${DOLFINX_CMAKE_BUILD_TYPE} ../cpp && \ 59 | ninja install && \ 60 | cd ../python && \ 61 | PETSC_ARCH=linux-gnu-real64-32-cuda pip -v install \ 62 | --config-settings=cmake.build-type="${DOLFINX_CMAKE_BUILD_TYPE}" --config-settings=install.strip=false --no-build-isolation --check-build-dependencies \ 63 | --target /usr/local/dolfinx-real/lib/python${PYTHON_VERSION}/dist-packages --no-dependencies --no-cache-dir '.' 64 | 65 | # Currently cuDOLFINX only supports real mode, as the CUDA version of PETSc appears to not compile with complex types . . . . 66 | ENV PKG_CONFIG_PATH=/usr/local/dolfinx-real/lib/pkgconfig:$PKG_CONFIG_PATH \ 67 | CMAKE_PREFIX_PATH=/usr/local/dolfinx-real/lib/cmake:$CMAKE_PREFIX_PATH \ 68 | PETSC_ARCH=linux-gnu-real64-32-cuda \ 69 | PYTHONPATH=/usr/local/dolfinx-real/lib/python${PYTHON_VERSION}/dist-packages:$PYTHONPATH \ 70 | LD_LIBRARY_PATH=/usr/local/dolfinx-real/lib:$LD_LIBRARY_PATH 71 | 72 | RUN cd cuda-dolfinx && \ 73 | mkdir -p build-real && \ 74 | cd build-real && \ 75 | PETSC_ARCH=linux-gnu-real64-32-cuda cmake -G Ninja -DCMAKE_INSTALL_PREFIX=/usr/local/dolfinx-real -DCMAKE_BUILD_TYPE=${DOLFINX_CMAKE_BUILD_TYPE} ../cpp && \ 76 | ninja install && \ 77 | cd ../python && \ 78 | PETSC_ARCH=linux-gnu-real64-32-cuda pip -v install \ 79 | --config-settings=cmake.build-type="${DOLFINX_CMAKE_BUILD_TYPE}" --config-settings=install.strip=false --no-build-isolation --check-build-dependencies \ 80 | --target /usr/local/dolfinx-real/lib/python${PYTHON_VERSION}/dist-packages --no-dependencies --no-cache-dir '.' 81 | 82 | # Prepending /usr/local to paths is needed to make the correct version of MPI be used (not the one that comes with NVHPC) 83 | # Since this container doesn't currently install GPU aware MPI, PETSc needs the gpu aware MPI option turned off 84 | # TODO: fix the base container to install GPU-aware MPI 85 | ENV PETSC_OPTIONS="-use_gpu_aware_mpi 0" \ 86 | LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH \ 87 | PATH=/usr/local/bin:$PATH 88 | -------------------------------------------------------------------------------- /docker/Dockerfile.test-env: -------------------------------------------------------------------------------- 1 | FROM nvcr.io/nvidia/nvhpc:24.9-devel-cuda12.6-ubuntu24.04 as cudolfinx-dev-env 2 | 3 | ARG ADIOS2_VERSION=2.10.2 4 | ARG DOXYGEN_VERSION=1_13_2 5 | ARG GMSH_VERSION=4_13_1 6 | ARG HDF5_VERSION=1.14.6 7 | ARG KAHIP_VERSION=3.18 8 | # NOTE: The NumPy version (https://pypi.org/project/numpy/#history) 9 | # should be pinned to the most recent NumPy release that is supported by 10 | # the most recent Numba release, see 11 | # https://numba.readthedocs.io/en/stable/user/installing.html#version-support-information 12 | ARG NUMPY_VERSION=2.1.3 13 | ARG PETSC_VERSION=3.22.4 14 | ARG SLEPC_VERSION=3.22.2 15 | ARG SPDLOG_VERSION=1.15.1 16 | 17 | ARG MPICH_VERSION=4.2.3 18 | ARG OPENMPI_SERIES=5.0 19 | ARG OPENMPI_PATCH=7 20 | 21 | ######################################## 22 | 23 | LABEL maintainer="Benjamin Pachev " 24 | LABEL description="Modified FEniCS dev environment with CUDA PETSc installed." 25 | 26 | ARG ADIOS2_VERSION 27 | ARG DOXYGEN_VERSION 28 | ARG GMSH_VERSION 29 | ARG HDF5_VERSION 30 | ARG KAHIP_VERSION 31 | ARG PETSC_VERSION 32 | ARG SLEPC_VERSION 33 | ARG SPDLOG_VERSION 34 | ARG NUMPY_VERSION 35 | ARG MPICH_VERSION 36 | ARG OPENMPI_SERIES 37 | ARG OPENMPI_PATCH 38 | 39 | # The following ARGS are used in the dev-env layer. 40 | # They are safe defaults. They can be overridden by the user. 41 | # Compiler optimisation flags for SLEPc and PETSc, all languages. 42 | ARG PETSC_SLEPC_OPTFLAGS="-O2" 43 | # Turn on PETSc and SLEPc debugging. "yes" or "no". 44 | ARG PETSC_SLEPC_DEBUGGING="no" 45 | 46 | # MPI variant. "mpich" or "openmpi". 47 | ARG MPI="openmpi" 48 | 49 | # Number of build threads to use with make 50 | ARG BUILD_NP=4 51 | 52 | WORKDIR /tmp 53 | 54 | # Environment variables 55 | ENV OPENBLAS_NUM_THREADS=1 \ 56 | OPENBLAS_VERBOSE=0 57 | 58 | # Install dependencies available via apt-get. 59 | # - First set of packages are required to build and run FEniCS. 60 | # - Second set of packages are recommended and/or required to build 61 | # documentation or tests. 62 | # - Third set of packages are optional, but required to run gmsh 63 | # pre-built binaries. 64 | RUN export DEBIAN_FRONTEND=noninteractive && \ 65 | apt-get -qq update && \ 66 | apt-get -yq --with-new-pkgs -o Dpkg::Options::="--force-confold" upgrade && \ 67 | apt-get -y install \ 68 | cmake \ 69 | g++ \ 70 | gfortran \ 71 | libboost-dev \ 72 | liblapack-dev \ 73 | libopenblas-dev \ 74 | libpugixml-dev \ 75 | ninja-build \ 76 | pkg-config \ 77 | python3-dev \ 78 | python3-pip \ 79 | python3-venv && \ 80 | # 81 | apt-get -y install \ 82 | catch2 \ 83 | git \ 84 | graphviz \ 85 | libeigen3-dev \ 86 | valgrind \ 87 | wget && \ 88 | # 89 | apt-get -y install \ 90 | libglu1 \ 91 | libxcursor-dev \ 92 | libxft2 \ 93 | libxinerama1 \ 94 | libfltk1.3-dev \ 95 | libfreetype6-dev \ 96 | libgl1-mesa-dev \ 97 | libocct-foundation-dev \ 98 | libocct-data-exchange-dev && \ 99 | apt-get clean && \ 100 | rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* 101 | 102 | # Install spdlog from source - Ubuntu version is incompatible with CUDA 12. 103 | RUN wget -nc --quiet https://github.com/gabime/spdlog/archive/refs/tags/v${SPDLOG_VERSION}.tar.gz && \ 104 | tar xfz v${SPDLOG_VERSION}.tar.gz && \ 105 | cd spdlog-${SPDLOG_VERSION} && \ 106 | cmake -G Ninja -DCMAKE_BUILD_TYPE=Release -DSPDLOG_BUILD_SHARED=ON -DSPDLOG_BUILD_PIC=ON -B build-dir . && \ 107 | cmake --build build-dir && \ 108 | cmake --install build-dir && \ 109 | rm -rf /tmp/* 110 | 111 | # Install Doxygen 112 | RUN apt-get -qq update && \ 113 | apt-get -y install bison flex && \ 114 | wget -nc --quiet https://github.com/doxygen/doxygen/archive/refs/tags/Release_${DOXYGEN_VERSION}.tar.gz && \ 115 | tar xfz Release_${DOXYGEN_VERSION}.tar.gz && \ 116 | cd doxygen-Release_${DOXYGEN_VERSION} && \ 117 | cmake -G Ninja -DCMAKE_BUILD_TYPE=Release -B build-dir . && \ 118 | cmake --build build-dir && \ 119 | cmake --install build-dir && \ 120 | apt-get -y purge bison flex && \ 121 | apt-get -y autoremove && \ 122 | apt-get clean && \ 123 | rm -rf /tmp/* 124 | 125 | # Install MPI 126 | RUN if [ "$MPI" = "mpich" ]; then \ 127 | wget https://www.mpich.org/static/downloads/${MPICH_VERSION}/mpich-${MPICH_VERSION}.tar.gz && \ 128 | tar xfz mpich-${MPICH_VERSION}.tar.gz && \ 129 | cd mpich-${MPICH_VERSION} && \ 130 | ./configure && \ 131 | make -j${BUILD_NP} install; \ 132 | else \ 133 | wget https://download.open-mpi.org/release/open-mpi/v${OPENMPI_SERIES}/openmpi-${OPENMPI_SERIES}.${OPENMPI_PATCH}.tar.gz && \ 134 | tar xfz openmpi-${OPENMPI_SERIES}.${OPENMPI_PATCH}.tar.gz && \ 135 | cd openmpi-${OPENMPI_SERIES}.${OPENMPI_PATCH} && \ 136 | ./configure && \ 137 | make -j${BUILD_NP} install; \ 138 | fi && \ 139 | ldconfig && \ 140 | rm -rf /tmp/* 141 | 142 | ENV VIRTUAL_ENV=/dolfinx-env 143 | ENV PATH=/dolfinx-env/bin:$PATH 144 | RUN python3 -m venv ${VIRTUAL_ENV} 145 | 146 | # Install Python packages (via pip) 147 | RUN pip install --no-cache-dir --upgrade pip setuptools wheel && \ 148 | pip install --no-cache-dir cython numpy==${NUMPY_VERSION} && \ 149 | CFLAGS="-noswitcherror" pip install --no-cache-dir mpi4py 150 | 151 | # Install KaHIP 152 | RUN wget -nc --quiet https://github.com/kahip/kahip/archive/v${KAHIP_VERSION}.tar.gz && \ 153 | tar -xf v${KAHIP_VERSION}.tar.gz && \ 154 | cmake -G Ninja -DCMAKE_BUILD_TYPE=Release -DNONATIVEOPTIMIZATIONS=on -B build-dir -S KaHIP-${KAHIP_VERSION} && \ 155 | cmake --build build-dir && \ 156 | cmake --install build-dir && \ 157 | rm -rf /tmp/* 158 | 159 | # Install HDF5 160 | # Note: HDF5 CMake install has numerous bugs and inconsistencies. Test carefully. 161 | # HDF5 overrides CMAKE_INSTALL_PREFIX by default, hence it is set 162 | # below to ensure that HDF5 is installed into a path where it can be 163 | # found. 164 | RUN wget -nc --quiet https://github.com/HDFGroup/hdf5/archive/refs/tags/hdf5_${HDF5_VERSION}.tar.gz && \ 165 | tar xfz hdf5_${HDF5_VERSION}.tar.gz && \ 166 | cmake -G Ninja -DCMAKE_INSTALL_PREFIX=/usr/local -DCMAKE_BUILD_TYPE=Release -DHDF5_ENABLE_PARALLEL=on -DHDF5_ENABLE_Z_LIB_SUPPORT=on -B build-dir -S hdf5-hdf5_${HDF5_VERSION} && \ 167 | cmake --build build-dir && \ 168 | cmake --install build-dir && \ 169 | rm -rf /tmp/* 170 | 171 | # Install ADIOS2 (Python interface in /usr/local/lib), same as GMSH 172 | RUN wget -nc --quiet https://github.com/ornladios/ADIOS2/archive/v${ADIOS2_VERSION}.tar.gz -O adios2-v${ADIOS2_VERSION}.tar.gz && \ 173 | mkdir -p adios2-v${ADIOS2_VERSION} && \ 174 | tar -xf adios2-v${ADIOS2_VERSION}.tar.gz -C adios2-v${ADIOS2_VERSION} --strip-components 1 && \ 175 | cmake -G Ninja -DADIOS2_USE_HDF5=on -DCMAKE_INSTALL_PYTHONDIR=/usr/local/lib/ -DADIOS2_USE_Fortran=off -DBUILD_TESTING=off -DADIOS2_BUILD_EXAMPLES=off -DADIOS2_USE_ZeroMQ=off -B build-dir -S ./adios2-v${ADIOS2_VERSION} && \ 176 | cmake --build build-dir && \ 177 | cmake --install build-dir && \ 178 | rm -rf /tmp/* 179 | 180 | # Install GMSH 181 | RUN git clone -b gmsh_${GMSH_VERSION} --single-branch --depth 1 https://gitlab.onelab.info/gmsh/gmsh.git && \ 182 | cmake -G Ninja -DCMAKE_BUILD_TYPE=Release -DENABLE_BUILD_DYNAMIC=1 -DENABLE_OPENMP=1 -B build-dir -S gmsh && \ 183 | cmake --build build-dir && \ 184 | cmake --install build-dir && \ 185 | rm -rf /tmp/* 186 | 187 | # GMSH installs python library in /usr/local/lib, see: https://gitlab.onelab.info/gmsh/gmsh/-/issues/1414 188 | ENV PYTHONPATH=/usr/local/lib:$PYTHONPATH 189 | 190 | # Install PETSc and petsc4py with real and complex types 191 | ENV PETSC_DIR=/usr/local/petsc SLEPC_DIR=/usr/local/slepc 192 | RUN ln -sf /opt/nvidia/hpc_sdk/Linux_x86_64/24.9/cuda/lib64/stubs/libcuda.so /opt/nvidia/hpc_sdk/Linux_x86_64/24.9/cuda/lib64/stubs/libcuda.so.1 193 | RUN apt-get -qq update && \ 194 | apt-get -y install bison flex && \ 195 | git clone --depth=1 -b v${PETSC_VERSION} https://gitlab.com/petsc/petsc.git ${PETSC_DIR} && \ 196 | cd ${PETSC_DIR} && \ 197 | # Real64, 32-bit int with CUDA 198 | LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/nvidia/hpc_sdk/Linux_x86_64/24.9/cuda/lib64/stubs/ ./configure \ 199 | PETSC_ARCH=linux-gnu-real64-32-cuda \ 200 | --COPTFLAGS="${PETSC_SLEPC_OPTFLAGS}" \ 201 | --CXXOPTFLAGS="${PETSC_SLEPC_OPTFLAGS}" \ 202 | --FOPTFLAGS="${PETSC_SLEPC_OPTFLAGS}" \ 203 | --with-64-bit-indices=no \ 204 | --with-debugging=${PETSC_SLEPC_DEBUGGING} \ 205 | --with-fortran-bindings=no \ 206 | --with-shared-libraries \ 207 | --download-hypre \ 208 | --download-metis \ 209 | --download-mumps-avoid-mpi-in-place \ 210 | --download-mumps \ 211 | --download-ptscotch \ 212 | --download-scalapack \ 213 | --with-cuda\ 214 | --download-spai \ 215 | --download-suitesparse \ 216 | --with-scalar-type=real \ 217 | --with-precision=double && \ 218 | make PETSC_ARCH=linux-gnu-real64-32-cuda ${MAKEFLAGS} all 219 | 220 | # Install petsc4py 221 | RUN cd ${PETSC_DIR}/src/binding/petsc4py && \ 222 | PETSC_ARCH=linux-gnu-real64-32-cuda CFLAGS="-noswitcherror" pip -v install --no-cache-dir --no-build-isolation . && \ 223 | # Cleanup 224 | apt-get -y purge bison flex && \ 225 | apt-get -y autoremove && \ 226 | apt-get clean && \ 227 | rm -rf \ 228 | ${PETSC_DIR}/**/tests/ \ 229 | ${PETSC_DIR}/**/obj/ \ 230 | ${PETSC_DIR}/**/externalpackages/ \ 231 | ${PETSC_DIR}/CTAGS \ 232 | ${PETSC_DIR}/RDict.log \ 233 | ${PETSC_DIR}/TAGS \ 234 | ${PETSC_DIR}/docs/ \ 235 | ${PETSC_DIR}/share/ \ 236 | ${PETSC_DIR}/src/ \ 237 | ${PETSC_DIR}/systems/ \ 238 | rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* 239 | 240 | WORKDIR /root 241 | 242 | -------------------------------------------------------------------------------- /python/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.18) 2 | 3 | project(cudolfinx_nanobind) 4 | 5 | # Set C++ standard 6 | set(CMAKE_CXX_STANDARD 20) 7 | set(CMAKE_CXX_STANDARD_REQUIRED ON) 8 | set(CMAKE_CXX_EXTENSIONS OFF) 9 | 10 | find_package( 11 | Python 12 | COMPONENTS Interpreter Development 13 | REQUIRED 14 | ) 15 | 16 | # Detect the installed nanobind package and import it into CMake 17 | execute_process( 18 | COMMAND "${Python_EXECUTABLE}" -m nanobind --cmake_dir 19 | OUTPUT_STRIP_TRAILING_WHITESPACE 20 | OUTPUT_VARIABLE NB_DIR 21 | ) 22 | list(APPEND CMAKE_PREFIX_PATH "${NB_DIR}") 23 | find_package(nanobind CONFIG REQUIRED) 24 | 25 | execute_process( 26 | COMMAND 27 | ${Python3_EXECUTABLE} -c 28 | "import os, sys, basix; sys.stdout.write(os.path.dirname(basix.__file__))" 29 | OUTPUT_VARIABLE BASIX_PY_DIR 30 | RESULT_VARIABLE BASIX_PY_COMMAND_RESULT 31 | ERROR_VARIABLE BASIX_ERROR_OUT 32 | OUTPUT_STRIP_TRAILING_WHITESPACE 33 | ) 34 | find_package(Basix REQUIRED CONFIG HINTS ${BASIX_PY_DIR}) 35 | 36 | if(Basix_FOUND) 37 | message(STATUS "Found Basix at ${Basix_DIR}") 38 | endif() 39 | 40 | find_package(DOLFINX REQUIRED CONFIG) 41 | 42 | if(DOLFINX_FOUND) 43 | message(STATUS "Found DOLFINx at ${DOLFINX_DIR}") 44 | endif() 45 | 46 | find_package(CUDOLFINX REQUIRED CONFIG) 47 | 48 | if(CUDOLFINX_FOUND) 49 | message(STATUS "Found CUDOLFINx at ${CUDOLFINX_DIR}") 50 | endif() 51 | 52 | find_package(CUDAToolkit REQUIRED) 53 | 54 | # Create the binding library nanobind handles its own calls to 55 | # target_link_libraries 56 | nanobind_add_module( 57 | cpp 58 | NOMINSIZE 59 | cudolfinx/wrappers/cudolfinx.cpp 60 | cudolfinx/wrappers/fem.cpp 61 | cudolfinx/wrappers/petsc.cpp 62 | ) 63 | 64 | # Add strict compiler flags include(CheckCXXCompilerFlag) 65 | # check_cxx_compiler_flag("-Wall -Werror -pedantic" HAVE_PEDANTIC) 66 | 67 | # if(HAVE_PEDANTIC) # target_compile_options(cpp PRIVATE 68 | # -Wall;-Werror;-pedantic) endif() 69 | 70 | # Add DOLFINx libraries 71 | target_link_libraries(cpp PRIVATE dolfinx) 72 | target_link_libraries(cpp PRIVATE cudolfinx) 73 | target_link_libraries(cpp PRIVATE CUDA::cuda_driver CUDA::nvrtc CUDA::cupti) 74 | target_include_directories(cpp SYSTEM PRIVATE ${CUDAToolkit_INCLUDE_DIRS}) 75 | 76 | # Check for petsc4py 77 | execute_process( 78 | COMMAND ${Python_EXECUTABLE} -c 79 | "import petsc4py; print(petsc4py.get_include())" 80 | OUTPUT_VARIABLE PETSC4PY_INCLUDE_DIR 81 | RESULT_VARIABLE PETSC4PY_COMMAND_RESULT 82 | ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE 83 | ) 84 | 85 | if(NOT PETSC4PY_COMMAND_RESULT) 86 | message(STATUS "Found petsc4py include directory at ${PETSC4PY_INCLUDE_DIR}") 87 | target_include_directories(cpp PRIVATE ${PETSC4PY_INCLUDE_DIR}) 88 | else() 89 | message(FATAL_ERROR "petsc4py could not be found.") 90 | endif() 91 | 92 | # Check for mpi4py 93 | execute_process( 94 | COMMAND "${Python_EXECUTABLE}" -c "import mpi4py; print(mpi4py.get_include())" 95 | OUTPUT_VARIABLE MPI4PY_INCLUDE_DIR 96 | RESULT_VARIABLE MPI4PY_COMMAND_RESULT 97 | ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE 98 | ) 99 | 100 | if(NOT MPI4PY_COMMAND_RESULT) 101 | message(STATUS "Found mpi4py include directory at ${MPI4PY_INCLUDE_DIR}") 102 | target_include_directories(cpp PRIVATE ${MPI4PY_INCLUDE_DIR}) 103 | else() 104 | message(FATAL_ERROR "mpi4py could not be found.") 105 | endif() 106 | 107 | set_target_properties(cpp PROPERTIES INSTALL_RPATH_USE_LINK_PATH TRUE) 108 | 109 | install(TARGETS cpp DESTINATION cudolfinx) 110 | -------------------------------------------------------------------------------- /python/README.md: -------------------------------------------------------------------------------- 1 | # CUDOLFINx Python interface installation 2 | 3 | Below is guidance for building the CUDOLFINx Python interface. 4 | 5 | 1. Build and install the CUDOLFINx C++ library. 6 | 7 | 2. Ensure the Python interface build requirements are installed: 8 | 9 | pip install -r build-requirements.txt 10 | 11 | 3. Build DOLFINx Python interface: 12 | 13 | pip install --check-build-dependencies --no-build-isolation . 14 | 15 | To build in debug and editable mode for development: 16 | 17 | pip -v install --check-build-dependencies --config-settings=build-dir="build" --config-settings=cmake.build-type="Debug" --config-settings=install.strip=false --no-build-isolation -e . 18 | -------------------------------------------------------------------------------- /python/build-requirements.txt: -------------------------------------------------------------------------------- 1 | nanobind>=1.8.0 2 | scikit-build-core[pyproject] 3 | petsc4py 4 | mpi4py 5 | -------------------------------------------------------------------------------- /python/cudolfinx/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2024 Benjamin Pachev 2 | # 3 | # This file is part of cuDOLFINX 4 | # 5 | # SPDX-License-Identifier: LGPL-3.0-or-later 6 | 7 | """Main module for CUDOLFINx""" 8 | 9 | from cudolfinx.assemble import CUDAAssembler 10 | from cudolfinx.form import form 11 | from cudolfinx.mesh import ghost_layer_mesh, ghost_layer_meshtags 12 | -------------------------------------------------------------------------------- /python/cudolfinx/bcs.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2024 Benjamin Pachev 2 | # 3 | # This file is part of cuDOLFINX 4 | # 5 | # SPDX-License-Identifier: LGPL-3.0-or-later 6 | 7 | from cudolfinx import cpp as _cucpp 8 | from dolfinx import cpp as _cpp 9 | from dolfinx.fem.bcs import DirichletBC 10 | import typing 11 | 12 | class CUDADirichletBC: 13 | """Represents a collection of boundary conditions 14 | """ 15 | 16 | def __init__(self, ctx, bcs: typing.List[DirichletBC]): 17 | """Initialize a collection of boundary conditions 18 | """ 19 | 20 | self.bcs = bcs 21 | self._function_spaces = [] 22 | self._bc_lists = [] 23 | self._device_bcs = [] 24 | self._ctx = ctx 25 | 26 | for bc in bcs: 27 | V = bc.function_space 28 | try: 29 | i = self._function_spaces.index(V) 30 | except ValueError: 31 | self._function_spaces.append(V) 32 | self._bc_lists.append([]) 33 | i = -1 34 | self._bc_lists[i].append(bc._cpp_object) 35 | 36 | for V, cpp_bcs in zip(self._function_spaces, self._bc_lists): 37 | _cpp_bc_obj = self._make_device_bc(V, cpp_bcs) 38 | self._device_bcs.append(_cpp_bc_obj) 39 | 40 | def _make_device_bc(self, 41 | V: typing.Union[_cpp.fem.FunctionSpace_float32, _cpp.fem.FunctionSpace_float64], 42 | cpp_bcs: typing.List[typing.Union[_cpp.fem.DirichletBC_float32, _cpp.fem.DirichletBC_float64]] 43 | ): 44 | """Create device bc object wrapping a list of bcs for the same function space""" 45 | 46 | if type(V) is _cpp.fem.FunctionSpace_float32: 47 | return _cucpp.fem.CUDADirichletBC_float32(self._ctx, V, cpp_bcs) 48 | elif type(V) is _cpp.fem.FunctionSpace_float64: 49 | return _cucpp.fem.CUDADirichletBC_float64(self._ctx, V, cpp_bcs) 50 | else: 51 | raise TypeError(f"Invalid type for cpp FunctionSpace object '{type(V)}'") 52 | 53 | def _get_cpp_bcs(self, V: typing.Union[_cpp.fem.FunctionSpace_float32, _cpp.fem.FunctionSpace_float64]): 54 | """Get cpp CUDADirichletBC object 55 | """ 56 | 57 | # Use this to avoid needing hashes (which might not be supported) 58 | # Usually there will be a max of two function spaces associated with a set of bcs 59 | try: 60 | i = self._function_spaces.index(V) 61 | return self._device_bcs[i] 62 | except ValueError: 63 | # return empty collection 64 | return self._make_device_bc(V, []) 65 | 66 | def update(self, bcs: typing.Optional[typing.List[DirichletBC]] = None): 67 | """Update a subset of the boundary conditions. 68 | 69 | Used for cases with time-varying boundary conditions whose device-side values 70 | need to be updated. By default all boundary conditions are updated 71 | """ 72 | 73 | if bcs is None: 74 | bcs = self.bcs 75 | _bcs_to_update = [bc._cpp_object for bc in bcs] 76 | 77 | for _cpp_bc, V in zip(self._device_bcs, self._function_spaces): 78 | # filter out anything not contained in the right function space 79 | _cpp_bc.update([_bc for _bc in _bcs_to_update if V.contains(_bc.function_space)]) 80 | 81 | -------------------------------------------------------------------------------- /python/cudolfinx/context.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2024 Benjamin Pachev 2 | # 3 | # This file is part of cuDOLFINX 4 | # 5 | # SPDX-License-Identifier: LGPL-3.0-or-later 6 | 7 | from petsc4py import PETSc 8 | from cudolfinx import cpp as _cucpp 9 | 10 | _device = None 11 | 12 | def _init_device(): 13 | """Initialize PETSc device 14 | """ 15 | global _device 16 | d = PETSc.Device() 17 | d.create(PETSc.Device.Type.CUDA) 18 | _device = d 19 | 20 | def get_device(): 21 | """Return PETSc device 22 | """ 23 | 24 | global _device 25 | if _device is None: 26 | _init_device() 27 | return _device 28 | 29 | def get_cuda_context(): 30 | """Return the CUDA context, intializing it if needed 31 | """ 32 | global _device 33 | if _device is None: 34 | _init_device() 35 | return _cucpp.fem.CUDAContext() 36 | -------------------------------------------------------------------------------- /python/cudolfinx/form.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2024 Benjamin Pachev 2 | # 3 | # This file is part of cuDOLFINX 4 | # 5 | # SPDX-License-Identifier: LGPL-3.0-or-later 6 | 7 | import collections 8 | from cudolfinx.context import get_cuda_context 9 | from cudolfinx import cpp as _cucpp, jit 10 | from dolfinx import fem as fe 11 | from dolfinx import cpp as _cpp 12 | import numpy as np 13 | import typing 14 | import ufl 15 | 16 | class CUDAForm: 17 | """CUDA wrapper class for a dolfinx.fem.Form 18 | """ 19 | 20 | def __init__(self, form: fe.Form): 21 | """Initialize the wrapper 22 | """ 23 | 24 | self._ctx = get_cuda_context() 25 | self._cuda_mesh = _create_mesh_on_device(form.mesh, self._ctx) 26 | 27 | self._dolfinx_form = form 28 | self._wrapped_tabulate_tensors = jit.get_wrapped_tabulate_tensors(form) 29 | ufcx_form_addr = form.module.ffi.cast("uintptr_t", form.module.ffi.addressof(form.ufcx_form)) 30 | 31 | cpp_form = form._cpp_object 32 | if type(cpp_form) is _cpp.fem.Form_float32: 33 | form_cls = _cucpp.fem.CUDAForm_float32 34 | elif type(cpp_form) is _cpp.fem.Form_float64: 35 | form_cls = _cucpp.fem.CUDAForm_float64 36 | else: 37 | raise ValueError(f"Cannot instantiate CUDAForm for Form of type {type(cpp_form)}!") 38 | 39 | _tabulate_tensor_names = [] 40 | _tabulate_tensor_sources = [] 41 | for name, source in self._wrapped_tabulate_tensors: 42 | _tabulate_tensor_names.append(name) 43 | _tabulate_tensor_sources.append(source) 44 | 45 | self._cuda_form = form_cls( 46 | self._ctx, 47 | cpp_form, 48 | ufcx_form_addr, 49 | _tabulate_tensor_names, 50 | _tabulate_tensor_sources 51 | ) 52 | 53 | # TODO expose these parameters to the user 54 | self._cuda_form.compile(self._ctx, max_threads_per_block=1024, min_blocks_per_multiprocessor=1) 55 | 56 | def to_device(self): 57 | """Copy host-side coefficients and constants to the device 58 | """ 59 | 60 | self._cuda_form.to_device(self._ctx) 61 | 62 | @property 63 | def cuda_form(self): 64 | """Return the underlying cpp CUDAForm 65 | """ 66 | 67 | return self._cuda_form 68 | 69 | @property 70 | def cuda_mesh(self): 71 | """Return the underlying cpp CUDAMesh""" 72 | 73 | return self._cuda_mesh 74 | 75 | @property 76 | def dolfinx_form(self): 77 | """Return the underlying Dolfinx form 78 | """ 79 | 80 | return self._dolfinx_form 81 | 82 | @property 83 | def function_spaces(self): 84 | """Return a list of FunctionSpaces corresponding to the form 85 | """ 86 | 87 | return self._dolfinx_form.function_spaces 88 | 89 | class BlockCUDAForm: 90 | """Data structure containing multiple CUDA forms to be used in block assembly.""" 91 | 92 | def __init__( 93 | self, forms: typing.Union[list[CUDAForm], list[list[CUDAForm]]], 94 | restrictions: typing.Optional[ 95 | typing.Union[ 96 | list[np.typing.NDArray[np.int32]], 97 | tuple[list[np.typing.NDArray[np.int32]], list[np.typing.NDArray[np.int32]]] 98 | ]] = None 99 | ): 100 | """Initialize the data structure.""" 101 | 102 | self._forms = forms 103 | self._restrictions = restrictions 104 | 105 | if not len(forms): raise ValueError("Must provide at least one form!") 106 | if type(forms[0]) is CUDAForm: self._init_vector() 107 | else: self._init_matrix() 108 | 109 | def _init_vector(self): 110 | """Initialize vector form.""" 111 | 112 | offset = 0 113 | offsets = [offset] 114 | for i, form in enumerate(self._forms): 115 | # note in dolfinx 0.10.0 dofmap is replaced with dofmaps 116 | # which means this portion will require reworking 117 | dofmap = form.function_spaces[0].dofmap 118 | local_size = dofmap.index_map.size_local 119 | if self._restrictions is not None: 120 | restriction_inds = self._restrictions[i] 121 | # ignore ghosts 122 | restriction_inds = restriction_inds[restriction_inds < local_size] 123 | local_size = len(restriction_inds) 124 | else: 125 | restriction_inds = np.arange(local_size, dtype=np.int32) 126 | target_inds = offset + np.arange(local_size, dtype=np.int32) 127 | offset += local_size * dofmap.index_map_bs 128 | offsets.append(offset) 129 | form.cuda_form.set_restriction([restriction_inds], [target_inds]) 130 | 131 | self._offsets = offsets 132 | comm = self._forms[0].dolfinx_form.mesh.comm 133 | self._global_size = comm.allreduce(offsets[-1]) 134 | 135 | 136 | def _init_matrix(self): 137 | """Initialize matrix form.""" 138 | 139 | raise NotImplementedError("Block matrix assembly is not yet implemented!") 140 | 141 | @property 142 | def forms(self): 143 | """Return the list of forms.""" 144 | 145 | return self._forms 146 | 147 | @property 148 | def dolfinx_forms(self): 149 | """Return list of underlying dolfinx forms.""" 150 | 151 | return [f.dolfinx_form for f in self._forms] 152 | 153 | @property 154 | def offsets(self): 155 | """Return list of offsets.""" 156 | 157 | return self._offsets 158 | 159 | @property 160 | def local_size(self): 161 | """Return size of local vector.""" 162 | 163 | return self._offsets[-1] 164 | 165 | @property 166 | def global_size(self): 167 | """Return size of global vector.""" 168 | 169 | return self._global_size 170 | 171 | def form( 172 | form: typing.Union[ufl.Form, typing.Iterable[ufl.Form]], 173 | restriction: typing.Optional[typing.Iterable[np.typing.NDArray[np.int32]]] = None, 174 | **kwargs): 175 | """Create a CUDAForm from a ufl form.""" 176 | 177 | def _create_form(form): 178 | """Recursively convert ufl.Forms to CUDAForm.""" 179 | 180 | if isinstance(form, ufl.Form): 181 | dolfinx_form = fe.form(form, **kwargs) 182 | return CUDAForm(dolfinx_form) 183 | elif isinstance(form, collections.abc.Iterable): 184 | return list(map(lambda sub_form: _create_form(sub_form), form)) 185 | else: 186 | raise TypeError("Expected form to be a ufl.Form or an iterable, got type '{type(form)}'!") 187 | 188 | cuda_form = _create_form(form) 189 | # TODO: properly handle restriction for a single form 190 | if isinstance(form, collections.abc.Iterable): 191 | return BlockCUDAForm(cuda_form, restriction) 192 | else: return cuda_form 193 | 194 | def _create_mesh_on_device(cpp_mesh: typing.Union[_cpp.mesh.Mesh_float32, _cpp.mesh.Mesh_float64], ctx: _cucpp.fem.CUDAContext): 195 | """Create device-side mesh data 196 | """ 197 | 198 | if type(cpp_mesh) is _cpp.mesh.Mesh_float32: 199 | return _cucpp.fem.CUDAMesh_float32(ctx, cpp_mesh) 200 | elif type(cpp_mesh) is _cpp.mesh.Mesh_float64: 201 | return _cucpp.fem.CUDAMesh_float64(ctx, cpp_mesh) 202 | else: 203 | raise ValueError(f"Cannot instantiate CUDAMesh for Mesh of type {type(cpp_mesh)}!") 204 | 205 | -------------------------------------------------------------------------------- /python/cudolfinx/jit.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2024 Benjamin Pachev 2 | # 3 | # This file is part of cuDOLFINX 4 | # 5 | # SPDX-License-Identifier: LGPL-3.0-or-later 6 | 7 | """Routines for manipulating generated FFCX code 8 | """ 9 | 10 | from dolfinx import fem, cpp 11 | import numpy as np 12 | import pathlib 13 | from typing import * 14 | 15 | def get_tabulate_tensor_sources(form: fem.Form): 16 | """Given a compiled fem.Form, extract the C source code of the tabulate tensors 17 | """ 18 | 19 | module_file = pathlib.Path(form.module.__file__) 20 | source_filename = module_file.name.split(".")[0] + ".c" 21 | source_file = module_file.parent.joinpath(source_filename) 22 | if not source_file.is_file(): 23 | raise IOError("Could not find generated ffcx source file '{source_file}'!") 24 | 25 | tabulate_tensors = [] 26 | parsing_tabulate = False 27 | parsing_header = False 28 | bracket_count = 0 29 | with open(source_file) as fp: 30 | for line in fp: 31 | if "tabulate_tensor_integral" in line and line.strip().startswith("void"): 32 | parsing_tabulate = True 33 | parsing_header = True 34 | tabulate_id = line.strip().split()[1].split("_")[-1].split("(")[0] 35 | tabulate_body = [] 36 | elif parsing_header: 37 | if line.startswith("{"): 38 | parsing_header = False 39 | bracket_count += 1 40 | elif parsing_tabulate: 41 | if line.startswith("{"): bracket_count += 1 42 | elif line.startswith("}"): bracket_count -= 1 43 | if not bracket_count: 44 | tabulate_tensors.append((tabulate_id, "".join(tabulate_body))) 45 | parsing_tabulate = False 46 | else: 47 | tabulate_body.append(line) 48 | elif "form_integrals_form" in line: 49 | if "{" in line: 50 | arr = line.split("{")[-1].split("}")[0] 51 | ordered_integral_ids = [ 52 | part.strip().split("_")[-1] for part in arr.split(",") 53 | ] 54 | 55 | id_order = {integral_id: i for i, integral_id in enumerate(ordered_integral_ids)} 56 | return sorted(tabulate_tensors, key=lambda tabulate: id_order[tabulate[0]]) 57 | 58 | cuda_tabulate_tensor_header = """ 59 | #define alignas(x) 60 | #define restrict __restrict__ 61 | 62 | typedef unsigned char uint8_t; 63 | typedef unsigned int uint32_t; 64 | typedef double ufc_scalar_t; 65 | 66 | extern "C" __global__ 67 | void tabulate_tensor_{factory_name}({scalar_type}* restrict A, 68 | const {scalar_type}* restrict w, 69 | const {scalar_type}* restrict c, 70 | const {geom_type}* restrict coordinate_dofs, 71 | const int* restrict entity_local_index, 72 | const uint8_t* restrict quadrature_permutation 73 | ) 74 | """ 75 | 76 | def _convert_dtype_to_str(dtype: Any): 77 | """Convert numpy dtype to named C type 78 | """ 79 | 80 | if dtype == np.float32: 81 | return "float" 82 | elif dtype == np.float64: 83 | return "double" 84 | else: 85 | raise TypeError(f"Unsupported dtype: '{dtype}'") 86 | 87 | def get_wrapped_tabulate_tensors(form: fem.Form, backend="cuda"): 88 | """Given a fem.Form, wrap the tabulate tensors for use on a GPU 89 | """ 90 | 91 | if backend != "cuda": 92 | raise NotImplementedError(f"Backend '{backend}' not yet supported.") 93 | 94 | # for now assume same type for form and mesh 95 | # this is typically the default 96 | geom_type = scalar_type = _convert_dtype_to_str(form.dtype) 97 | 98 | res = [] 99 | sources = get_tabulate_tensor_sources(form) 100 | for id, body in sources: 101 | factory_name = "integral_" + id 102 | name = "tabulate_tensor_" + factory_name 103 | header = cuda_tabulate_tensor_header.format( 104 | scalar_type=scalar_type, 105 | geom_type=geom_type, 106 | factory_name=factory_name 107 | ) 108 | wrapped_source = header + "{\n" + body + "}\n" 109 | res.append((name, wrapped_source)) 110 | 111 | return res 112 | 113 | -------------------------------------------------------------------------------- /python/cudolfinx/la.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2024 Benjamin Pachev 2 | # 3 | # This file is part of cuDOLFINX 4 | # 5 | # SPDX-License-Identifier: LGPL-3.0-or-later 6 | 7 | from cudolfinx import cpp as _cucpp 8 | 9 | class CUDAVector: 10 | """Vector on device 11 | """ 12 | 13 | def __init__(self, ctx, vec): 14 | """Initialize the vector 15 | """ 16 | 17 | self._petsc_vec = vec 18 | self._ctx = ctx 19 | self._cpp_object = _cucpp.fem.CUDAVector(ctx, self._petsc_vec) 20 | 21 | @property 22 | def vector(self): 23 | """Return underlying PETSc vector 24 | """ 25 | 26 | return self._petsc_vec 27 | 28 | def to_host(self): 29 | """Copy device-side values to host 30 | """ 31 | 32 | self._cpp_object.to_host(self._ctx) 33 | 34 | def __del__(self): 35 | """Delete the vector and free up GPU resources 36 | """ 37 | 38 | # Ensure that the cpp CUDAVector is taken care of BEFORE the petsc vector. . . . 39 | del self._cpp_object 40 | 41 | class CUDAMatrix: 42 | """Matrix on device 43 | """ 44 | 45 | def __init__(self, ctx, petsc_mat): 46 | """Initialize the matrix 47 | """ 48 | 49 | self._petsc_mat = petsc_mat 50 | self._ctx = ctx 51 | self._cpp_object = _cucpp.fem.CUDAMatrix(ctx, petsc_mat) 52 | 53 | @property 54 | def mat(self): 55 | """Return underlying CUDA matrix 56 | """ 57 | 58 | return self._petsc_mat 59 | 60 | def assemble(self): 61 | """Call assemble on the underlying PETSc matrix. 62 | 63 | If the PETSc matrix is not a CUDA matrix, then matrix 64 | values will be explicitly copied to the host. 65 | """ 66 | 67 | self._cpp_object.to_host(self._ctx) 68 | 69 | def __del__(self): 70 | """Delete the matrix and free up GPU resources 71 | """ 72 | 73 | # make sure we delete the CUDAMatrix before the petsc matrix 74 | del self._cpp_object 75 | 76 | 77 | -------------------------------------------------------------------------------- /python/cudolfinx/mesh.py: -------------------------------------------------------------------------------- 1 | from cudolfinx import cpp as _cucpp 2 | from dolfinx import mesh 3 | 4 | def ghost_layer_mesh(domain: mesh.Mesh): 5 | """Add a ghost layer of cells to the given mesh 6 | """ 7 | _ghost_mesh = _cucpp.fem.ghost_layer_mesh(domain._cpp_object, domain._geometry._cpp_object.cmap) 8 | return mesh.Mesh( 9 | _ghost_mesh, 10 | domain._ufl_domain) 11 | 12 | def ghost_layer_meshtags(meshtags: mesh.MeshTags, ghosted_mesh: mesh.Mesh): 13 | """Trasnfer meshtags to ghost layer mesh.""" 14 | 15 | _cpp_meshtags = _cucpp.fem.ghost_layer_meshtags(meshtags._cpp_object, ghosted_mesh.topology._cpp_object) 16 | return mesh.MeshTags(_cpp_meshtags) 17 | -------------------------------------------------------------------------------- /python/cudolfinx/wrappers/caster_petsc.h: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2024 Benjamin Pachev, James D. Trotter 2 | // 3 | // This file is part of cuDOLFINX 4 | // 5 | // SPDX-License-Identifier: LGPL-3.0-or-later 6 | 7 | #pragma once 8 | 9 | #ifdef HAS_PETSC 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | // nanobind casters for PETSc/petsc4py objects 17 | 18 | namespace nb = nanobind; 19 | 20 | // Import petsc4py on demand 21 | #define VERIFY_PETSC4PY_FROMPY(func) \ 22 | if (!func) \ 23 | { \ 24 | if (import_petsc4py() != 0) \ 25 | return false; \ 26 | } 27 | 28 | #define VERIFY_PETSC4PY_FROMCPP(func) \ 29 | if (!func) \ 30 | { \ 31 | if (import_petsc4py() != 0) \ 32 | return {}; \ 33 | } 34 | 35 | // Macro for casting between PETSc and petsc4py objects 36 | #define PETSC_CASTER_MACRO(TYPE, P4PYTYPE, NAME) \ 37 | template <> \ 38 | class type_caster<_p_##TYPE> \ 39 | { \ 40 | public: \ 41 | NB_TYPE_CASTER(TYPE, const_name(#NAME)) \ 42 | bool from_python(handle src, uint8_t, cleanup_list*) noexcept \ 43 | { \ 44 | VERIFY_PETSC4PY_FROMPY(PyPetsc##P4PYTYPE##_Get); \ 45 | if (PyObject_TypeCheck(src.ptr(), &PyPetsc##P4PYTYPE##_Type) != 0) \ 46 | { \ 47 | value = PyPetsc##P4PYTYPE##_Get(src.ptr()); \ 48 | return true; \ 49 | } \ 50 | else \ 51 | return false; \ 52 | } \ 53 | \ 54 | static handle from_cpp(TYPE src, rv_policy policy, \ 55 | cleanup_list* /*cleanup*/) noexcept \ 56 | { \ 57 | VERIFY_PETSC4PY_FROMCPP(PyPetsc##P4PYTYPE##_New); \ 58 | if (policy == rv_policy::take_ownership) \ 59 | { \ 60 | PyObject* obj = PyPetsc##P4PYTYPE##_New(src); \ 61 | PetscObjectDereference((PetscObject)src); \ 62 | return nb::handle(obj); \ 63 | } \ 64 | else if (policy == rv_policy::automatic_reference \ 65 | or policy == rv_policy::reference) \ 66 | { \ 67 | PyObject* obj = PyPetsc##P4PYTYPE##_New(src); \ 68 | return nb::handle(obj); \ 69 | } \ 70 | else \ 71 | { \ 72 | return {}; \ 73 | } \ 74 | } \ 75 | \ 76 | operator TYPE() { return value; } \ 77 | } 78 | 79 | namespace nanobind::detail 80 | { 81 | PETSC_CASTER_MACRO(Mat, Mat, mat); 82 | PETSC_CASTER_MACRO(Vec, Vec, vec); 83 | } // namespace nanobind::detail 84 | #endif 85 | -------------------------------------------------------------------------------- /python/cudolfinx/wrappers/cudolfinx.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2024 Benjamin Pachev, James D. Trotter 2 | // 3 | // This file is part of cuDOLFINX 4 | // 5 | // SPDX-License-Identifier: LGPL-3.0-or-later 6 | 7 | #include 8 | 9 | namespace nb = nanobind; 10 | 11 | namespace cudolfinx_wrappers 12 | { 13 | void fem(nb::module_& m); 14 | void petsc(nb::module_& m_fem); 15 | } // namespace cudolfinx_wrappers 16 | 17 | NB_MODULE(cpp, m) 18 | { 19 | // Create module for C++ wrappers 20 | m.doc() = "DOLFINx CUDA Python interface"; 21 | m.attr("__version__") = CUDOLFINX_VERSION; 22 | 23 | #ifdef NDEBUG 24 | nanobind::set_leak_warnings(false); 25 | #endif 26 | // Create fem submodule [fem] 27 | nb::module_ fem = m.def_submodule("fem", "FEM module"); 28 | cudolfinx_wrappers::fem(fem); 29 | cudolfinx_wrappers::petsc(fem); 30 | } 31 | -------------------------------------------------------------------------------- /python/cudolfinx/wrappers/petsc.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (C) 2024 Benjamin Pachev, James D. Trotter 2 | // 3 | // This file is part of cuDOLFINX 4 | // 5 | // SPDX-License-Identifier: LGPL-3.0-or-later 6 | 7 | #include "caster_petsc.h" 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | 22 | namespace 23 | { 24 | 25 | void petsc_fem_module(nb::module_& m) 26 | { 27 | m.def("create_cuda_matrix", dolfinx::fem::petsc::create_cuda_matrix, 28 | nb::rv_policy::take_ownership, nb::arg("a"), 29 | "Create a PETSc CUDA Mat for a bilinear form."); 30 | } 31 | 32 | } // namespace 33 | 34 | namespace cudolfinx_wrappers 35 | { 36 | void petsc(nb::module_& m_fem) 37 | { 38 | nb::module_ petsc_fem_mod 39 | = m_fem.def_submodule("petsc", "PETSc-specific finite element module"); 40 | petsc_fem_module(petsc_fem_mod); 41 | } 42 | } // namespace cudolfinx_wrappers 43 | 44 | -------------------------------------------------------------------------------- /python/examples/poisson.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2024 Benjamin Pachev 2 | # 3 | # This file is part of cuDOLFINX 4 | # 5 | # SPDX-License-Identifier: LGPL-3.0-or-later 6 | 7 | import argparse as ap 8 | from mpi4py import MPI 9 | from petsc4py import PETSc 10 | import cudolfinx as cufem 11 | from dolfinx import fem as fe, mesh 12 | from dolfinx.fem import petsc as fe_petsc 13 | import numpy as np 14 | import ufl 15 | import time 16 | from ufl import dx, ds, grad, inner 17 | 18 | def create_mesh(res: int = 10, dim: int = 3): 19 | """Create a uniform tetrahedral mesh on the unit cube. 20 | 21 | Parameters 22 | ---------- 23 | res - Number of subdivisions along each dimension 24 | dim - Geometric dimension of mesh 25 | 26 | Returns 27 | ---------- 28 | mesh - The mesh object. 29 | """ 30 | 31 | if dim == 3: 32 | return mesh.create_box( 33 | comm = MPI.COMM_WORLD, 34 | points = ((0,0,0), (1, 1, 1)), 35 | n = (res, res, res), 36 | cell_type = mesh.CellType.tetrahedron 37 | ) 38 | elif dim == 2: 39 | return mesh.create_unit_square(MPI.COMM_WORLD, res, res) 40 | 41 | def main(res, cuda=True, degree=1, dim=3): 42 | """Assembles a stiffness matrix for the Poisson problem with the given resolution. 43 | """ 44 | 45 | domain = create_mesh(res, dim=dim) 46 | comm = domain.comm 47 | if cuda and comm.size > 1: 48 | if comm.rank == 0: 49 | print("Using ghost layer mesh for CUDA Assembly") 50 | domain = cufem.ghost_layer_mesh(domain) 51 | 52 | V = fe.functionspace(domain, ("Lagrange", degree)) 53 | u = ufl.TrialFunction(V) 54 | v = ufl.TestFunction(V) 55 | x = ufl.SpatialCoordinate(domain) 56 | if dim == 3: 57 | f = 10*ufl.exp(-((x[0]-.05)**2 + (x[1]-.05)**2 + (x[2]-.05)**2) / .02) 58 | elif dim == 2: 59 | f = 10*ufl.exp(-((x[0]-.05)**2 + (x[1]-.05)**2) / .02) 60 | g = ufl.sin(5*x[0])*ufl.sin(5*x[1]) 61 | a = inner(grad(u), grad(v)) * dx 62 | L = inner(f, v) * dx + inner(g, v) * ds 63 | 64 | facets = mesh.locate_entities_boundary( 65 | domain, 66 | dim=(domain.topology.dim - 1), 67 | marker=lambda x: np.isclose(x[0], 0.0) | np.isclose(x[0], 2.0), 68 | ) 69 | 70 | dofs = fe.locate_dofs_topological(V=V, entity_dim=domain.topology.dim-1, entities=facets) 71 | bc = fe.dirichletbc(value=PETSc.ScalarType(0), dofs=dofs, V=V) 72 | 73 | if cuda: 74 | a = cufem.form(a) 75 | L = cufem.form(L) 76 | asm = cufem.CUDAAssembler() 77 | A = asm.create_matrix(a) 78 | b = asm.create_vector(L) 79 | device_bcs = asm.pack_bcs([bc]) 80 | else: 81 | a = fe.form(a, jit_options = {"cffi_extra_compile_args":["-O3", "-mcpu=neoverse-v2"]}) 82 | L = fe.form(L, jit_options = {"cffi_extra_compile_args":["-O3", "-mcpu=neoverse-v2"]}) 83 | A = fe_petsc.create_matrix(a) 84 | b = fe_petsc.create_vector(L) 85 | start = time.time() 86 | if cuda: 87 | asm.assemble_matrix(a, A, bcs=device_bcs) 88 | A.assemble() 89 | else: 90 | fe_petsc.assemble_matrix(A, a, bcs=[bc]) 91 | A.assemble() 92 | elapsed = time.time()-start 93 | 94 | timing = comm.gather(elapsed, root=0) 95 | if comm.rank == 0: 96 | timing = np.asarray(timing) 97 | timing = np.max(timing) 98 | # show max over all MPI processes, as that's the rate-limiter 99 | print(f"Res={res}, Num cells", domain.topology.index_map(domain.topology.dim).size_global) 100 | print(f"Assembly timing: {timing}, Dofs: {V.dofmap.index_map.size_global}") 101 | 102 | if __name__ == "__main__": 103 | parser = ap.ArgumentParser() 104 | parser.add_argument("--res", default=10, type=int, help="Number of subdivisions in each dimension.") 105 | parser.add_argument("--degree", default=1, type=int, help="Polynomial degree.") 106 | parser.add_argument("--dim", default=3, type=int, help="Geometric dimension.") 107 | parser.add_argument("--no-cuda", default=False, action="store_true", help="Disable GPU acceleration.") 108 | args = parser.parse_args() 109 | 110 | main(res=args.res, cuda = not args.no_cuda, degree=args.degree, dim=args.dim) 111 | -------------------------------------------------------------------------------- /python/examples/poisson_sum_factorization.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2024 Benjamin Pachev 2 | # 3 | # This file is part of cuDOLFINX 4 | # 5 | # SPDX-License-Identifier: LGPL-3.0-or-later 6 | 7 | import argparse as ap 8 | from mpi4py import MPI 9 | from petsc4py import PETSc 10 | try: 11 | import cudolfinx as cufem 12 | except ImportError: 13 | print("Must have cudolfinx to test CUDA assembly.") 14 | 15 | from dolfinx import fem as fe, mesh 16 | from dolfinx.fem import petsc as fe_petsc 17 | import numpy as np 18 | import ufl 19 | import time 20 | from ufl import dx, ds, grad, inner 21 | import basix 22 | 23 | def create_mesh(res: int = 10): 24 | """Create a uniform tetrahedral mesh on the unit cube. 25 | 26 | Parameters 27 | ---------- 28 | res - Number of subdivisions along each dimension 29 | 30 | Returns 31 | ---------- 32 | mesh - The mesh object. 33 | """ 34 | 35 | return mesh.create_box( 36 | comm = MPI.COMM_WORLD, 37 | points = ((0,0,0), (1, 1, 1)), 38 | n = (res, res, res), 39 | cell_type = mesh.CellType.hexahedron, 40 | ghost_mode = mesh.GhostMode.none, 41 | dtype = np.float64 42 | ) 43 | 44 | def main(res, cuda=True, sum_factorization=True, degree=1): 45 | """Assembles a stiffness matrix for the Poisson problem with the given resolution. 46 | """ 47 | 48 | domain = create_mesh(res) 49 | # Tensor product element 50 | family = basix.ElementFamily.P 51 | variant = basix.LagrangeVariant.gll_warped 52 | cell_type = domain.basix_cell() 53 | 54 | basix_element = basix.create_tp_element( 55 | family, cell_type, degree, variant 56 | ) # doesn't work with tp element, why? 57 | element = basix.ufl._BasixElement(basix_element) # basix ufl element 58 | V = fe.functionspace(domain, element) 59 | u = ufl.TrialFunction(V) 60 | v = ufl.TestFunction(V) 61 | x = ufl.SpatialCoordinate(domain) 62 | f = 10*ufl.exp(-((x[0]-.5)**2 + (x[1]-.5)**2 + (x[2]-.5)**2) / .02) 63 | g = ufl.sin(5*x[0])*ufl.sin(5*x[1]) 64 | a = inner(grad(u), grad(v)) * dx 65 | L = inner(f, v) * dx + inner(g, v) * ds 66 | 67 | facets = mesh.locate_entities_boundary( 68 | domain, 69 | dim=(domain.topology.dim - 1), 70 | marker=lambda x: np.isclose(x[0], 0.0) | np.isclose(x[0], 2.0), 71 | ) 72 | 73 | dofs = fe.locate_dofs_topological(V=V, entity_dim=domain.topology.dim-1, entities=facets) 74 | bc = fe.dirichletbc(value=PETSc.ScalarType(0), dofs=dofs, V=V) 75 | 76 | form_compiler_options = {"sum_factorization": sum_factorization} 77 | 78 | if cuda: 79 | a = cufem.form(a, form_compiler_options=form_compiler_options) 80 | asm = cufem.CUDAAssembler() 81 | A = asm.create_matrix(a) 82 | device_bcs = asm.pack_bcs([bc]) 83 | else: 84 | a = fe.form( 85 | a, 86 | form_compiler_options=form_compiler_options, 87 | jit_options = {"cffi_extra_compile_args":["-O3", "-mcpu=neoverse-v2"]} 88 | ) 89 | A = fe_petsc.create_matrix(a) 90 | 91 | start = time.time() 92 | if cuda: 93 | asm.assemble_matrix(a, A, bcs=device_bcs) 94 | else: 95 | fe_petsc.assemble_matrix(A, a, bcs=[bc]) 96 | A.assemble() 97 | elapsed = time.time()-start 98 | 99 | timing = MPI.COMM_WORLD.gather(elapsed, root=0) 100 | if MPI.COMM_WORLD.rank == 0: 101 | timing = np.asarray(timing) 102 | timing = np.max(timing) 103 | # show max over all MPI processes, as that's the rate-limiter 104 | print(f"Res={res}, Num cells", domain.topology.index_map(domain.topology.dim).size_global) 105 | print(f"Assembly timing: {timing}, Dofs: {V.dofmap.index_map.size_global}") 106 | 107 | if __name__ == "__main__": 108 | parser = ap.ArgumentParser() 109 | parser.add_argument("--res", default=10, type=int, help="Number of subdivisions in each dimension.") 110 | parser.add_argument("--degree", default=1, type=int, help="Polynomial degree.") 111 | parser.add_argument("--no-sum-factorization", default=False, action="store_true", help="Disable sum factorization") 112 | parser.add_argument("--no-cuda", default=False, action="store_true", help="Disable GPU acceleration.") 113 | args = parser.parse_args() 114 | 115 | main( 116 | res = args.res, 117 | cuda = not args.no_cuda, 118 | sum_factorization = not args.no_sum_factorization, 119 | degree = args.degree 120 | ) 121 | -------------------------------------------------------------------------------- /python/pyproject.toml: -------------------------------------------------------------------------------- 1 | # The CUDOLFINx Python interface must be built without build isolation (PEP517) 2 | # due to its runtime and build time dependency on system built petsc4py and 3 | # mpi4py. 4 | # pip install -r build-requirements.txt 5 | [build-system] 6 | requires = [ 7 | "scikit-build-core[pyproject]>=0.5", 8 | "nanobind>=1.8.0", 9 | "petsc4py", 10 | "mpi4py", 11 | ] 12 | build-backend = "scikit_build_core.build" 13 | 14 | [project] 15 | name = "fenics-cudolfinx" 16 | version = "0.9.0" 17 | description = "CUDA DOLFINx Python interface" 18 | readme = "../README.md" 19 | requires-python = ">=3.9.0" 20 | license = { file = "../COPYING.LESSER" } 21 | authors = [ 22 | { email = "benjaminpachev@gmail.com" }, 23 | { name = "Benjamin Pachev" }, 24 | ] 25 | dependencies = [ 26 | "numpy>=1.21", 27 | "cffi", 28 | "petsc4py", 29 | "mpi4py", 30 | "fenics-basix>=0.9.0,<0.10.0", 31 | "fenics-dolfinx>=0.9.0,<0.10.0", 32 | "fenics-ffcx>=0.9.0,<0.10.0", 33 | "fenics-ufl>=2024.2.0,<2024.3.0", 34 | ] 35 | 36 | [project.optional-dependencies] 37 | docs = ["markdown", "pyyaml", "sphinx", "sphinx_rtd_theme"] 38 | lint = ["ruff"] 39 | optional = ["numba"] 40 | test = ["pytest", "sympy", "scipy", "matplotlib", "fenics-dolfinx[optional]"] 41 | ci = [ 42 | "mypy", 43 | "pytest-xdist", 44 | "types-setuptools", 45 | "fenics-dolfinx[build]", 46 | "fenics-dolfinx[docs]", 47 | "fenics-dolfinx[lint]", 48 | "fenics-dolfinx[optional]", 49 | "fenics-dolfinx[test]", 50 | ] 51 | 52 | [tool.scikit-build] 53 | wheel.packages = ["cudolfinx"] 54 | sdist.exclude = ["*.cpp"] 55 | cmake.build-type = "Release" 56 | wheel.license-files = ["../COPYING*"] 57 | 58 | [tool.pytest] 59 | junit_family = "xunit2" 60 | 61 | [tool.pytest.ini_options] 62 | markers = ["skip_in_parallel: marks tests that should be run in serial only."] 63 | 64 | [tool.mypy] 65 | # Suggested at https://blog.wolt.com/engineering/2021/09/30/professional-grade-mypy-configuration/ 66 | # Goal would be to make all of the below True long-term 67 | disallow_untyped_defs = false 68 | disallow_any_unimported = false 69 | no_implicit_optional = false 70 | check_untyped_defs = false 71 | warn_return_any = false 72 | warn_unused_ignores = false 73 | show_error_codes = true 74 | ignore_missing_imports = true 75 | 76 | 77 | [tool.ruff] 78 | line-length = 100 79 | indent-width = 4 80 | 81 | [tool.ruff.lint] 82 | select = [ 83 | "E", # pycodestyle 84 | "W", # pycodestyle 85 | "F", # pyflakes 86 | "I", # isort - use standalone isort 87 | "RUF", # Ruff-specific rules 88 | "UP", # pyupgrade 89 | "ICN", # flake8-import-conventions 90 | "NPY", # numpy-specific rules 91 | "FLY", # use f-string not static joins 92 | ] 93 | ignore = ["UP007", "RUF012"] 94 | allowed-confusables = ["σ"] 95 | 96 | [tool.ruff.lint.isort] 97 | known-first-party = ["basix", "dolfinx", "ffcx", "ufl", "cudolfinx"] 98 | known-third-party = ["gmsh", "numba", "numpy", "pytest", "pyvista"] 99 | section-order = [ 100 | "future", 101 | "standard-library", 102 | "mpi", 103 | "third-party", 104 | "first-party", 105 | "local-folder", 106 | ] 107 | 108 | [tool.ruff.lint.isort.sections] 109 | "mpi" = ["mpi4py", "petsc4py"] 110 | -------------------------------------------------------------------------------- /python/test/test_cuda_assembly.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2024 Benjamin Pachev 2 | # 3 | # This file is part of cuDOLFINX 4 | # 5 | # SPDX-License-Identifier: LGPL-3.0-or-later 6 | 7 | import petsc4py 8 | from petsc4py import PETSc 9 | from mpi4py import MPI 10 | from dolfinx import fem as fe, mesh 11 | from dolfinx.fem import petsc 12 | import ufl 13 | import numpy as np 14 | import cudolfinx as cufem 15 | from cudolfinx.form import BlockCUDAForm 16 | from basix.ufl import element, mixed_element 17 | 18 | """ 19 | @author Benjamin Pachev 20 | @copyright 2024 21 | 22 | A set of simple variational forms to test the correctness of CUDA-accelerated assembly. 23 | """ 24 | 25 | 26 | def make_mixed_form(): 27 | """Test compilation of a mixed form. 28 | """ 29 | 30 | domain = mesh.create_unit_square(MPI.COMM_WORLD, 10, 10, mesh.CellType.triangle) 31 | el = element("P", domain.basix_cell(), 1) 32 | 33 | V = fe.functionspace(domain, el) 34 | u = ufl.TrialFunction(V) 35 | p = ufl.TestFunction(V) 36 | A = ufl.dot(ufl.grad(u), ufl.grad(p)) * ufl.dx 37 | F = fe.form(A) 38 | mat = fe.assemble_matrix(F) 39 | 40 | def make_test_domain(): 41 | """Make a test domain 42 | """ 43 | 44 | n = 19 45 | m = 27 46 | return mesh.create_unit_square(MPI.COMM_WORLD, n, m, mesh.CellType.triangle) 47 | 48 | def make_ufl(domain=None): 49 | """Create the UFL needed for making the forms 50 | """ 51 | 52 | if domain is None: 53 | domain = make_test_domain() 54 | 55 | V = fe.functionspace(domain, ("P", 1)) 56 | V_dg = fe.functionspace(domain, ("DG", 1)) 57 | u = fe.Function(V) 58 | p = ufl.TestFunction(V) 59 | p_dg = ufl.TestFunction(V_dg) 60 | n = ufl.FacetNormal(domain) 61 | u.interpolate(lambda x: x[0]**2 + x[1]) 62 | u_dg = fe.Function(V_dg) 63 | u_dg.interpolate(lambda x: x[0]**2 + x[1]) 64 | kappa = fe.Function(V) 65 | kappa.interpolate(lambda x: np.sin(x[0])*np.cos(x[1])) 66 | 67 | cell_residual = (ufl.exp(u)*p*kappa + ufl.dot(ufl.grad(u), ufl.grad(p))) * ufl.dx 68 | exterior_facet_residual = u*kappa*p * ufl.dot(ufl.grad(u), n) * ufl.ds 69 | interior_facet_residual = ufl.avg(p_dg) * ufl.avg(kappa) * ufl.avg(u_dg**2) * ufl.dS 70 | 71 | cell_jac = ufl.derivative(cell_residual, u) 72 | exterior_jac = ufl.derivative(exterior_facet_residual, u) 73 | interior_jac = ufl.derivative(interior_facet_residual, u_dg) 74 | 75 | f = fe.Function(V) 76 | f.interpolate(lambda x: x[0] +x[1]) 77 | dofs = fe.locate_dofs_geometrical(V, lambda x: np.isclose(x[0], 0)) 78 | bc = fe.dirichletbc(f, dofs) 79 | 80 | return { 81 | "coeff": kappa, 82 | "bcs": [bc], 83 | "vector": [cell_residual, exterior_facet_residual, interior_facet_residual], 84 | "matrix": [cell_jac, exterior_jac, interior_jac]} 85 | 86 | def test_assembly(): 87 | """Test correctness of assembly 88 | """ 89 | 90 | ufl_forms = make_ufl() 91 | 92 | for i, form in enumerate(ufl_forms["vector"]): 93 | fenics_form = fe.form(form) 94 | vec = petsc.create_vector(fenics_form) 95 | petsc.assemble_vector(vec, fenics_form) 96 | 97 | for i, form in enumerate(ufl_forms["matrix"]): 98 | fenics_form = fe.form(form) 99 | mat = petsc.create_matrix(fenics_form) 100 | mat.zeroEntries() 101 | petsc.assemble_matrix(mat, fenics_form) 102 | mat.assemble() 103 | 104 | def compare_mats(matcsr, matpetsc): 105 | """Compare a native FEniCS MatrixCSR to a PETSc matrix 106 | """ 107 | 108 | indptr, indices, data = matpetsc.getValuesCSR() 109 | bad = np.where(~np.isclose(matcsr.data, data))[0] 110 | assert np.allclose(matcsr.data, data) 111 | 112 | def compare_vecs(vecfenics, vecpetsc): 113 | assert np.allclose(vecfenics.array, vecpetsc.array) 114 | 115 | def test_cuda_assembly(): 116 | """Check assembly on GPU 117 | """ 118 | 119 | 120 | ufl_forms = make_ufl() 121 | asm = cufem.CUDAAssembler() 122 | 123 | for i, form in enumerate(ufl_forms['vector']): 124 | if i == 0: continue 125 | f = fe.form(form) 126 | vec1 = fe.assemble_vector(f) 127 | vec2 = asm.assemble_vector(cufem.form(form)) 128 | compare_vecs(vec1, vec2.vector) 129 | 130 | for i, form in enumerate(ufl_forms['matrix']): 131 | f = fe.form(form) 132 | Mat1 = fe.assemble_matrix(f, bcs=ufl_forms['bcs']) 133 | Mat2 = asm.assemble_matrix(cufem.form(form), bcs=ufl_forms['bcs']) 134 | Mat2.assemble() 135 | # now we need to compare the two 136 | compare_mats(Mat1, Mat2.mat) 137 | 138 | def test_reassembly(): 139 | """Ensure correct assembly when coefficients are updated 140 | """ 141 | 142 | ufl_forms = make_ufl() 143 | coeff = ufl_forms["coeff"] 144 | cuda_vec_form = cufem.form(ufl_forms["vector"][0]) 145 | vec_form = cuda_vec_form.dolfinx_form 146 | #mat_form = fe.form(ufl_forms["matrix"][0]) 147 | asm = cufem.CUDAAssembler() 148 | vec_cuda = asm.assemble_vector(cuda_vec_form) 149 | vec_fe = fe.assemble_vector(vec_form) 150 | compare_vecs(vec_fe, vec_cuda.vector) 151 | 152 | for d in [2,3]: 153 | coeff.interpolate(lambda x: x[0]**d + x[1]**d) 154 | vec_fe.array[:] = 0 155 | cuda_vec_form.to_device() 156 | fe.assemble_vector(vec_fe.array, vec_form) 157 | asm.assemble_vector(cuda_vec_form, vec_cuda) 158 | 159 | compare_vecs(vec_fe, vec_cuda.vector) 160 | 161 | def test_lifting(): 162 | """Ensure lifting and bc setting work correctly 163 | """ 164 | 165 | ufl_forms = make_ufl() 166 | asm = cufem.CUDAAssembler() 167 | for vec_form, mat_form in zip(ufl_forms['vector'][1:2], ufl_forms['matrix'][1:2]): 168 | L = fe.form(vec_form) 169 | vec_cuda = asm.assemble_vector(cufem.form(vec_form)) 170 | vec_fe = fe.assemble_vector(L) 171 | cuda_a = cufem.form(mat_form) 172 | a = cuda_a.dolfinx_form 173 | compare_vecs(vec_fe, vec_cuda.vector) 174 | fe.set_bc(vec_fe.array, ufl_forms['bcs']) 175 | asm.set_bc(vec_cuda, ufl_forms['bcs'], L.function_spaces[0]) 176 | compare_vecs(vec_fe, vec_cuda.vector) 177 | fe.apply_lifting(vec_fe.array, [a], [ufl_forms['bcs']]) 178 | asm.apply_lifting(vec_cuda, [cuda_a], [ufl_forms['bcs']]) 179 | compare_vecs(vec_fe, vec_cuda.vector) 180 | 181 | def test_block_assembly(): 182 | """Test that basic block assembly works properly.""" 183 | 184 | domain = make_test_domain() 185 | V1 = fe.functionspace(domain, ("P", 1)) 186 | V2 = fe.functionspace(domain, ("P", 1)) 187 | p1, p2 = ufl.TestFunction(V1), ufl.TestFunction(V2) 188 | 189 | u1, u2 = fe.Function(V1), fe.Function(V2) 190 | u1.interpolate(lambda x: x[0]**2 + x[1]**3) 191 | u2.interpolate(lambda x: 1 + x[0] + x[1]**2) 192 | b1 = ufl.dot(ufl.grad(u1), ufl.grad(p1)) * ufl.dx 193 | b2 = ufl.dot(ufl.grad(u2), ufl.grad(p2)) * ufl.dx 194 | 195 | asm = cufem.CUDAAssembler() 196 | cuda_L = cufem.form([b1,b2]) 197 | 198 | vec_cuda = asm.create_vector_block(cuda_L) 199 | asm.assemble_vector_block(cuda_L, vec_cuda) 200 | 201 | vec_fe = fe.petsc.create_vector_block(cuda_L.dolfinx_forms) 202 | # TODO - update this when switching to DOLFINx v0.10.0 203 | fe.petsc.assemble_vector_block(vec_fe, cuda_L.dolfinx_forms, [[None], [None]]) 204 | compare_vecs(vec_fe, vec_cuda.vector) 205 | 206 | -------------------------------------------------------------------------------- /python/test/test_multigpu_assembly.py: -------------------------------------------------------------------------------- 1 | from test_cuda_assembly import make_test_domain, make_ufl 2 | from mpi4py import MPI 3 | import cudolfinx as cufem 4 | from dolfinx import fem as fe 5 | from dolfinx.fem import petsc as fe_petsc 6 | import numpy as np 7 | from petsc4py import PETSc 8 | import json 9 | 10 | def compute_universal_dofmap(mesh, V, res=1000): 11 | """Map the global array of dofs to unique geometric information 12 | 13 | This is needed to compute maps between DG dofs on meshes with different partitioning schemes 14 | """ 15 | 16 | num_local_dofs = V.dofmap.index_map.size_local 17 | 18 | c_to_dofs = V.dofmap.map() 19 | dofs_to_cells = np.zeros(num_local_dofs, dtype=int) 20 | for i, cell in enumerate(c_to_dofs): 21 | for dof in cell: 22 | if dof >= num_local_dofs: continue 23 | dofs_to_cells[dof] = i 24 | dof_coords = V.tabulate_dof_coordinates()[:num_local_dofs] 25 | cell_coords = mesh.geometry.x[mesh.geometry.dofmap] 26 | dof_cell_coords = cell_coords[dofs_to_cells] 27 | dof_coords = mesh.comm.gather(dof_coords, root=0) 28 | dof_cell_coords = mesh.comm.gather(dof_cell_coords, root=0) 29 | if mesh.comm.rank == 0: 30 | dof_coords = (res*np.concat(dof_coords)).astype(int) 31 | dof_cell_coords = (res*np.concat(dof_cell_coords)).astype(int) 32 | i = 0 33 | keys_to_dofs = {} 34 | keys = [] 35 | for d_coords, d_cell_coords in zip(dof_coords, dof_cell_coords): 36 | k = (tuple(d_coords.tolist()), tuple(sorted([tuple(arr.tolist()) for arr in d_cell_coords]))) 37 | keys_to_dofs[k] = i 38 | keys.append(k) 39 | i += 1 40 | 41 | return keys, keys_to_dofs 42 | 43 | def compare_parallel_matrices(mat1, mat2): 44 | """Compare two distributed PETSc matrices 45 | """ 46 | 47 | _, _, data1 = mat1.getValuesCSR() 48 | _, _, data2 = mat2.getValuesCSR() 49 | sum1 = MPI.COMM_WORLD.gather(data1.sum(), root=0) 50 | sum2 = MPI.COMM_WORLD.gather(data2.sum(), root=0) 51 | if MPI.COMM_WORLD.rank == 0: 52 | sum1, sum2 = sum(sum1), sum(sum2) 53 | print(sum1, sum2, np.allclose(sum1, sum2)) 54 | return np.allclose(sum1, sum2) 55 | 56 | def compare_parallel_vectors(vec1, vec2): 57 | """Compare two distributed PETSc vectors 58 | """ 59 | 60 | sum1 = MPI.COMM_WORLD.gather(vec1.array[:].sum(), root=0) 61 | sum2 = MPI.COMM_WORLD.gather(vec2.array[:].sum(), root=0) 62 | if MPI.COMM_WORLD.rank == 0: 63 | sum1, sum2 = sum(sum1), sum(sum2) 64 | print(sum1, sum2, np.allclose(sum1, sum2)) 65 | return np.allclose(sum1, sum2) 66 | 67 | def test_multigpu_assembly(): 68 | """Check assembly operations across multiple GPUs 69 | """ 70 | 71 | domain = make_test_domain() 72 | regular_ufl = make_ufl() 73 | ghosted_domain = cufem.ghost_layer_mesh(domain) 74 | ghosted_ufl = make_ufl(ghosted_domain) 75 | asm = cufem.CUDAAssembler() 76 | for form1, form2 in zip(regular_ufl['matrix'], ghosted_ufl['matrix']): 77 | form1 = fe.form(form1) 78 | form2 = cufem.form(form2) 79 | regular_mat = fe_petsc.create_matrix(form1) 80 | regular_mat.zeroEntries() 81 | fe_petsc.assemble_matrix(regular_mat, form1, bcs=regular_ufl['bcs']) 82 | regular_mat.assemble() 83 | cuda_mat = asm.assemble_matrix(form2, bcs=ghosted_ufl['bcs']) 84 | cuda_mat.assemble() 85 | compare_parallel_matrices(regular_mat, cuda_mat.mat) 86 | 87 | for form1, form2 in zip(regular_ufl['vector'], ghosted_ufl['vector']): 88 | form1 = fe.form(form1) 89 | form2 = cufem.form(form2) 90 | regular_vec = fe_petsc.create_vector(form1) 91 | with regular_vec.localForm() as loc: 92 | loc.set(0) 93 | fe_petsc.assemble_vector(regular_vec, form1) 94 | regular_vec.ghostUpdate(addv=PETSc.InsertMode.ADD, mode=PETSc.ScatterMode.REVERSE) 95 | cuda_vec = asm.assemble_vector(form2) 96 | good = compare_parallel_vectors(regular_vec, cuda_vec.vector) 97 | 98 | if __name__ == "__main__": 99 | 100 | test_multigpu_assembly() 101 | -------------------------------------------------------------------------------- /spack/packages/cuda-dolfinx/package.py: -------------------------------------------------------------------------------- 1 | # Copyright 2013-2024 Lawrence Livermore National Security, LLC and other 2 | # Spack Project Developers. See the top-level COPYRIGHT file for details. 3 | # 4 | # SPDX-License-Identifier: (Apache-2.0 OR MIT) 5 | 6 | from spack.package import * 7 | 8 | 9 | class CudaDolfinx(CMakePackage): 10 | """CUDA accelerated extension of DOLFINx from the FEniCS project.""" 11 | 12 | homepage = "https://github.com/bpachev/cuda-dolfinx" 13 | git = "https://github.com/bpachev/cuda-dolfinx.git" 14 | url = "https://github.com/bpachev/cuda-dolfinx/archive/refs/tags/v0.9.0.zip" 15 | 16 | maintainers("bpachev") 17 | license("LGPL-3.0-or-later", checked_by="bpachev") 18 | 19 | version("main", branch="main") 20 | version("0.9.0", sha256="5c93155e58eee139985e9e9341cf7d8b29f8c9cbc51ccdf05134cdfb70ae105d") 21 | 22 | depends_on("cxx", type="build") 23 | 24 | depends_on("fenics-dolfinx@0.9", when="@0.9:") 25 | depends_on("py-fenics-dolfinx@0.9", when="@0.9:") 26 | depends_on("petsc+shared+mpi+cuda") 27 | 28 | root_cmakelists_dir = "cpp" 29 | 30 | def cmake_args(self): 31 | return [self.define("CUDOLFINX_SKIP_BUILD_TESTS", True)] 32 | -------------------------------------------------------------------------------- /spack/packages/py-cuda-dolfinx/package.py: -------------------------------------------------------------------------------- 1 | # Copyright 2013-2024 Lawrence Livermore National Security, LLC and other 2 | # Spack Project Developers. See the top-level COPYRIGHT file for details. 3 | # 4 | # SPDX-License-Identifier: (Apache-2.0 OR MIT) 5 | 6 | from spack.package import * 7 | 8 | 9 | class PyCudaDolfinx(PythonPackage): 10 | """Python interface for CUDA acceleration of DOLFINx in the FEniCS project.""" 11 | 12 | homepage = "https://github.com/bpachev/cuda-dolfinx" 13 | url = "https://github.com/bpachev/cuda-dolfinx/archive/refs/tags/v0.9.0.zip" 14 | git = "https://github.com/bpachev/cuda-dolfinx.git" 15 | 16 | maintainers("bpachev") 17 | 18 | license("LGPL-3.0-only") 19 | version("main", branch="main") 20 | version("0.9.0", sha256="5c93155e58eee139985e9e9341cf7d8b29f8c9cbc51ccdf05134cdfb70ae105d") 21 | 22 | depends_on("cxx", type="build") 23 | depends_on("cmake@3.21:", when="@0.9:", type="build") 24 | depends_on("cuda-dolfinx@main", when="@main") 25 | depends_on("cuda-dolfinx@0.9.0", when="@0.9.0") 26 | depends_on("pkgconfig", type="build") 27 | depends_on("py-nanobind@2:", when="@0.9:", type="build") 28 | depends_on("py-scikit-build-core+pyproject@0.5:", when="@0.9:", type="build") 29 | 30 | build_directory = "python" 31 | 32 | -------------------------------------------------------------------------------- /spack/repo.yaml: -------------------------------------------------------------------------------- 1 | repo: 2 | namespace: 'cudolfinx' 3 | --------------------------------------------------------------------------------