├── COPYING
├── COPYING.LESSER
├── README.md
├── cpp
    ├── CMakeLists.txt
    ├── cmake
    │   ├── modules
    │   │   └── FindUFCx.cmake
    │   ├── post-install
    │   │   └── CMakeLists.txt
    │   └── templates
    │   │   ├── CUDOLFINXConfig.cmake.in
    │   │   ├── cmake_uninstall.cmake.in
    │   │   ├── cudolfinx.conf.in
    │   │   └── cudolfinx.pc.in
    └── cudolfinx
    │   ├── CMakeLists.txt
    │   ├── common
    │       ├── CMakeLists.txt
    │       ├── CUDA.cpp
    │       ├── CUDA.h
    │       ├── CUDAStore.h
    │       └── version.h.in
    │   ├── cudolfinx.h
    │   ├── fem
    │       ├── CMakeLists.txt
    │       ├── CUDAAssembler.cpp
    │       ├── CUDAAssembler.h
    │       ├── CUDACoefficient.h
    │       ├── CUDADirichletBC.h
    │       ├── CUDADofMap.cpp
    │       ├── CUDADofMap.h
    │       ├── CUDAForm.h
    │       ├── CUDAFormCoefficients.h
    │       ├── CUDAFormConstants.h
    │       ├── CUDAFormIntegral.cpp
    │       ├── CUDAFormIntegral.h
    │       └── petsc.h
    │   ├── la
    │       ├── CMakeLists.txt
    │       ├── CUDAMatrix.cpp
    │       ├── CUDAMatrix.h
    │       ├── CUDASeqMatrix.cpp
    │       ├── CUDASeqMatrix.h
    │       ├── CUDAVector.cpp
    │       ├── CUDAVector.h
    │       ├── petsc.cpp
    │       └── petsc.h
    │   └── mesh
    │       ├── CMakeLists.txt
    │       ├── CUDAMesh.h
    │       ├── CUDAMeshEntities.h
    │       ├── util.cpp
    │       └── util.h
├── docker
    ├── Dockerfile.end-user
    └── Dockerfile.test-env
├── python
    ├── CMakeLists.txt
    ├── README.md
    ├── build-requirements.txt
    ├── cudolfinx
    │   ├── __init__.py
    │   ├── assemble.py
    │   ├── bcs.py
    │   ├── context.py
    │   ├── form.py
    │   ├── jit.py
    │   ├── la.py
    │   ├── mesh.py
    │   └── wrappers
    │   │   ├── caster_petsc.h
    │   │   ├── cudolfinx.cpp
    │   │   ├── fem.cpp
    │   │   └── petsc.cpp
    ├── examples
    │   ├── poisson.py
    │   └── poisson_sum_factorization.py
    ├── pyproject.toml
    └── test
    │   ├── test_cuda_assembly.py
    │   └── test_multigpu_assembly.py
└── spack
    ├── packages
        ├── cuda-dolfinx
        │   └── package.py
        └── py-cuda-dolfinx
        │   └── package.py
    └── repo.yaml


/COPYING.LESSER:
--------------------------------------------------------------------------------
  1 | 		   GNU LESSER GENERAL PUBLIC LICENSE
  2 |                        Version 3, 29 June 2007
  3 | 
  4 |  Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
  5 |  Everyone is permitted to copy and distribute verbatim copies
  6 |  of this license document, but changing it is not allowed.
  7 | 
  8 | 
  9 |   This version of the GNU Lesser General Public License incorporates
 10 | the terms and conditions of version 3 of the GNU General Public
 11 | License, supplemented by the additional permissions listed below.
 12 | 
 13 |   0. Additional Definitions.
 14 | 
 15 |   As used herein, "this License" refers to version 3 of the GNU Lesser
 16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU
 17 | General Public License.
 18 | 
 19 |   "The Library" refers to a covered work governed by this License,
 20 | other than an Application or a Combined Work as defined below.
 21 | 
 22 |   An "Application" is any work that makes use of an interface provided
 23 | by the Library, but which is not otherwise based on the Library.
 24 | Defining a subclass of a class defined by the Library is deemed a mode
 25 | of using an interface provided by the Library.
 26 | 
 27 |   A "Combined Work" is a work produced by combining or linking an
 28 | Application with the Library.  The particular version of the Library
 29 | with which the Combined Work was made is also called the "Linked
 30 | Version".
 31 | 
 32 |   The "Minimal Corresponding Source" for a Combined Work means the
 33 | Corresponding Source for the Combined Work, excluding any source code
 34 | for portions of the Combined Work that, considered in isolation, are
 35 | based on the Application, and not on the Linked Version.
 36 | 
 37 |   The "Corresponding Application Code" for a Combined Work means the
 38 | object code and/or source code for the Application, including any data
 39 | and utility programs needed for reproducing the Combined Work from the
 40 | Application, but excluding the System Libraries of the Combined Work.
 41 | 
 42 |   1. Exception to Section 3 of the GNU GPL.
 43 | 
 44 |   You may convey a covered work under sections 3 and 4 of this License
 45 | without being bound by section 3 of the GNU GPL.
 46 | 
 47 |   2. Conveying Modified Versions.
 48 | 
 49 |   If you modify a copy of the Library, and, in your modifications, a
 50 | facility refers to a function or data to be supplied by an Application
 51 | that uses the facility (other than as an argument passed when the
 52 | facility is invoked), then you may convey a copy of the modified
 53 | version:
 54 | 
 55 |    a) under this License, provided that you make a good faith effort to
 56 |    ensure that, in the event an Application does not supply the
 57 |    function or data, the facility still operates, and performs
 58 |    whatever part of its purpose remains meaningful, or
 59 | 
 60 |    b) under the GNU GPL, with none of the additional permissions of
 61 |    this License applicable to that copy.
 62 | 
 63 |   3. Object Code Incorporating Material from Library Header Files.
 64 | 
 65 |   The object code form of an Application may incorporate material from
 66 | a header file that is part of the Library.  You may convey such object
 67 | code under terms of your choice, provided that, if the incorporated
 68 | material is not limited to numerical parameters, data structure
 69 | layouts and accessors, or small macros, inline functions and templates
 70 | (ten or fewer lines in length), you do both of the following:
 71 | 
 72 |    a) Give prominent notice with each copy of the object code that the
 73 |    Library is used in it and that the Library and its use are
 74 |    covered by this License.
 75 | 
 76 |    b) Accompany the object code with a copy of the GNU GPL and this license
 77 |    document.
 78 | 
 79 |   4. Combined Works.
 80 | 
 81 |   You may convey a Combined Work under terms of your choice that,
 82 | taken together, effectively do not restrict modification of the
 83 | portions of the Library contained in the Combined Work and reverse
 84 | engineering for debugging such modifications, if you also do each of
 85 | the following:
 86 | 
 87 |    a) Give prominent notice with each copy of the Combined Work that
 88 |    the Library is used in it and that the Library and its use are
 89 |    covered by this License.
 90 | 
 91 |    b) Accompany the Combined Work with a copy of the GNU GPL and this license
 92 |    document.
 93 | 
 94 |    c) For a Combined Work that displays copyright notices during
 95 |    execution, include the copyright notice for the Library among
 96 |    these notices, as well as a reference directing the user to the
 97 |    copies of the GNU GPL and this license document.
 98 | 
 99 |    d) Do one of the following:
100 | 
101 |        0) Convey the Minimal Corresponding Source under the terms of this
102 |        License, and the Corresponding Application Code in a form
103 |        suitable for, and under terms that permit, the user to
104 |        recombine or relink the Application with a modified version of
105 |        the Linked Version to produce a modified Combined Work, in the
106 |        manner specified by section 6 of the GNU GPL for conveying
107 |        Corresponding Source.
108 | 
109 |        1) Use a suitable shared library mechanism for linking with the
110 |        Library.  A suitable mechanism is one that (a) uses at run time
111 |        a copy of the Library already present on the user's computer
112 |        system, and (b) will operate properly with a modified version
113 |        of the Library that is interface-compatible with the Linked
114 |        Version.
115 | 
116 |    e) Provide Installation Information, but only if you would otherwise
117 |    be required to provide such information under section 6 of the
118 |    GNU GPL, and only to the extent that such information is
119 |    necessary to install and execute a modified version of the
120 |    Combined Work produced by recombining or relinking the
121 |    Application with a modified version of the Linked Version. (If
122 |    you use option 4d0, the Installation Information must accompany
123 |    the Minimal Corresponding Source and Corresponding Application
124 |    Code. If you use option 4d1, you must provide the Installation
125 |    Information in the manner specified by section 6 of the GNU GPL
126 |    for conveying Corresponding Source.)
127 | 
128 |   5. Combined Libraries.
129 | 
130 |   You may place library facilities that are a work based on the
131 | Library side by side in a single library together with other library
132 | facilities that are not Applications and are not covered by this
133 | License, and convey such a combined library under terms of your
134 | choice, if you do both of the following:
135 | 
136 |    a) Accompany the combined library with a copy of the same work based
137 |    on the Library, uncombined with any other library facilities,
138 |    conveyed under the terms of this License.
139 | 
140 |    b) Give prominent notice with the combined library that part of it
141 |    is a work based on the Library, and explaining where to find the
142 |    accompanying uncombined form of the same work.
143 | 
144 |   6. Revised Versions of the GNU Lesser General Public License.
145 | 
146 |   The Free Software Foundation may publish revised and/or new versions
147 | of the GNU Lesser General Public License from time to time. Such new
148 | versions will be similar in spirit to the present version, but may
149 | differ in detail to address new problems or concerns.
150 | 
151 |   Each version is given a distinguishing version number. If the
152 | Library as you received it specifies that a certain numbered version
153 | of the GNU Lesser General Public License "or any later version"
154 | applies to it, you have the option of following the terms and
155 | conditions either of that published version or of any later version
156 | published by the Free Software Foundation. If the Library as you
157 | received it does not specify a version number of the GNU Lesser
158 | General Public License, you may choose any version of the GNU Lesser
159 | General Public License ever published by the Free Software Foundation.
160 | 
161 |   If the Library as you received it specifies that a proxy can decide
162 | whether future versions of the GNU Lesser General Public License shall
163 | apply, that proxy's public statement of acceptance of any version is
164 | permanent authorization for you to choose that version for the
165 | Library.
166 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # About
 2 | 
 3 | This repository is an add-on extension to the DOLFINx library providing CUDA accelerated assembly routines. It complements the CUDA linear solvers in PETSc to enable fully GPU-accelerated DOLFINx codes. It is designed to enable GPU acceleration for existing codes with minimal changes.
 4 | 
 5 | # Basic Usage
 6 | 
 7 | ```
 8 | import cudolfinx as cufem
 9 | 
10 | # given UFL forms A and L representing a stiffness matrix and right-hand-side
11 | cuda_A = cufem.form(A)
12 | cuda_L = cufem.form(L)
13 | asm = cufem.CUDAAssembler()
14 | # returns a custom type CUDAMatrix
15 | mat = asm.assemble_matrix(cuda_A)
16 | mat.assemble()
17 | # get PETSc matrix
18 | petsc_mat = mat.mat()
19 | # returns a custom type CUDAVector
20 | vec = asm.assemble_vector(cuda_L)
21 | #get PETSc vector
22 | petsc_vec = vec.vector()
23 | ```
24 | 
25 | # Dependencies
26 | 
27 | - dolfinx 0.9.0
28 | - PETSc with CUDA support
29 | - CUDA Toolkit 12.x
30 | 
31 | # Installation
32 | 
33 | There are three ways to do the install, in increasing order of difficulty. Currently, it is not possible to use `cudolfinx` with the existing Conda and Docker distributions of `dolfinx`, because these force installation of PETSc without CUDA support. Consequently, installing `cudolfinx` requires a custom modification to the `dolfinx` dependency stack that has CUDA-enabled PETSc.
34 | 
35 | ## Docker
36 | 
37 | Using Docker is by far the easiest approach.
38 | 
39 | ```
40 | docker run --gpus all -it benpachev/cudolfinx:v0.9.0-cuda12.6
41 | ```
42 | You may experience errors with the prebuilt container due to CUDA Toolkit or MPI version mismatch between the host and container. In this case, the Dockerfiles in `docker/` can be modified to use a different CUDA Toolkit version or MPI version to build a container that will work with your system. Note that on HPC systems, Docker is not available, but Docker containers can be converted to Apptainer/Singularity containers.
43 | 
44 | ```
45 | apptainer pull docker://benpachev/cudolfinx:v0.9.0-cuda12.6
46 | apptainer run --nv cudolfinx_v0.9.0-cuda12.6.sif
47 | ```
48 | 
49 | ## Spack
50 | 
51 | Spack is a management tool for HPC software, and allows for an extreme amount of flexibility in compilation of code and dependency selection. It has somewhat of a learning curve, and typically doesn't work out of the box without some manual configuration. However, it can be a good choice for HPC systems without Apptainer installed, or if more control over the compilation process and dependencies is desired. To install with Spack:
52 | 
53 | ```
54 | git clone https://github.com/spack/spack.git
55 | . spack/share/spack/setup-env.sh
56 | spack env create cudolfinx-env
57 | spacktivate cudolfinx-env
58 | git clone https://github.com/bpachev/cuda-dolfinx.git
59 | spack repo add cuda-dolfinx/spack
60 | spack add cuda-dolfinx py-cuda-dolfinx
61 | spack install
62 | ```
63 | 
64 | If this leads to errors, it is likely due to either (a) Spack is unable to find a suitable compiler or properly configure your existing compiler (b) Spack is trying to build a poorly supported low-level package from source. To resolve (a), you can usually do `spack compiler add`. Especially on HPC systems, [additional configuration](https://spack-tutorial.readthedocs.io/en/latest/tutorial_configuration.html#compiler-configuration) is usually needed. To solve (b), you will often need to [force Spack to use existing](https://spack-tutorial.readthedocs.io/en/latest/tutorial_configuration.html#external-packages) low-level software on your system instead of trying to install it from source.
65 | 
66 | ## From Source
67 | 
68 | The difficult part about the install is the dependencies. The Dockerfiles under `docker/` provide a template for how to install the dependencies on Debian-based systems (and using Docker is by far the easiest way to get a development environment). Once that is taken care of, the installation of `cuda-dolfinx` itself is simple. 
69 | 
70 | ### C++ Core
71 | ```
72 | cd cpp
73 | mkdir build
74 | cmake ..  -DCUDOLFINX_SKIP_BUILD_TESTS=YES
75 | make install
76 | ```
77 | 
78 | ### Python Bindings:
79 | ```
80 | cd python
81 | pip --check-build-dependencies --no-build-isolation .
82 | ```
83 | 
84 | For help with installing or using the library, feel free to contact me at benjaminpachev@gmail.com.
85 | 


--------------------------------------------------------------------------------
/cpp/CMakeLists.txt:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------------
  2 | # Top level CMakeLists.txt file for DOLFINx
  3 | cmake_minimum_required(VERSION 3.19)
  4 | 
  5 | # ------------------------------------------------------------------------------
  6 | # Set project name and version number
  7 | project(CUDOLFINX VERSION "0.9.0")
  8 | 
  9 | set(DOXYGEN_CUDOLFINX_VERSION
 10 |     ${CUDOLFINX_VERSION}
 11 |     CACHE STRING "Version for Doxygen" FORCE
 12 | )
 13 | 
 14 | # ------------------------------------------------------------------------------
 15 | # Use C++20
 16 | set(CMAKE_CXX_STANDARD 20)
 17 | 
 18 | # Require C++20
 19 | set(CMAKE_CXX_STANDARD_REQUIRED ON)
 20 | 
 21 | # Do not enable compler-specific extensions
 22 | set(CMAKE_CXX_EXTENSIONS OFF)
 23 | 
 24 | # ------------------------------------------------------------------------------
 25 | # Get GIT changeset, if available
 26 | find_program(GIT_FOUND git)
 27 | 
 28 | if(GIT_FOUND)
 29 |   # Get the commit hash of the working branch
 30 |   execute_process(
 31 |     COMMAND git rev-parse HEAD
 32 |     WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
 33 |     OUTPUT_VARIABLE GIT_COMMIT_HASH
 34 |     OUTPUT_STRIP_TRAILING_WHITESPACE
 35 |   )
 36 | else()
 37 |   set(GIT_COMMIT_HASH "unknown")
 38 | endif()
 39 | 
 40 | # ------------------------------------------------------------------------------
 41 | # General configuration
 42 | 
 43 | # Set location of our FindFoo.cmake modules
 44 | set(CMAKE_MODULE_PATH "${CUDOLFINX_SOURCE_DIR}/cmake/modules")
 45 | 
 46 | # Make sure CMake uses the correct DOLFINConfig.cmake for tests and demos
 47 | set(CMAKE_PREFIX_PATH ${CMAKE_PREFIX_PATH} ${CMAKE_CURRENT_BINARY_DIR}/cudolfinx)
 48 | 
 49 | # ------------------------------------------------------------------------------
 50 | # Configurable options for how we want to build
 51 | include(FeatureSummary)
 52 | 
 53 | option(BUILD_SHARED_LIBS "Build CUDOLFINx with shared libraries." ON)
 54 | add_feature_info(
 55 |  BUILD_SHARED_LIBS BUILD_SHARED_LIBS "Build CUDOLFINx with shared libraries."
 56 | )
 57 | 
 58 | option(CUDOLFINX_SKIP_BUILD_TESTS
 59 |        "Skip build tests for testing usability of dependency packages." OFF
 60 | )
 61 | add_feature_info(
 62 |   CUDOLFINX_SKIP_BUILD_TESTS CUDOLFINX_SKIP_BUILD_TESTS
 63 |   "Skip build tests for testing usability of dependency packages."
 64 | )
 65 | 
 66 | # Add shared library paths so shared libs in non-system paths are found
 67 | option(CMAKE_INSTALL_RPATH_USE_LINK_PATH
 68 |        "Add paths to linker search and installed rpath." ON
 69 | )
 70 | add_feature_info(
 71 |   CMAKE_INSTALL_RPATH_USE_LINK_PATH CMAKE_INSTALL_RPATH_USE_LINK_PATH
 72 |   "Add paths to linker search and installed rpath."
 73 | )
 74 | 
 75 | # Control UFCx discovery
 76 | option(
 77 |   CUDOLFINX_UFCX_PYTHON
 78 |   "Enable UFCx discovery using Python. Disable if UFCx should be found using CMake."
 79 |   ON
 80 | )
 81 | add_feature_info(
 82 |   CUDOLFINX_UFCX_PYTHON
 83 |   CUDOLFINX_UFCX_PYTHON
 84 |   "Enable UFCx discovery using Python. Disable if UFCx should be found using a CMake config file."
 85 | )
 86 | 
 87 | # ------------------------------------------------------------------------------
 88 | # Enable or disable optional packages
 89 | 
 90 | 
 91 | if(CUDOLFINX_ENABLE_PETSC)
 92 |   set(_REQUIRE_PETSC
 93 |       TRUE
 94 |       CACHE BOOL "Is PETSc REQUIRED?"
 95 |   )
 96 | else()
 97 |   set(_REQUIRE_PETSC
 98 |       FALSE
 99 |       CACHE BOOL "Is PETSc REQUIRED?"
100 |   )
101 | endif()
102 | 
103 | option(CUDOLFINX_ENABLE_PETSC "Compile with support for PETSc." ON)
104 | set_package_properties(
105 |   PETSc PROPERTIES
106 |   TYPE RECOMMENDED
107 |   DESCRIPTION "Portable, Extensible Toolkit for Scientific Computation"
108 |   URL "https://petsc.org/"
109 |   PURPOSE "Linear and nonlinear solvers"
110 | )
111 | 
112 | 
113 | # ------------------------------------------------------------------------------
114 | # Check for MPI
115 | find_package(MPI 3 REQUIRED)
116 | 
117 | # ------------------------------------------------------------------------------
118 | # Compiler flags
119 | 
120 | # Default build type (can be overridden by user)
121 | if(NOT CMAKE_BUILD_TYPE)
122 |   set(CMAKE_BUILD_TYPE
123 |       "RelWithDebInfo"
124 |       CACHE
125 |         STRING
126 |         "Choose the type of build, options are: Debug Developer MinSizeRel Release RelWithDebInfo."
127 |         FORCE
128 |   )
129 | endif()
130 | 
131 | # Check for some compiler flags
132 | include(CheckCXXCompilerFlag)
133 | check_cxx_compiler_flag(-pipe HAVE_PIPE)
134 | 
135 | if(HAVE_PIPE)
136 |   list(APPEND CUDOLFINX_CXX_DEVELOPER_FLAGS -pipe)
137 | endif()
138 | 
139 | # Add some strict compiler checks
140 | check_cxx_compiler_flag("-Wall -Werror -Wextra -pedantic" HAVE_PEDANTIC)
141 | 
142 | if(HAVE_PEDANTIC)
143 |   list(APPEND CUDOLFINX_CXX_DEVELOPER_FLAGS -Wall;-Werror;-Wextra;-pedantic)
144 | endif()
145 | 
146 | # Debug flags
147 | check_cxx_compiler_flag(-g HAVE_DEBUG)
148 | 
149 | if(HAVE_DEBUG)
150 |   list(APPEND CUDOLFINX_CXX_DEVELOPER_FLAGS -g)
151 | endif()
152 | 
153 | # Optimisation
154 | check_cxx_compiler_flag(-O2 HAVE_O2_OPTIMISATION)
155 | 
156 | if(HAVE_O2_OPTIMISATION)
157 |   list(APPEND CUDOLFINX_CXX_DEVELOPER_FLAGS -O2)
158 | endif()
159 | 
160 | # ------------------------------------------------------------------------------
161 | # Find required packages
162 | 
163 | # pugixml
164 | find_package(pugixml REQUIRED)
165 | 
166 | # Note: When updating Boost version, also update CUDOLFINXCongif.cmake.in
167 | if(DEFINED ENV{BOOST_ROOT} OR DEFINED BOOST_ROOT)
168 |   set(Boost_NO_SYSTEM_PATHS on)
169 | endif()
170 | 
171 | set(Boost_USE_MULTITHREADED $ENV{BOOST_USE_MULTITHREADED})
172 | set(Boost_VERBOSE TRUE)
173 | find_package(Boost 1.70 REQUIRED timer)
174 | set_package_properties(
175 |   Boost PROPERTIES
176 |   TYPE REQUIRED
177 |   DESCRIPTION "Boost C++ libraries"
178 |   URL "http://www.boost.org"
179 | )
180 | 
181 | # Use Python for detecting UFCx and Basix
182 | find_package(
183 |   Python3
184 |   COMPONENTS Interpreter
185 |   QUIET
186 | )
187 | 
188 | # Check for Basix Note: Basix may be installed as a standalone C++ library, or
189 | # in the Basix Python module tree
190 | if(Python3_Interpreter_FOUND)
191 |   message(STATUS "Checking for basix hints with ${Python3_EXECUTABLE}")
192 |   execute_process(
193 |     COMMAND
194 |       ${Python3_EXECUTABLE} -c
195 |       "import basix, os, sys; sys.stdout.write(os.path.dirname(basix.__file__))"
196 |     OUTPUT_VARIABLE BASIX_PY_DIR
197 |     RESULT_VARIABLE BASIX_PY_COMMAND_RESULT
198 |     ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE
199 |   )
200 | 
201 |   if(BASIX_PY_DIR)
202 |     message(STATUS "Adding ${BASIX_PY_DIR} to Basix search hints")
203 | 
204 |     # Basix installed from manylinux wheel
205 |     if(IS_DIRECTORY ${BASIX_PY_DIR}/../fenics_basix.libs)
206 |       set(CMAKE_INSTALL_RPATH ${BASIX_PY_DIR}/../fenics_basix.libs)
207 |     endif()
208 |   endif()
209 | endif()
210 | 
211 | find_package(Basix 0.8 REQUIRED CONFIG HINTS ${BASIX_PY_DIR})
212 | set_package_properties(
213 |   basix PROPERTIES
214 |   TYPE REQUIRED
215 |   DESCRIPTION "FEniCS tabulation library"
216 |   URL "https://github.com/fenics/basix"
217 | )
218 | 
219 | find_package(DOLFINX 0.8 REQUIRED CONFIG)
220 | set_package_properties(
221 |   DOLFINX PROPERTIES
222 |   TYPE REQUIRED
223 |   DESCRIPTION "Dynamic Object-oriented Library for FINite element computation"
224 |   URL "https://github.com/fenics/basix"
225 | )
226 | 
227 | # Check for HDF5
228 | set(HDF5_PREFER_PARALLEL TRUE)
229 | set(HDF5_FIND_DEBUG TRUE)
230 | find_package(HDF5 REQUIRED COMPONENTS C)
231 | 
232 | if(NOT HDF5_IS_PARALLEL)
233 |   message(
234 |     FATAL_ERROR
235 |       "Found serial HDF5 build, MPI HDF5 build required, try setting HDF5_DIR or HDF5_ROOT"
236 |   )
237 | endif()
238 | 
239 | set_package_properties(
240 |   HDF5 PROPERTIES
241 |   TYPE REQUIRED
242 |   DESCRIPTION "Hierarchical Data Format 5 (HDF5)"
243 |   URL "https://www.hdfgroup.org/HDF5"
244 | )
245 | 
246 | # Check for UFC Note: we use the case (ufcx vs UFCx) elsewhere to determine by
247 | # which method UFCx was found
248 | if(NOT CUDOLFINX_UFCX_PYTHON)
249 |   # Check in CONFIG mode, i.e. look for installed ufcxConfig.cmake
250 |   find_package(ufcx 0.8 REQUIRED CONFIG)
251 | else()
252 |   # Check in MODULE mode (using FindUFCX.cmake)
253 |   find_package(
254 |     Python3
255 |     COMPONENTS Interpreter
256 |     REQUIRED
257 |   )
258 |   find_package(UFCx 0.8 REQUIRED MODULE)
259 | endif()
260 | 
261 | set_package_properties(
262 |   UFCx PROPERTIES
263 |   TYPE REQUIRED
264 |   DESCRIPTION "Interface for form-compilers (part of FFCx)"
265 |   URL "https://github.com/fenics/ffcx"
266 | )
267 | 
268 | find_package(CUDAToolkit REQUIRED)
269 | 
270 | set_package_properties(CUDAToolkit PROPERTIES TYPE OPTIONAL
271 |   DESCRIPTION "Parallel computing platform for GPUs"
272 |   URL "https://developer.nvidia.com/cuda-toolkit"
273 |   PURPOSE "Enables GPU-accelerated computing"
274 | )
275 | 
276 | 
277 | # ------------------------------------------------------------------------------
278 | # Find optional packages
279 | 
280 | if(CUDOLFINX_ENABLE_PETSC)
281 |   find_package(PkgConfig REQUIRED)
282 |   set(ENV{PKG_CONFIG_PATH}
283 |       "$ENV{PETSC_DIR}/$ENV{PETSC_ARCH}/lib/pkgconfig:$ENV{PETSC_DIR}/lib/pkgconfig:$ENV{PKG_CONFIG_PATH}"
284 |   )
285 |   if(_REQUIRE_PETSC)
286 |     pkg_search_module(PETSC REQUIRED IMPORTED_TARGET PETSc>=3.15 petsc>=3.15)
287 |   else()
288 |     pkg_search_module(PETSC OPTIONAL IMPORTED_TARGET PETSc>=3.15 petsc>=3.15)
289 |   endif()
290 | 
291 |   # Setting for FeatureSummary
292 |   if(PETSC_FOUND)
293 |     message(
294 |       STATUS "Found PETSc version ${PETSC_VERSION}, prefix: ${PETSC_PREFIX}"
295 |     )
296 |     set_property(GLOBAL APPEND PROPERTY PACKAGES_FOUND PETSc)
297 |   else()
298 |     set_property(GLOBAL APPEND PROPERTY PACKAGES_NOT_FOUND PETSc)
299 |   endif()
300 | endif()
301 | 
302 | # ------------------------------------------------------------------------------
303 | # Print summary of found and not found optional packages
304 | feature_summary(WHAT ALL)
305 | 
306 | 
307 | 
308 | # ------------------------------------------------------------------------------
309 | # Installation of DOLFINx library
310 | add_subdirectory(cudolfinx)
311 | 
312 | # ------------------------------------------------------------------------------
313 | # Generate and install helper file cudolfinx.conf
314 | 
315 | # FIXME: Can CMake provide the library path name variable?
316 | if(APPLE)
317 |   set(OS_LIBRARY_PATH_NAME "DYLD_LIBRARY_PATH")
318 | else()
319 |   set(OS_LIBRARY_PATH_NAME "LD_LIBRARY_PATH")
320 | endif()
321 | 
322 | # FIXME: not cross-platform compatible Create and install cudolfinx.conf file
323 | configure_file(
324 |   ${CUDOLFINX_SOURCE_DIR}/cmake/templates/cudolfinx.conf.in
325 |   ${CMAKE_BINARY_DIR}/cudolfinx.conf @ONLY
326 | )
327 | install(
328 |   FILES ${CMAKE_BINARY_DIR}/cudolfinx.conf
329 |   DESTINATION ${CMAKE_INSTALL_LIBDIR}/cudolfinx
330 |   COMPONENT Development
331 | )
332 | 
333 | # ------------------------------------------------------------------------------
334 | # Add "make uninstall" target
335 | configure_file(
336 |   "${CUDOLFINX_SOURCE_DIR}/cmake/templates/cmake_uninstall.cmake.in"
337 |   "${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake" IMMEDIATE @ONLY
338 | )
339 | 
340 | add_custom_target(
341 |   uninstall "${CMAKE_COMMAND}" -P
342 |             "${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake"
343 | )
344 | 
345 | # ------------------------------------------------------------------------------
346 | # Print post-install message
347 | add_subdirectory(cmake/post-install)
348 | 
349 | # ------------------------------------------------------------------------------
350 | 


--------------------------------------------------------------------------------
/cpp/cmake/modules/FindUFCx.cmake:
--------------------------------------------------------------------------------
 1 | #=============================================================================
 2 | # - Try to find UFCx by interrogating the Python module FFCx
 3 | # Once done this will define
 4 | #
 5 | #  UFCX_FOUND        - system has UFCx
 6 | #  UFCX_INCLUDE_DIRS - include directories for UFCx
 7 | #  UFCX_SIGNATURE    - signature for UFCx
 8 | #  UFCX_VERSION      - version for UFCx
 9 | #
10 | #=============================================================================
11 | # Copyright (C) 2010-2021 Johannes Ring and Garth N. Wells
12 | # All rights reserved.
13 | #
14 | # Redistribution and use in source and binary forms, with or without
15 | # modification, are permitted provided that the following conditions
16 | # are met:
17 | #
18 | # 1. Redistributions of source code must retain the above copyright
19 | #    notice, this list of conditions and the following disclaimer.
20 | # 2. Redistributions in binary form must reproduce the above copyright
21 | #    notice, this list of conditions and the following disclaimer in
22 | #    the documentation and/or other materials provided with the
23 | #    distribution.
24 | #
25 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
26 | # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
27 | # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
28 | # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
29 | # COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
30 | # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
31 | # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
32 | # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
33 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34 | # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
35 | # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 | # POSSIBILITY OF SUCH DAMAGE.
37 | #=============================================================================
38 | 
39 | message(
40 |   STATUS
41 |     "Asking Python module FFCx for location of UFC... (Python executable: ${Python3_EXECUTABLE})"
42 | )
43 | 
44 | # Get include path
45 | execute_process(
46 |   COMMAND
47 |     ${Python3_EXECUTABLE} -c
48 |     "import ffcx.codegeneration, sys; sys.stdout.write(ffcx.codegeneration.get_include_path())"
49 |   OUTPUT_VARIABLE UFCX_INCLUDE_DIR
50 | )
51 | 
52 | # Get ufcx.h version
53 | if(UFCX_INCLUDE_DIR)
54 |   set(UFCX_INCLUDE_DIRS
55 |       ${UFCX_INCLUDE_DIR}
56 |       CACHE STRING "Where to find ufcx.h"
57 |   )
58 |   execute_process(
59 |     COMMAND ${Python3_EXECUTABLE} -c
60 |             "import ffcx, sys; sys.stdout.write(ffcx.__version__)"
61 |     OUTPUT_VARIABLE UFCX_VERSION
62 |   )
63 | endif()
64 | 
65 | # Compute hash of ufcx.h
66 | find_file(_UFCX_HEADER "ufcx.h" ${UFCX_INCLUDE_DIR})
67 | if(_UFCX_HEADER)
68 |   file(SHA1 ${_UFCX_HEADER} UFCX_SIGNATURE)
69 | endif()
70 | 
71 | mark_as_advanced(UFCX_VERSION UFCX_INCLUDE_DIRS UFCX_SIGNATURE)
72 | find_package_handle_standard_args(
73 |   UFCx
74 |   REQUIRED_VARS UFCX_INCLUDE_DIRS UFCX_SIGNATURE UFCX_VERSION
75 |   VERSION_VAR UFCX_VERSION HANDLE_VERSION_RANGE REASON_FAILURE_MESSAGE
76 |                            "UFCx could not be found."
77 | )
78 | 


--------------------------------------------------------------------------------
/cpp/cmake/post-install/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | install(
 2 |   CODE "MESSAGE(
 3 | \"----------------------------------------------------------------------------
 4 | CUDOLFINx has now been installed in
 5 | 
 6 |     ${CMAKE_INSTALL_PREFIX}
 7 | 
 8 | 
 9 | Don't forget to update your environment variables. This can be done
10 | easily using the helper file 'cudolfinx.conf' which sets the appropriate
11 | variables (for users of the Bash shell).
12 | 
13 | To update your environment variables, run the following command:
14 | 
15 |     source ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}/cudolfinx/cudolfinx.conf
16 | 
17 | ----------------------------------------------------------------------------\")"
18 | )
19 | 


--------------------------------------------------------------------------------
/cpp/cmake/templates/CUDOLFINXConfig.cmake.in:
--------------------------------------------------------------------------------
 1 | # * Build details for CUDOLFINx: CUDA extension for DOLFINX
 2 | #
 3 | # This file has been automatically generated.
 4 | 
 5 | # FIXME: Check that naming conforms to CMake standards
 6 | 
 7 | @PACKAGE_INIT@
 8 | include(CMakeFindDependencyMacro)
 9 | 
10 | find_dependency(MPI REQUIRED)
11 | find_dependency(pugixml)
12 | 
13 | # Check for Boost
14 | if(DEFINED ENV{BOOST_ROOT} OR DEFINED BOOST_ROOT)
15 |   set(Boost_NO_SYSTEM_PATHS on)
16 | endif()
17 | set(Boost_USE_MULTITHREADED $ENV{BOOST_USE_MULTITHREADED})
18 | set(Boost_VERBOSE TRUE)
19 | find_dependency(Boost 1.70 REQUIRED COMPONENTS timer filesystem)
20 | 
21 | if(@ufcx_FOUND@)
22 |   find_dependency(ufcx)
23 | endif()
24 | 
25 | # Basix
26 | find_package(Python3 COMPONENTS Interpreter)
27 | if(Python3_Interpreter_FOUND)
28 |   execute_process(
29 |     COMMAND
30 |       ${Python3_EXECUTABLE} -c
31 |       "import basix, os, sys; sys.stdout.write(os.path.dirname(basix.__file__))"
32 |     OUTPUT_VARIABLE BASIX_PY_DIR
33 |     RESULT_VARIABLE BASIX_PY_COMMAND_RESULT
34 |     ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE
35 |   )
36 | endif()
37 | if(BASIX_PY_DIR)
38 |   message(STATUS "Adding ${BASIX_PY_DIR} to Basix search hints")
39 | endif()
40 | find_dependency(Basix CONFIG HINTS ${BASIX_PY_DIR})
41 | 
42 | # HDF5
43 | if(NOT TARGET hdf5::hdf5)
44 |   set(HDF5_PREFER_PARALLEL TRUE)
45 |   set(HDF5_FIND_DEBUG TRUE)
46 |   find_dependency(HDF5 COMPONENTS C)
47 |   if(HDF5_FOUND AND NOT HDF5_IS_PARALLEL)
48 |     message(FATAL_ERROR "Found serial HDF5 build, MPI HDF5 build required")
49 |   endif()
50 | endif()
51 | 
52 | if(@PETSC_FOUND@)
53 |   if(NOT TARGET PkgConfig::PETSC)
54 |     find_package(PkgConfig REQUIRED)
55 |     set(ENV{PKG_CONFIG_PATH}
56 |         "$ENV{PETSC_DIR}/$ENV{PETSC_ARCH}/lib/pkgconfig:$ENV{PETSC_DIR}/lib/pkgconfig:$ENV{PKG_CONFIG_PATH}"
57 |     )
58 |     pkg_search_module(PETSC REQUIRED IMPORTED_TARGET PETSc petsc)
59 |   endif()
60 | endif()
61 | 
62 | if(@SLEPC_FOUND@)
63 |   if(NOT TARGET PkgConfig::SLEPC)
64 |     find_package(PkgConfig REQUIRED)
65 |     set(ENV{PKG_CONFIG_PATH}
66 |         "$ENV{SLEPC_DIR}/$ENV{PETSC_ARCH}/lib/pkgconfig:$ENV{SLEPC_DIR}/lib/pkgconfig:$ENV{PKG_CONFIG_PATH}"
67 |     )
68 |     set(ENV{PKG_CONFIG_PATH}
69 |         "$ENV{PETSC_DIR}/$ENV{PETSC_ARCH}/lib/pkgconfig:$ENV{PETSC_DIR}/lib/pkgconfig:$ENV{PKG_CONFIG_PATH}"
70 |     )
71 |     set(ENV{PKG_CONFIG_PATH}
72 |         "$ENV{PETSC_DIR}/$ENV{PETSC_ARCH}:$ENV{PETSC_DIR}:$ENV{PKG_CONFIG_PATH}"
73 |     )
74 |     pkg_search_module(SLEPC REQUIRED IMPORTED_TARGET SLEPc slepc)
75 |   endif()
76 | endif()
77 | 
78 | if(NOT TARGET cudolfinx)
79 |   include("${CMAKE_CURRENT_LIST_DIR}/CUDOLFINXTargets.cmake")
80 | endif()
81 | 
82 | check_required_components(CUDOLFINX)
83 | 


--------------------------------------------------------------------------------
/cpp/cmake/templates/cmake_uninstall.cmake.in:
--------------------------------------------------------------------------------
 1 | if(NOT EXISTS "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt")
 2 |   message(
 3 |     FATAL_ERROR
 4 |       "Cannot find install manifest: \"@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt\""
 5 |   )
 6 | endif()
 7 | 
 8 | file(READ "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt" files)
 9 | string(REGEX REPLACE "\n" ";" files "${files}")
10 | foreach(file ${files})
11 |   message(STATUS "Uninstalling \"$ENV{DESTDIR}${file}\"")
12 |   if(EXISTS "$ENV{DESTDIR}${file}")
13 |     exec_program(
14 |       "@CMAKE_COMMAND@" ARGS
15 |       "-E remove \"$ENV{DESTDIR}${file}\""
16 |       OUTPUT_VARIABLE rm_out
17 |       RETURN_VALUE rm_retval
18 |     )
19 |     if(NOT "${rm_retval}" STREQUAL 0)
20 |       message(FATAL_ERROR "Problem when removing \"$ENV{DESTDIR}${file}\"")
21 |     endif()
22 |   else()
23 |     message(STATUS "File \"$ENV{DESTDIR}${file}\" does not exist.")
24 |   endif()
25 | endforeach()
26 | 


--------------------------------------------------------------------------------
/cpp/cmake/templates/cudolfinx.conf.in:
--------------------------------------------------------------------------------
 1 | # Helper file for setting non-default CUDOLFINx environment variables
 2 | 
 3 | # Common Unix variables
 4 | export @OS_LIBRARY_PATH_NAME@=@CMAKE_INSTALL_PREFIX@/@CMAKE_INSTALL_LIBDIR@:$@OS_LIBRARY_PATH_NAME@
 5 | export PATH=@CMAKE_INSTALL_PREFIX@/@CMAKE_INSTALL_BINDIR@:$PATH
 6 | export PKG_CONFIG_PATH=@CMAKE_INSTALL_PREFIX@/@CMAKE_INSTALL_LIBDIR@/pkgconfig:$PKG_CONFIG_PATH
 7 | export CMAKE_PREFIX_PATH=@CMAKE_INSTALL_PREFIX@/@CMAKE_INSTALL_LIBDIR@/cmake:$CMAKE_PREFIX_PATH
 8 | 
 9 | # Special macOS variables
10 | export DYLD_FRAMEWORK_PATH=/opt/local/Library/Frameworks:$DYLD_FRAMEWORK_PATH
11 | 


--------------------------------------------------------------------------------
/cpp/cmake/templates/cudolfinx.pc.in:
--------------------------------------------------------------------------------
 1 | # pkg-config configuration for CUDOLFINx
 2 | prefix=@CMAKE_INSTALL_PREFIX@
 3 | exec_prefix=@CMAKE_INSTALL_PREFIX@
 4 | libdir=${exec_prefix}/@CMAKE_INSTALL_LIBDIR@
 5 | includedir=${prefix}/@CMAKE_INSTALL_INCLUDEDIR@
 6 | compiler=@CMAKE_CXX_COMPILER@
 7 | definitions=@PKG_DEFINITIONS@
 8 | extlibs=@CUDOLFINX_EXT_LIBS@
 9 | 
10 | Name: CUDOLFINx
11 | Description: CUDA extension for DOLFINX
12 | Version: @CUDOLFINX_VERSION@
13 | Requires: @PKG_REQUIRES@
14 | Conflicts:
15 | Libs: @PKG_LINKFLAGS@ -L${libdir} -lcudolfinx
16 | Cflags: @PKG_CXXFLAGS@ -DCUDOLFINX_VERSION=\"@CUDOLFINX_VERSION@\" ${definitions} -I${includedir} @PKG_INCLUDES@
17 | 


--------------------------------------------------------------------------------
/cpp/cudolfinx/CMakeLists.txt:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------------
  2 | include(GNUInstallDirs)
  3 | 
  4 | # ------------------------------------------------------------------------------
  5 | # Declare the library (target)
  6 | add_library(cudolfinx)
  7 | 
  8 | # ------------------------------------------------------------------------------
  9 | # Add source files to the target
 10 | set(CUDOLFINX_DIRS
 11 |     common
 12 |     fem
 13 |     la
 14 |     mesh
 15 | )
 16 | 
 17 | # Add source to dolfinx target, and get sets of header files
 18 | foreach(DIR ${CUDOLFINX_DIRS})
 19 |   add_subdirectory(${DIR})
 20 | endforeach()
 21 | 
 22 | # Set target include location (for build and installed)
 23 | target_include_directories(
 24 |   cudolfinx
 25 |   PUBLIC
 26 |     $<INSTALL_INTERFACE:include>
 27 |     "$<BUILD_INTERFACE:${CUDOLFINX_SOURCE_DIR};${CUDOLFINX_SOURCE_DIR}/cudolfinx>"
 28 | )
 29 | 
 30 | # ------------------------------------------------------------------------------
 31 | # Configure the common/version.h file
 32 | configure_file(
 33 |   ${CMAKE_CURRENT_SOURCE_DIR}/common/version.h.in common/version.h @ONLY
 34 | )
 35 | 
 36 | # ------------------------------------------------------------------------------
 37 | # Set target properties
 38 | set_target_properties(
 39 |   cudolfinx
 40 |   PROPERTIES VERSION ${CUDOLFINX_VERSION}
 41 |   SOVERSION ${CUDOLFINX_VERSION_MAJOR}.${CUDOLFINX_VERSION_MINOR}
 42 | )
 43 | 
 44 | # Add git revision flag to the one affected file
 45 | #set_source_files_properties(
 46 | #  common/defines.cpp
 47 | #  PROPERTIES
 48 | #    COMPILE_DEFINITIONS
 49 | #    "UFCX_SIGNATURE=\"${UFCX_SIGNATURE}\";CUDOLFINX_GIT_COMMIT_HASH=\"${GIT_COMMIT_HASH}\""
 50 | #)
 51 | 
 52 | # ------------------------------------------------------------------------------
 53 | # Set compiler options and definitions
 54 | 
 55 | # Set 'Developer' build type flags
 56 | target_compile_options(
 57 | 	cudolfinx PRIVATE $<$<CONFIG:Developer>:${CUDOLFINX_CXX_DEVELOPER_FLAGS}>
 58 | )
 59 | 
 60 | # Add version to definitions (public)
 61 | target_compile_definitions(cudolfinx PUBLIC CUDOLFINX_VERSION="${CUDOLFINX_VERSION}")
 62 | 
 63 | # ------------------------------------------------------------------------------
 64 | # Add include directories and libraries of required packages
 65 | 
 66 | # UFCx
 67 | if(TARGET ufcx::ufcx)
 68 |   target_link_libraries(cudolfinx PUBLIC ufcx::ufcx)
 69 | else()
 70 |   target_include_directories(cudolfinx SYSTEM PUBLIC ${UFCX_INCLUDE_DIRS})
 71 | endif()
 72 | 
 73 | # Basix
 74 | target_link_libraries(cudolfinx PUBLIC Basix::basix)
 75 | 
 76 | # Boost
 77 | target_link_libraries(cudolfinx PUBLIC Boost::headers)
 78 | target_link_libraries(cudolfinx PUBLIC Boost::timer)
 79 | 
 80 | # MPI
 81 | target_link_libraries(cudolfinx PUBLIC MPI::MPI_CXX)
 82 | 
 83 | # HDF5
 84 | target_link_libraries(cudolfinx PUBLIC hdf5::hdf5)
 85 | 
 86 | # CUDA Toolkit
 87 | target_link_libraries(cudolfinx PRIVATE CUDA::cuda_driver CUDA::nvrtc CUDA::cupti)
 88 | target_include_directories(cudolfinx SYSTEM PRIVATE ${CUDAToolkit_INCLUDE_DIRS})
 89 | 
 90 | # Dolfinx
 91 | target_link_libraries(cudolfinx PUBLIC dolfinx)
 92 | 
 93 | # ------------------------------------------------------------------------------
 94 | # Optional packages
 95 | 
 96 | # PETSc
 97 | if(CUDOLFINX_ENABLE_PETSC AND PETSC_FOUND)
 98 |   target_link_libraries(cudolfinx PUBLIC PkgConfig::PETSC)
 99 |   target_compile_definitions(cudolfinx PUBLIC HAS_PETSC)
100 | endif()
101 | 
102 | 
103 | # ------------------------------------------------------------------------------
104 | # Install cudolfinx library and header files
105 | install(
106 |   TARGETS cudolfinx
107 |   EXPORT CUDOLFINXTargets
108 |   RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} COMPONENT RuntimeExecutables
109 |   LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT RuntimeLibraries
110 |   ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT Development
111 | )
112 | 
113 | # Generate CUDOLFINXTargets.cmake
114 | install(EXPORT CUDOLFINXTargets DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/cudolfinx)
115 | 
116 | # Install the header files
117 | install(
118 |   FILES cudolfinx.h
119 |   DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
120 |   COMPONENT Development
121 | )
122 | 
123 | foreach(DIR ${CUDOLFINX_DIRS})
124 |   install(
125 |     FILES ${HEADERS_${DIR}}
126 |     DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/cudolfinx/${DIR}
127 |     COMPONENT Development
128 |   )
129 | endforeach()
130 | 
131 | install(
132 |   FILES ${CMAKE_CURRENT_BINARY_DIR}/common/version.h
133 |   DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/cudolfinx/common
134 |   COMPONENT Development
135 | )
136 | 
137 | # ------------------------------------------------------------------------------
138 | # Generate CMake config files (CUDOLFINXConfig{,Version}.cmake)
139 | include(CMakePackageConfigHelpers)
140 | write_basic_package_version_file(
141 |   ${CMAKE_BINARY_DIR}/cudolfinx/CUDOLFINXConfigVersion.cmake
142 |   VERSION ${CUDOLFINX_VERSION}
143 |   COMPATIBILITY AnyNewerVersion
144 | )
145 | 
146 | configure_package_config_file(
147 | 	${CUDOLFINX_SOURCE_DIR}/cmake/templates/CUDOLFINXConfig.cmake.in
148 | 	${CMAKE_BINARY_DIR}/cudolfinx/CUDOLFINXConfig.cmake
149 |   INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/cudolfinx
150 | )
151 | 
152 | # Install CMake helper files
153 | install(
154 | 	FILES ${CMAKE_BINARY_DIR}/cudolfinx/CUDOLFINXConfig.cmake
155 | 	${CMAKE_BINARY_DIR}/cudolfinx/CUDOLFINXConfigVersion.cmake
156 |   DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/cudolfinx
157 |   COMPONENT Development
158 | )
159 | 
160 | # ------------------------------------------------------------------------------
161 | # Generate pkg-config file and install it
162 | 
163 | # Define packages that should be required by pkg-config file
164 | set(PKG_REQUIRES "")
165 | 
166 | # Get link libraries and includes
167 | get_target_property(
168 |   PKGCONFIG_CUDOLFINX_TARGET_LINK_LIBRARIES cudolfinx INTERFACE_LINK_LIBRARIES
169 | )
170 | get_target_property(
171 |   PKGCONFIG_CUDOLFINX_INCLUDE_DIRECTORIES cudolfinx
172 |   INTERFACE_SYSTEM_INCLUDE_DIRECTORIES
173 | )
174 | 
175 | # Add imported targets to lists for creating pkg-config file
176 | set(PKGCONFIG_CUDOLFINX_LIBS)
177 | 
178 | foreach(_target ${PKGCONFIG_CUDOLFINX_TARGET_LINK_LIBRARIES})
179 |   if("${_target}" MATCHES "^[^<>]+$") # Skip "$<foo...>", which we get with
180 |                                       # static libs
181 |     if("${_target}" MATCHES "^.*::.*$")
182 |       # Get include paths
183 |       get_target_property(_inc_dirs ${_target} INTERFACE_INCLUDE_DIRECTORIES)
184 | 
185 |       if(_inc_dirs)
186 |         list(APPEND PKGCONFIG_CUDOLFINX_INCLUDE_DIRECTORIES ${_inc_dirs})
187 |       endif()
188 | 
189 |       # Get libraries
190 |       get_target_property(_libs ${_target} INTERFACE_LINK_LIBRARIES)
191 | 
192 |       if(_libs)
193 |         list(APPEND PKGCONFIG_CUDOLFINX_LIBS ${_libs})
194 |       endif()
195 | 
196 |     else()
197 |       # 'regular' libs, i.e. not imported targets
198 |       list(APPEND PKGCONFIG_CUDOLFINX_LIBS ${_target})
199 |     endif()
200 | 
201 |     # Special handling for compiled Boost imported targets
202 |     if(("${_target}" MATCHES "^.*Boost::.*$") AND NOT "${_target}" STREQUAL
203 |                                                   "Boost::headers"
204 |     )
205 |       get_target_property(_libs ${_target} IMPORTED_LOCATION_RELEASE)
206 | 
207 |       if(_libs)
208 |         list(APPEND PKGCONFIG_CUDOLFINX_LIBS ${_libs})
209 |       endif()
210 |     endif()
211 |   endif()
212 | endforeach()
213 | 
214 | # Join include lists and remove duplicates
215 | list(REMOVE_DUPLICATES PKGCONFIG_CUDOLFINX_INCLUDE_DIRECTORIES)
216 | list(REMOVE_DUPLICATES PKGCONFIG_CUDOLFINX_LIBS)
217 | 
218 | # Convert include dirs to -I<incdir> form
219 | foreach(_inc_dir ${PKGCONFIG_CUDOLFINX_INCLUDE_DIRECTORIES})
220 |   set(PKG_INCLUDES "-I${_inc_dir} ${PKG_INCLUDES}")
221 | endforeach()
222 | 
223 | # Get cudolfinx definitions
224 | get_target_property(
225 |   PKG_CUDOLFINX_DEFINITIONS cudolfinx INTERFACE_COMPILE_DEFINITIONS
226 | )
227 | set(PKG_DEFINITIONS)
228 | 
229 | foreach(_def ${PKG_DOLFINX_DEFINITIONS})
230 |   set(PKG_DEFINITIONS "${PKG_DEFINITIONS} -D${_def}")
231 | endforeach()
232 | 
233 | # Get basix definitions (this is required to propagate Basix definition to the
234 | # pkg-config file, in the future Basix should create its own basix.pc file, see
235 | # https://github.com/FEniCS/basix/issues/204)
236 | get_target_property(
237 |   PKG_BASIX_DEFINITIONS Basix::basix INTERFACE_COMPILE_DEFINITIONS
238 | )
239 | 
240 | foreach(_def ${PKG_BASIX_DEFINITIONS})
241 |   set(PKG_DEFINITIONS "${PKG_DEFINITIONS} -D${_def}")
242 | endforeach()
243 | 
244 | # Convert compiler flags and definitions into space separated strings
245 | string(REPLACE ";" " " PKG_CXXFLAGS "${CMAKE_CXX_FLAGS}")
246 | string(REPLACE ";" " " PKG_LINKFLAGS "${CMAKE_EXE_LINKER_FLAGS}")
247 | 
248 | # Convert libraries to -L<libdir> -l<lib> form
249 | foreach(_lib ${PKGCONFIG_CUDOLFINX_LIBS})
250 |   # Add -Wl,option directives
251 |   if("${_lib}" MATCHES "-Wl,[^ ]*")
252 |     set(PKG_LINKFLAGS "${_lib} ${PKG_LINKFLAGS}")
253 |   else()
254 |     get_filename_component(_path ${_lib} DIRECTORY)
255 |     get_filename_component(_name ${_lib} NAME_WE)
256 |     string(REPLACE "lib" "" _name "${_name}")
257 | 
258 |     # Add libraries that matches the form -L<libdir> -l<lib>
259 |     if(NOT "${_path}" STREQUAL "")
260 |       set(PKG_LINKFLAGS "-L${_path} -l${_name} ${PKG_LINKFLAGS}")
261 |     endif()
262 |   endif()
263 | endforeach()
264 | 
265 | # Remove duplicated link flags
266 | separate_arguments(PKG_LINKFLAGS)
267 | list(REMOVE_DUPLICATES PKG_LINKFLAGS)
268 | string(REPLACE ";" " " PKG_LINKFLAGS "${PKG_LINKFLAGS}")
269 | 
270 | # Add additional link flags
271 | foreach(_linkflag ${CUDOLFINX_LINK_FLAGS})
272 |   set(PKG_LINKFLAGS "${PKG_LINKFLAGS} ${_linkflag}")
273 | endforeach()
274 | 
275 | # Boost include dir (used as pkg-config variable)
276 | get_target_property(
277 |   BOOST_INCLUDE_DIR Boost::headers INTERFACE_INCLUDE_DIRECTORIES
278 | )
279 | 
280 | # Configure and install pkg-config file
281 | configure_file(
282 |   ${CUDOLFINX_SOURCE_DIR}/cmake/templates/cudolfinx.pc.in
283 |   ${CMAKE_BINARY_DIR}/cudolfinx.pc @ONLY
284 | )
285 | install(
286 |   FILES ${CMAKE_BINARY_DIR}/cudolfinx.pc
287 |   DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig
288 |   COMPONENT Development
289 | )
290 | 
291 | # ------------------------------------------------------------------------------
292 | 


--------------------------------------------------------------------------------
/cpp/cudolfinx/common/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set(HEADERS_common
 2 |     ${CMAKE_CURRENT_SOURCE_DIR}/CUDA.h
 3 |     ${CMAKE_CURRENT_SOURCE_DIR}/CUDAStore.h
 4 |     PARENT_SCOPE
 5 | )
 6 | 
 7 | target_sources(
 8 |   cudolfinx
 9 |   PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/CUDA.cpp
10 | )
11 | 


--------------------------------------------------------------------------------
/cpp/cudolfinx/common/CUDA.h:
--------------------------------------------------------------------------------
  1 | // Copyright (C) 2024 Benjamin Pachev, James D. Trotter
  2 | //
  3 | // This file is part of cuDOLFINX
  4 | //
  5 | // SPDX-License-Identifier:    LGPL-3.0-or-later
  6 | 
  7 | #pragma once
  8 | 
  9 | #include <cuda.h>
 10 | #include <memory>
 11 | #include <string>
 12 | #include <vector>
 13 | 
 14 | namespace dolfinx
 15 | {
 16 | 
 17 | namespace CUDA
 18 | {
 19 | class Module;
 20 | class Kernel;
 21 | 
 22 | /// This class is a wrapper around a CUDA device context
 23 | class Context
 24 | {
 25 | public:
 26 |   /// Create a CUDA device context
 27 |   Context();
 28 | 
 29 |   /// Destructor
 30 |   ~Context();
 31 | 
 32 |   /// Copy constructor
 33 |   /// @param[in] context The object to be copied
 34 |   Context(const Context& context) = delete;
 35 | 
 36 |   /// Move constructor
 37 |   /// @param[in] context The object to be moved
 38 |   Context(Context&& context) = delete;
 39 | 
 40 |   /// Assignment operator
 41 |   /// @param[in] context The object to assign from
 42 |   Context& operator=(const Context& context) = delete;
 43 | 
 44 |   /// Move assignment operator
 45 |   /// @param[in] context The object to assign from
 46 |   Context& operator=(Context&& context) = delete;
 47 | 
 48 |   /// Return underlying CUDA device
 49 |   const CUdevice& device() const;
 50 | 
 51 |   /// Return underlying CUDA context
 52 |   CUcontext& context();
 53 | 
 54 | private:
 55 |   CUdevice _device;
 56 |   CUcontext _context;
 57 | };
 58 | 
 59 | /// This class is a wrapper around a module, which is obtained by
 60 | /// compiling PTX assembly to CUDA device code.
 61 | class Module
 62 | {
 63 | public:
 64 |   /// Create an empty module
 65 |   Module();
 66 | 
 67 |   /// Create a module
 68 |   Module(
 69 |     const CUDA::Context& cuda_context,
 70 |     const std::string& ptx,
 71 |     CUjit_target target,
 72 |     int num_module_load_options,
 73 |     CUjit_option* module_load_options,
 74 |     void** module_load_option_values,
 75 |     bool verbose,
 76 |     bool debug);
 77 | 
 78 |   /// Destructor
 79 |   ~Module();
 80 | 
 81 |   /// Copy constructor
 82 |   /// @param[in] module The object to be copied
 83 |   Module(const Module& module) = delete;
 84 | 
 85 |   /// Move constructor
 86 |   /// @param[in] module The object to be moved
 87 |   Module(Module&& module);
 88 | 
 89 |   /// Assignment operator
 90 |   /// @param[in] module The object to assign from
 91 |   Module& operator=(const Module& module) = delete;
 92 | 
 93 |   /// Move assignment operator
 94 |   /// @param[in] module The object to assign from
 95 |   Module& operator=(Module&& module);
 96 | 
 97 |   /// Get a device-side function from a loaded module
 98 |   CUfunction get_device_function(
 99 |     const std::string& device_function_name) const;
100 | 
101 |   /// Get info log for a loaded module
102 |   const char* info_log() const {
103 |     return _info_log; }
104 | 
105 |   /// Get error log for a loaded module
106 |   const char* error_log() const {
107 |     return _error_log; }
108 | 
109 | private:
110 |   /// Handle to the CUDA module
111 |   CUmodule _module;
112 | 
113 |   /// Size of the buffer for informational log messages
114 |   size_t _info_log_size;
115 | 
116 |   /// Informational log messages related to loading the module
117 |   char* _info_log;
118 | 
119 |   /// Size of the buffer for error log messages
120 |   size_t _error_log_size;
121 | 
122 |   /// Error log messages related to loading the module
123 |   char* _error_log;
124 | };
125 | 
126 | /// Use the NVIDIA CUDA Runtime Compilation (nvrtc) library to compile
127 | /// device-side code for a given CUDA program.
128 | std::string compile_cuda_cpp_to_ptx(
129 |   const char* program_name,
130 |   int num_program_headers,
131 |   const char** program_headers,
132 |   const char** program_include_names,
133 |   int num_compile_options,
134 |   const char** compile_options,
135 |   const char* program_src,
136 |   const char* cudasrcdir,
137 |   bool verbose);
138 | 
139 | void safeMemAlloc(CUdeviceptr* dptr, size_t bytesize);
140 | void safeMemcpyDtoH(void* dstHost, CUdeviceptr srcDevice, size_t ByteCount);
141 | void safeMemcpyHtoD(CUdeviceptr dstDevice, const void* srcHost, size_t ByteCount);
142 | void safeDeviceGetAttribute(int * res, CUdevice_attribute attr, CUdevice dev);
143 | void safeCtxSynchronize();
144 | void safeStreamCreate(CUstream* streamptr, unsigned int flags);
145 | 
146 | template <typename T> void safeVectorCreate(CUdeviceptr* dptr, std::vector<T> arr) {
147 |   size_t bytesize = sizeof(T) * arr.size();
148 |   safeMemAlloc(dptr, bytesize);
149 |   safeMemcpyHtoD(*dptr, (void *)arr.data(), bytesize);
150 | }
151 | 
152 | CUjit_target get_cujit_target(const Context& cuda_context);
153 | 
154 | } // namespace CUDA
155 | 
156 | 
157 | } // namespace dolfinx
158 | 


--------------------------------------------------------------------------------
/cpp/cudolfinx/common/CUDAStore.h:
--------------------------------------------------------------------------------
 1 | // Copyright (C) 2024 Benjamin Pachev, James D. Trotter
 2 | //
 3 | // This file is part of cuDOLFINX
 4 | //
 5 | // SPDX-License-Identifier:    LGPL-3.0-or-later
 6 | 
 7 | #pragma once
 8 | #include <cudolfinx/common/CUDA.h>
 9 | #include <map>
10 | 
11 | namespace dolfinx::common
12 | {
13 | /// @brief This class represents an abstract mapping between host-side
14 | /// and device-side objects. Its purpose is to prevent creation of duplicate
15 | /// copies of host-side objects on the device.
16 | 
17 | template <class H, class D>
18 | class CUDAStore
19 | {
20 | public:
21 |   
22 |   /// @brief Empty constructor
23 |   CUDAStore()
24 |   {
25 |   }
26 | 
27 |   /// @brief Return stored device object, or update accordingly
28 |   /// @param[in] host_object Shared pointer to the host-side object
29 |   std::shared_ptr<D> get_device_object(const H* host_object) {
30 |     auto it = _map.find(host_object);
31 |     if (it != _map.end()) return it->second;
32 |     auto device_object = std::make_shared<D>(host_object);
33 |     _map[host_object] = device_object;
34 |     return device_object;
35 |   }
36 | 
37 | private:
38 |   std::map<const H*, std::shared_ptr<D>> _map;  
39 | };
40 | }
41 | 
42 | 


--------------------------------------------------------------------------------
/cpp/cudolfinx/common/version.h.in:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #define CUDOLFINX_VERSION_RELEASE @CUDOLFINX_VERSION_RELEASE@
 4 | #define CUDOLFINX_VERSION_MAJOR   @CUDOLFINX_VERSION_MAJOR@
 5 | #define CUDOLFINX_VERSION_MINOR   @CUDOLFINX_VERSION_MINOR@
 6 | #define CUDOLFINX_VERSION_MICRO   @CUDOLFINX_VERSION_MICRO_STRIPPED@
 7 | #define CUDOLFINX_VERSION_STRING  "@CUDOLFINX_VERSION@"
 8 | #define CUDOLFINX_VERSION_GIT     "@GIT_COMMIT_HASH@"
 9 | #define UFCX_SIGNATURE          "@UFCX_SIGNATURE@"
10 | 


--------------------------------------------------------------------------------
/cpp/cudolfinx/cudolfinx.h:
--------------------------------------------------------------------------------
 1 | // Copyright (C) 2024 Benjamin Pachev, James D. Trotter
 2 | //
 3 | // This file is part of cuDOLFINX
 4 | //
 5 | // SPDX-License-Identifier:    LGPL-3.0-or-later
 6 | 
 7 | #pragma once
 8 | 
 9 | namespace cudolfinx
10 | {
11 | }
12 | 
13 | // TODO: actually develop a C++ interface, currently the target is Python
14 | 


--------------------------------------------------------------------------------
/cpp/cudolfinx/fem/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set(HEADERS_fem
 2 |     ${CMAKE_CURRENT_SOURCE_DIR}/CUDAAssembler.h
 3 |     ${CMAKE_CURRENT_SOURCE_DIR}/CUDACoefficient.h
 4 |     ${CMAKE_CURRENT_SOURCE_DIR}/CUDADirichletBC.h
 5 |     ${CMAKE_CURRENT_SOURCE_DIR}/CUDADofMap.h
 6 |     ${CMAKE_CURRENT_SOURCE_DIR}/CUDAForm.h
 7 |     ${CMAKE_CURRENT_SOURCE_DIR}/CUDAFormConstants.h
 8 |     ${CMAKE_CURRENT_SOURCE_DIR}/CUDAFormCoefficients.h
 9 |     ${CMAKE_CURRENT_SOURCE_DIR}/CUDAFormIntegral.h
10 |     ${CMAKE_CURRENT_SOURCE_DIR}/petsc.h
11 |     PARENT_SCOPE
12 | )
13 | 
14 | target_sources(
15 |   cudolfinx
16 |   PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/CUDAAssembler.cpp
17 |           ${CMAKE_CURRENT_SOURCE_DIR}/CUDADofMap.cpp
18 | 	  ${CMAKE_CURRENT_SOURCE_DIR}/CUDAFormIntegral.cpp
19 | )
20 | 


--------------------------------------------------------------------------------
/cpp/cudolfinx/fem/CUDACoefficient.h:
--------------------------------------------------------------------------------
 1 | // Copyright (C) 2024 Benjamin Pachev, James D. Trotter
 2 | //
 3 | // This file is part of cuDOLFINX
 4 | //
 5 | // SPDX-License-Identifier:    LGPL-3.0-or-later
 6 | 
 7 | #pragma once
 8 | 
 9 | #include <cudolfinx/common/CUDA.h>
10 | #include <dolfinx/fem/Function.h>
11 | 
12 | namespace dolfinx::fem
13 | {
14 | /// @brief a wrapper around a Function
15 | template <dolfinx::scalar T,
16 |           std::floating_point U = dolfinx::scalar_value_type_t<T>>
17 | class CUDACoefficient
18 | {
19 | public:
20 |   
21 |   /// @brief Construct a new CUDACoefficient
22 |   CUDACoefficient(std::shared_ptr<const Function<T, U>> f) {
23 |     _f = f;
24 |     _x = f->x();
25 |     _dvalues_size = _x->bs() * (_x->index_map()->size_local()+_x->index_map()->num_ghosts()) * sizeof(T);
26 |     CUDA::safeMemAlloc(&_dvalues, _dvalues_size);
27 |     copy_host_values_to_device();
28 |   }
29 | 
30 |   /// Copy to device, allocating GPU memory if required
31 |   void copy_host_values_to_device()
32 |   {
33 |     CUDA::safeMemcpyHtoD(_dvalues, (void*)(_x->array().data()), _dvalues_size);
34 |   }
35 | 
36 |   /// Get pointer to vector data on device
37 |   CUdeviceptr device_values() const
38 |   {
39 |     return _dvalues;
40 |   }
41 | 
42 |   ~CUDACoefficient()
43 |   {
44 |     if (_dvalues)
45 |       cuMemFree(_dvalues);
46 |   }
47 | 
48 | private:
49 | 
50 |   // Device-side coefficient array
51 |   CUdeviceptr _dvalues;
52 |   // Size of coefficient array
53 |   size_t _dvalues_size;
54 |   // Pointer to host-side Function
55 |   std::shared_ptr<const dolfinx::fem::Function<T,U>> _f;
56 |   // Pointer to host-side coefficient vector
57 |   std::shared_ptr<const dolfinx::la::Vector<T>> _x;
58 | };
59 | 
60 | }
61 | 


--------------------------------------------------------------------------------
/cpp/cudolfinx/fem/CUDADirichletBC.h:
--------------------------------------------------------------------------------
  1 | // Copyright (C) 2024 Benjamin Pachev, James D. Trotter
  2 | //
  3 | // This file is part of cuDOLFINX
  4 | //
  5 | // SPDX-License-Identifier:    LGPL-3.0-or-later
  6 | 
  7 | #pragma once
  8 | 
  9 | #include <cudolfinx/common/CUDA.h>
 10 | #include <dolfinx/fem/DirichletBC.h>
 11 | #include <dolfinx/fem/DofMap.h>
 12 | #include <dolfinx/fem/FunctionSpace.h>
 13 | #include <dolfinx/common/IndexMap.h>
 14 | #include <cuda.h>
 15 | 
 16 | #include <memory>
 17 | #include <vector>
 18 | 
 19 | namespace dolfinx {
 20 | 
 21 | //namespace function {
 22 | //class FunctionSpace;
 23 | //}
 24 | 
 25 | namespace fem {
 26 | //class DirichletBC;
 27 | 
 28 | /// A wrapper for data marking which degrees of freedom that are
 29 | /// affected by Dirichlet boundary conditions, with data being stored
 30 | /// in the device memory of a CUDA device.
 31 | template <dolfinx::scalar T,
 32 |           std::floating_point U = dolfinx::scalar_value_type_t<T>>
 33 | class CUDADirichletBC
 34 | {
 35 | public:
 36 | 
 37 | //-----------------------------------------------------------------------------
 38 |   /// Create empty Dirichlet boundary conditions
 39 |   CUDADirichletBC()
 40 |     : _num_dofs()
 41 |     , _num_owned_boundary_dofs()
 42 |     , _num_boundary_dofs()
 43 |     , _ddof_markers(0)
 44 |     , _ddof_indices(0)
 45 |     , _ddof_values(0)
 46 |   {
 47 |   }
 48 |   //-----------------------------------------------------------------------------
 49 |   /// Create Dirichlet boundary conditions
 50 |   ///
 51 |   /// @param[in] cuda_context A context for a CUDA device
 52 |   /// @param[in] V The function space to build dof markers for.
 53 |   ///              Boundary conditions are only applied for degrees of
 54 |   ///              freedom that belong to the given function space.
 55 |   /// @param[in] bcs The boundary conditions to copy to device memory
 56 |   CUDADirichletBC(
 57 |     const CUDA::Context& cuda_context,
 58 |     const dolfinx::fem::FunctionSpace<T>& V,
 59 |     const std::vector<std::shared_ptr<const dolfinx::fem::DirichletBC<T,U>>>& bcs)
 60 |     : _num_dofs()
 61 |     , _num_owned_boundary_dofs()
 62 |     , _num_boundary_dofs()
 63 |     , _ddof_markers(0)
 64 |     , _ddof_indices(0)
 65 |     , _ddof_values(0)
 66 |   {
 67 |     CUresult cuda_err;
 68 |     const char * cuda_err_description;
 69 | 
 70 |     // Count the number of degrees of freedom
 71 |     const dolfinx::fem::DofMap& dofmap = *(V.dofmap());
 72 |     const common::IndexMap& index_map = *dofmap.index_map;
 73 |     // Looks like index_map no longer has block_size
 74 |     const int block_size = dofmap.index_map_bs();
 75 |     _num_dofs = block_size * (
 76 | 		    index_map.size_local() + index_map.num_ghosts());
 77 | 
 78 |     // Count the number of degrees of freedom affected by boundary
 79 |     // conditions
 80 |     _num_owned_boundary_dofs = 0;
 81 |     _num_boundary_dofs = 0;
 82 | 
 83 |     // Build dof markers, indices and values
 84 |     signed char* dof_markers = nullptr;
 85 |     std::vector<std::int32_t> dof_indices;
 86 |     std::vector<std::int32_t> ghost_dof_indices;
 87 |     for (auto const& bc : bcs) {
 88 |       if (V.contains(*bc->function_space())) {
 89 |         if (!dof_markers) {
 90 |           dof_markers = new signed char[_num_dofs];
 91 |           for (int i = 0; i < _num_dofs; i++) {
 92 |             dof_markers[i] = 0;
 93 |           }
 94 |           _dof_values.assign(_num_dofs, 0.0);
 95 |         }
 96 |         
 97 |         bc->mark_dofs(std::span(dof_markers, _num_dofs));
 98 |         auto const [dofs, range] = bc->dof_indices();
 99 |         for (std::int32_t i = 0; i < dofs.size(); i++) {
100 | 	  if (i < range) dof_indices.push_back(dofs[i]);
101 | 	  else ghost_dof_indices.push_back(dofs[i]);
102 |         }
103 |         bc->set(std::span<T>(_dof_values), {}, 1);
104 |       }
105 |     }
106 |     _num_owned_boundary_dofs = dof_indices.size();
107 |     _num_boundary_dofs = _num_owned_boundary_dofs + ghost_dof_indices.size();
108 |     dof_indices.insert(dof_indices.end(), ghost_dof_indices.begin(), ghost_dof_indices.end());
109 |     // Allocate device-side storage for dof markers
110 |     if (dof_markers && _num_dofs > 0) {
111 |       size_t ddof_markers_size = _num_dofs * sizeof(char);
112 |       cuda_err = cuMemAlloc(&_ddof_markers, ddof_markers_size);
113 |       if (cuda_err != CUDA_SUCCESS) {
114 |         delete[] dof_markers;
115 |         cuGetErrorString(cuda_err, &cuda_err_description);
116 |         throw std::runtime_error(
117 |           "cuMemAlloc() failed with " + std::string(cuda_err_description) +
118 |           " at " + std::string(__FILE__) + ":" + std::to_string(__LINE__));
119 |       }
120 | 
121 |       // Copy dof markers to device
122 |       cuda_err = cuMemcpyHtoD(
123 |         _ddof_markers, dof_markers, ddof_markers_size);
124 |       if (cuda_err != CUDA_SUCCESS) {
125 |         cuMemFree(_ddof_markers);
126 |         delete[] dof_markers;
127 |         cuGetErrorString(cuda_err, &cuda_err_description);
128 |         throw std::runtime_error(
129 |           "cuMemcpyHtoD() failed with " + std::string(cuda_err_description) +
130 |           " at " + std::string(__FILE__) + ":" + std::to_string(__LINE__));
131 |       }
132 |     }
133 |     if (dof_markers)
134 |       delete[] dof_markers;
135 | 
136 |     // Allocate device-side storage for dof indices
137 |     if (_num_boundary_dofs > 0) {
138 |       size_t ddof_indices_size = dof_indices.size() * sizeof(std::int32_t);
139 |       cuda_err = cuMemAlloc(&_ddof_indices, ddof_indices_size);
140 |       if (cuda_err != CUDA_SUCCESS) {
141 |         if (_ddof_markers)
142 |           cuMemFree(_ddof_markers);
143 |         cuGetErrorString(cuda_err, &cuda_err_description);
144 |         throw std::runtime_error(
145 |           "cuMemAlloc() failed with " + std::string(cuda_err_description) +
146 |           " at " + std::string(__FILE__) + ":" + std::to_string(__LINE__));
147 |       }
148 | 
149 |       // Copy dof indices to device
150 |       cuda_err = cuMemcpyHtoD(
151 |         _ddof_indices, dof_indices.data(), ddof_indices_size);
152 |       if (cuda_err != CUDA_SUCCESS) {
153 |         cuMemFree(_ddof_indices);
154 |         if (_ddof_markers)
155 |           cuMemFree(_ddof_markers);
156 |         cuGetErrorString(cuda_err, &cuda_err_description);
157 |         throw std::runtime_error(
158 |           "cuMemcpyHtoD() failed with " + std::string(cuda_err_description) +
159 |           " at " + std::string(__FILE__) + ":" + std::to_string(__LINE__));
160 |       }
161 |     }
162 | 
163 | 
164 |     // Allocate device-side storage for dof values
165 |     if (dof_markers && _num_dofs > 0) {
166 |       size_t ddof_values_size = _num_dofs * sizeof(T);
167 |       cuda_err = cuMemAlloc(&_ddof_values, ddof_values_size);
168 |       if (cuda_err != CUDA_SUCCESS) {
169 |         if (_ddof_indices)
170 |           cuMemFree(_ddof_indices);
171 |         if (_ddof_markers)
172 |           cuMemFree(_ddof_markers);
173 |         cuGetErrorString(cuda_err, &cuda_err_description);
174 |         throw std::runtime_error(
175 |           "cuMemAlloc() failed with " + std::string(cuda_err_description) +
176 |           " at " + std::string(__FILE__) + ":" + std::to_string(__LINE__));
177 |       }
178 | 
179 |       // Copy dof values to device
180 |       cuda_err = cuMemcpyHtoD(
181 |         _ddof_values, _dof_values.data(), ddof_values_size);
182 |       if (cuda_err != CUDA_SUCCESS) {
183 |         cuMemFree(_ddof_values);
184 |         if (_ddof_indices)
185 |           cuMemFree(_ddof_indices);
186 |         if (_ddof_markers)
187 |           cuMemFree(_ddof_markers);
188 |         cuGetErrorString(cuda_err, &cuda_err_description);
189 |         throw std::runtime_error(
190 |           "cuMemcpyHtoD() failed with " + std::string(cuda_err_description) +
191 |           " at " + std::string(__FILE__) + ":" + std::to_string(__LINE__));
192 |       }
193 |     }
194 |   }
195 |   //-----------------------------------------------------------------------------
196 |   /// Destructor
197 |   ~CUDADirichletBC()
198 |   {
199 |     if (_ddof_values)
200 |       cuMemFree(_ddof_values);
201 |     if (_ddof_indices)
202 |       cuMemFree(_ddof_indices);
203 |     if (_ddof_markers)
204 |       cuMemFree(_ddof_markers);
205 |   }
206 |   //-----------------------------------------------------------------------------
207 |   /// Copy constructor
208 |   /// @param[in] bc The object to be copied
209 |   CUDADirichletBC(const CUDADirichletBC& bc) = delete;
210 | 
211 |   /// Move constructor
212 |   /// @param[in] bc The object to be moved
213 |   CUDADirichletBC(CUDADirichletBC&& bc)
214 |     : _num_dofs(bc._num_dofs)
215 |     , _num_owned_boundary_dofs(bc._num_owned_boundary_dofs)
216 |     , _num_boundary_dofs(bc._num_boundary_dofs)
217 |     , _ddof_markers(bc._ddof_markers)
218 |     , _ddof_indices(bc._ddof_indices)
219 |     , _ddof_values(bc._ddof_values)
220 |   {
221 |     bc._num_dofs = 0;
222 |     bc._num_owned_boundary_dofs = 0;
223 |     bc._num_boundary_dofs = 0;
224 |     bc._ddof_markers = 0;
225 |     bc._ddof_indices = 0;
226 |     bc._ddof_values = 0;
227 |   }
228 |   //-----------------------------------------------------------------------------
229 |   /// Assignment operator
230 |   /// @param[in] bc Another CUDADirichletBC object
231 |   CUDADirichletBC& operator=(const CUDADirichletBC& bc) = delete;
232 | 
233 |   /// Move assignment operator
234 |   /// @param[in] bc Another CUDADirichletBC object
235 |   CUDADirichletBC& operator=(CUDADirichletBC&& bc)
236 |   {
237 |     _num_dofs = bc._num_dofs;
238 |     _num_owned_boundary_dofs = bc._num_owned_boundary_dofs;
239 |     _num_boundary_dofs = bc._num_boundary_dofs;
240 |     _ddof_markers = bc._ddof_markers;
241 |     _ddof_indices = bc._ddof_indices;
242 |     _ddof_values = bc._ddof_values;
243 |     bc._num_dofs = 0;
244 |     bc._num_owned_boundary_dofs = 0;
245 |     bc._num_boundary_dofs = 0;
246 |     bc._ddof_markers = 0;
247 |     bc._ddof_indices = 0;
248 |     bc._ddof_values = 0;
249 |     return *this;
250 |   }
251 |   //-----------------------------------------------------------------------------
252 | 
253 |   /// Update device-side values for all provided boundary conditions
254 |   /// The user is responsible for ensuring the provided conditions are in the original list
255 |   void update(const std::vector<std::shared_ptr<const dolfinx::fem::DirichletBC<T,U>>>& bcs) {
256 |     for (auto const& bc: bcs) {
257 |       bc->set(std::span<T>(_dof_values), {});
258 |     }
259 | 
260 |     CUDA::safeMemcpyHtoD(_ddof_values, _dof_values.data(), _num_dofs * sizeof(T));
261 |   }
262 | 
263 |   /// Get the number of degrees of freedom
264 |   int32_t num_dofs() const { return _num_dofs; }
265 | 
266 |   /// Get a handle to the device-side dof markers
267 |   CUdeviceptr dof_markers() const { return _ddof_markers; }
268 | 
269 |   /// Get the number of owned degrees of freedom subject to boundary
270 |   /// conditions 
271 |   int32_t num_owned_boundary_dofs() const { return _num_owned_boundary_dofs; }
272 |   
273 |   /// Get the number of degrees of freedom subject to boundary
274 |   /// conditions 
275 |   int32_t num_boundary_dofs() const { return _num_boundary_dofs; }
276 | 
277 |   /// Get a handle to the device-side dof indices
278 |   CUdeviceptr dof_indices() const { return _ddof_indices; }
279 | 
280 |   /// Get a handle to the device-side dofs for the values
281 |   CUdeviceptr dof_value_indices() const { return _ddof_indices; }
282 | 
283 |   /// Get a handle to the device-side dof values
284 |   CUdeviceptr dof_values() const { return _ddof_values; }
285 | 
286 | private:
287 |   /// The number of degrees of freedom
288 |   int32_t _num_dofs;
289 | 
290 |   /// The number of degrees of freedom owned by the current process
291 |   /// that are subject to the essential boundary conditions.
292 |   int32_t _num_owned_boundary_dofs;
293 | 
294 |   /// The number of degrees of freedom that are subject to the
295 |   /// essential boundary conditions, including ghost nodes.
296 |   int32_t _num_boundary_dofs;
297 | 
298 |   /// A host-side vector with the values for the boundary conditions
299 |   /// Used for cases when the boundary condition values change
300 |   std::vector<T> _dof_values;
301 | 
302 |   /// Markers for each degree of freedom, indicating whether or not
303 |   /// they are subject to essential boundary conditions
304 |   CUdeviceptr _ddof_markers;
305 | 
306 |   /// Indices of the degrees of freedom that are subject to essential
307 |   /// boundary conditions
308 |   CUdeviceptr _ddof_indices;
309 | 
310 |   /// Values for each degree of freedom, indicating whether or not
311 |   /// they are subject to essential boundary conditions
312 |   CUdeviceptr _ddof_values;
313 | };
314 | 
315 | } // namespace fem
316 | } // namespace dolfinx
317 | 
318 | 


--------------------------------------------------------------------------------
/cpp/cudolfinx/fem/CUDADofMap.cpp:
--------------------------------------------------------------------------------
  1 | // Copyright (C) 2024 Benjamin Pachev, James D. Trotter
  2 | //
  3 | // This file is part of cuDOLFINX
  4 | //
  5 | // SPDX-License-Identifier:    LGPL-3.0-or-later
  6 | 
  7 | #include "CUDADofMap.h"
  8 | #include <cudolfinx/common/CUDA.h>
  9 | #include <dolfinx/common/IndexMap.h>
 10 | #include <dolfinx/fem/DofMap.h>
 11 | #include <cuda.h>
 12 | 
 13 | using namespace dolfinx;
 14 | using namespace dolfinx::fem;
 15 | 
 16 | //-----------------------------------------------------------------------------
 17 | CUDADofMap::CUDADofMap()
 18 |   : _dofmap(nullptr)
 19 |   , _num_dofs()
 20 |   , _num_cells()
 21 |   , _num_dofs_per_cell()
 22 |   , _ddofs_per_cell(0)
 23 |   , _dcells_per_dof_ptr(0)
 24 |   , _dcells_per_dof(0)
 25 | {
 26 | }
 27 | 
 28 | CUDADofMap::CUDADofMap(
 29 |   const dolfinx::fem::DofMap* dofmap)
 30 |   : CUDADofMap::CUDADofMap(*dofmap, nullptr)
 31 | {
 32 | }
 33 | 
 34 | CUDADofMap::CUDADofMap(
 35 |   const dolfinx::fem::DofMap* dofmap, std::map<std::int32_t, std::int32_t>* restriction)
 36 |   : CUDADofMap::CUDADofMap(*dofmap, restriction)
 37 | {
 38 | }
 39 | 
 40 | CUDADofMap::CUDADofMap(
 41 |   const dolfinx::fem::DofMap& dofmap)
 42 |   : CUDADofMap::CUDADofMap(dofmap, nullptr)
 43 | {
 44 | }
 45 | 
 46 | //-----------------------------------------------------------------------------
 47 | CUDADofMap::CUDADofMap(
 48 |   const dolfinx::fem::DofMap& dofmap, std::map<std::int32_t, std::int32_t>* restriction)
 49 |   : _dofmap(&dofmap)
 50 |   , _num_dofs()
 51 |   , _num_cells()
 52 |   , _num_dofs_per_cell()
 53 |   , _ddofs_per_cell(0)
 54 |   , _dcells_per_dof_ptr(0)
 55 |   , _dcells_per_dof(0)
 56 | {
 57 |   CUresult cuda_err;
 58 |   const char * cuda_err_description;
 59 | 
 60 |   auto dofs = dofmap.map();
 61 |   auto element_dof_layout = dofmap.element_dof_layout();
 62 |   // get block sizes and ensure positivity (sometimes the default is -1)
 63 |   std::int32_t element_block_size = element_dof_layout.block_size();
 64 |   _block_size = dofmap.bs();
 65 |   element_block_size = (element_block_size < 0) ? 1 : element_block_size;
 66 |   _block_size = (_block_size < 0) ? 1 : _block_size;
 67 |   _num_cells = dofs.extent(0);
 68 |   _num_dofs_per_cell = element_dof_layout.num_dofs() * element_block_size;
 69 |   _num_dofs = dofs.size() * _block_size;
 70 |   if (_num_dofs != _num_cells * _num_dofs_per_cell) {
 71 |     throw std::runtime_error(
 72 |        "Num dofs " + std::to_string(_num_dofs) + " != " + std::to_string(_num_cells) +
 73 |        "*" + std::to_string(_num_dofs_per_cell) + " at " + std::string(__FILE__) + ":" + std::to_string(__LINE__));
 74 |   }
 75 |   // Allocate device-side storage for degrees of freedom
 76 |   if (_num_cells > 0 && _num_dofs_per_cell > 0) {
 77 |     size_t ddofs_per_cell_size = _num_dofs * sizeof(int32_t);
 78 |     cuda_err = cuMemAlloc(
 79 |       &_ddofs_per_cell,
 80 |       ddofs_per_cell_size);
 81 |     if (cuda_err != CUDA_SUCCESS) {
 82 |       cuGetErrorString(cuda_err, &cuda_err_description);
 83 |       throw std::runtime_error(
 84 |         "cuMemAlloc() failed with " + std::string(cuda_err_description) +
 85 |         " at " + std::string(__FILE__) + ":" + std::to_string(__LINE__));
 86 |     }
 87 |   }
 88 |   update(restriction);
 89 | 
 90 |   // cells_per_dof_ptr and cells_per_dof are only used for
 91 |   // lookup table computations, which currently aren't in use
 92 | /*
 93 |   // Compute mapping from degrees of freedom to cells
 94 |   std::vector<int32_t> cells_per_dof_ptr(_num_dofs+1);
 95 | 
 96 |   // Count the number cells containing each degree of freedom
 97 |   for (int32_t i = 0; i < _num_cells; i++) {
 98 |     auto cell_dofs = dofmap.cell_dofs(i);
 99 |     for (int32_t l = 0; l < cell_dofs.size(); l++) {
100 |       int32_t j = cell_dofs[l];
101 |       cells_per_dof_ptr[j+1]++;
102 |     }
103 |   }
104 | 
105 |   // Compute offset to the first cell for each degree of freedom
106 |   for (int32_t i = 0; i < _num_dofs; i++)
107 |     cells_per_dof_ptr[i+1] += cells_per_dof_ptr[i];
108 |   int32_t num_dof_cells = cells_per_dof_ptr[_num_dofs];
109 |   if (num_dof_cells != _num_cells * _num_dofs_per_cell) {
110 |     cuMemFree(_ddofs_per_cell);
111 |     throw std::logic_error(
112 |       "Expected " + std::to_string(_num_cells) + " cells, " +
113 |       std::to_string(_num_dofs_per_cell) + " degrees of freedom per cell, "
114 |       "but the mapping from degrees of freedom to cells contains " +
115 |       std::to_string(num_dof_cells) + " values" );
116 |   }
117 | 
118 |   // Allocate storage for and compute the cells containing each degree
119 |   // of freedom
120 |   std::vector<int32_t> cells_per_dof(num_dof_cells);
121 |   for (int32_t i = 0; i < _num_cells; i++) {
122 |     auto cell_dofs = dofmap.cell_dofs(i);
123 |     for (int32_t l = 0; l < cell_dofs.size(); l++) {
124 |       int32_t j = cell_dofs[l];
125 |       int32_t p = cells_per_dof_ptr[j];
126 |       cells_per_dof[p] = i;
127 |       cells_per_dof_ptr[j]++;
128 |     }
129 |   }
130 | 
131 |   // Adjust offsets to first cell
132 |   for (int32_t i = _num_dofs; i > 0; i--)
133 |     cells_per_dof_ptr[i] = cells_per_dof_ptr[i-1];
134 |   cells_per_dof_ptr[0] = 0;
135 | 
136 |   // Allocate device-side storage for offsets to the first cell
137 |   // containing each degree of freedom
138 |   if (_num_dofs > 0) {
139 |     size_t dcells_per_dof_ptr_size = (_num_dofs+1) * sizeof(int32_t);
140 |     cuda_err = cuMemAlloc(
141 |       &_dcells_per_dof_ptr, dcells_per_dof_ptr_size);
142 |     if (cuda_err != CUDA_SUCCESS) {
143 |       cuGetErrorString(cuda_err, &cuda_err_description);
144 |       cuMemFree(_ddofs_per_cell);
145 |       throw std::runtime_error(
146 |         "cuMemAlloc() failed with " + std::string(cuda_err_description) +
147 |         " at " + std::string(__FILE__) + ":" + std::to_string(__LINE__));
148 |     }
149 | 
150 |     // Copy cell degrees of freedom to device
151 |     cuda_err = cuMemcpyHtoD(
152 |       _dcells_per_dof_ptr, cells_per_dof_ptr.data(), dcells_per_dof_ptr_size);
153 |     if (cuda_err != CUDA_SUCCESS) {
154 |       cuGetErrorString(cuda_err, &cuda_err_description);
155 |       cuMemFree(_dcells_per_dof_ptr);
156 |       cuMemFree(_ddofs_per_cell);
157 |       throw std::runtime_error(
158 |         "cuMemcpyHtoD() failed with " + std::string(cuda_err_description) +
159 |         " at " + std::string(__FILE__) + ":" + std::to_string(__LINE__));
160 |     }
161 |   }
162 | 
163 |   // Allocate device-side storage for cells containing each degree of freedom
164 |   if (_num_cells > 0 && _num_dofs_per_cell > 0) {
165 |     size_t dcells_per_dof_size = num_dof_cells * sizeof(int32_t);
166 |     cuda_err = cuMemAlloc(
167 |       &_dcells_per_dof,
168 |       dcells_per_dof_size);
169 |     if (cuda_err != CUDA_SUCCESS) {
170 |       cuGetErrorString(cuda_err, &cuda_err_description);
171 |       cuMemFree(_dcells_per_dof_ptr);
172 |       cuMemFree(_ddofs_per_cell);
173 |       throw std::runtime_error(
174 |         "cuMemAlloc() failed with " + std::string(cuda_err_description) +
175 |         " at " + std::string(__FILE__) + ":" + std::to_string(__LINE__));
176 |     }
177 | 
178 |     // Copy cell degrees of freedom to device
179 |     cuda_err = cuMemcpyHtoD(
180 |       _dcells_per_dof, cells_per_dof.data(), dcells_per_dof_size);
181 |     if (cuda_err != CUDA_SUCCESS) {
182 |       cuGetErrorString(cuda_err, &cuda_err_description);
183 |       cuMemFree(_dcells_per_dof);
184 |       cuMemFree(_dcells_per_dof_ptr);
185 |       cuMemFree(_ddofs_per_cell);
186 |       throw std::runtime_error(
187 |         "cuMemcpyHtoD() failed with " + std::string(cuda_err_description) +
188 |         " at " + std::string(__FILE__) + ":" + std::to_string(__LINE__));
189 |     }
190 |   }*/
191 | }
192 | //-----------------------------------------------------------------------------
193 | void CUDADofMap::update(std::map<std::int32_t, std::int32_t>* restriction)
194 | {
195 |   std::vector<std::int32_t> unrolled_dofs;
196 |   const std::int32_t* dofs_per_cell, *dofs_orig;
197 |   auto dofs = _dofmap->map();
198 |   dofs_orig = dofs.data_handle();
199 | 
200 |   if (restriction) {
201 |     unrolled_dofs.resize(_num_dofs);
202 |     for (std::size_t i = 0; i < dofs.size(); i++) {
203 |       const std::int32_t dof = dofs_orig[i];
204 |       if (restriction->find(dof) != restriction->end()) {
205 |         std::int32_t mapped_dof = (*restriction)[dof];
206 |         for (int j = 0; j < _block_size; j++)
207 |           unrolled_dofs[i*_block_size + j] = mapped_dof*_block_size + j;
208 |       }
209 |       else {
210 |         for (int j = 0; j < _block_size; j++)
211 |           unrolled_dofs[i*_block_size + j] = -1; // we should not be using this degree of freedom
212 |       }
213 |     }
214 |     dofs_per_cell = unrolled_dofs.data();
215 |   }
216 |   else if (_block_size == 1) {
217 |     dofs_per_cell = dofs_orig;
218 |   }
219 |   else {
220 |     unrolled_dofs.resize(_num_dofs);
221 |     for (std::size_t i = 0; i < _num_dofs; i++)
222 |       unrolled_dofs[i] = _block_size*dofs_orig[i/_block_size] + i%_block_size;
223 | 
224 |     dofs_per_cell = unrolled_dofs.data();
225 |   }
226 | 
227 |   // Copy cell degrees of freedom to device
228 |   if (_num_cells > 0 && _num_dofs_per_cell > 0) {
229 |     size_t ddofs_per_cell_size = _num_dofs * sizeof(int32_t);
230 |     CUDA::safeMemcpyHtoD(_ddofs_per_cell, dofs_per_cell, ddofs_per_cell_size);
231 |   }
232 | 
233 | }
234 | //-----------------------------------------------------------------------------
235 | CUDADofMap::~CUDADofMap()
236 | {
237 |   if (_dcells_per_dof)
238 |     cuMemFree(_dcells_per_dof);
239 |   if (_dcells_per_dof_ptr)
240 |     cuMemFree(_dcells_per_dof_ptr);
241 |   if (_ddofs_per_cell)
242 |     cuMemFree(_ddofs_per_cell);
243 | }
244 | //-----------------------------------------------------------------------------
245 | CUDADofMap::CUDADofMap(CUDADofMap&& dofmap)
246 |   : _dofmap(dofmap._dofmap)
247 |   , _num_dofs(dofmap._num_dofs)
248 |   , _num_cells(dofmap._num_cells)
249 |   , _num_dofs_per_cell(dofmap._num_dofs_per_cell)
250 |   , _ddofs_per_cell(dofmap._ddofs_per_cell)
251 |   , _dcells_per_dof_ptr(dofmap._dcells_per_dof_ptr)
252 |   , _dcells_per_dof(dofmap._dcells_per_dof)
253 | {
254 |   dofmap._dofmap = nullptr;
255 |   dofmap._num_dofs = 0;
256 |   dofmap._num_cells = 0;
257 |   dofmap._num_dofs_per_cell = 0;
258 |   dofmap._ddofs_per_cell = 0;
259 |   dofmap._dcells_per_dof_ptr = 0;
260 |   dofmap._dcells_per_dof = 0;
261 | }
262 | //-----------------------------------------------------------------------------
263 | CUDADofMap& CUDADofMap::operator=(CUDADofMap&& dofmap)
264 | {
265 |   _dofmap = dofmap._dofmap;
266 |   _num_dofs = dofmap._num_dofs;
267 |   _num_cells = dofmap._num_cells;
268 |   _num_dofs_per_cell = dofmap._num_dofs_per_cell;
269 |   _ddofs_per_cell = dofmap._ddofs_per_cell;
270 |   _dcells_per_dof_ptr = dofmap._dcells_per_dof_ptr;
271 |   _dcells_per_dof = dofmap._dcells_per_dof;
272 |   dofmap._dofmap = nullptr;
273 |   dofmap._num_dofs = 0;
274 |   dofmap._num_cells = 0;
275 |   dofmap._num_dofs_per_cell = 0;
276 |   dofmap._ddofs_per_cell = 0;
277 |   dofmap._dcells_per_dof_ptr = 0;
278 |   dofmap._dcells_per_dof = 0;
279 |   return *this;
280 | }
281 | //-----------------------------------------------------------------------------
282 | 


--------------------------------------------------------------------------------
/cpp/cudolfinx/fem/CUDADofMap.h:
--------------------------------------------------------------------------------
  1 | // Copyright (C) 2024 Benjamin Pachev, James D. Trotter
  2 | //
  3 | // This file is part of cuDOLFINX
  4 | //
  5 | // SPDX-License-Identifier:    LGPL-3.0-or-later
  6 | 
  7 | #pragma once
  8 | 
  9 | #include <cudolfinx/common/CUDA.h>
 10 | #include <cuda.h>
 11 | #include <map>
 12 | 
 13 | namespace dolfinx {
 14 | namespace fem {
 15 | class DofMap;
 16 | 
 17 | /// A wrapper for a cellwise-to-global mapping of degres of freedom
 18 | /// that is stored in the device memory of a CUDA device.
 19 | class CUDADofMap
 20 | {
 21 | public:
 22 |   /// Create an empty dofmap
 23 |   CUDADofMap();
 24 | 
 25 |   /// Create a dofmap
 26 |   ///
 27 |   /// @param[in] dofmap The dofmap to copy to device memory
 28 |   CUDADofMap(const dolfinx::fem::DofMap& dofmap, std::map<std::int32_t, std::int32_t>* restriction);
 29 | 
 30 |   // constructors without restriction
 31 |   CUDADofMap(const dolfinx::fem::DofMap* dofmap);
 32 | 
 33 |   CUDADofMap(const dolfinx::fem::DofMap& dofmap);
 34 |    
 35 |   /// Alternate constructor
 36 |   CUDADofMap(const dolfinx::fem::DofMap* dofmap, std::map<std::int32_t, std::int32_t>* restriction);
 37 | 
 38 |   /// Destructor
 39 |   ~CUDADofMap();
 40 | 
 41 |   /// Copy constructor
 42 |   /// @param[in] dofmap The object to be copied
 43 |   CUDADofMap(const CUDADofMap& dofmap) = delete;
 44 | 
 45 |   /// Move constructor
 46 |   /// @param[in] dofmap The object to be moved
 47 |   CUDADofMap(CUDADofMap&& dofmap);
 48 | 
 49 |   /// Assignment operator
 50 |   /// @param[in] dofmap Another CUDADofMap object
 51 |   CUDADofMap& operator=(const CUDADofMap& dofmap) = delete;
 52 | 
 53 |   /// Move assignment operator
 54 |   /// @param[in] dofmap Another CUDADofMap object
 55 |   CUDADofMap& operator=(CUDADofMap&& dofmap);
 56 | 
 57 |   /// Update the dofmap on the device, possibly with a new restriction
 58 |   void update(std::map<std::int32_t, std::int32_t>* restriction);
 59 | 
 60 |   /// Get the underlying dofmap on the host
 61 |   const dolfinx::fem::DofMap* dofmap() const { return _dofmap; }
 62 | 
 63 |   /// Get the number of degrees of freedom
 64 |   int32_t num_dofs() const { return _num_dofs; }
 65 | 
 66 |   /// Get the number of cells
 67 |   int32_t num_cells() const { return _num_cells; }
 68 | 
 69 |   /// Get the number of dofs per cell
 70 |   int32_t num_dofs_per_cell() const {
 71 |     return _num_dofs_per_cell; }
 72 | 
 73 |   /// Get a handle to the device-side dofs of each cell
 74 |   CUdeviceptr dofs_per_cell() const {
 75 |     return _ddofs_per_cell; }
 76 | 
 77 |   /// Get the offsets to the first cell containing each degree of freedom
 78 |   CUdeviceptr cells_per_dof_ptr() const {
 79 |     return _dcells_per_dof_ptr; }
 80 | 
 81 |   /// Get the cells containing each degree of freedom
 82 |   CUdeviceptr cells_per_dof() const {
 83 |     return _dcells_per_dof; }
 84 | 
 85 | private:
 86 |   /// The underlying dofmap on the host
 87 |   const dolfinx::fem::DofMap* _dofmap;
 88 | 
 89 |   /// The number of degrees of freedom
 90 |   int32_t _num_dofs;
 91 | 
 92 |   /// The number of cells in the mesh
 93 |   int32_t _num_cells;
 94 | 
 95 |   /// The number of degrees of freedom in each cell
 96 |   int32_t _num_dofs_per_cell;
 97 | 
 98 |   /// The block size
 99 |   int32_t _block_size;
100 | 
101 |   /// The degrees of freedom of each cell
102 |   CUdeviceptr _ddofs_per_cell;
103 | 
104 |   /// Offsets to the first cell containing each degree of freedom
105 |   CUdeviceptr _dcells_per_dof_ptr;
106 | 
107 |   /// The cells containing each degree of freedom
108 |   CUdeviceptr _dcells_per_dof;
109 | };
110 | 
111 | } // namespace fem
112 | } // namespace dolfinx
113 | 
114 | 


--------------------------------------------------------------------------------
/cpp/cudolfinx/fem/CUDAForm.h:
--------------------------------------------------------------------------------
  1 | // Copyright (C) 2024 Benjamin Pachev, James D. Trotter
  2 | //
  3 | // This file is part of cuDOLFINX
  4 | //
  5 | // SPDX-License-Identifier:    LGPL-3.0-or-later
  6 | 
  7 | #pragma once
  8 | 
  9 | #include <dolfinx/fem/Form.h>
 10 | #include <dolfinx/fem/DirichletBC.h>
 11 | #include <cudolfinx/common/CUDA.h>
 12 | #include <cudolfinx/common/CUDAStore.h>
 13 | #include <cudolfinx/fem/CUDADirichletBC.h>
 14 | #include <cudolfinx/fem/CUDADofMap.h>
 15 | #include <cudolfinx/fem/CUDAFormCoefficients.h>
 16 | #include <cudolfinx/fem/CUDAFormConstants.h>
 17 | #include <cudolfinx/fem/CUDAFormIntegral.h>
 18 | #include <cudolfinx/la/CUDAVector.h>
 19 | #include <string>
 20 | #include <utility>
 21 | #include <ufcx.h>
 22 | 
 23 | namespace dolfinx {
 24 | 
 25 | namespace fem {
 26 | 
 27 | /// Consolidates all form classes into one
 28 | template <dolfinx::scalar T,
 29 |           std::floating_point U = dolfinx::scalar_value_type_t<T>>
 30 | class CUDAForm
 31 | {
 32 | 
 33 | public:
 34 |   /// Create GPU copies of data needed for assembly
 35 |   ///
 36 |   /// @param[in] cuda_context A context for a CUDA device
 37 |   /// @param[in] form Pointer to the variational form
 38 |   CUDAForm(
 39 |     const CUDA::Context& cuda_context,
 40 |     Form<T,U>* form,
 41 |     ufcx_form* ufcx_form,
 42 |     std::vector<std::string>& tabulate_tensor_names,
 43 |     std::vector<std::string>& tabulate_tensor_sources
 44 |   )
 45 |   : _coefficients(cuda_context, form, _dofmap_store)
 46 |   , _constants(cuda_context, form)
 47 |   , _form(form)
 48 |   , _ufcx_form(ufcx_form)
 49 |   , _compiled(false)
 50 |   {
 51 |     _coefficients = CUDAFormCoefficients<T,U>(cuda_context, form, _dofmap_store);
 52 |     const int* integral_offsets = ufcx_form->form_integral_offsets;
 53 |     if (integral_offsets[3] != tabulate_tensor_names.size()) {
 54 |       throw std::runtime_error("UFCx form has " + std::to_string(integral_offsets[3])
 55 | 		      + " integrals, but only " + std::to_string(tabulate_tensor_names.size())
 56 | 		      + " tabulate tensor sources provided to CUDAForm!"
 57 | 		      );
 58 |     }
 59 |     for (int i = 0; i < 3; i++) {
 60 |       for (int offset = integral_offsets[i]; offset < integral_offsets[i+1]; offset++) {
 61 |         int id = ufcx_form->form_integral_ids[offset];
 62 |         _cuda_integrals[i].insert({id, {tabulate_tensor_names[offset], tabulate_tensor_sources[offset]}});
 63 |       }
 64 |     } 
 65 |   }
 66 | 
 67 |   /// Compile form on GPU
 68 |   /// Under the hood, this creates the integrals
 69 |   void compile(
 70 |     const CUDA::Context& cuda_context,
 71 |     int32_t max_threads_per_block,
 72 |     int32_t min_blocks_per_multiprocessor,
 73 |     enum assembly_kernel_type assembly_kernel_type)
 74 |   {
 75 |     auto cujit_target = CUDA::get_cujit_target(cuda_context);
 76 |     _integrals = cuda_form_integrals(
 77 |       cuda_context, cujit_target, *_form, _cuda_integrals, assembly_kernel_type,
 78 |       max_threads_per_block, min_blocks_per_multiprocessor, false, NULL, false);
 79 |     _compiled = true;
 80 |   }
 81 | 
 82 |   /// Copy constructor
 83 |   CUDAForm(const CUDAForm& form) = delete;
 84 | 
 85 |   /// Move constructor
 86 |   CUDAForm(CUDAForm&& form) = default;
 87 | 
 88 |   /// Destructor
 89 |   virtual ~CUDAForm() = default;
 90 | 
 91 |   bool compiled() { return _compiled; }
 92 | 
 93 |   bool restricted() { return _restricted_dofmaps.size() > 0; }
 94 |   
 95 |   std::map<IntegralType, std::vector<CUDAFormIntegral<T,U>>>& integrals() {
 96 |     if (!_compiled) {
 97 |       throw std::runtime_error("Cannot access integrals for uncompiled cuda form!");
 98 |     }
 99 |     return _integrals;
100 |   }
101 | 
102 |   CUDAFormCoefficients<T,U>& coefficients() { return _coefficients; }
103 | 
104 |   const CUDAFormConstants<T>& constants() { return _constants; }
105 | 
106 |   std::shared_ptr<const CUDADofMap> unrestricted_dofmap(size_t i) {
107 |     if (i >= _form->function_spaces().size()) throw std::runtime_error("Dofmap index out of bounds!");
108 |     return _dofmap_store.get_device_object(_form->function_spaces()[i]->dofmap().get());
109 |   }
110 | 
111 |   std::shared_ptr<const CUDADofMap> dofmap(size_t i) {
112 |     if (!restricted()) return unrestricted_dofmap(i);
113 |     if (i >= _restricted_dofmaps.size()) throw std::runtime_error("Dofmap index out of bounds!");
114 |     return _restricted_dofmaps[i];
115 |   }
116 | 
117 |   Form<T,U>* form() { return _form; }
118 | 
119 |   CUDADirichletBC<T,U> bc(
120 |     const CUDA::Context& cuda_context, size_t i,
121 |     std::vector<std::shared_ptr<const DirichletBC<T,U>>> bcs)
122 |   {
123 |     return CUDADirichletBC<T,U>(cuda_context, *_form->function_spaces()[i], bcs);
124 |   }
125 | 
126 |   /// Copy the coefficient and constant data to the device
127 |   /// This can be necessary if either changes on the host
128 |   void to_device(const CUDA::Context& cuda_context)
129 |   {
130 |     _coefficients.copy_coefficients_to_device(cuda_context);
131 |     _constants.update_constant_values(); 
132 |   }
133 | 
134 |   void set_restriction(std::vector<std::shared_ptr<std::map<std::int32_t, std::int32_t>>> restriction)
135 |   {
136 |     if (restriction.size() != _form->function_spaces().size()) {
137 |       throw std::runtime_error("Number of restrictions must equal arity of form (1 for vector, 2 for matrix)!");
138 |     }
139 | 
140 |     if (_restricted_dofmaps.size()) {
141 |       // need to update the restriction
142 |       for (int i = 0; i < _restricted_dofmaps.size(); i++) {
143 |         _restricted_dofmaps[i]->update(restriction[i].get());
144 |       } 
145 |     }
146 |     else {
147 |       for (int i = 0; i < restriction.size(); i++) {
148 |         _restricted_dofmaps.push_back(
149 |           std::make_shared<CUDADofMap>(
150 | 	    _form->function_spaces()[i]->dofmap().get(),
151 | 	    restriction[i].get()
152 | 	  )
153 | 	);
154 |       }
155 |     }
156 |   }
157 | 
158 | private:
159 |   // Cache of CUDADofMaps
160 |   common::CUDAStore<DofMap, CUDADofMap> _dofmap_store;
161 |   // Restricted dofmaps
162 |   std::vector<std::shared_ptr<CUDADofMap>> _restricted_dofmaps;
163 |   // Form coefficients
164 |   CUDAFormCoefficients<T, U> _coefficients;
165 |   // Form Constants
166 |   CUDAFormConstants<T> _constants;
167 |   // Compiled CUDA kernels
168 |   std::map<IntegralType, std::vector<CUDAFormIntegral<T,U>>> _integrals;
169 |   // CUDA tabulate tensors 
170 |   std::array<std::map<int, std::pair<std::string, std::string>>, 4> _cuda_integrals;
171 |   bool _compiled;
172 |   Form<T,U>* _form;
173 |   ufcx_form* _ufcx_form;
174 | };
175 | 
176 | } // end namespace fem
177 | 
178 | } // end namespace dolfinx
179 | 


--------------------------------------------------------------------------------
/cpp/cudolfinx/fem/CUDAFormConstants.h:
--------------------------------------------------------------------------------
  1 | // Copyright (C) 2024 Benjamin Pachev, James D. Trotter
  2 | //
  3 | // This file is part of cuDOLFINX
  4 | //
  5 | // SPDX-License-Identifier:    LGPL-3.0-or-later
  6 | 
  7 | #pragma once
  8 | 
  9 | #include <cudolfinx/common/CUDA.h>
 10 | #include <dolfinx/fem/Form.h>
 11 | #include <dolfinx/fem/utils.h>
 12 | #include <cuda.h>
 13 | 
 14 | namespace dolfinx {
 15 | namespace fem {
 16 | 
 17 | /// A wrapper for a form constant with data that is stored in the
 18 | /// device memory of a CUDA device.
 19 | template <dolfinx::scalar T>
 20 | class CUDAFormConstants
 21 | {
 22 | public:
 23 | 
 24 |   /// Create an empty collection constant values
 25 |   CUDAFormConstants()
 26 |     : _form(nullptr)
 27 |     , _num_constant_values()
 28 |     , _dconstant_values(0)
 29 |   {
 30 |   }
 31 |   //-----------------------------------------------------------------------------
 32 |   /// Create a collection constant values from a given form
 33 |   ///
 34 |   /// @param[in] cuda_context A context for a CUDA device
 35 |   /// @param[in] form The variational form whose constants are used
 36 |   CUDAFormConstants(
 37 |     const CUDA::Context& cuda_context,
 38 |     const Form<T>* form)
 39 |     : _form(form)
 40 |     , _num_constant_values()
 41 |     , _dconstant_values(0)
 42 |   {
 43 |     CUresult cuda_err;
 44 |     const char * cuda_err_description;
 45 | 
 46 |     const std::vector<T>
 47 |       constant_values = pack_constants(*_form);
 48 | 
 49 |     // Allocate device-side storage for constant values
 50 |     _num_constant_values = constant_values.size();
 51 |     if (_num_constant_values > 0) {
 52 |       size_t dconstant_values_size =
 53 |         _num_constant_values * sizeof(T);
 54 |       cuda_err = cuMemAlloc(
 55 |         &_dconstant_values, dconstant_values_size);
 56 |       if (cuda_err != CUDA_SUCCESS) {
 57 |         cuGetErrorString(cuda_err, &cuda_err_description);
 58 |         throw std::runtime_error(
 59 |           "cuMemAlloc() failed with " + std::string(cuda_err_description) +
 60 |           " at " + std::string(__FILE__) + ":" + std::to_string(__LINE__));
 61 |       }
 62 | 
 63 |       // Copy constant values to device
 64 |       cuda_err = cuMemcpyHtoD(
 65 |         _dconstant_values, constant_values.data(), dconstant_values_size);
 66 |       if (cuda_err != CUDA_SUCCESS) {
 67 |         cuMemFree(_dconstant_values);
 68 |         cuGetErrorString(cuda_err, &cuda_err_description);
 69 |         throw std::runtime_error(
 70 |           "cuMemcpyHtoD() failed with " + std::string(cuda_err_description) +
 71 |           " at " + std::string(__FILE__) + ":" + std::to_string(__LINE__));
 72 |       }
 73 |     }
 74 |   }
 75 |   //-----------------------------------------------------------------------------
 76 |   /// Destructor
 77 |   ~CUDAFormConstants()
 78 |   {
 79 |     if (_dconstant_values)
 80 |       cuMemFree(_dconstant_values);
 81 |   }
 82 |   //-----------------------------------------------------------------------------
 83 |   /// Copy constructor
 84 |   /// @param[in] form_constant The object to be copied
 85 |   CUDAFormConstants(const CUDAFormConstants& form_constant) = delete;
 86 | 
 87 |   /// Move constructor
 88 |   /// @param[in] form_constant The object to be moved
 89 |   CUDAFormConstants(CUDAFormConstants&& constants)
 90 |     : _form(constants._form)
 91 |     , _num_constant_values(constants._num_constant_values)
 92 |     , _dconstant_values(constants._dconstant_values)
 93 |   {
 94 |     constants._form = nullptr;
 95 |     constants._num_constant_values = 0;
 96 |     constants._dconstant_values = 0;
 97 |   }
 98 |   //-----------------------------------------------------------------------------
 99 |   /// Assignment operator
100 |   /// @param[in] form_constant Another CUDAFormConstants object
101 |   CUDAFormConstants& operator=(const CUDAFormConstants& form_constant) = delete;
102 | 
103 |   /// Move assignment operator
104 |   /// @param[in] form_constant Another CUDAFormConstants object
105 |   CUDAFormConstants& operator=(CUDAFormConstants&& constants)
106 |   {
107 |     _form = constants._form;
108 |     _num_constant_values = constants._num_constant_values;
109 |     _dconstant_values = constants._dconstant_values;
110 |     constants._form = nullptr;
111 |     constants._num_constant_values = 0;
112 |     constants._dconstant_values = 0;
113 |     return *this;
114 |   }
115 |   //-----------------------------------------------------------------------------
116 |   /// Get the number of constant values that the constant applies to
117 |   int32_t num_constant_values() const { return _num_constant_values; }
118 | 
119 |   /// Get the constant values that the constant applies to
120 |   CUdeviceptr constant_values() const { return _dconstant_values; }
121 | 
122 |   /// Update the constant values by copying values from host to device
123 |   void update_constant_values() const
124 |   {
125 |     CUresult cuda_err;
126 |     const char * cuda_err_description;
127 | 
128 |     // Pack constants into an array
129 |     const std::vector<T>  
130 |       constant_values = pack_constants(*_form);
131 |     assert(_num_constant_values == constant_values.size());
132 | 
133 |     // Copy constant values to device
134 |     if (_num_constant_values > 0) {
135 |       size_t dconstant_values_size =
136 |         _num_constant_values * sizeof(T);
137 |       cuda_err = cuMemcpyHtoD(
138 |         _dconstant_values, constant_values.data(), dconstant_values_size);
139 |       if (cuda_err != CUDA_SUCCESS) {
140 |         cuMemFree(_dconstant_values);
141 |         cuGetErrorString(cuda_err, &cuda_err_description);
142 |         throw std::runtime_error(
143 |           "cuMemcpyHtoD() failed with " + std::string(cuda_err_description) +
144 |           " at " + std::string(__FILE__) + ":" + std::to_string(__LINE__));
145 |       }
146 |     }
147 |   }
148 |   //-----------------------------------------------------------------------------
149 | 
150 | 
151 | private:
152 |   // The form that the constant applies to
153 |   const Form<T>* _form;
154 | 
155 |   /// The number of constant values
156 |   int32_t _num_constant_values;
157 | 
158 |   /// The constant values
159 |   CUdeviceptr _dconstant_values;
160 | };
161 | 
162 | } // namespace fem
163 | } // namespace dolfinx
164 | 
165 | 


--------------------------------------------------------------------------------
/cpp/cudolfinx/fem/petsc.h:
--------------------------------------------------------------------------------
 1 | // Copyright (C) 2024 Benjamin Pachev, James D. Trotter
 2 | //
 3 | // This file is part of cuDOLFINX
 4 | //
 5 | // SPDX-License-Identifier:    LGPL-3.0-or-later
 6 | 
 7 | #pragma once
 8 | 
 9 | #include <concepts>
10 | #include <cudolfinx/la/petsc.h>
11 | #include <dolfinx/fem/Form.h>
12 | #include <dolfinx/fem/utils.h>
13 | #include <dolfinx/la/petsc.h>
14 | #include <map>
15 | #include <memory>
16 | #include <petscmat.h>
17 | #include <petscvec.h>
18 | #include <span>
19 | #include <utility>
20 | #include <vector>
21 | 
22 | namespace dolfinx::fem
23 | {
24 | 
25 | namespace petsc
26 | {	
27 | 
28 | template <std::floating_point T>
29 | Mat create_cuda_matrix(const Form<PetscScalar, T>& a)
30 | {
31 |   la::SparsityPattern pattern = fem::create_sparsity_pattern(a);
32 |   pattern.finalize();
33 |   return la::petsc::create_cuda_matrix(a.mesh()->comm(), pattern);
34 | }
35 | 
36 | } // namespace petsc
37 | } // namespace dolfinx::fem
38 | 


--------------------------------------------------------------------------------
/cpp/cudolfinx/la/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set(HEADERS_la
 2 |     ${CMAKE_CURRENT_SOURCE_DIR}/CUDAMatrix.h
 3 |     ${CMAKE_CURRENT_SOURCE_DIR}/CUDASeqMatrix.h
 4 |     ${CMAKE_CURRENT_SOURCE_DIR}/CUDAVector.h
 5 |     ${CMAKE_CURRENT_SOURCE_DIR}/petsc.h
 6 |     PARENT_SCOPE
 7 | )
 8 | 
 9 | target_sources(
10 |   cudolfinx
11 |   PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/CUDAMatrix.cpp
12 |           ${CMAKE_CURRENT_SOURCE_DIR}/CUDASeqMatrix.cpp
13 |           ${CMAKE_CURRENT_SOURCE_DIR}/CUDAVector.cpp
14 |           ${CMAKE_CURRENT_SOURCE_DIR}/petsc.cpp
15 | )
16 | 


--------------------------------------------------------------------------------
/cpp/cudolfinx/la/CUDAMatrix.cpp:
--------------------------------------------------------------------------------
  1 | // Copyright (C) 2024 Benjamin Pachev, James D. Trotter
  2 | //
  3 | // This file is part of cuDOLFINX
  4 | //
  5 | // SPDX-License-Identifier:    LGPL-3.0-or-later
  6 | 
  7 | #include "CUDAMatrix.h"
  8 | #include <cudolfinx/common/CUDA.h>
  9 | #include <cudolfinx/la/CUDASeqMatrix.h>
 10 | #include <dolfinx/la/utils.h>
 11 | #include <dolfinx/la/petsc.h>
 12 | #include <iostream>
 13 | #include <vector>
 14 | #include <cuda.h>
 15 | #include <petscmat.h>
 16 | 
 17 | using namespace dolfinx;
 18 | using namespace dolfinx::la;
 19 | 
 20 | //-----------------------------------------------------------------------------
 21 | CUDAMatrix::CUDAMatrix()
 22 |   : _A(nullptr)
 23 |   , _diag()
 24 |   , _offdiag()
 25 |   , _dcolmap(0)
 26 |   , _num_rows()
 27 |   , _num_columns()
 28 |   , _local_row_start()
 29 |   , _local_row_end()
 30 |   , _num_local_rows()
 31 |   , _num_local_columns()
 32 |   , _num_local_offdiag_columns()
 33 | {
 34 | }
 35 | //-----------------------------------------------------------------------------
 36 | CUDAMatrix::CUDAMatrix(
 37 |   const CUDA::Context& cuda_context,
 38 |   Mat A,
 39 |   bool page_lock_values,
 40 |   bool use_seqaijcusparsegetarray)
 41 |   : _A(A)
 42 |   , _diag()
 43 |   , _offdiag()
 44 |   , _dcolmap(0)
 45 |   , _num_rows()
 46 |   , _num_columns()
 47 |   , _local_row_start()
 48 |   , _local_row_end()
 49 |   , _num_local_rows()
 50 |   , _num_local_columns()
 51 |   , _num_local_offdiag_columns()
 52 | {
 53 |   PetscErrorCode ierr;
 54 |   CUresult cuda_err;
 55 |   const char * cuda_err_description;
 56 | 
 57 |   // Check the type of matrix
 58 |   MatType matrix_type;
 59 |   ierr = MatGetType(A, &matrix_type);
 60 |   if (ierr != 0)
 61 |     la::petsc::error(ierr, __FILE__, "MatGetType");
 62 | 
 63 |   // Get the number of matrix rows and columns
 64 |   ierr = MatGetSize(A, &_num_rows, &_num_columns);
 65 |   if (ierr != 0)
 66 |     la::petsc::error(ierr, __FILE__, "MatGetSize");
 67 | 
 68 |   // Get the number of rows and columns owned by the current MPI process
 69 |   ierr = MatGetLocalSize(A, &_num_local_rows, &_num_local_columns);
 70 |   if (ierr != 0)
 71 |     la::petsc::error(ierr, __FILE__, "MatGetLocalSize");
 72 | 
 73 |   // TODO: We might need to do some additional work to handle non-zero
 74 |   // local_row_start.
 75 |   ierr = MatGetOwnershipRange(A, &_local_row_start, &_local_row_end);
 76 |   if (ierr != 0)
 77 |     la::petsc::error(ierr, __FILE__, "MatGetOwnershipRange");
 78 | 
 79 |   if (strcmp(matrix_type, MATSEQAIJ) == 0 ||
 80 |       strcmp(matrix_type, MATSEQAIJCUSPARSE) == 0)
 81 |   {
 82 |     // A non-distributed matrix only has a diagonal part
 83 |     _diag = std::make_unique<CUDASeqMatrix>(
 84 |       cuda_context, A, page_lock_values, use_seqaijcusparsegetarray);
 85 |   } else if (strcmp(matrix_type, MATMPIAIJ) == 0 ||
 86 |              strcmp(matrix_type, MATMPIAIJCUSPARSE) == 0)
 87 |   {
 88 |     // For a distributed matrix, we obtain local diagonal and
 89 |     // off-diagonal blocks using MatMPIAIJGetSeqAIJ().
 90 |     Mat Ad, Ao;
 91 |     const int * colmap;
 92 |     ierr = MatMPIAIJGetSeqAIJ(A, &Ad, &Ao, &colmap);
 93 |     if (ierr != 0)
 94 |       la::petsc::error(ierr, __FILE__, "MatMPIAIJGetSeqAIJ");
 95 |     _diag = std::make_unique<CUDASeqMatrix>(
 96 |       cuda_context, Ad, page_lock_values, use_seqaijcusparsegetarray);
 97 |     _offdiag = std::make_unique<CUDASeqMatrix>(
 98 |       cuda_context, Ao, page_lock_values, use_seqaijcusparsegetarray);
 99 | 
100 |     // Get the number of columns in the off-diagonal part of the local
101 |     // matrix.
102 |     ierr = MatGetLocalSize(Ao, NULL, &_num_local_offdiag_columns);
103 |     if (ierr != 0)
104 |       la::petsc::error(ierr, __FILE__, "MatGetLocalSize");
105 | 
106 |     // Convert the column map from global numbering to the
107 |     // process-local numbering
108 |     ISLocalToGlobalMapping cmapping;
109 |     ierr = MatGetLocalToGlobalMapping(A, NULL, &cmapping);
110 |     if (ierr != 0)
111 |       la::petsc::error(ierr, __FILE__, "MatGetLocalToGlobalMapping");
112 | 
113 |     std::vector<std::int32_t> colmap_local(_num_local_offdiag_columns);
114 |     ierr = ISGlobalToLocalMappingApply(
115 |       cmapping, IS_GTOLM_MASK, _num_local_offdiag_columns, colmap,
116 |       NULL, colmap_local.data());
117 |     if (ierr != 0)
118 |       la::petsc::error(ierr, __FILE__, "ISGlobalToLocalMappingApply");
119 | 
120 |     // Allocate device-side storage for off-diagonal column map
121 |     if (_num_local_offdiag_columns > 0) {
122 |       std::vector<std::pair<std::int32_t, std::int32_t>> combined;
123 |       for (int i = 0; i < colmap_local.size(); i++) {
124 |         combined.emplace_back(colmap_local[i], i);
125 |       }
126 |       std::sort(combined.begin(), combined.end(),
127 |             [](const std::pair<std::int32_t, std::int32_t>& a, const std::pair<std::int32_t, std::int32_t>& b) {
128 |               return a.first < b.first;
129 |             });
130 |       std::vector<std::int32_t> colmap_sorted(combined.size());
131 |       std::vector<std::int32_t> colmap_sorted_indices(combined.size());
132 | 
133 |       for (int i = 0; i < combined.size(); i++) {
134 | 	 colmap_sorted[i] = combined[i].first;
135 |          colmap_sorted_indices[i] = combined[i].second;
136 |       }
137 | 
138 |       dolfinx::CUDA::safeVectorCreate<std::int32_t>(&_dcolmap, colmap_local);
139 |       dolfinx::CUDA::safeVectorCreate<std::int32_t>(&_dcolmap_sorted, colmap_sorted);
140 |       dolfinx::CUDA::safeVectorCreate<std::int32_t>(&_dcolmap_sorted_indices, colmap_sorted_indices);
141 |     }
142 | 
143 |   } else {
144 |     throw std::runtime_error(
145 |       "Unsupported matrix type '" + std::string(matrix_type) + "' "
146 |       "at " + std::string(__FILE__) + ":" + std::to_string(__LINE__));
147 |   }
148 | }
149 | //-----------------------------------------------------------------------------
150 | CUDAMatrix::~CUDAMatrix()
151 | {
152 | }
153 | //-----------------------------------------------------------------------------
154 | CUDAMatrix::CUDAMatrix(CUDAMatrix&& matrix)
155 |   : _A(matrix._A)
156 |   , _diag(std::move(matrix._diag))
157 |   , _offdiag(std::move(matrix._offdiag))
158 |   , _dcolmap(matrix._dcolmap)
159 |   , _num_rows(matrix._num_rows)
160 |   , _num_columns(matrix._num_columns)
161 |   , _local_row_start(matrix._local_row_start)
162 |   , _local_row_end(matrix._local_row_end)
163 |   , _num_local_rows(matrix._num_local_rows)
164 |   , _num_local_columns(matrix._num_local_columns)
165 |   , _num_local_offdiag_columns(matrix._num_local_offdiag_columns)
166 | {
167 |   matrix._A = nullptr;
168 |   matrix._diag = nullptr;
169 |   matrix._offdiag = nullptr;
170 |   matrix._dcolmap = 0;
171 |   matrix._num_rows = 0;
172 |   matrix._num_columns = 0;
173 |   matrix._local_row_start = 0;
174 |   matrix._local_row_end = 0;
175 |   matrix._num_local_rows = 0;
176 |   matrix._num_local_columns = 0;
177 |   matrix._num_local_offdiag_columns = 0;
178 | }
179 | //-----------------------------------------------------------------------------
180 | CUDAMatrix& CUDAMatrix::operator=(CUDAMatrix&& matrix)
181 | {
182 |   _A = matrix._A;
183 |   _diag = std::move(matrix._diag);
184 |   _offdiag = std::move(matrix._offdiag);
185 |   _dcolmap = matrix._dcolmap;
186 |   _num_rows = matrix._num_rows;
187 |   _num_columns = matrix._num_columns;
188 |   _local_row_start = matrix._local_row_start;
189 |   _local_row_end = matrix._local_row_end;
190 |   _num_local_rows = matrix._num_local_rows;
191 |   _num_local_columns = matrix._num_local_columns;
192 |   _num_local_offdiag_columns = matrix._num_local_offdiag_columns;
193 |   matrix._A = nullptr;
194 |   matrix._diag = nullptr;
195 |   matrix._offdiag = nullptr;
196 |   matrix._dcolmap = 0;
197 |   matrix._num_rows = 0;
198 |   matrix._num_columns = 0;
199 |   matrix._local_row_start = 0;
200 |   matrix._local_row_end = 0;
201 |   matrix._num_local_rows = 0;
202 |   matrix._num_local_columns = 0;
203 |   matrix._num_local_offdiag_columns = 0;
204 |   return *this;
205 | }
206 | //-----------------------------------------------------------------------------
207 | void CUDAMatrix::copy_matrix_values_to_host(
208 |   const CUDA::Context& cuda_context)
209 | {
210 |   if (_diag)
211 |     _diag->copy_matrix_values_to_host(cuda_context);
212 |   if (_offdiag)
213 |     _offdiag->copy_matrix_values_to_host(cuda_context);
214 | }
215 | //-----------------------------------------------------------------------------
216 | void CUDAMatrix::apply(MatAssemblyType type)
217 | {
218 |   PetscErrorCode ierr;
219 |   ierr = MatAssemblyBegin(_A, type);
220 |   if (ierr != 0)
221 |     petsc::error(ierr, __FILE__, "MatAssemblyBegin");
222 |   ierr = MatAssemblyEnd(_A, type);
223 |   if (ierr != 0)
224 |     petsc::error(ierr, __FILE__, "MatAssemblyEnd");
225 | }
226 | //-----------------------------------------------------------------------------
227 | void CUDAMatrix::debug_dump()
228 | {
229 |   if (_diag) {
230 |     std::cout << "Dumping diag matrix." << std::endl;
231 |     _diag->debug_dump();
232 |   }
233 |   if (_offdiag) {
234 |     std::cout << "Dumping offdiag matrix." << std::endl;
235 |     _offdiag->debug_dump();
236 |   }
237 | }
238 | 
239 | 


--------------------------------------------------------------------------------
/cpp/cudolfinx/la/CUDAMatrix.h:
--------------------------------------------------------------------------------
  1 | // Copyright (C) 2024 Benjamin Pachev, James D. Trotter
  2 | //
  3 | // This file is part of cuDOLFINX
  4 | //
  5 | // SPDX-License-Identifier:    LGPL-3.0-or-later
  6 | 
  7 | #pragma once
  8 | 
  9 | #include <cudolfinx/common/CUDA.h>
 10 | #include <cuda.h>
 11 | #include <petscmat.h>
 12 | #include <memory>
 13 | 
 14 | namespace dolfinx::la
 15 | {
 16 | 
 17 | class CUDASeqMatrix;
 18 | 
 19 | /// A wrapper for a matrix in the compressed sparse row (CSR) format
 20 | /// that is stored in the device memory of a CUDA device.
 21 | 
 22 | class CUDAMatrix
 23 | {
 24 | public:
 25 |   /// Create an empty CUDA matrix
 26 |   CUDAMatrix();
 27 | 
 28 |   /// Create a matrix from a PETSc Mat object
 29 |   ///
 30 |   /// @param[in] cuda_context A context for a CUDA device
 31 |   /// @param[in] A PETSc matrix to copy to the device
 32 |   /// @param[in] page_lock_values Whether or not to use page-locked
 33 |   ///                             memory for the host-side array of
 34 |   ///                             non-zero values.
 35 |   /// @param[in] use_seqaijcusparsegetarray Whether or not to use the
 36 |   ///              function MatSeqAIJCUSPARSEGetArray(),which is only
 37 |   ///              available in a custom-built version of PETSc. If it
 38 |   ///              is set, this will avoid unnecessary copying of data
 39 |   ///              between host and device for matrices of type
 40 |   ///              MATSEQAIJCUSPARSE whenever a CUDA-based assembler
 41 |   ///              is used.
 42 |   CUDAMatrix(
 43 |     const CUDA::Context& cuda_context,
 44 |     Mat A,
 45 |     bool page_lock_values,
 46 |     bool use_seqaijcusparsegetarray);
 47 | 
 48 |   /// Destructor
 49 |   ~CUDAMatrix();
 50 | 
 51 |   /// Copy constructor
 52 |   /// @param[in] matrix The object to be copied
 53 |   CUDAMatrix(const CUDAMatrix& matrix) = delete;
 54 | 
 55 |   /// Move constructor
 56 |   /// @param[in] matrix The object to be moved
 57 |   CUDAMatrix(CUDAMatrix&& matrix);
 58 | 
 59 |   /// Assignment operator
 60 |   /// @param[in] matrix Another CUDAMatrix object
 61 |   CUDAMatrix& operator=(const CUDAMatrix& matrix) = delete;
 62 | 
 63 |   /// Move assignment operator
 64 |   /// @param[in] matrix Another CUDAMatrix object
 65 |   CUDAMatrix& operator=(CUDAMatrix&& matrix);
 66 | 
 67 |   /// Get the underlying PETSc matrix object
 68 |   Mat mat() { return _A; }
 69 | 
 70 |   /// Get the diagonal block of the local part of the matrix
 71 |   const CUDASeqMatrix * diag() const { return _diag.get(); }
 72 |   CUDASeqMatrix * diag() { return _diag.get(); }
 73 | 
 74 |   /// Get the off-diagonal block of the local part of the matrix
 75 |   const CUDASeqMatrix * offdiag() const { return _offdiag.get(); }
 76 |   CUDASeqMatrix * offdiag() { return _offdiag.get(); }
 77 | 
 78 |   /// Methods to get off diagonal column mapping
 79 |   CUdeviceptr colmap() const { return _dcolmap; }
 80 |   CUdeviceptr colmap_sorted() const { return _dcolmap_sorted; }
 81 |   CUdeviceptr colmap_sorted_indices() const { return _dcolmap_sorted_indices; }
 82 | 
 83 |   /// Get the number of matrix rows
 84 |   int32_t num_rows() const { return _num_rows; }
 85 | 
 86 |   /// Get the number of matrix columns
 87 |   int32_t num_columns() const { return _num_columns; }
 88 | 
 89 |   /// Get the global index of the first row
 90 |   int32_t local_row_start() const { return _local_row_start; }
 91 | 
 92 |   /// Get the global index of the last row
 93 |   int32_t local_row_end() const { return _local_row_end; }
 94 | 
 95 |   /// Get the number of local matrix rows
 96 |   int32_t num_local_rows() const { return _num_local_rows; }
 97 | 
 98 |   /// Get the number of local matrix columns
 99 |   int32_t num_local_columns() const { return _num_local_columns; }
100 | 
101 |   /// Get the number of local matrix columns in the off-diagonal part
102 |   int32_t num_local_offdiag_columns() const { return _num_local_offdiag_columns; }
103 | 
104 |   /// Update the values of the underlying PETSc matrix by copying
105 |   /// values from device memory to host memory.
106 |   ///
107 |   /// @param[in] cuda_context A context for a CUDA device
108 |   void copy_matrix_values_to_host(
109 |     const CUDA::Context& cuda_context);
110 | 
111 |   /// Finalize matrix assembly by calling PETSc's MatAssemblyBegin()
112 |   /// and MatAssemblyEnd().
113 |   ///
114 |   /// @param[in] type MAT_FLUSH_ASSEMBLY or MAT_FINAL_ASSEMBLY
115 |   void apply(MatAssemblyType type);
116 | 
117 |   void debug_dump();
118 | 
119 | private:
120 |   /// Handle to the corresponding PETSc matrix object
121 |   Mat _A;
122 | 
123 |   /// The diagonal block of the local part of the matrix
124 |   std::unique_ptr<CUDASeqMatrix> _diag;
125 | 
126 |   /// The off-diagonal block of the local part of the matrix.
127 |   /// This is only used if the matrix is distributed.
128 |   std::unique_ptr<CUDASeqMatrix> _offdiag;
129 | 
130 |   /// Device-side mapping from columns of the local, off-diagonal
131 |   /// block of the matrix to columns of the global matrix.
132 |   CUdeviceptr _dcolmap;
133 |   CUdeviceptr _dcolmap_sorted;
134 |   CUdeviceptr _dcolmap_sorted_indices;
135 | 
136 |   /// The number of rows in the global matrix
137 |   int32_t _num_rows;
138 | 
139 |   /// The number of columns in the global matrix
140 |   int32_t _num_columns;
141 | 
142 |   /// The first row owned by the current MPI process
143 |   int32_t _local_row_start;
144 | 
145 |   /// The last row owned by the current MPI process
146 |   int32_t _local_row_end;
147 | 
148 |   /// The number of rows owned by the current MPI process
149 |   int32_t _num_local_rows;
150 | 
151 |   /// The number of columns owned by the current MPI process
152 |   int32_t _num_local_columns;
153 | 
154 |   /// The number of columns in the off-diagonal part of the local
155 |   /// matrix owned by the current MPI process
156 |   int32_t _num_local_offdiag_columns;
157 | };
158 | 
159 | } // namespace dolfinx::la
160 | 


--------------------------------------------------------------------------------
/cpp/cudolfinx/la/CUDASeqMatrix.h:
--------------------------------------------------------------------------------
  1 | // Copyright (C) 2024 Benjamin Pachev, James D. Trotter
  2 | //
  3 | // This file is part of cuDOLFINX
  4 | //
  5 | // SPDX-License-Identifier:    LGPL-3.0-or-later
  6 | 
  7 | #pragma once
  8 | 
  9 | #include <cudolfinx/common/CUDA.h>
 10 | #include <cuda.h>
 11 | #include <petscmat.h>
 12 | 
 13 | namespace dolfinx::la
 14 | {
 15 | 
 16 | /// A wrapper for a matrix in the compressed sparse row (CSR) format
 17 | /// that is stored in the device memory of a CUDA device.
 18 | 
 19 | class CUDASeqMatrix
 20 | {
 21 | public:
 22 |   /// Create an empty CUDA matrix
 23 |   CUDASeqMatrix();
 24 | 
 25 |   /// Create a matrix from a PETSc Mat object. Note that the Mat must
 26 |   /// be of type MATSEQAIJ.
 27 |   ///
 28 |   /// @param[in] cuda_context A context for a CUDA device
 29 |   /// @param[in] A PETSc matrix to copy to the device
 30 |   /// @param[in] page_lock_values Whether or not to use page-locked
 31 |   ///                             memory for the host-side array of
 32 |   ///                             non-zero values.
 33 |   /// @param[in] use_seqaijcusparsegetarray Whether or not to use the
 34 |   ///              function MatSeqAIJCUSPARSEGetArray(),which is only
 35 |   ///              available in a custom-built version of PETSc. If it
 36 |   ///              is set, this will avoid unnecessary copying of data
 37 |   ///              between host and device for matrices of type
 38 |   ///              MATSEQAIJCUSPARSE whenever a CUDA-based assembler
 39 |   ///              is used.
 40 |   CUDASeqMatrix(
 41 |     const CUDA::Context& cuda_context,
 42 |     Mat A,
 43 |     bool page_lock_values,
 44 |     bool use_seqaijcusparsegetarray);
 45 | 
 46 |   /// Destructor
 47 |   ~CUDASeqMatrix();
 48 | 
 49 |   /// Copy constructor
 50 |   /// @param[in] matrix The object to be copied
 51 |   CUDASeqMatrix(const CUDASeqMatrix& matrix) = delete;
 52 | 
 53 |   /// Move constructor
 54 |   /// @param[in] matrix The object to be moved
 55 |   CUDASeqMatrix(CUDASeqMatrix&& matrix);
 56 | 
 57 |   /// Assignment operator
 58 |   /// @param[in] matrix Another CUDASeqMatrix object
 59 |   CUDASeqMatrix& operator=(const CUDASeqMatrix& matrix) = delete;
 60 | 
 61 |   /// Move assignment operator
 62 |   /// @param[in] matrix Another CUDASeqMatrix object
 63 |   CUDASeqMatrix& operator=(CUDASeqMatrix&& matrix);
 64 | 
 65 |   /// Get the underlying PETSc matrix object
 66 |   Mat mat() { return _A; }
 67 | 
 68 |   /// Get the number of matrix rows
 69 |   int32_t num_rows() const { return _num_rows; }
 70 | 
 71 |   /// Get the number of matrix columns
 72 |   int32_t num_columns() const { return _num_columns; }
 73 | 
 74 |   /// Get the global index of the first row
 75 |   int32_t local_row_start() const { return _local_row_start; }
 76 | 
 77 |   /// Get the global index of the last row
 78 |   int32_t local_row_end() const { return _local_row_end; }
 79 | 
 80 |   /// Get the number of local matrix rows
 81 |   int32_t num_local_rows() const { return _num_local_rows; }
 82 | 
 83 |   /// Get the number of local matrix columns
 84 |   int32_t num_local_columns() const { return _num_local_columns; }
 85 | 
 86 |   /// Get a handle to the device-side row pointers
 87 |   CUdeviceptr row_ptr() const { return _drow_ptr; }
 88 | 
 89 |   /// Get the number of local non-zeros
 90 |   int32_t num_local_nonzeros() const { return _num_local_nonzeros; }
 91 | 
 92 |   /// Get a handle to the device-side column indices
 93 |   CUdeviceptr column_indices() const { return _dcolumn_indices; }
 94 | 
 95 |   /// Get a handle to the device-side non-zero values
 96 |   CUdeviceptr values() const;
 97 | 
 98 |   /// Update the values of the underlying PETSc matrix by copying
 99 |   /// values from device memory to host memory.
100 |   ///
101 |   /// @param[in] cuda_context A context for a CUDA device
102 |   void copy_matrix_values_to_host(
103 |     const CUDA::Context& cuda_context);
104 | 
105 |   /// Finalize matrix assembly by calling PETSc's MatAssemblyBegin()
106 |   /// and MatAssemblyEnd().
107 |   ///
108 |   /// @param[in] type MAT_FLUSH_ASSEMBLY or MAT_FINAL_ASSEMBLY
109 |   void apply(MatAssemblyType type);
110 | 
111 |   void debug_dump();
112 | 
113 | private:
114 |   /// Handle to the corresponding PETSc matrix object
115 |   Mat _A;
116 | 
117 |   /// Whether or not the host-side array of non-zero values uses
118 |   /// page-locked or pinned memory
119 |   bool _values_page_locked;
120 | 
121 |   /// The number of rows in the global matrix
122 |   int32_t _num_rows;
123 | 
124 |   /// The number of columns in the global matrix
125 |   int32_t _num_columns;
126 | 
127 |   /// The first row owned by the current MPI process
128 |   int32_t _local_row_start;
129 | 
130 |   /// The last row owned by the current MPI process
131 |   int32_t _local_row_end;
132 | 
133 |   /// The number of rows owned by the current MPI process
134 |   int32_t _num_local_rows;
135 | 
136 |   /// The number of columns owned by the current MPI process
137 |   int32_t _num_local_columns;
138 | 
139 |   /// Device-side storage for row pointers
140 |   CUdeviceptr _drow_ptr;
141 | 
142 |   /// The number of non-zeros in the global matrix
143 |   int32_t _num_local_nonzeros;
144 | 
145 |   /// Device-side storage for column indices
146 |   CUdeviceptr _dcolumn_indices;
147 | 
148 |   /// Device-side storage for non-zero values
149 |   CUdeviceptr _dvalues;
150 | 
151 |   /// Whether or not the device-side pointer is owned by PETSc and
152 |   /// needs to be returned when we are done, or if it was allocated
153 |   /// with cuMemAlloc() and needs to be freed with cuMemFree().
154 |   ///
155 |   /// For now, PETSc does not provide access to device-side non-zero
156 |   /// values, even for matrices that are stored on a CUDA
157 |   /// device. Consequently, `_dvalues_petsc_owned` is always false,
158 |   /// and there is potentially some unnecessary copying between the
159 |   /// host and device.
160 |   bool _dvalues_petsc_owned;
161 | };
162 | 
163 | } // namespace dolfinx::la
164 | 


--------------------------------------------------------------------------------
/cpp/cudolfinx/la/CUDAVector.h:
--------------------------------------------------------------------------------
  1 | // Copyright (C) 2024 Benjamin Pachev, James D. Trotter
  2 | //
  3 | // This file is part of cuDOLFINX
  4 | //
  5 | // SPDX-License-Identifier:    LGPL-3.0-or-later
  6 | 
  7 | #pragma once
  8 | 
  9 | #include <cudolfinx/common/CUDA.h>
 10 | #include <cuda.h>
 11 | #include <petscvec.h>
 12 | 
 13 | namespace dolfinx::la
 14 | {
 15 | 
 16 | /// A wrapper for a dense vector that is stored in the device memory
 17 | /// of a CUDA device.
 18 | 
 19 | class CUDAVector
 20 | {
 21 | public:
 22 |   /// Create an empty CUDA vector
 23 |   CUDAVector();
 24 | 
 25 |   /// Create a vector from a PETSc Vec object
 26 |   ///
 27 |   /// @param[in] cuda_context A context for a CUDA device
 28 |   /// @param[in] x PETSc vector to copy to the device
 29 |   /// @param[in] page_lock_values Whether or not to use page-locked
 30 |   ///                             memory for the host-side array of
 31 |   ///                             values.
 32 |   CUDAVector(
 33 |       const CUDA::Context& cuda_context,
 34 |       Vec x,
 35 |       bool page_lock_values = true,
 36 |       bool include_ghosts = true);
 37 | 
 38 | 
 39 |   /*template <dolfinx::scalar T>
 40 |   CUDAVector(const CUDA::Context& cuda_context, std::shared_ptr<la::Vector<T>> x)
 41 |    : CUDAVector(cuda_context, la::petsc::create_vector_wrap(*x))
 42 |   {
 43 |   }*/
 44 | 
 45 |   /// Destructor
 46 |   ~CUDAVector();
 47 | 
 48 |   /// Copy constructor
 49 |   /// @param[in] vector The object to be copied
 50 |   CUDAVector(const CUDAVector& vector) = delete;
 51 | 
 52 |   /// Move constructor
 53 |   /// @param[in] vector The object to be moved
 54 |   CUDAVector(CUDAVector&& vector);
 55 | 
 56 |   /// Assignment operator
 57 |   /// @param[in] vector Another CUDAVector object
 58 |   CUDAVector& operator=(const CUDAVector& vector) = delete;
 59 | 
 60 |   /// Move assignment operator
 61 |   /// @param[in] vector Another CUDAVector object
 62 |   CUDAVector& operator=(CUDAVector&& vector);
 63 | 
 64 |   /// Get a handle to the underlying PETSc vector object
 65 |   const Vec vector() const { return _x; }
 66 | 
 67 |   /// Get the number of vector values
 68 |   int32_t num_values() const { return _num_values; }
 69 | 
 70 |   /// Get the number of local vector values
 71 |   int32_t num_local_values() const { return _num_local_values; }
 72 | 
 73 |   /// Get the number of local vector values
 74 |   int32_t num_local_ghosted_values() const { return _num_local_ghosted_values; }
 75 | 
 76 |   bool ghosted() const;
 77 | 
 78 |   /// Get a handle to the device-side non-zero values
 79 |   CUdeviceptr values() const;
 80 | 
 81 |   /// Return a handle to the device-side non-zero values
 82 |   void restore_values() const;
 83 | 
 84 |   /// Get a handle to the device-side non-zero values
 85 |   CUdeviceptr values_write() const;
 86 | 
 87 |   /// Return a handle to the device-side non-zero values
 88 |   void restore_values_write() const;
 89 | 
 90 |   /// Update the device-side vector values from the underlying PETSc
 91 |   /// vector. If the PETSc vector resides in host memory, then the
 92 |   /// values are copied from host memory to device memory. This does
 93 |   /// nothing if the PETSc vector is already held in device memory.
 94 |   ///
 95 |   /// @param[in] cuda_context A context for a CUDA device
 96 |   void copy_vector_values_to_device(
 97 |     const CUDA::Context& cuda_context);
 98 | 
 99 |   /// Update the values of the underlying PETSc vector. If the PETSc
100 |   /// vector resides in host memory, then the values are copied from
101 |   /// device memory to host memory. This does nothing if the PETSc
102 |   /// vector is already held in device memory.
103 |   ///
104 |   /// @param[in] cuda_context A context for a CUDA device
105 |   void copy_vector_values_to_host(
106 |     const CUDA::Context& cuda_context);
107 | 
108 |   /// Update the device-side values of ghost nodes from the underlying
109 |   /// PETSc vector. If the PETSc vector resides in host memory, then
110 |   /// values are copied from host memory to device memory. This does
111 |   /// nothing if the PETSc vector is already held in device memory.
112 |   ///
113 |   /// @param[in] cuda_context A context for a CUDA device
114 |   void copy_ghost_values_to_device(
115 |     const CUDA::Context& cuda_context);
116 | 
117 |   /// Update the values of ghost nodes of the underlying PETSc vector.
118 |   /// If the PETSc vector resides in host memory, then ghost values
119 |   /// are copied from device memory to host memory. This does nothing
120 |   /// if the PETSc vector is already held in device memory.
121 |   ///
122 |   /// @param[in] cuda_context A context for a CUDA device
123 |   void copy_ghost_values_to_host(
124 |     const CUDA::Context& cuda_context);
125 | 
126 |   /// Update vector entries that are owned by this process, but are
127 |   /// represented as ghost values on other processes.
128 |   void apply_ghosts(
129 |     const CUDA::Context& cuda_context);
130 | 
131 |   /// Update vector entries corresponding to ghost values, meaning
132 |   /// that ghost values are gathered from other processes that own
133 |   /// them.
134 |   bool update_ghosts(
135 |     const CUDA::Context& cuda_context);
136 | 
137 | private:
138 |   /// Handle to the corresponding PETSc vector object
139 |   Vec _x;
140 | 
141 |   /// Handle to the corresponding local PETSc vector object, if the
142 |   /// vector is distributed.
143 |   Vec _x_local;
144 | 
145 |   /// Whether or not the host-side array of values uses page-locked or
146 |   /// pinned memory
147 |   bool _values_page_locked;
148 | 
149 |   bool _include_ghosts;
150 | 
151 |   /// The number of values in the global vector
152 |   int32_t _num_values;
153 | 
154 |   /// The number of values owned by the current MPI rank
155 |   int32_t _num_local_values;
156 | 
157 |   /// The number of values owned by the current MPI rank
158 |   int32_t _num_local_ghosted_values;
159 | 
160 |   /// The first value owned by the current MPI rank
161 |   int32_t _local_values_start;
162 | 
163 |   /// The last value owned by the current MPI rank
164 |   int32_t _local_values_end;
165 | 
166 |   /// Device-side storage for non-zero values
167 |   mutable CUdeviceptr _dvalues;
168 | 
169 |   /// Whether or not the device-side pointer is owned by PETSc and
170 |   /// needs to be returned when we are done, or if it was allocated
171 |   /// with cuMemAlloc() and needs to be freed with cuMemFree().
172 |   bool _dvalues_petsc_owned;
173 | 
174 | public:
175 |   bool debug;
176 | };
177 | 
178 | } // namespace dolfinx::la
179 | 


--------------------------------------------------------------------------------
/cpp/cudolfinx/la/petsc.cpp:
--------------------------------------------------------------------------------
  1 | // Copyright (C) 2024 Benjamin Pachev, James D. Trotter
  2 | //
  3 | // This file is part of cuDOLFINX
  4 | //
  5 | // SPDX-License-Identifier:    LGPL-3.0-or-later
  6 | 
  7 | #include <cudolfinx/la/petsc.h>
  8 | #include <dolfinx/la/petsc.h>
  9 | #include <dolfinx/common/MPI.h>
 10 | #include <dolfinx/la/SparsityPattern.h>
 11 | #include <dolfinx/la/utils.h>
 12 | #include <iostream>
 13 | 
 14 | Mat la::petsc::create_cuda_matrix(MPI_Comm comm, const SparsityPattern& sp)
 15 | {
 16 |   PetscErrorCode ierr;
 17 |   Mat A;
 18 | 
 19 |   // Get IndexMaps from sparsity patterm, and block size
 20 |   std::array maps = {sp.index_map(0), sp.index_map(1)};
 21 |   const std::array bs = {sp.block_size(0), sp.block_size(1)};
 22 |   dolfinx::common::IndexMap col_map = sp.column_index_map();
 23 | 
 24 |   // Get global and local dimensions
 25 |   const std::int64_t M = bs[0] * maps[0]->size_global();
 26 |   const std::int64_t N = bs[1] * maps[1]->size_global();
 27 |   const std::int32_t m = bs[0] * maps[0]->size_local();
 28 |   const std::int32_t n = bs[1] * maps[1]->size_local();
 29 | 
 30 |   // Build data to initialise sparsity pattern (modify for block size)
 31 |   std::vector<PetscInt> _row_ptr;
 32 |   // Need to ensure correct int type. . .
 33 |   std::vector<std::int32_t> _column_indices;
 34 |   auto [_edges, _offsets]  = sp.graph();
 35 | 
 36 |   // The CUDA assembly kernels aren't currently robust to matrices with variable block size
 37 |   // So for now always unroll to 1
 38 |   _row_ptr.resize(m+1);
 39 |   _row_ptr[0] = 0;
 40 |   _column_indices.resize(_edges.size()*bs[0]*bs[1]);
 41 |   // index indicating where we are in _edges
 42 |   std::size_t edge_index = 0;
 43 |   std::size_t unrolled_edge_index = 0;
 44 |   // Iterate over (blocked) rows
 45 |   for (std::size_t row = 0; row < maps[0]->size_local(); row++) {
 46 |     // TODO test with differing block sizes to ensure this is still valid
 47 |     PetscInt row_nnz = _offsets[row+1] - _offsets[row];
 48 |     PetscInt unrolled_row_nnz = row_nnz * bs[1];
 49 | 
 50 |     // row ptr
 51 |     for (std::size_t unrolled_row = bs[0]*row; unrolled_row < bs[0]*(row+1); unrolled_row++)
 52 |       _row_ptr[unrolled_row+1] = _row_ptr[unrolled_row] + unrolled_row_nnz;
 53 | 
 54 |     for (std::size_t j = 0; j < row_nnz; j++) {
 55 |       for (std::size_t k = 0; k < bs[1]; k++)
 56 |         _column_indices[unrolled_edge_index + j*bs[1] + k] = bs[1]*_edges[edge_index+j] + k;
 57 |     }
 58 |     // Unroll row block 
 59 |     for (std::size_t l = 1; l < bs[0]; l++)
 60 |       std::copy_n(std::next(_column_indices.begin(), unrolled_edge_index), unrolled_row_nnz, 
 61 | 		      std::next(_column_indices.begin(), unrolled_edge_index + l*unrolled_row_nnz));
 62 | 
 63 |     edge_index += row_nnz;
 64 |     unrolled_edge_index += bs[0] * unrolled_row_nnz;
 65 |   }
 66 | 
 67 |   // convert local column indices to global ones (unrolling blocked indices)
 68 |   std::vector<PetscInt> global_column_indices(_column_indices.size());
 69 |   auto col_local_size = bs[1]*col_map.size_local();
 70 |   auto col_ghosts = col_map.ghosts();
 71 |   auto col_local_range = bs[1]*col_map.local_range()[0];
 72 |   for (std::size_t i = 0; i < _column_indices.size(); i++) {
 73 |     
 74 |     if (_column_indices[i] < col_local_size)
 75 |       global_column_indices[i] = _column_indices[i] + col_local_range;
 76 |     else {
 77 |       int diff = _column_indices[i] - col_local_size;     
 78 |       global_column_indices[i] = bs[1] * col_ghosts[diff / bs[1]] + diff % bs[1];
 79 |     }	    
 80 |   }
 81 |   MatCreateMPIAIJWithArrays(comm, m, n, M, N, _row_ptr.data(), global_column_indices.data(), nullptr, &A);
 82 |   // Change matrix type to CUDA
 83 |   ierr = MatSetType(A, MATMPIAIJCUSPARSE);
 84 |   if (ierr != 0)
 85 |     petsc::error(ierr, __FILE__, "MatSetFromOptions");
 86 | 
 87 |   // Set block sizes
 88 |   ierr = MatSetBlockSizes(A, 1, 1);
 89 |   if (ierr != 0)
 90 |     petsc::error(ierr, __FILE__, "MatSetBlockSizes");
 91 | 
 92 |   // Create PETSc local-to-global map/index sets
 93 |   ISLocalToGlobalMapping local_to_global0;
 94 |   // create unrolled global indices
 95 |   const std::vector map0 = maps[0]->global_indices();
 96 |   std::vector<PetscInt> _map0;
 97 |   _map0.resize(map0.size() * bs[0]);
 98 |   for (size_t i = 0; i < map0.size(); i++)
 99 |     for (size_t j = 0; j < bs[0]; j++)
100 |       _map0[i*bs[0] + j] = map0[i]*bs[0] + j;
101 |   //const std::vector<PetscInt> _map0(map0.begin(), map0.end());
102 |  ierr = ISLocalToGlobalMappingCreate(MPI_COMM_SELF, 1, _map0.size(),
103 |                                       _map0.data(), PETSC_COPY_VALUES,
104 |                                       &local_to_global0);
105 | 
106 |   if (ierr != 0)
107 |     petsc::error(ierr, __FILE__, "ISLocalToGlobalMappingCreate");
108 | 
109 |   // Check for common index maps
110 |   if (maps[0] == maps[1] and bs[0] == bs[1])
111 |   {
112 |     ierr = MatSetLocalToGlobalMapping(A, local_to_global0, local_to_global0);
113 |     if (ierr != 0)
114 |       petsc::error(ierr, __FILE__, "MatSetLocalToGlobalMapping");
115 |   }
116 |   else
117 |   {
118 |     ISLocalToGlobalMapping local_to_global1;
119 |     const std::vector map1 = maps[1]->global_indices();
120 |     std::vector<PetscInt> _map1;
121 |     _map1.resize(map1.size() * bs[1]);
122 |     for (size_t i = 0; i < map1.size(); i++)
123 |       for (size_t j = 0; j < bs[1]; j++)
124 |         _map1[i*bs[1] + j] = map1[i]*bs[1] + j;
125 |     //const std::vector<PetscInt> _map1(map1.begin(), map1.end());
126 |     ierr = ISLocalToGlobalMappingCreate(MPI_COMM_SELF, 1, _map1.size(),
127 |                                         _map1.data(), PETSC_COPY_VALUES,
128 |                                         &local_to_global1);
129 |     if (ierr != 0)
130 |       petsc::error(ierr, __FILE__, "ISLocalToGlobalMappingCreate");
131 |     ierr = MatSetLocalToGlobalMapping(A, local_to_global0, local_to_global1);
132 |     if (ierr != 0)
133 |       petsc::error(ierr, __FILE__, "MatSetLocalToGlobalMapping");
134 |     ierr = ISLocalToGlobalMappingDestroy(&local_to_global1);
135 |     if (ierr != 0)
136 |       petsc::error(ierr, __FILE__, "ISLocalToGlobalMappingDestroy");
137 |   }
138 | 
139 |   // Clean up local-to-global 0
140 |   ierr = ISLocalToGlobalMappingDestroy(&local_to_global0);
141 |   if (ierr != 0)
142 |     petsc::error(ierr, __FILE__, "ISLocalToGlobalMappingDestroy");
143 | 
144 |   // Set some options on Mat object
145 |   ierr = MatSetOption(A, MAT_NEW_NONZERO_ALLOCATION_ERR, PETSC_TRUE);
146 |   if (ierr != 0)
147 |     petsc::error(ierr, __FILE__, "MatSetOption");
148 |   ierr = MatSetOption(A, MAT_KEEP_NONZERO_PATTERN, PETSC_TRUE);
149 |   if (ierr != 0)
150 |     petsc::error(ierr, __FILE__, "MatSetOption");
151 | 
152 |   return A;
153 | }
154 | 
155 | 


--------------------------------------------------------------------------------
/cpp/cudolfinx/la/petsc.h:
--------------------------------------------------------------------------------
 1 | // Copyright (C) 2024 Benjamin Pachev, James D. Trotter
 2 | //
 3 | // This file is part of cuDOLFINX
 4 | //
 5 | // SPDX-License-Identifier:    LGPL-3.0-or-later
 6 | 
 7 | #pragma once
 8 | 
 9 | #include <dolfinx/la/SparsityPattern.h>
10 | #include <petscmat.h>
11 | #include <petscoptions.h>
12 | #include <petscvec.h>
13 | #include <petscmacros.h>
14 | 
15 | namespace dolfinx::la
16 | {
17 | 
18 | namespace petsc
19 | {
20 | 
21 | Mat create_cuda_matrix(MPI_Comm comm, const SparsityPattern& sp);
22 | 
23 | } // namespace petsc
24 | } // namespace dolfinx::la
25 | 


--------------------------------------------------------------------------------
/cpp/cudolfinx/mesh/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set(HEADERS_mesh
 2 |     ${CMAKE_CURRENT_SOURCE_DIR}/CUDAMesh.h
 3 |     ${CMAKE_CURRENT_SOURCE_DIR}/CUDAMeshEntities.h
 4 |     ${CMAKE_CURRENT_SOURCE_DIR}/util.h
 5 |     PARENT_SCOPE
 6 | )
 7 | 
 8 | target_sources(
 9 |    cudolfinx
10 |    PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/util.cpp
11 | )
12 | 
13 | 


--------------------------------------------------------------------------------
/cpp/cudolfinx/mesh/CUDAMesh.h:
--------------------------------------------------------------------------------
  1 | // Copyright (C) 2024 Benjamin Pachev, James D. Trotter
  2 | //
  3 | // This file is part of cuDOLFINX
  4 | //
  5 | // SPDX-License-Identifier:    LGPL-3.0-or-later
  6 | 
  7 | #pragma once
  8 | 
  9 | #include <cudolfinx/common/CUDA.h>
 10 | #include <dolfinx/mesh/Mesh.h>
 11 | #include <cudolfinx/mesh/CUDAMeshEntities.h>
 12 | #include <cuda.h>
 13 | #include <vector>
 14 | 
 15 | namespace dolfinx {
 16 | namespace mesh {
 17 | 
 18 | /// A wrapper for mesh data that is stored in the device memory of a
 19 | /// CUDA device.
 20 | template <std::floating_point T>
 21 | class CUDAMesh
 22 | {
 23 | public:
 24 |   /// Create an empty mesh
 25 |   CUDAMesh()
 26 |     : _tdim()
 27 |     , _num_vertices()
 28 |     , _num_coordinates_per_vertex()
 29 |     , _dvertex_coordinates(0)
 30 |     , _num_cells()
 31 |     , _num_vertices_per_cell()
 32 |     , _dvertex_indices_per_cell(0)
 33 |     , _dcell_permutations(0)
 34 |     , _mesh_entities()
 35 |   {
 36 |   }
 37 |   //-----------------------------------------------------------------------------
 38 |   /// Create a mesh
 39 |   ///
 40 |   /// @param[in] cuda_context A context for a CUDA device
 41 |   /// @param[in] mesh Data structures for mesh topology and geometry
 42 |   CUDAMesh(const CUDA::Context& cuda_context, const dolfinx::mesh::Mesh<T>& mesh)
 43 |   {
 44 |     CUresult cuda_err;
 45 |     const char * cuda_err_description;
 46 | 
 47 |     _tdim = mesh.topology()->dim();
 48 | 
 49 |     // Allocate device-side storage for vertex coordinates
 50 |     auto vertex_coordinates = mesh.geometry().x();
 51 |     _num_vertices = vertex_coordinates.size() / 3;
 52 |     // TODO figure out how to handle this properly
 53 |     // FEniCSx has a dimension of 3 during assembly, but returns a 
 54 |     // different value for the dim of mesh.geometry
 55 |     _num_coordinates_per_vertex = 3;
 56 |     //_num_coordinates_per_vertex = mesh.geometry().dim();
 57 |     if (_num_vertices > 0 && _num_coordinates_per_vertex > 0) {
 58 |       if (_num_coordinates_per_vertex > 3) {
 59 |         throw std::runtime_error(
 60 |           "Expected at most 3 coordinates per vertex "
 61 |           "instead of " + std::to_string(_num_coordinates_per_vertex) + " "
 62 |           "at " + std::string(__FILE__) + ":" + std::to_string(__LINE__));
 63 |       }
 64 | 
 65 |       size_t dvertex_coordinates_size =
 66 |         _num_vertices * 3 * sizeof(double);
 67 |       cuda_err = cuMemAlloc(
 68 |         &_dvertex_coordinates,
 69 |         dvertex_coordinates_size);
 70 |       if (cuda_err != CUDA_SUCCESS) {
 71 |         cuGetErrorString(cuda_err, &cuda_err_description);
 72 |         throw std::runtime_error(
 73 |           "cuMemAlloc() failed with " + std::string(cuda_err_description) +
 74 |           " at " + std::string(__FILE__) + ":" + std::to_string(__LINE__));
 75 |       }
 76 | 
 77 |       // Copy vertex coordinates to device
 78 |       cuda_err = cuMemcpyHtoD(
 79 |         _dvertex_coordinates,
 80 |         vertex_coordinates.data(),
 81 |         dvertex_coordinates_size);
 82 |       if (cuda_err != CUDA_SUCCESS) {
 83 |         cuMemFree(_dvertex_coordinates);
 84 |         cuGetErrorString(cuda_err, &cuda_err_description);
 85 |         throw std::runtime_error(
 86 |           "cuMemcpyHtoD() failed with " + std::string(cuda_err_description) +
 87 |           " at " + std::string(__FILE__) + ":" + std::to_string(__LINE__));
 88 |       }
 89 |     }
 90 | 
 91 |     // Obtain mesh geometry
 92 |     auto x_dofmap =
 93 |       mesh.geometry().dofmap();
 94 | 
 95 |     // Allocate device-side storage for cell vertex indices
 96 |     _num_cells = x_dofmap.extent(0);
 97 |     _num_vertices_per_cell = x_dofmap.extent(1);
 98 |     if (_num_cells > 0 && _num_vertices_per_cell > 0) {
 99 |       size_t dvertex_indices_per_cell_size =
100 |         _num_cells * _num_vertices_per_cell * sizeof(int32_t);
101 |       cuda_err = cuMemAlloc(
102 |         &_dvertex_indices_per_cell,
103 |         dvertex_indices_per_cell_size);
104 |       if (cuda_err != CUDA_SUCCESS) {
105 |         cuMemFree(_dvertex_coordinates);
106 |         cuGetErrorString(cuda_err, &cuda_err_description);
107 |         throw std::runtime_error(
108 |           "cuMemAlloc() failed with " + std::string(cuda_err_description) +
109 |           " at " + std::string(__FILE__) + ":" + std::to_string(__LINE__));
110 |       }
111 | 
112 |       // Copy cell vertex indices to device
113 |       cuda_err = cuMemcpyHtoD(
114 |         _dvertex_indices_per_cell,
115 |         x_dofmap.data_handle(),
116 |         dvertex_indices_per_cell_size);
117 |       if (cuda_err != CUDA_SUCCESS) {
118 |         cuMemFree(_dvertex_indices_per_cell);
119 |         cuMemFree(_dvertex_coordinates);
120 |         cuGetErrorString(cuda_err, &cuda_err_description);
121 |         throw std::runtime_error(
122 |           "cuMemcpyHtoD() failed with " + std::string(cuda_err_description) +
123 |           " at " + std::string(__FILE__) + ":" + std::to_string(__LINE__));
124 |       }
125 |     }
126 | 
127 |     // Obtain cell permutations
128 |     mesh.topology_mutable()->create_entity_permutations();
129 |     auto cell_permutations = mesh.topology()->get_cell_permutation_info();
130 | 
131 |     // Allocate device-side storage for cell permutations
132 |     if (_num_cells > 0) {
133 |       size_t dcell_permutations_size =
134 |         _num_cells * sizeof(uint32_t);
135 |       cuda_err = cuMemAlloc(
136 |         &_dcell_permutations,
137 |         dcell_permutations_size);
138 |       if (cuda_err != CUDA_SUCCESS) {
139 |         cuMemFree(_dvertex_indices_per_cell);
140 |         cuMemFree(_dvertex_coordinates);
141 |         cuGetErrorString(cuda_err, &cuda_err_description);
142 |         throw std::runtime_error(
143 |           "cuMemAlloc() failed with " + std::string(cuda_err_description) +
144 |           " at " + std::string(__FILE__) + ":" + std::to_string(__LINE__));
145 |       }
146 | 
147 |       // Copy cell permutations to device
148 |       cuda_err = cuMemcpyHtoD(
149 |         _dcell_permutations,
150 |         cell_permutations.data(),
151 |         dcell_permutations_size);
152 |       if (cuda_err != CUDA_SUCCESS) {
153 |         cuMemFree(_dcell_permutations);
154 |         cuGetErrorString(cuda_err, &cuda_err_description);
155 |         throw std::runtime_error(
156 |           "cuMemcpyHtoD() failed with " + std::string(cuda_err_description) +
157 |           " at " + std::string(__FILE__) + ":" + std::to_string(__LINE__));
158 |       }
159 |     }
160 | 
161 |     for (int dim = 0; dim < _tdim; dim++) {
162 |       _mesh_entities.emplace_back(
163 |         cuda_context, mesh, dim);
164 |     }
165 |   }
166 |   //-----------------------------------------------------------------------------
167 |   /// Destructor
168 |   ~CUDAMesh()
169 |   {
170 |     if (_dcell_permutations)
171 |       cuMemFree(_dcell_permutations);
172 |     if (_dvertex_indices_per_cell)
173 |       cuMemFree(_dvertex_indices_per_cell);
174 |     if (_dvertex_coordinates)
175 |       cuMemFree(_dvertex_coordinates);
176 |   }
177 |   //-----------------------------------------------------------------------------
178 |   /// Copy constructor
179 |   /// @param[in] mesh The object to be copied
180 |   CUDAMesh(const CUDAMesh& mesh) = delete;
181 | 
182 |   /// Move constructor
183 |   /// @param[in] mesh The object to be moved
184 |   CUDAMesh(CUDAMesh&& mesh)
185 |     : _tdim(mesh._tdim)
186 |     , _num_vertices(mesh._num_vertices)
187 |     , _num_coordinates_per_vertex(mesh._num_coordinates_per_vertex)
188 |     , _dvertex_coordinates(mesh._dvertex_coordinates)
189 |     , _num_cells(mesh._num_cells)
190 |     , _num_vertices_per_cell(mesh._num_vertices_per_cell)
191 |     , _dvertex_indices_per_cell(mesh._dvertex_indices_per_cell)
192 |     , _dcell_permutations(mesh._dcell_permutations)
193 |     , _mesh_entities(std::move(mesh._mesh_entities))
194 |   {
195 |     mesh._tdim = 0;
196 |     mesh._num_vertices = 0;
197 |     mesh._num_coordinates_per_vertex = 0;
198 |     mesh._dvertex_coordinates = 0;
199 |     mesh._num_cells = 0;
200 |     mesh._num_vertices_per_cell = 0;
201 |     mesh._dvertex_indices_per_cell = 0;
202 |     mesh._dcell_permutations = 0;
203 |   }
204 |   //-----------------------------------------------------------------------------
205 |   /// Assignment operator
206 |   /// @param[in] mesh Another CUDAMesh object
207 |   CUDAMesh& operator=(const CUDAMesh& mesh) = delete;
208 | 
209 |   /// Move assignment operator
210 |   /// @param[in] mesh Another CUDAMesh object
211 |   CUDAMesh& operator=(CUDAMesh&& mesh)
212 |   {
213 |     _tdim = mesh._tdim;
214 |     _num_vertices = mesh._num_vertices;
215 |     _num_coordinates_per_vertex = mesh._num_coordinates_per_vertex;
216 |     _dvertex_coordinates = mesh._dvertex_coordinates;
217 |     _num_cells = mesh._num_cells;
218 |     _num_vertices_per_cell = mesh._num_vertices_per_cell;
219 |     _dvertex_indices_per_cell = mesh._dvertex_indices_per_cell;
220 |     _dcell_permutations = mesh._dcell_permutations;
221 |     _mesh_entities = std::move(mesh._mesh_entities);
222 |     mesh._tdim = 0;
223 |     mesh._num_vertices = 0;
224 |     mesh._num_coordinates_per_vertex = 0;
225 |     mesh._dvertex_coordinates = 0;
226 |     mesh._num_cells = 0;
227 |     mesh._num_vertices_per_cell = 0;
228 |     mesh._dvertex_indices_per_cell = 0;
229 |     mesh._dcell_permutations = 0;
230 |     return *this;
231 |   }
232 |   //-----------------------------------------------------------------------------
233 | 
234 | 
235 |   /// Get the topological dimension of the mesh
236 |   int32_t tdim() const { return _tdim; }
237 | 
238 |   /// Get the number of vertices
239 |   int32_t num_vertices() const { return _num_vertices; }
240 | 
241 |   /// Get the number of coordinates per vertex
242 |   int32_t num_coordinates_per_vertex() const {
243 |     return _num_coordinates_per_vertex; }
244 | 
245 |   /// Get a handle to the device-side vertex coordinates
246 |   CUdeviceptr vertex_coordinates() const {
247 |     return _dvertex_coordinates; }
248 | 
249 |   /// Get the number of cells
250 |   int32_t num_cells() const { return _num_cells; }
251 | 
252 |   /// Get the number of vertices per cell
253 |   int32_t num_vertices_per_cell() const {
254 |     return _num_vertices_per_cell; }
255 | 
256 |   /// Get a handle to the device-side cell vertex indices
257 |   CUdeviceptr vertex_indices_per_cell() const {
258 |     return _dvertex_indices_per_cell; }
259 | 
260 |   /// Get a handle to the device-side cell permutations
261 |   CUdeviceptr cell_permutations() const {
262 |     return _dcell_permutations; }
263 | 
264 |   /// Get the mesh entities of each dimension
265 |   const std::vector<CUDAMeshEntities<T>>& mesh_entities() const {
266 |     return _mesh_entities; }
267 | 
268 | private:
269 |   /// The topological dimension of the mesh, or the largest dimension
270 |   /// of any of the mesh entities
271 |   int32_t _tdim;
272 | 
273 |   /// The number of vertices in the mesh
274 |   int32_t _num_vertices;
275 | 
276 |   /// The number of coordinates for each vertex
277 |   int32_t _num_coordinates_per_vertex;
278 | 
279 |   /// The coordinates of the mesh vertices
280 |   CUdeviceptr _dvertex_coordinates;
281 | 
282 |   /// The number of cells in the mesh
283 |   int32_t _num_cells;
284 | 
285 |   /// The number of vertices in each cell
286 |   int32_t _num_vertices_per_cell;
287 | 
288 |   /// The vertex indices of each cell
289 |   CUdeviceptr _dvertex_indices_per_cell;
290 | 
291 |   /// Cell permutations
292 |   CUdeviceptr _dcell_permutations;
293 | 
294 |   /// The mesh entities of each dimension
295 |   std::vector<CUDAMeshEntities<T>> _mesh_entities;
296 | };
297 | 
298 | } // namespace mesh
299 | } // namespace dolfinx
300 | 
301 | 


--------------------------------------------------------------------------------
/cpp/cudolfinx/mesh/util.cpp:
--------------------------------------------------------------------------------
  1 | #include <cudolfinx/mesh/util.h>
  2 | #include <dolfinx/fem/Form.h>
  3 | #include <dolfinx/fem/utils.h>
  4 | #include <dolfinx/mesh/Topology.h>
  5 | 
  6 | using namespace dolfinx;
  7 | 
  8 | std::vector<std::int32_t> dolfinx::mesh::ghost_exterior_facet_indices(std::shared_ptr<Topology> topology)
  9 | {
 10 |   const int tdim = topology->dim();
 11 |   auto f_to_c = topology->connectivity(tdim - 1, tdim);
 12 |   auto f_to_v = topology->connectivity(tdim-1, 0);
 13 |   if (!f_to_c)  {
 14 |     topology->create_connectivity(tdim-1, tdim);
 15 |     f_to_c = topology->connectivity(tdim-1, tdim);
 16 |   }
 17 |   if (!f_to_v) {
 18 |     topology->create_connectivity(tdim-1, 0);
 19 |     f_to_v = topology->connectivity(tdim-1, 0);
 20 |   }
 21 |   // Find all owned facets (not ghost) with only one attached cell
 22 |   auto facet_map = topology->index_map(tdim - 1);
 23 |   const int num_local_facets = facet_map->size_local();
 24 |   const int num_ghost_facets = facet_map->num_ghosts();
 25 |   const int num_local_vertices = topology->index_map(0)->size_local();
 26 |   std::vector<std::int32_t> facets;
 27 |   for (std::int32_t f = num_local_facets; f < num_local_facets+num_ghost_facets; ++f)
 28 |   {
 29 |     if (f_to_c->num_links(f) == 1) {
 30 |       // check to make sure at least one facet vertex is owned
 31 |       // otherwise this is not needed
 32 |       auto vertices = f_to_v->links(f);
 33 |       bool has_owned_vertex = false;
 34 |       for (int i = 0; i < vertices.size(); i++) {
 35 |         if (vertices[i] < num_local_vertices) has_owned_vertex = true;
 36 |       }
 37 |       if (has_owned_vertex) facets.push_back(f);
 38 |     }
 39 |   }
 40 |   // Remove facets on internal inter-process boundary
 41 |   std::vector<std::int32_t> ext_facets;
 42 |   std::ranges::set_difference(facets, topology->interprocess_facets(),
 43 |                               std::back_inserter(ext_facets));
 44 |   return ext_facets;
 45 | }
 46 | 
 47 | std::vector<std::int32_t> dolfinx::mesh::ghost_entities(
 48 | 		dolfinx::fem::IntegralType integral_type, 
 49 | 		std::shared_ptr<Topology> topology)
 50 | {
 51 |   std::vector<std::int32_t> ghost_entities; 
 52 |   int tdim = topology->dim();
 53 |   switch (integral_type) {
 54 |     case fem::IntegralType::cell:
 55 |       {
 56 |         auto cell_index_map = topology->index_map(tdim);
 57 |         int num_ghost_cells = cell_index_map->num_ghosts();
 58 | 	int num_owned_cells = cell_index_map->size_local();
 59 |         ghost_entities.resize(num_ghost_cells);
 60 |         std::iota(ghost_entities.begin(), ghost_entities.end(), num_owned_cells);
 61 |       }
 62 |       break;
 63 |     case fem::IntegralType::exterior_facet:
 64 |       {
 65 |         auto ghost_exterior_facets = dolfinx::mesh::ghost_exterior_facet_indices(topology);
 66 | 	ghost_entities.reserve(2*ghost_exterior_facets.size());
 67 | 	auto c_to_f = topology->connectivity(tdim, tdim-1);
 68 | 	auto f_to_c = topology->connectivity(tdim-1, tdim);
 69 | 	for (std::int32_t f : ghost_exterior_facets) {
 70 | 	  auto pair =
 71 |             dolfinx::fem::impl::get_cell_facet_pairs<1>(f, f_to_c->links(f), *c_to_f);
 72 |           ghost_entities.insert(ghost_entities.end(), pair.begin(), pair.end());
 73 | 	}
 74 |       }
 75 |       break;
 76 |     case fem::IntegralType::interior_facet:
 77 |       {
 78 |         auto c_to_f = topology->connectivity(tdim, tdim-1);
 79 |         auto f_to_c = topology->connectivity(tdim-1, tdim);
 80 |         auto facet_map = topology->index_map(tdim-1);
 81 |         int num_local_facets = facet_map->size_local();
 82 |         int total_facets = num_local_facets + facet_map->num_ghosts();
 83 |         for (int f = num_local_facets; f < total_facets; f++) {
 84 |           if (f_to_c->num_links(f) == 2) {
 85 |             auto pairs =
 86 |                 fem::impl::get_cell_facet_pairs<2>(f, f_to_c->links(f), *c_to_f);
 87 |             ghost_entities.insert(ghost_entities.end(), pairs.begin(), pairs.end());
 88 | 	  }
 89 | 	}
 90 |       }
 91 |     default:
 92 |       break;
 93 |   }
 94 |   return ghost_entities;
 95 | }
 96 | 
 97 | std::vector<std::int32_t> dolfinx::mesh::active_ghost_entities(
 98 |                 std::span<const std::int32_t> active_local_entities,
 99 |                 fem::IntegralType integral_type,
100 |                 std::shared_ptr<Topology> topology)
101 | {
102 |   std::vector<std::int32_t> ghost_entities;
103 |   MPI_Comm comm = topology->comm();
104 |   int rank = dolfinx::MPI::rank(comm);
105 |   int tdim = topology->dim();
106 |   int ent_dim = (integral_type == fem::IntegralType::cell) ? tdim : tdim-1;
107 |   // Step 1: determine the active entities which are ghosted on other processes 
108 |   std::map<int, std::vector<std::int32_t>> dest_entities;
109 |   auto imap = topology->index_map(ent_dim);
110 |   int num_local_entities = imap->size_local();
111 |   auto entity_ranks = imap->index_to_dest_ranks();
112 |   int facet_increment = (integral_type == fem::IntegralType::interior_facet) ? 4 : 2;
113 |   switch (integral_type) {
114 |     case fem::IntegralType::cell:
115 |       if (rank == 0) std::cout << "cell" <<std::endl;
116 |       for (auto& cell : active_local_entities) {
117 |         if (cell >= entity_ranks.num_nodes()) continue;
118 |         for (auto& r : entity_ranks.links(cell)) {
119 |           if (dest_entities.find(r) == dest_entities.end()) {
120 |             dest_entities[r] = {cell};
121 |           }
122 |           else dest_entities[r].push_back(cell);
123 |         }
124 |       }
125 |       break;
126 |     case fem::IntegralType::interior_facet:
127 |     case fem::IntegralType::exterior_facet: {
128 |       auto c_to_f = topology->connectivity(tdim, tdim-1);
129 |       if (!c_to_f) {
130 |         topology->create_connectivity(tdim, tdim-1);
131 |         c_to_f = topology->connectivity(tdim, tdim-1);
132 |       }
133 |       for (int i = 0; i < active_local_entities.size(); i += facet_increment) {
134 |         auto cell = active_local_entities[i];
135 |         auto facet_index = active_local_entities[i+1];
136 |         auto facet = c_to_f->links(cell)[facet_index];
137 |         if (facet >= entity_ranks.num_nodes()) continue;
138 |         for (auto& r : entity_ranks.links(facet)) {
139 |           if (dest_entities.find(r) == dest_entities.end()) {
140 |             dest_entities[r] = {facet};
141 |           }
142 |           else dest_entities[r].push_back(facet);
143 |         }
144 |       }
145 |     }
146 |     default:
147 |       break;
148 |   }
149 | 
150 |   // Step 2: send those entities to the other processes
151 |   std::vector<std::int64_t> indices_send_buffer;
152 |   // construct list of destination MPI ranks
153 |   std::vector<int> dest;
154 |   std::vector<int> send_sizes;
155 |   for (const auto& pair : dest_entities) {
156 |     dest.push_back(pair.first);
157 |     std::size_t num_inds = pair.second.size();
158 |     send_sizes.push_back(num_inds);
159 |     std::vector<std::int64_t> global_inds(num_inds);
160 |     imap->local_to_global(pair.second, global_inds);
161 |     for (const auto& i : global_inds)
162 |       indices_send_buffer.push_back(i);
163 |   }
164 |   // get source ranks
165 |   std::vector<int> src = dolfinx::MPI::compute_graph_edges_nbx(comm, dest);
166 |   // Create neighbor communicator
167 |   MPI_Comm neigh_comm;
168 |   int ierr = MPI_Dist_graph_create_adjacent(
169 |       comm, src.size(), src.data(), MPI_UNWEIGHTED, dest.size(),
170 |       dest.data(), MPI_UNWEIGHTED, MPI_INFO_NULL, false, &neigh_comm);
171 |   dolfinx::MPI::check_error(comm, ierr);
172 |   // Share lengths of indices to be sent to each rank
173 |   std::vector<int> recv_sizes(src.size(), 0);
174 |   ierr = MPI_Neighbor_alltoall(send_sizes.data(), 1, MPI_INT,
175 |                                recv_sizes.data(), 1, MPI_INT, neigh_comm);
176 |   dolfinx::MPI::check_error(comm, ierr);
177 |   // Prepare displacement arrays
178 |   std::vector<int> send_disp(dest.size() + 1, 0);
179 |   std::vector<int> recv_disp(src.size() + 1, 0);
180 |   std::partial_sum(send_sizes.begin(), send_sizes.end(),
181 |                    std::next(send_disp.begin()));
182 |   std::partial_sum(recv_sizes.begin(), recv_sizes.end(),
183 |                    std::next(recv_disp.begin()));
184 |   // next steps - construct recv buffers and perform communication
185 |   std::size_t recv_buf_size = recv_disp.back();
186 |   // make sure that the buffer pointers actually get allocated
187 |   std::vector<std::int64_t> indices_recv_buffer(recv_buf_size);
188 |   ierr = MPI_Neighbor_alltoallv(indices_send_buffer.data(), send_sizes.data(),
189 |                                 send_disp.data(), MPI_INT64_T,
190 |                                 indices_recv_buffer.data(), recv_sizes.data(),
191 |                                 recv_disp.data(), MPI_INT64_T, neigh_comm);
192 |   dolfinx::MPI::check_error(comm, ierr);
193 |   // Step 3: Convert from global to local indices and do entity computation
194 |   std::vector<std::int32_t> local_recv_indices(indices_recv_buffer.size());
195 |   imap->global_to_local(indices_recv_buffer, local_recv_indices);
196 | 
197 |   switch (integral_type) {
198 |     case fem::IntegralType::cell:
199 |       return local_recv_indices;
200 |       break;
201 |     case fem::IntegralType::exterior_facet: {
202 |       // Remove facets on internal inter-process boundary
203 |       std::vector<std::int32_t> ext_facets;
204 |       std::sort(local_recv_indices.begin(), local_recv_indices.end());
205 |       std::ranges::set_difference(local_recv_indices, topology->interprocess_facets(),
206 |                               std::back_inserter(ext_facets));
207 |       auto c_to_f = topology->connectivity(tdim, tdim-1);
208 |       auto f_to_c = topology->connectivity(tdim-1, tdim);
209 |       ghost_entities.reserve(2*ext_facets.size());
210 |       for (auto& facet : ext_facets) {
211 |         if (f_to_c->num_links(facet) == 1) {
212 |           auto pair =
213 |             dolfinx::fem::impl::get_cell_facet_pairs<1>(facet, f_to_c->links(facet), *c_to_f);
214 |           ghost_entities.insert(ghost_entities.end(), pair.begin(), pair.end());
215 |         }
216 |       }
217 |       break;
218 |     }
219 |     case fem::IntegralType::interior_facet: {
220 |       auto c_to_f = topology->connectivity(tdim, tdim-1);
221 |       auto f_to_c = topology->connectivity(tdim-1, tdim);
222 |       for (auto& facet : local_recv_indices) {
223 |         if (f_to_c->num_links(facet) == 2) {
224 |           auto pair =
225 |             dolfinx::fem::impl::get_cell_facet_pairs<2>(facet, f_to_c->links(facet), *c_to_f);
226 |           ghost_entities.insert(ghost_entities.end(), pair.begin(), pair.end());
227 |         }
228 |       }
229 |     }
230 |     default:
231 |       break;
232 |   }
233 | 
234 |   return ghost_entities;
235 | }
236 | 


--------------------------------------------------------------------------------
/docker/Dockerfile.end-user:
--------------------------------------------------------------------------------
 1 | # Dockerfile describing end-user CUDA-accelerated FEniCS environments
 2 | # Modified version of the DOLFINx end user Docker file
 3 | #
 4 | # Authors:
 5 | # Benjamin Pachev <benjamin.pachev@gmail.com>
 6 | #
 7 | 
 8 | ARG PYVISTA_VERSION=0.44.2
 9 | 
10 | # Used to set the correct PYTHONPATH for the real and complex install of
11 | # DOLFINx
12 | ARG PYTHON_VERSION=3.12
13 | # Base image for end-user images
14 | ARG BASEIMAGE=benpachev/cudolfinx:dev-env-v0.9.0
15 | ARG CUDOLFINX_TAG=v0.9.0
16 | 
17 | FROM ${BASEIMAGE} as cudolfinx
18 | LABEL description="cuDOLFINx (onbuild)"
19 | 
20 | ARG PYTHON_VERSION
21 | 
22 | WORKDIR /src
23 | 
24 | RUN git clone --depth 1 --branch v0.9.0 https://github.com/FEniCS/dolfinx.git
25 | RUN git clone --depth 1 --branch v0.9.0 https://github.com/FEniCS/ffcx.git 
26 | RUN git clone --depth 1 --branch v0.9.0 https://github.com/FEniCS/basix.git
27 | RUN git clone --depth 1 --branch 2024.2.0 https://github.com/FEniCS/ufl.git
28 | RUN git clone --depth 1 --branch v0.9.0 https://github.com/bpachev/cuda-dolfinx.git
29 | 
30 | RUN cp dolfinx/docker/dolfinx-real-mode /usr/local/bin/dolfinx-real-mode
31 | RUN chmod +x /usr/local/bin/dolfinx-*-mode
32 | 
33 | # CMake build type for DOLFINx C++ build. See CMake documentation.
34 | ARG DOLFINX_CMAKE_BUILD_TYPE="Release"
35 | 
36 | # Using pip install `.[test]` with --no-dependencies and --no-build-isolation
37 | # does not install necessary packages, hence install build and optional
38 | # dependencies manually here.
39 | RUN pip install --no-cache-dir -r dolfinx/python/build-requirements.txt && \
40 |             pip install --no-cache-dir pyamg pytest scipy matplotlib numba # test + optional set
41 | 
42 | RUN cd basix && cmake -G Ninja -DCMAKE_BUILD_TYPE=${DOLFINX_CMAKE_BUILD_TYPE} -B build-dir -S ./cpp && \
43 |     cmake --build build-dir && \
44 |     cmake --install build-dir && \
45 |     pip install ./python && \
46 |     cd ../ufl && pip install --no-cache-dir . && \
47 |     cd ../ffcx && pip install --no-cache-dir . && \
48 |     cd ../ && pip install --no-cache-dir ipython
49 | 
50 | RUN apt-get -qq update && \
51 |     apt-get install -y libboost-timer-dev libboost-filesystem-dev
52 | 
53 | # --no-dependencies necessary as --target does not contain any dependencies e.g.
54 | # mpi4py - leading to unwanted rebuild.
55 | RUN cd dolfinx && \
56 |     mkdir -p build-real && \
57 |     cd build-real && \
58 |     PETSC_ARCH=linux-gnu-real64-32-cuda cmake -G Ninja -DCMAKE_INSTALL_PREFIX=/usr/local/dolfinx-real -DCMAKE_BUILD_TYPE=${DOLFINX_CMAKE_BUILD_TYPE} ../cpp && \
59 |     ninja install && \
60 |     cd ../python && \
61 |     PETSC_ARCH=linux-gnu-real64-32-cuda pip -v install \
62 |       --config-settings=cmake.build-type="${DOLFINX_CMAKE_BUILD_TYPE}" --config-settings=install.strip=false --no-build-isolation --check-build-dependencies \
63 |       --target /usr/local/dolfinx-real/lib/python${PYTHON_VERSION}/dist-packages --no-dependencies --no-cache-dir '.'  
64 | 
65 | # Currently cuDOLFINX only supports real mode, as the CUDA version of PETSc appears to not compile with complex types . . . . 
66 | ENV PKG_CONFIG_PATH=/usr/local/dolfinx-real/lib/pkgconfig:$PKG_CONFIG_PATH \
67 |     CMAKE_PREFIX_PATH=/usr/local/dolfinx-real/lib/cmake:$CMAKE_PREFIX_PATH \
68 |     PETSC_ARCH=linux-gnu-real64-32-cuda \
69 |     PYTHONPATH=/usr/local/dolfinx-real/lib/python${PYTHON_VERSION}/dist-packages:$PYTHONPATH \
70 |     LD_LIBRARY_PATH=/usr/local/dolfinx-real/lib:$LD_LIBRARY_PATH
71 | 
72 | RUN cd cuda-dolfinx && \
73 |     mkdir -p build-real && \
74 |     cd build-real && \
75 |     PETSC_ARCH=linux-gnu-real64-32-cuda cmake -G Ninja -DCMAKE_INSTALL_PREFIX=/usr/local/dolfinx-real -DCMAKE_BUILD_TYPE=${DOLFINX_CMAKE_BUILD_TYPE} ../cpp && \
76 |     ninja install && \
77 |     cd ../python && \
78 |     PETSC_ARCH=linux-gnu-real64-32-cuda pip -v install \
79 |       --config-settings=cmake.build-type="${DOLFINX_CMAKE_BUILD_TYPE}" --config-settings=install.strip=false --no-build-isolation --check-build-dependencies \
80 |       --target /usr/local/dolfinx-real/lib/python${PYTHON_VERSION}/dist-packages --no-dependencies --no-cache-dir '.'
81 | 
82 | # Prepending /usr/local to paths is needed to make the correct version of MPI be used (not the one that comes with NVHPC)
83 | # Since this container doesn't currently install GPU aware MPI, PETSc needs the gpu aware MPI option turned off
84 | # TODO: fix the base container to install GPU-aware MPI
85 | ENV PETSC_OPTIONS="-use_gpu_aware_mpi 0" \
86 |     LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH \
87 |     PATH=/usr/local/bin:$PATH
88 | 


--------------------------------------------------------------------------------
/docker/Dockerfile.test-env:
--------------------------------------------------------------------------------
  1 | FROM nvcr.io/nvidia/nvhpc:24.9-devel-cuda12.6-ubuntu24.04 as cudolfinx-dev-env
  2 | 
  3 | ARG ADIOS2_VERSION=2.10.2
  4 | ARG DOXYGEN_VERSION=1_13_2
  5 | ARG GMSH_VERSION=4_13_1
  6 | ARG HDF5_VERSION=1.14.6
  7 | ARG KAHIP_VERSION=3.18
  8 | # NOTE: The NumPy version (https://pypi.org/project/numpy/#history)
  9 | # should be pinned to the most recent NumPy release that is supported by
 10 | # the most recent Numba release, see
 11 | # https://numba.readthedocs.io/en/stable/user/installing.html#version-support-information
 12 | ARG NUMPY_VERSION=2.1.3
 13 | ARG PETSC_VERSION=3.22.4
 14 | ARG SLEPC_VERSION=3.22.2
 15 | ARG SPDLOG_VERSION=1.15.1
 16 | 
 17 | ARG MPICH_VERSION=4.2.3
 18 | ARG OPENMPI_SERIES=5.0
 19 | ARG OPENMPI_PATCH=7
 20 | 
 21 | ########################################
 22 | 
 23 | LABEL maintainer="Benjamin Pachev <benjaminpachev@gmail.com>"
 24 | LABEL description="Modified FEniCS dev environment with CUDA PETSc installed."
 25 | 
 26 | ARG ADIOS2_VERSION
 27 | ARG DOXYGEN_VERSION
 28 | ARG GMSH_VERSION
 29 | ARG HDF5_VERSION
 30 | ARG KAHIP_VERSION
 31 | ARG PETSC_VERSION
 32 | ARG SLEPC_VERSION
 33 | ARG SPDLOG_VERSION
 34 | ARG NUMPY_VERSION
 35 | ARG MPICH_VERSION
 36 | ARG OPENMPI_SERIES
 37 | ARG OPENMPI_PATCH
 38 | 
 39 | # The following ARGS are used in the dev-env layer.
 40 | # They are safe defaults. They can be overridden by the user.
 41 | # Compiler optimisation flags for SLEPc and PETSc, all languages.
 42 | ARG PETSC_SLEPC_OPTFLAGS="-O2"
 43 | # Turn on PETSc and SLEPc debugging. "yes" or "no".
 44 | ARG PETSC_SLEPC_DEBUGGING="no"
 45 | 
 46 | # MPI variant. "mpich" or "openmpi".
 47 | ARG MPI="openmpi"
 48 | 
 49 | # Number of build threads to use with make
 50 | ARG BUILD_NP=4
 51 | 
 52 | WORKDIR /tmp
 53 | 
 54 | # Environment variables
 55 | ENV OPENBLAS_NUM_THREADS=1 \
 56 |     OPENBLAS_VERBOSE=0
 57 | 
 58 | # Install dependencies available via apt-get.
 59 | # - First set of packages are required to build and run FEniCS.
 60 | # - Second set of packages are recommended and/or required to build
 61 | #   documentation or tests.
 62 | # - Third set of packages are optional, but required to run gmsh
 63 | #   pre-built binaries.
 64 | RUN export DEBIAN_FRONTEND=noninteractive && \
 65 |     apt-get -qq update && \
 66 |     apt-get -yq --with-new-pkgs -o Dpkg::Options::="--force-confold" upgrade && \
 67 |     apt-get -y install \
 68 |     cmake \
 69 |     g++ \
 70 |     gfortran \
 71 |     libboost-dev \
 72 |     liblapack-dev \
 73 |     libopenblas-dev \
 74 |     libpugixml-dev \
 75 |     ninja-build \
 76 |     pkg-config \
 77 |     python3-dev \
 78 |     python3-pip \
 79 |     python3-venv && \
 80 |     #
 81 |     apt-get -y install \
 82 |     catch2 \
 83 |     git \
 84 |     graphviz \
 85 |     libeigen3-dev \
 86 |     valgrind \
 87 |     wget && \
 88 |     #
 89 |     apt-get -y install \
 90 |     libglu1 \
 91 |     libxcursor-dev \
 92 |     libxft2 \
 93 |     libxinerama1 \
 94 |     libfltk1.3-dev \
 95 |     libfreetype6-dev  \
 96 |     libgl1-mesa-dev \
 97 |     libocct-foundation-dev \
 98 |     libocct-data-exchange-dev && \
 99 |     apt-get clean && \
100 |     rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
101 | 
102 | # Install spdlog from source - Ubuntu version is incompatible with CUDA 12.
103 | RUN wget -nc --quiet https://github.com/gabime/spdlog/archive/refs/tags/v${SPDLOG_VERSION}.tar.gz && \
104 |     tar xfz v${SPDLOG_VERSION}.tar.gz && \
105 |     cd spdlog-${SPDLOG_VERSION} && \
106 |     cmake -G Ninja -DCMAKE_BUILD_TYPE=Release -DSPDLOG_BUILD_SHARED=ON -DSPDLOG_BUILD_PIC=ON -B build-dir . && \
107 |     cmake --build build-dir && \
108 |     cmake --install build-dir && \
109 |     rm -rf /tmp/*
110 | 
111 | # Install Doxygen
112 | RUN apt-get -qq update && \
113 |     apt-get -y install bison flex && \
114 |     wget -nc --quiet https://github.com/doxygen/doxygen/archive/refs/tags/Release_${DOXYGEN_VERSION}.tar.gz && \
115 |     tar xfz Release_${DOXYGEN_VERSION}.tar.gz && \
116 |     cd doxygen-Release_${DOXYGEN_VERSION} && \
117 |     cmake -G Ninja -DCMAKE_BUILD_TYPE=Release -B build-dir . && \
118 |     cmake --build build-dir && \
119 |     cmake --install build-dir && \
120 |     apt-get -y purge bison flex && \
121 |     apt-get -y autoremove && \
122 |     apt-get clean && \
123 |     rm -rf /tmp/*
124 | 
125 | # Install MPI
126 | RUN if [ "$MPI" = "mpich" ]; then \
127 |     wget https://www.mpich.org/static/downloads/${MPICH_VERSION}/mpich-${MPICH_VERSION}.tar.gz && \
128 |     tar xfz mpich-${MPICH_VERSION}.tar.gz  && \
129 |     cd mpich-${MPICH_VERSION}  && \
130 |     ./configure && \
131 |     make -j${BUILD_NP} install; \
132 |     else \
133 |     wget https://download.open-mpi.org/release/open-mpi/v${OPENMPI_SERIES}/openmpi-${OPENMPI_SERIES}.${OPENMPI_PATCH}.tar.gz && \
134 |     tar xfz openmpi-${OPENMPI_SERIES}.${OPENMPI_PATCH}.tar.gz  && \
135 |     cd openmpi-${OPENMPI_SERIES}.${OPENMPI_PATCH} && \
136 |     ./configure  && \
137 |     make -j${BUILD_NP} install; \
138 |     fi && \
139 |     ldconfig && \
140 |     rm -rf /tmp/*
141 | 
142 | ENV VIRTUAL_ENV=/dolfinx-env
143 | ENV PATH=/dolfinx-env/bin:$PATH
144 | RUN python3 -m venv ${VIRTUAL_ENV}
145 | 
146 | # Install Python packages (via pip)
147 | RUN pip install --no-cache-dir --upgrade pip setuptools wheel && \
148 |     pip install --no-cache-dir cython numpy==${NUMPY_VERSION} && \
149 |     CFLAGS="-noswitcherror" pip install --no-cache-dir mpi4py
150 | 
151 | # Install KaHIP
152 | RUN wget -nc --quiet https://github.com/kahip/kahip/archive/v${KAHIP_VERSION}.tar.gz && \
153 |     tar -xf v${KAHIP_VERSION}.tar.gz && \
154 |     cmake -G Ninja -DCMAKE_BUILD_TYPE=Release -DNONATIVEOPTIMIZATIONS=on -B build-dir -S KaHIP-${KAHIP_VERSION} && \
155 |     cmake --build build-dir && \
156 |     cmake --install build-dir && \
157 |     rm -rf /tmp/*
158 | 
159 | # Install HDF5
160 | # Note: HDF5 CMake install has numerous bugs and inconsistencies. Test carefully.
161 | # HDF5 overrides CMAKE_INSTALL_PREFIX by default, hence it is set
162 | # below to ensure that HDF5 is installed into a path where it can be
163 | # found.
164 | RUN wget -nc --quiet https://github.com/HDFGroup/hdf5/archive/refs/tags/hdf5_${HDF5_VERSION}.tar.gz && \
165 |     tar xfz hdf5_${HDF5_VERSION}.tar.gz && \
166 |     cmake -G Ninja -DCMAKE_INSTALL_PREFIX=/usr/local -DCMAKE_BUILD_TYPE=Release -DHDF5_ENABLE_PARALLEL=on -DHDF5_ENABLE_Z_LIB_SUPPORT=on -B build-dir -S hdf5-hdf5_${HDF5_VERSION} && \
167 |     cmake --build build-dir && \
168 |     cmake --install build-dir && \
169 |     rm -rf /tmp/*
170 | 
171 | # Install ADIOS2 (Python interface in /usr/local/lib), same as GMSH
172 | RUN wget -nc --quiet https://github.com/ornladios/ADIOS2/archive/v${ADIOS2_VERSION}.tar.gz -O adios2-v${ADIOS2_VERSION}.tar.gz && \
173 |     mkdir -p adios2-v${ADIOS2_VERSION} && \
174 |     tar -xf adios2-v${ADIOS2_VERSION}.tar.gz -C adios2-v${ADIOS2_VERSION} --strip-components 1 && \
175 |     cmake -G Ninja -DADIOS2_USE_HDF5=on -DCMAKE_INSTALL_PYTHONDIR=/usr/local/lib/ -DADIOS2_USE_Fortran=off -DBUILD_TESTING=off -DADIOS2_BUILD_EXAMPLES=off -DADIOS2_USE_ZeroMQ=off -B build-dir -S ./adios2-v${ADIOS2_VERSION} && \
176 |     cmake --build build-dir && \
177 |     cmake --install build-dir && \
178 |     rm -rf /tmp/*
179 | 
180 | # Install GMSH
181 | RUN git clone -b gmsh_${GMSH_VERSION} --single-branch --depth 1 https://gitlab.onelab.info/gmsh/gmsh.git && \
182 |     cmake -G Ninja -DCMAKE_BUILD_TYPE=Release -DENABLE_BUILD_DYNAMIC=1  -DENABLE_OPENMP=1 -B build-dir -S gmsh && \
183 |     cmake --build build-dir && \
184 |     cmake --install build-dir && \
185 |     rm -rf /tmp/*
186 | 
187 | # GMSH installs python library in /usr/local/lib, see: https://gitlab.onelab.info/gmsh/gmsh/-/issues/1414
188 | ENV PYTHONPATH=/usr/local/lib:$PYTHONPATH
189 | 
190 | # Install PETSc and petsc4py with real and complex types
191 | ENV PETSC_DIR=/usr/local/petsc SLEPC_DIR=/usr/local/slepc
192 | RUN ln -sf /opt/nvidia/hpc_sdk/Linux_x86_64/24.9/cuda/lib64/stubs/libcuda.so /opt/nvidia/hpc_sdk/Linux_x86_64/24.9/cuda/lib64/stubs/libcuda.so.1
193 | RUN apt-get -qq update && \
194 |     apt-get -y install bison flex && \
195 |     git clone --depth=1 -b v${PETSC_VERSION} https://gitlab.com/petsc/petsc.git ${PETSC_DIR} && \
196 |     cd ${PETSC_DIR} && \
197 |     # Real64, 32-bit int with CUDA
198 |     LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/nvidia/hpc_sdk/Linux_x86_64/24.9/cuda/lib64/stubs/ ./configure \
199 |     PETSC_ARCH=linux-gnu-real64-32-cuda \
200 |     --COPTFLAGS="${PETSC_SLEPC_OPTFLAGS}" \
201 |     --CXXOPTFLAGS="${PETSC_SLEPC_OPTFLAGS}" \
202 |     --FOPTFLAGS="${PETSC_SLEPC_OPTFLAGS}" \
203 |     --with-64-bit-indices=no \
204 |     --with-debugging=${PETSC_SLEPC_DEBUGGING} \
205 |     --with-fortran-bindings=no \
206 |     --with-shared-libraries \
207 |     --download-hypre \
208 |     --download-metis \
209 |     --download-mumps-avoid-mpi-in-place \
210 |     --download-mumps \
211 |     --download-ptscotch \
212 |     --download-scalapack \
213 |     --with-cuda\
214 |     --download-spai \
215 |     --download-suitesparse \
216 |     --with-scalar-type=real \
217 |     --with-precision=double && \
218 |     make PETSC_ARCH=linux-gnu-real64-32-cuda ${MAKEFLAGS} all 
219 | 
220 |     # Install petsc4py
221 | RUN cd ${PETSC_DIR}/src/binding/petsc4py && \
222 |     PETSC_ARCH=linux-gnu-real64-32-cuda CFLAGS="-noswitcherror" pip -v install --no-cache-dir --no-build-isolation . && \
223 |     # Cleanup
224 |     apt-get -y purge bison flex && \
225 |     apt-get -y autoremove && \
226 |     apt-get clean && \
227 |     rm -rf \
228 |     ${PETSC_DIR}/**/tests/ \
229 |     ${PETSC_DIR}/**/obj/ \
230 |     ${PETSC_DIR}/**/externalpackages/  \
231 |     ${PETSC_DIR}/CTAGS \
232 |     ${PETSC_DIR}/RDict.log \
233 |     ${PETSC_DIR}/TAGS \
234 |     ${PETSC_DIR}/docs/ \
235 |     ${PETSC_DIR}/share/ \
236 |     ${PETSC_DIR}/src/ \
237 |     ${PETSC_DIR}/systems/ \
238 |     rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
239 | 
240 | WORKDIR /root
241 | 
242 | 


--------------------------------------------------------------------------------
/python/CMakeLists.txt:
--------------------------------------------------------------------------------
  1 | cmake_minimum_required(VERSION 3.18)
  2 | 
  3 | project(cudolfinx_nanobind)
  4 | 
  5 | # Set C++ standard
  6 | set(CMAKE_CXX_STANDARD 20)
  7 | set(CMAKE_CXX_STANDARD_REQUIRED ON)
  8 | set(CMAKE_CXX_EXTENSIONS OFF)
  9 | 
 10 | find_package(
 11 |   Python
 12 |   COMPONENTS Interpreter Development
 13 |   REQUIRED
 14 | )
 15 | 
 16 | # Detect the installed nanobind package and import it into CMake
 17 | execute_process(
 18 |   COMMAND "${Python_EXECUTABLE}" -m nanobind --cmake_dir
 19 |   OUTPUT_STRIP_TRAILING_WHITESPACE
 20 |   OUTPUT_VARIABLE NB_DIR
 21 | )
 22 | list(APPEND CMAKE_PREFIX_PATH "${NB_DIR}")
 23 | find_package(nanobind CONFIG REQUIRED)
 24 | 
 25 | execute_process(
 26 |   COMMAND
 27 |     ${Python3_EXECUTABLE} -c
 28 |     "import os, sys, basix; sys.stdout.write(os.path.dirname(basix.__file__))"
 29 |   OUTPUT_VARIABLE BASIX_PY_DIR
 30 |   RESULT_VARIABLE BASIX_PY_COMMAND_RESULT
 31 |   ERROR_VARIABLE BASIX_ERROR_OUT
 32 |   OUTPUT_STRIP_TRAILING_WHITESPACE
 33 | )
 34 | find_package(Basix REQUIRED CONFIG HINTS ${BASIX_PY_DIR})
 35 | 
 36 | if(Basix_FOUND)
 37 |   message(STATUS "Found Basix at ${Basix_DIR}")
 38 | endif()
 39 | 
 40 | find_package(DOLFINX REQUIRED CONFIG)
 41 | 
 42 | if(DOLFINX_FOUND)
 43 |   message(STATUS "Found DOLFINx at ${DOLFINX_DIR}")
 44 | endif()
 45 | 
 46 | find_package(CUDOLFINX REQUIRED CONFIG)
 47 | 
 48 | if(CUDOLFINX_FOUND)
 49 |   message(STATUS "Found CUDOLFINx at ${CUDOLFINX_DIR}")
 50 | endif()
 51 | 
 52 | find_package(CUDAToolkit REQUIRED)
 53 | 
 54 | # Create the binding library nanobind handles its own calls to
 55 | # target_link_libraries
 56 | nanobind_add_module(
 57 |   cpp
 58 |   NOMINSIZE
 59 |   cudolfinx/wrappers/cudolfinx.cpp
 60 |   cudolfinx/wrappers/fem.cpp
 61 |   cudolfinx/wrappers/petsc.cpp
 62 | )
 63 | 
 64 | # Add strict compiler flags include(CheckCXXCompilerFlag)
 65 | # check_cxx_compiler_flag("-Wall -Werror -pedantic" HAVE_PEDANTIC)
 66 | 
 67 | # if(HAVE_PEDANTIC) # target_compile_options(cpp PRIVATE
 68 | # -Wall;-Werror;-pedantic) endif()
 69 | 
 70 | # Add DOLFINx libraries
 71 | target_link_libraries(cpp PRIVATE dolfinx)
 72 | target_link_libraries(cpp PRIVATE cudolfinx)
 73 | target_link_libraries(cpp PRIVATE CUDA::cuda_driver CUDA::nvrtc CUDA::cupti)
 74 | target_include_directories(cpp SYSTEM PRIVATE ${CUDAToolkit_INCLUDE_DIRS})
 75 | 
 76 | # Check for petsc4py
 77 | execute_process(
 78 |   COMMAND ${Python_EXECUTABLE} -c
 79 |           "import petsc4py; print(petsc4py.get_include())"
 80 |   OUTPUT_VARIABLE PETSC4PY_INCLUDE_DIR
 81 |   RESULT_VARIABLE PETSC4PY_COMMAND_RESULT
 82 |   ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE
 83 | )
 84 | 
 85 | if(NOT PETSC4PY_COMMAND_RESULT)
 86 |   message(STATUS "Found petsc4py include directory at ${PETSC4PY_INCLUDE_DIR}")
 87 |   target_include_directories(cpp PRIVATE ${PETSC4PY_INCLUDE_DIR})
 88 | else()
 89 |   message(FATAL_ERROR "petsc4py could not be found.")
 90 | endif()
 91 | 
 92 | # Check for mpi4py
 93 | execute_process(
 94 |   COMMAND "${Python_EXECUTABLE}" -c "import mpi4py; print(mpi4py.get_include())"
 95 |   OUTPUT_VARIABLE MPI4PY_INCLUDE_DIR
 96 |   RESULT_VARIABLE MPI4PY_COMMAND_RESULT
 97 |   ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE
 98 | )
 99 | 
100 | if(NOT MPI4PY_COMMAND_RESULT)
101 |   message(STATUS "Found mpi4py include directory at ${MPI4PY_INCLUDE_DIR}")
102 |   target_include_directories(cpp PRIVATE ${MPI4PY_INCLUDE_DIR})
103 | else()
104 |   message(FATAL_ERROR "mpi4py could not be found.")
105 | endif()
106 | 
107 | set_target_properties(cpp PROPERTIES INSTALL_RPATH_USE_LINK_PATH TRUE)
108 | 
109 | install(TARGETS cpp DESTINATION cudolfinx)
110 | 


--------------------------------------------------------------------------------
/python/README.md:
--------------------------------------------------------------------------------
 1 | # CUDOLFINx Python interface installation
 2 | 
 3 | Below is guidance for building the CUDOLFINx Python interface.
 4 | 
 5 | 1. Build and install the CUDOLFINx C++ library.
 6 | 
 7 | 2. Ensure the Python interface build requirements are installed:
 8 | 
 9 |           pip install -r build-requirements.txt
10 | 
11 | 3. Build DOLFINx Python interface:
12 | 
13 |           pip install --check-build-dependencies --no-build-isolation .
14 | 
15 | To build in debug and editable mode for development:
16 | 
17 |      pip -v install --check-build-dependencies --config-settings=build-dir="build" --config-settings=cmake.build-type="Debug"  --config-settings=install.strip=false --no-build-isolation -e .
18 | 


--------------------------------------------------------------------------------
/python/build-requirements.txt:
--------------------------------------------------------------------------------
1 | nanobind>=1.8.0
2 | scikit-build-core[pyproject]
3 | petsc4py
4 | mpi4py
5 | 


--------------------------------------------------------------------------------
/python/cudolfinx/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2024 Benjamin Pachev
 2 | #
 3 | # This file is part of cuDOLFINX
 4 | #
 5 | # SPDX-License-Identifier:    LGPL-3.0-or-later
 6 | 
 7 | """Main module for CUDOLFINx"""
 8 | 
 9 | from cudolfinx.assemble import CUDAAssembler
10 | from cudolfinx.form import form
11 | from cudolfinx.mesh import ghost_layer_mesh, ghost_layer_meshtags
12 | 


--------------------------------------------------------------------------------
/python/cudolfinx/bcs.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2024 Benjamin Pachev
 2 | #
 3 | # This file is part of cuDOLFINX
 4 | #
 5 | # SPDX-License-Identifier:    LGPL-3.0-or-later
 6 | 
 7 | from cudolfinx import cpp as _cucpp
 8 | from dolfinx import cpp as _cpp
 9 | from dolfinx.fem.bcs import DirichletBC
10 | import typing
11 | 
12 | class CUDADirichletBC:
13 |   """Represents a collection of boundary conditions
14 |   """
15 | 
16 |   def __init__(self, ctx, bcs: typing.List[DirichletBC]):
17 |     """Initialize a collection of boundary conditions
18 |     """
19 | 
20 |     self.bcs = bcs
21 |     self._function_spaces = []
22 |     self._bc_lists = []
23 |     self._device_bcs = []
24 |     self._ctx = ctx
25 |     
26 |     for bc in bcs:
27 |         V = bc.function_space
28 |         try:
29 |             i = self._function_spaces.index(V)
30 |         except ValueError:
31 |             self._function_spaces.append(V)
32 |             self._bc_lists.append([])
33 |             i = -1
34 |         self._bc_lists[i].append(bc._cpp_object)
35 | 
36 |     for V, cpp_bcs in zip(self._function_spaces, self._bc_lists):
37 |         _cpp_bc_obj = self._make_device_bc(V, cpp_bcs)
38 |         self._device_bcs.append(_cpp_bc_obj)
39 | 
40 |   def _make_device_bc(self,
41 |           V: typing.Union[_cpp.fem.FunctionSpace_float32, _cpp.fem.FunctionSpace_float64],
42 |           cpp_bcs: typing.List[typing.Union[_cpp.fem.DirichletBC_float32, _cpp.fem.DirichletBC_float64]]
43 |           ):
44 |       """Create device bc object wrapping a list of bcs for the same function space"""
45 | 
46 |       if type(V) is _cpp.fem.FunctionSpace_float32:
47 |         return _cucpp.fem.CUDADirichletBC_float32(self._ctx, V, cpp_bcs)
48 |       elif type(V) is _cpp.fem.FunctionSpace_float64:
49 |         return _cucpp.fem.CUDADirichletBC_float64(self._ctx, V, cpp_bcs)
50 |       else:
51 |         raise TypeError(f"Invalid type for cpp FunctionSpace object '{type(V)}'")
52 | 
53 |   def _get_cpp_bcs(self, V: typing.Union[_cpp.fem.FunctionSpace_float32, _cpp.fem.FunctionSpace_float64]):
54 |     """Get cpp CUDADirichletBC object
55 |     """
56 | 
57 |     # Use this to avoid needing hashes (which might not be supported)
58 |     # Usually there will be a max of two function spaces associated with a set of bcs
59 |     try:
60 |         i = self._function_spaces.index(V)
61 |         return self._device_bcs[i]
62 |     except ValueError:
63 |         # return empty collection
64 |         return self._make_device_bc(V, [])
65 | 
66 |   def update(self, bcs: typing.Optional[typing.List[DirichletBC]] = None):
67 |     """Update a subset of the boundary conditions.
68 | 
69 |     Used for cases with time-varying boundary conditions whose device-side values
70 |     need to be updated. By default all boundary conditions are updated
71 |     """
72 | 
73 |     if bcs is None:
74 |       bcs = self.bcs
75 |     _bcs_to_update = [bc._cpp_object for bc in bcs]
76 |     
77 |     for _cpp_bc, V in zip(self._device_bcs, self._function_spaces):
78 |       # filter out anything not contained in the right function space
79 |       _cpp_bc.update([_bc for _bc in _bcs_to_update if V.contains(_bc.function_space)])
80 | 
81 | 


--------------------------------------------------------------------------------
/python/cudolfinx/context.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2024 Benjamin Pachev
 2 | #
 3 | # This file is part of cuDOLFINX
 4 | #
 5 | # SPDX-License-Identifier:    LGPL-3.0-or-later
 6 | 
 7 | from petsc4py import PETSc
 8 | from cudolfinx import cpp as _cucpp
 9 | 
10 | _device = None
11 | 
12 | def _init_device():
13 |   """Initialize PETSc device
14 |   """
15 |   global _device  
16 |   d = PETSc.Device()
17 |   d.create(PETSc.Device.Type.CUDA)
18 |   _device = d
19 | 
20 | def get_device():
21 |   """Return PETSc device
22 |   """
23 | 
24 |   global _device
25 |   if _device is None:
26 |     _init_device()
27 |   return _device
28 | 
29 | def get_cuda_context():
30 |   """Return the CUDA context, intializing it if needed
31 |   """
32 |   global _device
33 |   if _device is None:
34 |      _init_device() 
35 |   return _cucpp.fem.CUDAContext()
36 | 


--------------------------------------------------------------------------------
/python/cudolfinx/form.py:
--------------------------------------------------------------------------------
  1 | # Copyright (C) 2024 Benjamin Pachev
  2 | #
  3 | # This file is part of cuDOLFINX
  4 | #
  5 | # SPDX-License-Identifier:    LGPL-3.0-or-later
  6 | 
  7 | import collections
  8 | from cudolfinx.context import get_cuda_context
  9 | from cudolfinx import cpp as _cucpp, jit
 10 | from dolfinx import fem as fe
 11 | from dolfinx import cpp as _cpp
 12 | import numpy as np
 13 | import typing 
 14 | import ufl
 15 | 
 16 | class CUDAForm:
 17 |     """CUDA wrapper class for a dolfinx.fem.Form
 18 |     """
 19 |     
 20 |     def __init__(self, form: fe.Form):
 21 |         """Initialize the wrapper
 22 |         """
 23 | 
 24 |         self._ctx = get_cuda_context()
 25 |         self._cuda_mesh = _create_mesh_on_device(form.mesh, self._ctx)
 26 | 
 27 |         self._dolfinx_form = form
 28 |         self._wrapped_tabulate_tensors = jit.get_wrapped_tabulate_tensors(form)
 29 |         ufcx_form_addr = form.module.ffi.cast("uintptr_t", form.module.ffi.addressof(form.ufcx_form))
 30 | 
 31 |         cpp_form = form._cpp_object
 32 |         if type(cpp_form) is _cpp.fem.Form_float32:
 33 |             form_cls = _cucpp.fem.CUDAForm_float32
 34 |         elif type(cpp_form) is _cpp.fem.Form_float64:
 35 |             form_cls = _cucpp.fem.CUDAForm_float64
 36 |         else:
 37 |             raise ValueError(f"Cannot instantiate CUDAForm for Form of type {type(cpp_form)}!")
 38 | 
 39 |         _tabulate_tensor_names = []
 40 |         _tabulate_tensor_sources = []
 41 |         for name, source in self._wrapped_tabulate_tensors:
 42 |             _tabulate_tensor_names.append(name)
 43 |             _tabulate_tensor_sources.append(source)
 44 | 
 45 |         self._cuda_form = form_cls(
 46 |                 self._ctx,
 47 |                 cpp_form,
 48 |                 ufcx_form_addr,
 49 |                 _tabulate_tensor_names,
 50 |                 _tabulate_tensor_sources
 51 |         )
 52 | 
 53 |         # TODO expose these parameters to the user
 54 |         self._cuda_form.compile(self._ctx, max_threads_per_block=1024, min_blocks_per_multiprocessor=1)
 55 | 
 56 |     def to_device(self):
 57 |         """Copy host-side coefficients and constants to the device
 58 |         """
 59 | 
 60 |         self._cuda_form.to_device(self._ctx)
 61 | 
 62 |     @property
 63 |     def cuda_form(self):
 64 |         """Return the underlying cpp CUDAForm
 65 |         """
 66 | 
 67 |         return self._cuda_form
 68 | 
 69 |     @property
 70 |     def cuda_mesh(self):
 71 |         """Return the underlying cpp CUDAMesh"""
 72 | 
 73 |         return self._cuda_mesh
 74 | 
 75 |     @property
 76 |     def dolfinx_form(self):
 77 |         """Return the underlying Dolfinx form
 78 |         """
 79 | 
 80 |         return self._dolfinx_form
 81 | 
 82 |     @property
 83 |     def function_spaces(self):
 84 |         """Return a list of FunctionSpaces corresponding to the form
 85 |         """
 86 | 
 87 |         return self._dolfinx_form.function_spaces
 88 | 
 89 | class BlockCUDAForm:
 90 |     """Data structure containing multiple CUDA forms to be used in block assembly."""
 91 | 
 92 |     def __init__(
 93 |         self, forms: typing.Union[list[CUDAForm], list[list[CUDAForm]]],
 94 |         restrictions: typing.Optional[
 95 |             typing.Union[
 96 |                 list[np.typing.NDArray[np.int32]],
 97 |                 tuple[list[np.typing.NDArray[np.int32]], list[np.typing.NDArray[np.int32]]]
 98 |         ]] = None
 99 |     ):
100 |         """Initialize the data structure."""
101 | 
102 |         self._forms = forms
103 |         self._restrictions = restrictions
104 | 
105 |         if not len(forms): raise ValueError("Must provide at least one form!")
106 |         if type(forms[0]) is CUDAForm: self._init_vector()
107 |         else: self._init_matrix()
108 | 
109 |     def _init_vector(self):
110 |         """Initialize vector form."""
111 | 
112 |         offset = 0
113 |         offsets = [offset]
114 |         for i, form in enumerate(self._forms):
115 |             # note in dolfinx 0.10.0 dofmap is replaced with dofmaps
116 |             # which means this portion will require reworking
117 |             dofmap = form.function_spaces[0].dofmap
118 |             local_size = dofmap.index_map.size_local 
119 |             if self._restrictions is not None:
120 |                 restriction_inds = self._restrictions[i]
121 |                 # ignore ghosts
122 |                 restriction_inds = restriction_inds[restriction_inds < local_size]
123 |                 local_size = len(restriction_inds) 
124 |             else:
125 |                 restriction_inds = np.arange(local_size, dtype=np.int32)
126 |             target_inds = offset + np.arange(local_size, dtype=np.int32) 
127 |             offset += local_size * dofmap.index_map_bs
128 |             offsets.append(offset)
129 |             form.cuda_form.set_restriction([restriction_inds], [target_inds])
130 | 
131 |         self._offsets = offsets
132 |         comm = self._forms[0].dolfinx_form.mesh.comm
133 |         self._global_size = comm.allreduce(offsets[-1])
134 | 
135 | 
136 |     def _init_matrix(self):
137 |         """Initialize matrix form."""
138 | 
139 |         raise NotImplementedError("Block matrix assembly is not yet implemented!")
140 | 
141 |     @property
142 |     def forms(self):
143 |         """Return the list of forms."""
144 | 
145 |         return self._forms
146 | 
147 |     @property
148 |     def dolfinx_forms(self):
149 |         """Return list of underlying dolfinx forms."""
150 | 
151 |         return [f.dolfinx_form for f in self._forms]
152 | 
153 |     @property
154 |     def offsets(self):
155 |         """Return list of offsets."""
156 | 
157 |         return self._offsets
158 | 
159 |     @property
160 |     def local_size(self):
161 |         """Return size of local vector."""
162 | 
163 |         return self._offsets[-1]
164 | 
165 |     @property
166 |     def global_size(self):
167 |         """Return size of global vector."""
168 | 
169 |         return self._global_size
170 | 
171 | def form(
172 |     form: typing.Union[ufl.Form, typing.Iterable[ufl.Form]],
173 |     restriction: typing.Optional[typing.Iterable[np.typing.NDArray[np.int32]]] = None,
174 |     **kwargs):
175 |     """Create a CUDAForm from a ufl form."""
176 | 
177 |     def _create_form(form):
178 |         """Recursively convert ufl.Forms to CUDAForm."""
179 | 
180 |         if isinstance(form, ufl.Form):
181 |             dolfinx_form = fe.form(form, **kwargs)
182 |             return CUDAForm(dolfinx_form)
183 |         elif isinstance(form, collections.abc.Iterable):
184 |             return list(map(lambda sub_form: _create_form(sub_form), form))
185 |         else:
186 |             raise TypeError("Expected form to be a ufl.Form or an iterable, got type '{type(form)}'!")
187 | 
188 |     cuda_form = _create_form(form)
189 |     # TODO: properly handle restriction for a single form
190 |     if isinstance(form, collections.abc.Iterable):
191 |         return BlockCUDAForm(cuda_form, restriction)
192 |     else: return cuda_form
193 | 
194 | def _create_mesh_on_device(cpp_mesh: typing.Union[_cpp.mesh.Mesh_float32, _cpp.mesh.Mesh_float64], ctx: _cucpp.fem.CUDAContext):
195 |   """Create device-side mesh data
196 |   """
197 | 
198 |   if type(cpp_mesh) is _cpp.mesh.Mesh_float32:
199 |     return _cucpp.fem.CUDAMesh_float32(ctx, cpp_mesh)
200 |   elif type(cpp_mesh) is _cpp.mesh.Mesh_float64:
201 |     return _cucpp.fem.CUDAMesh_float64(ctx, cpp_mesh)
202 |   else:
203 |     raise ValueError(f"Cannot instantiate CUDAMesh for Mesh of type {type(cpp_mesh)}!")
204 | 
205 | 


--------------------------------------------------------------------------------
/python/cudolfinx/jit.py:
--------------------------------------------------------------------------------
  1 | # Copyright (C) 2024 Benjamin Pachev
  2 | #
  3 | # This file is part of cuDOLFINX
  4 | #
  5 | # SPDX-License-Identifier:    LGPL-3.0-or-later
  6 | 
  7 | """Routines for manipulating generated FFCX code
  8 | """
  9 | 
 10 | from dolfinx import fem, cpp
 11 | import numpy as np
 12 | import pathlib
 13 | from typing import *
 14 | 
 15 | def get_tabulate_tensor_sources(form: fem.Form):
 16 |     """Given a compiled fem.Form, extract the C source code of the tabulate tensors
 17 |     """
 18 | 
 19 |     module_file = pathlib.Path(form.module.__file__)
 20 |     source_filename = module_file.name.split(".")[0] + ".c"
 21 |     source_file = module_file.parent.joinpath(source_filename)
 22 |     if not source_file.is_file():
 23 |         raise IOError("Could not find generated ffcx source file '{source_file}'!")
 24 | 
 25 |     tabulate_tensors = []
 26 |     parsing_tabulate = False
 27 |     parsing_header = False
 28 |     bracket_count = 0
 29 |     with open(source_file) as fp:
 30 |         for line in fp:
 31 |             if "tabulate_tensor_integral" in line and line.strip().startswith("void"):
 32 |                 parsing_tabulate = True
 33 |                 parsing_header = True
 34 |                 tabulate_id = line.strip().split()[1].split("_")[-1].split("(")[0]
 35 |                 tabulate_body = []
 36 |             elif parsing_header:
 37 |                 if line.startswith("{"):
 38 |                     parsing_header = False
 39 |                     bracket_count += 1
 40 |             elif parsing_tabulate:
 41 |                 if line.startswith("{"): bracket_count += 1
 42 |                 elif line.startswith("}"): bracket_count -= 1
 43 |                 if not bracket_count:
 44 |                     tabulate_tensors.append((tabulate_id, "".join(tabulate_body)))
 45 |                     parsing_tabulate = False
 46 |                 else:
 47 |                     tabulate_body.append(line)
 48 |             elif "form_integrals_form" in line:
 49 |                 if "{" in line:
 50 |                     arr = line.split("{")[-1].split("}")[0]
 51 |                     ordered_integral_ids = [
 52 |                         part.strip().split("_")[-1] for part in arr.split(",")
 53 |                     ]
 54 | 
 55 |     id_order = {integral_id: i for i, integral_id in enumerate(ordered_integral_ids)}
 56 |     return sorted(tabulate_tensors, key=lambda tabulate: id_order[tabulate[0]])
 57 | 
 58 | cuda_tabulate_tensor_header = """
 59 |     #define alignas(x)
 60 |     #define restrict __restrict__
 61 |     
 62 |     typedef unsigned char uint8_t;
 63 |     typedef unsigned int uint32_t;
 64 |     typedef double ufc_scalar_t;
 65 |     
 66 |     extern "C" __global__
 67 |     void tabulate_tensor_{factory_name}({scalar_type}* restrict A,
 68 |                                         const {scalar_type}* restrict w,
 69 |                                         const {scalar_type}* restrict c,
 70 |                                         const {geom_type}* restrict coordinate_dofs,
 71 |                                         const int* restrict entity_local_index,
 72 |                                         const uint8_t* restrict quadrature_permutation
 73 |                                         )
 74 | """
 75 | 
 76 | def _convert_dtype_to_str(dtype: Any):
 77 |     """Convert numpy dtype to named C type
 78 |     """
 79 | 
 80 |     if dtype == np.float32:
 81 |         return "float"
 82 |     elif dtype == np.float64:
 83 |         return "double"
 84 |     else:
 85 |         raise TypeError(f"Unsupported dtype: '{dtype}'")    
 86 | 
 87 | def get_wrapped_tabulate_tensors(form: fem.Form, backend="cuda"):
 88 |     """Given a fem.Form, wrap the tabulate tensors for use on a GPU
 89 |     """
 90 | 
 91 |     if backend != "cuda":
 92 |         raise NotImplementedError(f"Backend '{backend}' not yet supported.")
 93 | 
 94 |     # for now assume same type for form and mesh
 95 |     # this is typically the default
 96 |     geom_type = scalar_type = _convert_dtype_to_str(form.dtype)
 97 | 
 98 |     res = []
 99 |     sources = get_tabulate_tensor_sources(form)
100 |     for id, body in sources:
101 |         factory_name = "integral_" + id
102 |         name = "tabulate_tensor_" + factory_name
103 |         header = cuda_tabulate_tensor_header.format(
104 |                 scalar_type=scalar_type,
105 |                 geom_type=geom_type,
106 |                 factory_name=factory_name
107 |                 )
108 |         wrapped_source = header + "{\n" + body + "}\n"
109 |         res.append((name, wrapped_source))
110 | 
111 |     return res
112 | 
113 | 


--------------------------------------------------------------------------------
/python/cudolfinx/la.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2024 Benjamin Pachev
 2 | #
 3 | # This file is part of cuDOLFINX
 4 | #
 5 | # SPDX-License-Identifier:    LGPL-3.0-or-later
 6 | 
 7 | from cudolfinx import cpp as _cucpp
 8 | 
 9 | class CUDAVector:
10 |   """Vector on device
11 |   """
12 | 
13 |   def __init__(self, ctx, vec):
14 |     """Initialize the vector
15 |     """
16 | 
17 |     self._petsc_vec = vec
18 |     self._ctx = ctx
19 |     self._cpp_object = _cucpp.fem.CUDAVector(ctx, self._petsc_vec)
20 | 
21 |   @property
22 |   def vector(self):
23 |     """Return underlying PETSc vector
24 |     """
25 | 
26 |     return self._petsc_vec
27 | 
28 |   def to_host(self):
29 |     """Copy device-side values to host
30 |     """
31 | 
32 |     self._cpp_object.to_host(self._ctx)
33 | 
34 |   def __del__(self):
35 |     """Delete the vector and free up GPU resources
36 |     """
37 | 
38 |     # Ensure that the cpp CUDAVector is taken care of BEFORE the petsc vector. . . .
39 |     del self._cpp_object
40 | 
41 | class CUDAMatrix:
42 |   """Matrix on device
43 |   """
44 | 
45 |   def __init__(self, ctx, petsc_mat):
46 |     """Initialize the matrix
47 |     """
48 | 
49 |     self._petsc_mat = petsc_mat
50 |     self._ctx = ctx
51 |     self._cpp_object = _cucpp.fem.CUDAMatrix(ctx, petsc_mat)
52 | 
53 |   @property
54 |   def mat(self):
55 |     """Return underlying CUDA matrix
56 |     """
57 | 
58 |     return self._petsc_mat
59 | 
60 |   def assemble(self):
61 |     """Call assemble on the underlying PETSc matrix.
62 | 
63 |     If the PETSc matrix is not a CUDA matrix, then matrix 
64 |     values will be explicitly copied to the host.
65 |     """
66 | 
67 |     self._cpp_object.to_host(self._ctx)
68 | 
69 |   def __del__(self):
70 |     """Delete the matrix and free up GPU resources
71 |     """
72 | 
73 |     # make sure we delete the CUDAMatrix before the petsc matrix
74 |     del self._cpp_object
75 |    
76 | 
77 | 


--------------------------------------------------------------------------------
/python/cudolfinx/mesh.py:
--------------------------------------------------------------------------------
 1 | from cudolfinx import cpp as _cucpp
 2 | from dolfinx import mesh
 3 | 
 4 | def ghost_layer_mesh(domain: mesh.Mesh):
 5 |     """Add a ghost layer of cells to the given mesh
 6 |     """
 7 |     _ghost_mesh = _cucpp.fem.ghost_layer_mesh(domain._cpp_object, domain._geometry._cpp_object.cmap)
 8 |     return mesh.Mesh(
 9 |             _ghost_mesh,
10 |             domain._ufl_domain)
11 | 
12 | def ghost_layer_meshtags(meshtags: mesh.MeshTags, ghosted_mesh: mesh.Mesh):
13 |     """Trasnfer meshtags to ghost layer mesh."""
14 | 
15 |     _cpp_meshtags = _cucpp.fem.ghost_layer_meshtags(meshtags._cpp_object, ghosted_mesh.topology._cpp_object)
16 |     return mesh.MeshTags(_cpp_meshtags)
17 | 


--------------------------------------------------------------------------------
/python/cudolfinx/wrappers/caster_petsc.h:
--------------------------------------------------------------------------------
 1 | // Copyright (C) 2024 Benjamin Pachev, James D. Trotter
 2 | //
 3 | // This file is part of cuDOLFINX
 4 | //
 5 | // SPDX-License-Identifier:    LGPL-3.0-or-later
 6 | 
 7 | #pragma once
 8 | 
 9 | #ifdef HAS_PETSC
10 | 
11 | #include <nanobind/nanobind.h>
12 | #include <petsc4py/petsc4py.h>
13 | #include <petscmat.h>
14 | #include <petscvec.h>
15 | 
16 | // nanobind casters for PETSc/petsc4py objects
17 | 
18 | namespace nb = nanobind;
19 | 
20 | // Import petsc4py on demand
21 | #define VERIFY_PETSC4PY_FROMPY(func)                                           \
22 |   if (!func)                                                                   \
23 |   {                                                                            \
24 |     if (import_petsc4py() != 0)                                                \
25 |       return false;                                                            \
26 |   }
27 | 
28 | #define VERIFY_PETSC4PY_FROMCPP(func)                                          \
29 |   if (!func)                                                                   \
30 |   {                                                                            \
31 |     if (import_petsc4py() != 0)                                                \
32 |       return {};                                                               \
33 |   }
34 | 
35 | // Macro for casting between PETSc and petsc4py objects
36 | #define PETSC_CASTER_MACRO(TYPE, P4PYTYPE, NAME)                               \
37 |   template <>                                                                  \
38 |   class type_caster<_p_##TYPE>                                                 \
39 |   {                                                                            \
40 |   public:                                                                      \
41 |     NB_TYPE_CASTER(TYPE, const_name(#NAME))                                    \
42 |     bool from_python(handle src, uint8_t, cleanup_list*) noexcept              \
43 |     {                                                                          \
44 |       VERIFY_PETSC4PY_FROMPY(PyPetsc##P4PYTYPE##_Get);                         \
45 |       if (PyObject_TypeCheck(src.ptr(), &PyPetsc##P4PYTYPE##_Type) != 0)       \
46 |       {                                                                        \
47 |         value = PyPetsc##P4PYTYPE##_Get(src.ptr());                            \
48 |         return true;                                                           \
49 |       }                                                                        \
50 |       else                                                                     \
51 |         return false;                                                          \
52 |     }                                                                          \
53 |                                                                                \
54 |     static handle from_cpp(TYPE src, rv_policy policy,                         \
55 |                            cleanup_list* /*cleanup*/) noexcept                 \
56 |     {                                                                          \
57 |       VERIFY_PETSC4PY_FROMCPP(PyPetsc##P4PYTYPE##_New);                        \
58 |       if (policy == rv_policy::take_ownership)                                 \
59 |       {                                                                        \
60 |         PyObject* obj = PyPetsc##P4PYTYPE##_New(src);                          \
61 |         PetscObjectDereference((PetscObject)src);                              \
62 |         return nb::handle(obj);                                                \
63 |       }                                                                        \
64 |       else if (policy == rv_policy::automatic_reference                        \
65 |                or policy == rv_policy::reference)                              \
66 |       {                                                                        \
67 |         PyObject* obj = PyPetsc##P4PYTYPE##_New(src);                          \
68 |         return nb::handle(obj);                                                \
69 |       }                                                                        \
70 |       else                                                                     \
71 |       {                                                                        \
72 |         return {};                                                             \
73 |       }                                                                        \
74 |     }                                                                          \
75 |                                                                                \
76 |     operator TYPE() { return value; }                                          \
77 |   }
78 | 
79 | namespace nanobind::detail
80 | {
81 | PETSC_CASTER_MACRO(Mat, Mat, mat);
82 | PETSC_CASTER_MACRO(Vec, Vec, vec);
83 | } // namespace nanobind::detail
84 | #endif
85 | 


--------------------------------------------------------------------------------
/python/cudolfinx/wrappers/cudolfinx.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (C) 2024 Benjamin Pachev, James D. Trotter
 2 | //
 3 | // This file is part of cuDOLFINX
 4 | //
 5 | // SPDX-License-Identifier:    LGPL-3.0-or-later
 6 | 
 7 | #include <nanobind/nanobind.h>
 8 | 
 9 | namespace nb = nanobind;
10 | 
11 | namespace cudolfinx_wrappers
12 | {
13 | void fem(nb::module_& m);
14 | void petsc(nb::module_& m_fem);
15 | } // namespace cudolfinx_wrappers
16 | 
17 | NB_MODULE(cpp, m)
18 | {
19 |   // Create module for C++ wrappers
20 |   m.doc() = "DOLFINx CUDA Python interface";
21 |   m.attr("__version__") = CUDOLFINX_VERSION;
22 | 
23 | #ifdef NDEBUG
24 |   nanobind::set_leak_warnings(false);
25 | #endif
26 |   // Create fem submodule [fem]
27 |   nb::module_ fem = m.def_submodule("fem", "FEM module");
28 |   cudolfinx_wrappers::fem(fem);
29 |   cudolfinx_wrappers::petsc(fem);
30 | }
31 | 


--------------------------------------------------------------------------------
/python/cudolfinx/wrappers/petsc.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (C) 2024 Benjamin Pachev, James D. Trotter
 2 | //
 3 | // This file is part of cuDOLFINX
 4 | //
 5 | // SPDX-License-Identifier:    LGPL-3.0-or-later
 6 | 
 7 | #include "caster_petsc.h"
 8 | #include <cudolfinx/fem/petsc.h>
 9 | #include <nanobind/nanobind.h>
10 | #include <nanobind/ndarray.h>
11 | #include <nanobind/stl/complex.h>
12 | #include <nanobind/stl/function.h>
13 | #include <nanobind/stl/map.h>
14 | #include <nanobind/stl/pair.h>
15 | #include <nanobind/stl/shared_ptr.h>
16 | #include <nanobind/stl/string.h>
17 | #include <nanobind/stl/tuple.h>
18 | #include <nanobind/stl/vector.h>
19 | #include <petsc4py/petsc4py.h>
20 | #include <petscis.h>
21 | 
22 | namespace
23 | {
24 | 
25 | void petsc_fem_module(nb::module_& m)
26 | {
27 |   m.def("create_cuda_matrix", dolfinx::fem::petsc::create_cuda_matrix<PetscReal>,
28 |         nb::rv_policy::take_ownership, nb::arg("a"),
29 |         "Create a PETSc CUDA Mat for a bilinear form.");
30 | }
31 | 
32 | } // namespace
33 | 
34 | namespace cudolfinx_wrappers
35 | {
36 | void petsc(nb::module_& m_fem)
37 | {
38 |   nb::module_ petsc_fem_mod
39 |       = m_fem.def_submodule("petsc", "PETSc-specific finite element module");
40 |   petsc_fem_module(petsc_fem_mod);
41 | }
42 | } // namespace cudolfinx_wrappers
43 | 
44 | 


--------------------------------------------------------------------------------
/python/examples/poisson.py:
--------------------------------------------------------------------------------
  1 | # Copyright (C) 2024 Benjamin Pachev
  2 | #
  3 | # This file is part of cuDOLFINX
  4 | #
  5 | # SPDX-License-Identifier:    LGPL-3.0-or-later
  6 | 
  7 | import argparse as ap
  8 | from mpi4py import MPI
  9 | from petsc4py import PETSc
 10 | import cudolfinx as cufem
 11 | from dolfinx import fem as fe, mesh
 12 | from dolfinx.fem import petsc as fe_petsc
 13 | import numpy as np
 14 | import ufl
 15 | import time
 16 | from ufl import dx, ds, grad, inner 
 17 | 
 18 | def create_mesh(res: int = 10, dim: int = 3):
 19 |     """Create a uniform tetrahedral mesh on the unit cube.
 20 | 
 21 |     Parameters
 22 |     ----------
 23 |     res - Number of subdivisions along each dimension
 24 |     dim - Geometric dimension of mesh
 25 | 
 26 |     Returns
 27 |     ----------
 28 |     mesh - The mesh object.
 29 |     """
 30 | 
 31 |     if dim == 3:
 32 |         return mesh.create_box(
 33 |             comm = MPI.COMM_WORLD,
 34 |             points = ((0,0,0), (1, 1, 1)),
 35 |             n = (res, res, res),
 36 |             cell_type = mesh.CellType.tetrahedron
 37 |         )
 38 |     elif dim == 2:
 39 |         return mesh.create_unit_square(MPI.COMM_WORLD, res, res)
 40 | 
 41 | def main(res, cuda=True, degree=1, dim=3):
 42 |     """Assembles a stiffness matrix for the Poisson problem with the given resolution.
 43 |     """
 44 | 
 45 |     domain = create_mesh(res, dim=dim)
 46 |     comm = domain.comm
 47 |     if cuda and comm.size > 1:
 48 |         if comm.rank == 0:
 49 |             print("Using ghost layer mesh for CUDA Assembly")
 50 |         domain = cufem.ghost_layer_mesh(domain)
 51 | 
 52 |     V = fe.functionspace(domain, ("Lagrange", degree))
 53 |     u = ufl.TrialFunction(V)
 54 |     v = ufl.TestFunction(V)
 55 |     x = ufl.SpatialCoordinate(domain)
 56 |     if dim == 3:
 57 |         f = 10*ufl.exp(-((x[0]-.05)**2 + (x[1]-.05)**2 + (x[2]-.05)**2) / .02)
 58 |     elif dim == 2:
 59 |         f = 10*ufl.exp(-((x[0]-.05)**2 + (x[1]-.05)**2) / .02)
 60 |     g = ufl.sin(5*x[0])*ufl.sin(5*x[1])
 61 |     a = inner(grad(u), grad(v)) * dx
 62 |     L = inner(f, v) * dx + inner(g, v) * ds
 63 | 
 64 |     facets = mesh.locate_entities_boundary(
 65 |       domain,
 66 |       dim=(domain.topology.dim - 1),
 67 |       marker=lambda x: np.isclose(x[0], 0.0) | np.isclose(x[0], 2.0),
 68 |     )
 69 | 
 70 |     dofs = fe.locate_dofs_topological(V=V, entity_dim=domain.topology.dim-1, entities=facets)
 71 |     bc = fe.dirichletbc(value=PETSc.ScalarType(0), dofs=dofs, V=V)
 72 | 
 73 |     if cuda:
 74 |         a = cufem.form(a)
 75 |         L = cufem.form(L)
 76 |         asm = cufem.CUDAAssembler()
 77 |         A = asm.create_matrix(a)
 78 |         b = asm.create_vector(L)
 79 |         device_bcs = asm.pack_bcs([bc])
 80 |     else:
 81 |         a = fe.form(a, jit_options = {"cffi_extra_compile_args":["-O3", "-mcpu=neoverse-v2"]})
 82 |         L = fe.form(L, jit_options = {"cffi_extra_compile_args":["-O3", "-mcpu=neoverse-v2"]})
 83 |         A = fe_petsc.create_matrix(a)
 84 |         b = fe_petsc.create_vector(L)
 85 |     start = time.time()
 86 |     if cuda:
 87 |         asm.assemble_matrix(a, A, bcs=device_bcs)
 88 |         A.assemble()
 89 |     else:
 90 |         fe_petsc.assemble_matrix(A, a, bcs=[bc])
 91 |         A.assemble()
 92 |     elapsed = time.time()-start
 93 | 
 94 |     timing = comm.gather(elapsed, root=0)
 95 |     if comm.rank == 0:
 96 |         timing = np.asarray(timing)
 97 |         timing = np.max(timing)
 98 |         # show max over all MPI processes, as that's the rate-limiter
 99 |         print(f"Res={res}, Num cells", domain.topology.index_map(domain.topology.dim).size_global)
100 |         print(f"Assembly timing: {timing}, Dofs: {V.dofmap.index_map.size_global}")
101 | 
102 | if __name__ == "__main__":
103 |     parser = ap.ArgumentParser()
104 |     parser.add_argument("--res", default=10, type=int, help="Number of subdivisions in each dimension.")
105 |     parser.add_argument("--degree", default=1, type=int, help="Polynomial degree.")
106 |     parser.add_argument("--dim", default=3, type=int, help="Geometric dimension.")
107 |     parser.add_argument("--no-cuda", default=False, action="store_true", help="Disable GPU acceleration.")
108 |     args = parser.parse_args()
109 | 
110 |     main(res=args.res, cuda = not args.no_cuda, degree=args.degree, dim=args.dim)
111 | 


--------------------------------------------------------------------------------
/python/examples/poisson_sum_factorization.py:
--------------------------------------------------------------------------------
  1 | # Copyright (C) 2024 Benjamin Pachev
  2 | #
  3 | # This file is part of cuDOLFINX
  4 | #
  5 | # SPDX-License-Identifier:    LGPL-3.0-or-later
  6 | 
  7 | import argparse as ap
  8 | from mpi4py import MPI
  9 | from petsc4py import PETSc
 10 | try:
 11 |     import cudolfinx as cufem
 12 | except ImportError:
 13 |     print("Must have cudolfinx to test CUDA assembly.")
 14 | 
 15 | from dolfinx import fem as fe, mesh
 16 | from dolfinx.fem import petsc as fe_petsc
 17 | import numpy as np
 18 | import ufl
 19 | import time
 20 | from ufl import dx, ds, grad, inner 
 21 | import basix
 22 | 
 23 | def create_mesh(res: int = 10):
 24 |     """Create a uniform tetrahedral mesh on the unit cube.
 25 | 
 26 |     Parameters
 27 |     ----------
 28 |     res - Number of subdivisions along each dimension
 29 | 
 30 |     Returns
 31 |     ----------
 32 |     mesh - The mesh object.
 33 |     """
 34 | 
 35 |     return mesh.create_box(
 36 |             comm = MPI.COMM_WORLD,
 37 |             points = ((0,0,0), (1, 1, 1)),
 38 |             n = (res, res, res),
 39 |             cell_type = mesh.CellType.hexahedron,
 40 |             ghost_mode = mesh.GhostMode.none,
 41 |             dtype = np.float64
 42 |         )
 43 | 
 44 | def main(res, cuda=True, sum_factorization=True, degree=1):
 45 |     """Assembles a stiffness matrix for the Poisson problem with the given resolution.
 46 |     """
 47 | 
 48 |     domain = create_mesh(res)
 49 |     # Tensor product element
 50 |     family = basix.ElementFamily.P
 51 |     variant = basix.LagrangeVariant.gll_warped
 52 |     cell_type = domain.basix_cell()
 53 | 
 54 |     basix_element = basix.create_tp_element(
 55 |         family, cell_type, degree, variant
 56 |     )  # doesn't work with tp element, why?
 57 |     element = basix.ufl._BasixElement(basix_element)  # basix ufl element
 58 |     V = fe.functionspace(domain, element)
 59 |     u = ufl.TrialFunction(V)
 60 |     v = ufl.TestFunction(V)
 61 |     x = ufl.SpatialCoordinate(domain)
 62 |     f = 10*ufl.exp(-((x[0]-.5)**2 + (x[1]-.5)**2 + (x[2]-.5)**2) / .02)
 63 |     g = ufl.sin(5*x[0])*ufl.sin(5*x[1])
 64 |     a = inner(grad(u), grad(v)) * dx
 65 |     L = inner(f, v) * dx + inner(g, v) * ds
 66 | 
 67 |     facets = mesh.locate_entities_boundary(
 68 |       domain,
 69 |       dim=(domain.topology.dim - 1),
 70 |       marker=lambda x: np.isclose(x[0], 0.0) | np.isclose(x[0], 2.0),
 71 |     )
 72 | 
 73 |     dofs = fe.locate_dofs_topological(V=V, entity_dim=domain.topology.dim-1, entities=facets)
 74 |     bc = fe.dirichletbc(value=PETSc.ScalarType(0), dofs=dofs, V=V)
 75 | 
 76 |     form_compiler_options = {"sum_factorization": sum_factorization}  
 77 | 
 78 |     if cuda:
 79 |         a = cufem.form(a, form_compiler_options=form_compiler_options)
 80 |         asm = cufem.CUDAAssembler()
 81 |         A = asm.create_matrix(a)
 82 |         device_bcs = asm.pack_bcs([bc])
 83 |     else:
 84 |         a = fe.form(
 85 |                 a,
 86 |                 form_compiler_options=form_compiler_options,
 87 |                 jit_options = {"cffi_extra_compile_args":["-O3", "-mcpu=neoverse-v2"]}
 88 |         )
 89 |         A = fe_petsc.create_matrix(a)
 90 | 
 91 |     start = time.time()
 92 |     if cuda:
 93 |         asm.assemble_matrix(a, A, bcs=device_bcs)
 94 |     else:
 95 |         fe_petsc.assemble_matrix(A, a, bcs=[bc])
 96 |         A.assemble()
 97 |     elapsed = time.time()-start
 98 | 
 99 |     timing = MPI.COMM_WORLD.gather(elapsed, root=0)
100 |     if MPI.COMM_WORLD.rank == 0:
101 |         timing = np.asarray(timing)
102 |         timing = np.max(timing)
103 |         # show max over all MPI processes, as that's the rate-limiter
104 |         print(f"Res={res}, Num cells", domain.topology.index_map(domain.topology.dim).size_global)
105 |         print(f"Assembly timing: {timing}, Dofs: {V.dofmap.index_map.size_global}")
106 | 
107 | if __name__ == "__main__":
108 |     parser = ap.ArgumentParser()
109 |     parser.add_argument("--res", default=10, type=int, help="Number of subdivisions in each dimension.")
110 |     parser.add_argument("--degree", default=1, type=int, help="Polynomial degree.")
111 |     parser.add_argument("--no-sum-factorization", default=False, action="store_true", help="Disable sum factorization")
112 |     parser.add_argument("--no-cuda", default=False, action="store_true", help="Disable GPU acceleration.")
113 |     args = parser.parse_args()
114 | 
115 |     main(
116 |          res = args.res,
117 |          cuda = not args.no_cuda,
118 |          sum_factorization = not args.no_sum_factorization,
119 |          degree = args.degree
120 |     )
121 | 


--------------------------------------------------------------------------------
/python/pyproject.toml:
--------------------------------------------------------------------------------
  1 | # The CUDOLFINx Python interface must be built without build isolation (PEP517)
  2 | # due to its runtime and build time dependency on system built petsc4py and
  3 | # mpi4py.
  4 | # pip install -r build-requirements.txt
  5 | [build-system]
  6 | requires = [
  7 |       "scikit-build-core[pyproject]>=0.5",
  8 |       "nanobind>=1.8.0",
  9 |       "petsc4py",
 10 |       "mpi4py",
 11 | ]
 12 | build-backend = "scikit_build_core.build"
 13 | 
 14 | [project]
 15 | name = "fenics-cudolfinx"
 16 | version = "0.9.0"
 17 | description = "CUDA DOLFINx Python interface"
 18 | readme = "../README.md"
 19 | requires-python = ">=3.9.0"
 20 | license = { file = "../COPYING.LESSER" }
 21 | authors = [
 22 |       { email = "benjaminpachev@gmail.com" },
 23 |       { name = "Benjamin Pachev" },
 24 | ]
 25 | dependencies = [
 26 |       "numpy>=1.21",
 27 |       "cffi",
 28 |       "petsc4py",
 29 |       "mpi4py",
 30 |       "fenics-basix>=0.9.0,<0.10.0",
 31 |       "fenics-dolfinx>=0.9.0,<0.10.0",
 32 |       "fenics-ffcx>=0.9.0,<0.10.0",
 33 |       "fenics-ufl>=2024.2.0,<2024.3.0",
 34 | ]
 35 | 
 36 | [project.optional-dependencies]
 37 | docs = ["markdown", "pyyaml", "sphinx", "sphinx_rtd_theme"]
 38 | lint = ["ruff"]
 39 | optional = ["numba"]
 40 | test = ["pytest", "sympy", "scipy", "matplotlib", "fenics-dolfinx[optional]"]
 41 | ci = [
 42 |       "mypy",
 43 |       "pytest-xdist",
 44 |       "types-setuptools",
 45 |       "fenics-dolfinx[build]",
 46 |       "fenics-dolfinx[docs]",
 47 |       "fenics-dolfinx[lint]",
 48 |       "fenics-dolfinx[optional]",
 49 |       "fenics-dolfinx[test]",
 50 | ]
 51 | 
 52 | [tool.scikit-build]
 53 | wheel.packages = ["cudolfinx"]
 54 | sdist.exclude = ["*.cpp"]
 55 | cmake.build-type = "Release"
 56 | wheel.license-files = ["../COPYING*"]
 57 | 
 58 | [tool.pytest]
 59 | junit_family = "xunit2"
 60 | 
 61 | [tool.pytest.ini_options]
 62 | markers = ["skip_in_parallel: marks tests that should be run in serial only."]
 63 | 
 64 | [tool.mypy]
 65 | # Suggested at https://blog.wolt.com/engineering/2021/09/30/professional-grade-mypy-configuration/
 66 | # Goal would be to make all of the below True long-term
 67 | disallow_untyped_defs = false
 68 | disallow_any_unimported = false
 69 | no_implicit_optional = false
 70 | check_untyped_defs = false
 71 | warn_return_any = false
 72 | warn_unused_ignores = false
 73 | show_error_codes = true
 74 | ignore_missing_imports = true
 75 | 
 76 | 
 77 | [tool.ruff]
 78 | line-length = 100
 79 | indent-width = 4
 80 | 
 81 | [tool.ruff.lint]
 82 | select = [
 83 |       "E", # pycodestyle
 84 |       "W", # pycodestyle
 85 |       "F", # pyflakes
 86 |       "I",   # isort - use standalone isort
 87 |       "RUF", # Ruff-specific rules
 88 |       "UP",  # pyupgrade
 89 |       "ICN", # flake8-import-conventions
 90 |       "NPY", # numpy-specific rules
 91 |       "FLY", # use f-string not static joins
 92 | ]
 93 | ignore = ["UP007", "RUF012"]
 94 | allowed-confusables = ["σ"]
 95 | 
 96 | [tool.ruff.lint.isort]
 97 | known-first-party = ["basix", "dolfinx", "ffcx", "ufl", "cudolfinx"]
 98 | known-third-party = ["gmsh", "numba", "numpy", "pytest", "pyvista"]
 99 | section-order = [
100 |       "future",
101 |       "standard-library",
102 |       "mpi",
103 |       "third-party",
104 |       "first-party",
105 |       "local-folder",
106 | ]
107 | 
108 | [tool.ruff.lint.isort.sections]
109 | "mpi" = ["mpi4py", "petsc4py"]
110 | 


--------------------------------------------------------------------------------
/python/test/test_cuda_assembly.py:
--------------------------------------------------------------------------------
  1 | # Copyright (C) 2024 Benjamin Pachev
  2 | #
  3 | # This file is part of cuDOLFINX
  4 | #
  5 | # SPDX-License-Identifier:    LGPL-3.0-or-later
  6 | 
  7 | import petsc4py
  8 | from petsc4py import PETSc
  9 | from mpi4py import MPI
 10 | from dolfinx import fem as fe, mesh
 11 | from dolfinx.fem import petsc
 12 | import ufl
 13 | import numpy as np
 14 | import cudolfinx as cufem
 15 | from cudolfinx.form import BlockCUDAForm
 16 | from basix.ufl import element, mixed_element
 17 | 
 18 | """
 19 | @author Benjamin Pachev <benjamin.pachev@gmail.com>
 20 | @copyright 2024
 21 | 
 22 | A set of simple variational forms to test the correctness of CUDA-accelerated assembly.
 23 | """
 24 | 
 25 | 
 26 | def make_mixed_form():
 27 |   """Test compilation of a mixed form.
 28 |   """
 29 | 
 30 |   domain = mesh.create_unit_square(MPI.COMM_WORLD, 10, 10, mesh.CellType.triangle)
 31 |   el = element("P", domain.basix_cell(), 1)
 32 |   
 33 |   V = fe.functionspace(domain, el)
 34 |   u = ufl.TrialFunction(V)
 35 |   p = ufl.TestFunction(V)
 36 |   A = ufl.dot(ufl.grad(u), ufl.grad(p)) * ufl.dx
 37 |   F = fe.form(A)
 38 |   mat = fe.assemble_matrix(F)
 39 | 
 40 | def make_test_domain():
 41 |   """Make a test domain
 42 |   """
 43 | 
 44 |   n = 19
 45 |   m = 27
 46 |   return mesh.create_unit_square(MPI.COMM_WORLD, n, m, mesh.CellType.triangle)
 47 | 
 48 | def make_ufl(domain=None):
 49 |   """Create the UFL needed for making the forms
 50 |   """
 51 | 
 52 |   if domain is None:
 53 |     domain = make_test_domain()
 54 |   
 55 |   V = fe.functionspace(domain, ("P", 1))
 56 |   V_dg = fe.functionspace(domain, ("DG", 1))
 57 |   u = fe.Function(V)
 58 |   p = ufl.TestFunction(V)
 59 |   p_dg = ufl.TestFunction(V_dg)
 60 |   n = ufl.FacetNormal(domain)
 61 |   u.interpolate(lambda x: x[0]**2 + x[1])
 62 |   u_dg = fe.Function(V_dg)
 63 |   u_dg.interpolate(lambda x: x[0]**2 + x[1])
 64 |   kappa = fe.Function(V)
 65 |   kappa.interpolate(lambda x: np.sin(x[0])*np.cos(x[1]))
 66 | 
 67 |   cell_residual = (ufl.exp(u)*p*kappa + ufl.dot(ufl.grad(u), ufl.grad(p))) * ufl.dx
 68 |   exterior_facet_residual = u*kappa*p * ufl.dot(ufl.grad(u), n) * ufl.ds
 69 |   interior_facet_residual = ufl.avg(p_dg) * ufl.avg(kappa) * ufl.avg(u_dg**2) * ufl.dS
 70 | 
 71 |   cell_jac = ufl.derivative(cell_residual, u)
 72 |   exterior_jac = ufl.derivative(exterior_facet_residual, u)
 73 |   interior_jac = ufl.derivative(interior_facet_residual, u_dg)
 74 | 
 75 |   f = fe.Function(V)
 76 |   f.interpolate(lambda x: x[0] +x[1])
 77 |   dofs = fe.locate_dofs_geometrical(V, lambda x: np.isclose(x[0], 0))
 78 |   bc = fe.dirichletbc(f, dofs)
 79 | 
 80 |   return {
 81 |            "coeff": kappa,
 82 |            "bcs": [bc],
 83 |            "vector": [cell_residual, exterior_facet_residual, interior_facet_residual],
 84 |            "matrix": [cell_jac, exterior_jac, interior_jac]}
 85 | 
 86 | def test_assembly():
 87 |   """Test correctness of assembly
 88 |   """
 89 | 
 90 |   ufl_forms = make_ufl()
 91 | 
 92 |   for i, form in enumerate(ufl_forms["vector"]):
 93 |       fenics_form = fe.form(form)
 94 |       vec = petsc.create_vector(fenics_form)
 95 |       petsc.assemble_vector(vec, fenics_form)
 96 | 
 97 |   for i, form in enumerate(ufl_forms["matrix"]):
 98 |       fenics_form = fe.form(form)
 99 |       mat = petsc.create_matrix(fenics_form)
100 |       mat.zeroEntries()
101 |       petsc.assemble_matrix(mat, fenics_form)
102 |       mat.assemble()
103 | 
104 | def compare_mats(matcsr, matpetsc):
105 |   """Compare a native FEniCS MatrixCSR to a PETSc matrix
106 |   """
107 | 
108 |   indptr, indices, data = matpetsc.getValuesCSR()
109 |   bad = np.where(~np.isclose(matcsr.data, data))[0]
110 |   assert np.allclose(matcsr.data, data)
111 | 
112 | def compare_vecs(vecfenics, vecpetsc):
113 |   assert np.allclose(vecfenics.array, vecpetsc.array)
114 | 
115 | def test_cuda_assembly():
116 |   """Check assembly on GPU
117 |   """
118 | 
119 | 
120 |   ufl_forms = make_ufl()
121 |   asm = cufem.CUDAAssembler()
122 | 
123 |   for i, form in enumerate(ufl_forms['vector']):
124 |     if i == 0: continue
125 |     f = fe.form(form)
126 |     vec1 = fe.assemble_vector(f)
127 |     vec2 = asm.assemble_vector(cufem.form(form))
128 |     compare_vecs(vec1, vec2.vector)
129 | 
130 |   for i, form in enumerate(ufl_forms['matrix']):
131 |     f = fe.form(form)
132 |     Mat1 = fe.assemble_matrix(f, bcs=ufl_forms['bcs'])
133 |     Mat2 = asm.assemble_matrix(cufem.form(form), bcs=ufl_forms['bcs'])
134 |     Mat2.assemble()
135 |     # now we need to compare the two
136 |     compare_mats(Mat1, Mat2.mat)
137 | 
138 | def test_reassembly():
139 |   """Ensure correct assembly when coefficients are updated
140 |   """
141 | 
142 |   ufl_forms = make_ufl()
143 |   coeff = ufl_forms["coeff"]
144 |   cuda_vec_form = cufem.form(ufl_forms["vector"][0])
145 |   vec_form = cuda_vec_form.dolfinx_form
146 |   #mat_form = fe.form(ufl_forms["matrix"][0])
147 |   asm = cufem.CUDAAssembler()
148 |   vec_cuda = asm.assemble_vector(cuda_vec_form)
149 |   vec_fe = fe.assemble_vector(vec_form)
150 |   compare_vecs(vec_fe, vec_cuda.vector)
151 | 
152 |   for d in [2,3]:
153 |     coeff.interpolate(lambda x: x[0]**d + x[1]**d)
154 |     vec_fe.array[:] = 0
155 |     cuda_vec_form.to_device()
156 |     fe.assemble_vector(vec_fe.array, vec_form)
157 |     asm.assemble_vector(cuda_vec_form, vec_cuda)
158 | 
159 |     compare_vecs(vec_fe, vec_cuda.vector)
160 | 
161 | def test_lifting():
162 |   """Ensure lifting and bc setting work correctly
163 |   """
164 | 
165 |   ufl_forms = make_ufl()
166 |   asm = cufem.CUDAAssembler()
167 |   for vec_form, mat_form in zip(ufl_forms['vector'][1:2], ufl_forms['matrix'][1:2]):
168 |     L = fe.form(vec_form)
169 |     vec_cuda = asm.assemble_vector(cufem.form(vec_form))
170 |     vec_fe = fe.assemble_vector(L)
171 |     cuda_a = cufem.form(mat_form)
172 |     a = cuda_a.dolfinx_form
173 |     compare_vecs(vec_fe, vec_cuda.vector)
174 |     fe.set_bc(vec_fe.array, ufl_forms['bcs'])
175 |     asm.set_bc(vec_cuda, ufl_forms['bcs'], L.function_spaces[0])
176 |     compare_vecs(vec_fe, vec_cuda.vector)
177 |     fe.apply_lifting(vec_fe.array, [a], [ufl_forms['bcs']])
178 |     asm.apply_lifting(vec_cuda, [cuda_a], [ufl_forms['bcs']])
179 |     compare_vecs(vec_fe, vec_cuda.vector)
180 | 
181 | def test_block_assembly():
182 |     """Test that basic block assembly works properly."""
183 | 
184 |     domain = make_test_domain()
185 |     V1 = fe.functionspace(domain, ("P", 1))
186 |     V2 = fe.functionspace(domain, ("P", 1))
187 |     p1, p2 = ufl.TestFunction(V1), ufl.TestFunction(V2)
188 | 
189 |     u1, u2 = fe.Function(V1), fe.Function(V2)
190 |     u1.interpolate(lambda x: x[0]**2 + x[1]**3)
191 |     u2.interpolate(lambda x: 1 + x[0] + x[1]**2)
192 |     b1 = ufl.dot(ufl.grad(u1), ufl.grad(p1)) * ufl.dx
193 |     b2 = ufl.dot(ufl.grad(u2), ufl.grad(p2)) * ufl.dx
194 | 
195 |     asm = cufem.CUDAAssembler()
196 |     cuda_L = cufem.form([b1,b2])
197 |     
198 |     vec_cuda = asm.create_vector_block(cuda_L)
199 |     asm.assemble_vector_block(cuda_L, vec_cuda)
200 | 
201 |     vec_fe = fe.petsc.create_vector_block(cuda_L.dolfinx_forms)
202 |     # TODO - update this when switching to DOLFINx v0.10.0
203 |     fe.petsc.assemble_vector_block(vec_fe, cuda_L.dolfinx_forms, [[None], [None]])
204 |     compare_vecs(vec_fe, vec_cuda.vector)
205 | 
206 | 


--------------------------------------------------------------------------------
/python/test/test_multigpu_assembly.py:
--------------------------------------------------------------------------------
  1 | from test_cuda_assembly import make_test_domain, make_ufl
  2 | from mpi4py import MPI
  3 | import cudolfinx as cufem
  4 | from dolfinx import fem as fe
  5 | from dolfinx.fem import petsc as fe_petsc
  6 | import numpy as np
  7 | from petsc4py import PETSc
  8 | import json
  9 | 
 10 | def compute_universal_dofmap(mesh, V, res=1000):
 11 |     """Map the global array of dofs to unique geometric information
 12 | 
 13 |     This is needed to compute maps between DG dofs on meshes with different partitioning schemes
 14 |     """
 15 |     
 16 |     num_local_dofs = V.dofmap.index_map.size_local
 17 |     
 18 |     c_to_dofs = V.dofmap.map()
 19 |     dofs_to_cells = np.zeros(num_local_dofs, dtype=int)
 20 |     for i, cell in enumerate(c_to_dofs):
 21 |         for dof in cell:
 22 |             if dof >= num_local_dofs: continue
 23 |             dofs_to_cells[dof] = i 
 24 |     dof_coords = V.tabulate_dof_coordinates()[:num_local_dofs]
 25 |     cell_coords = mesh.geometry.x[mesh.geometry.dofmap]
 26 |     dof_cell_coords = cell_coords[dofs_to_cells]
 27 |     dof_coords = mesh.comm.gather(dof_coords, root=0)
 28 |     dof_cell_coords = mesh.comm.gather(dof_cell_coords, root=0)
 29 |     if mesh.comm.rank == 0:
 30 |         dof_coords = (res*np.concat(dof_coords)).astype(int)
 31 |         dof_cell_coords = (res*np.concat(dof_cell_coords)).astype(int)
 32 |         i = 0
 33 |         keys_to_dofs = {}
 34 |         keys = []
 35 |         for d_coords, d_cell_coords in zip(dof_coords, dof_cell_coords):
 36 |             k = (tuple(d_coords.tolist()), tuple(sorted([tuple(arr.tolist()) for arr in d_cell_coords])))
 37 |             keys_to_dofs[k] = i
 38 |             keys.append(k)
 39 |             i += 1
 40 | 
 41 |         return keys, keys_to_dofs
 42 | 
 43 | def compare_parallel_matrices(mat1, mat2):
 44 |     """Compare two distributed PETSc matrices
 45 |     """
 46 | 
 47 |     _, _, data1 = mat1.getValuesCSR()
 48 |     _, _, data2 = mat2.getValuesCSR()
 49 |     sum1 = MPI.COMM_WORLD.gather(data1.sum(), root=0)
 50 |     sum2 = MPI.COMM_WORLD.gather(data2.sum(), root=0)
 51 |     if MPI.COMM_WORLD.rank == 0:
 52 |         sum1, sum2 = sum(sum1), sum(sum2)
 53 |         print(sum1, sum2, np.allclose(sum1, sum2))
 54 |         return np.allclose(sum1, sum2)
 55 | 
 56 | def compare_parallel_vectors(vec1, vec2):
 57 |     """Compare two distributed PETSc vectors
 58 |     """
 59 | 
 60 |     sum1 = MPI.COMM_WORLD.gather(vec1.array[:].sum(), root=0)
 61 |     sum2 = MPI.COMM_WORLD.gather(vec2.array[:].sum(), root=0)
 62 |     if MPI.COMM_WORLD.rank == 0:
 63 |         sum1, sum2 = sum(sum1), sum(sum2)
 64 |         print(sum1, sum2, np.allclose(sum1, sum2))
 65 |         return np.allclose(sum1, sum2)
 66 | 
 67 | def test_multigpu_assembly():
 68 |     """Check assembly operations across multiple GPUs
 69 |     """
 70 | 
 71 |     domain = make_test_domain()
 72 |     regular_ufl = make_ufl()
 73 |     ghosted_domain = cufem.ghost_layer_mesh(domain)
 74 |     ghosted_ufl = make_ufl(ghosted_domain)
 75 |     asm = cufem.CUDAAssembler()
 76 |     for form1, form2 in zip(regular_ufl['matrix'], ghosted_ufl['matrix']):
 77 |         form1 = fe.form(form1)
 78 |         form2 = cufem.form(form2)
 79 |         regular_mat = fe_petsc.create_matrix(form1)
 80 |         regular_mat.zeroEntries()
 81 |         fe_petsc.assemble_matrix(regular_mat, form1, bcs=regular_ufl['bcs'])
 82 |         regular_mat.assemble()
 83 |         cuda_mat = asm.assemble_matrix(form2, bcs=ghosted_ufl['bcs'])
 84 |         cuda_mat.assemble()
 85 |         compare_parallel_matrices(regular_mat, cuda_mat.mat)
 86 | 
 87 |     for form1, form2 in zip(regular_ufl['vector'], ghosted_ufl['vector']):
 88 |         form1 = fe.form(form1)
 89 |         form2 = cufem.form(form2)
 90 |         regular_vec = fe_petsc.create_vector(form1)
 91 |         with regular_vec.localForm() as loc:
 92 |             loc.set(0)
 93 |         fe_petsc.assemble_vector(regular_vec, form1)
 94 |         regular_vec.ghostUpdate(addv=PETSc.InsertMode.ADD, mode=PETSc.ScatterMode.REVERSE)
 95 |         cuda_vec = asm.assemble_vector(form2)
 96 |         good = compare_parallel_vectors(regular_vec, cuda_vec.vector)
 97 | 
 98 | if __name__ == "__main__":
 99 |     
100 |     test_multigpu_assembly()
101 | 


--------------------------------------------------------------------------------
/spack/packages/cuda-dolfinx/package.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2013-2024 Lawrence Livermore National Security, LLC and other
 2 | # Spack Project Developers. See the top-level COPYRIGHT file for details.
 3 | #
 4 | # SPDX-License-Identifier: (Apache-2.0 OR MIT)
 5 | 
 6 | from spack.package import *
 7 | 
 8 | 
 9 | class CudaDolfinx(CMakePackage):
10 |     """CUDA accelerated extension of DOLFINx from the FEniCS project."""
11 | 
12 |     homepage = "https://github.com/bpachev/cuda-dolfinx"
13 |     git = "https://github.com/bpachev/cuda-dolfinx.git"
14 |     url = "https://github.com/bpachev/cuda-dolfinx/archive/refs/tags/v0.9.0.zip"
15 | 
16 |     maintainers("bpachev")
17 |     license("LGPL-3.0-or-later", checked_by="bpachev")
18 | 
19 |     version("main", branch="main")
20 |     version("0.9.0", sha256="5c93155e58eee139985e9e9341cf7d8b29f8c9cbc51ccdf05134cdfb70ae105d")
21 | 
22 |     depends_on("cxx", type="build")
23 | 
24 |     depends_on("fenics-dolfinx@0.9", when="@0.9:")
25 |     depends_on("py-fenics-dolfinx@0.9", when="@0.9:")
26 |     depends_on("petsc+shared+mpi+cuda")
27 | 
28 |     root_cmakelists_dir = "cpp"
29 | 
30 |     def cmake_args(self):
31 |         return [self.define("CUDOLFINX_SKIP_BUILD_TESTS", True)]
32 | 


--------------------------------------------------------------------------------
/spack/packages/py-cuda-dolfinx/package.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2013-2024 Lawrence Livermore National Security, LLC and other
 2 | # Spack Project Developers. See the top-level COPYRIGHT file for details.
 3 | #
 4 | # SPDX-License-Identifier: (Apache-2.0 OR MIT)
 5 | 
 6 | from spack.package import *
 7 | 
 8 | 
 9 | class PyCudaDolfinx(PythonPackage):
10 |     """Python interface for CUDA acceleration of DOLFINx in the FEniCS project."""
11 | 
12 |     homepage = "https://github.com/bpachev/cuda-dolfinx"
13 |     url = "https://github.com/bpachev/cuda-dolfinx/archive/refs/tags/v0.9.0.zip"
14 |     git = "https://github.com/bpachev/cuda-dolfinx.git"
15 | 
16 |     maintainers("bpachev")
17 | 
18 |     license("LGPL-3.0-only")
19 |     version("main", branch="main")
20 |     version("0.9.0", sha256="5c93155e58eee139985e9e9341cf7d8b29f8c9cbc51ccdf05134cdfb70ae105d")
21 | 
22 |     depends_on("cxx", type="build")
23 |     depends_on("cmake@3.21:", when="@0.9:", type="build")
24 |     depends_on("cuda-dolfinx@main", when="@main")
25 |     depends_on("cuda-dolfinx@0.9.0", when="@0.9.0")
26 |     depends_on("pkgconfig", type="build")
27 |     depends_on("py-nanobind@2:", when="@0.9:", type="build")
28 |     depends_on("py-scikit-build-core+pyproject@0.5:", when="@0.9:", type="build")
29 | 
30 |     build_directory = "python"
31 | 
32 | 


--------------------------------------------------------------------------------
/spack/repo.yaml:
--------------------------------------------------------------------------------
1 | repo:
2 |   namespace: 'cudolfinx'
3 | 


--------------------------------------------------------------------------------