├── compilers.yaml ├── forms ├── Mass.ufl ├── Laplacian.ufl ├── N1curl.ufl └── Elasticity.ufl ├── ffcx ├── CMakeLists.txt ├── geometry.hpp ├── main.cpp └── compile.py ├── .gitignore ├── installation.md ├── info ├── count_flops.py └── kernel_info.py ├── .github └── workflows │ └── python-app.yml ├── compilers-a64fx.yaml ├── utils.py ├── README.md └── run.py /compilers.yaml: -------------------------------------------------------------------------------- 1 | gcc-10: 2 | version: 3 | - 10 4 | cpp: 5 | - g++-10 6 | cc: 7 | - gcc-10 8 | flags: 9 | - -Ofast -march=native -mprefer-vector-width=256 -------------------------------------------------------------------------------- /forms/Mass.ufl: -------------------------------------------------------------------------------- 1 | from ufl import * 2 | 3 | element = FiniteElement("Lagrange", $cell, $degree) 4 | mesh = Mesh(VectorElement("Lagrange", $cell, 1)) 5 | 6 | V = FunctionSpace(mesh, element) 7 | u = TrialFunction(V) 8 | v = TestFunction(V) 9 | 10 | a = inner(u, v)*dx 11 | 12 | un = Coefficient(V) 13 | L = action(a, un) -------------------------------------------------------------------------------- /ffcx/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.12) 2 | 3 | set(PROJECT_NAME benchmark) 4 | project(${PROJECT_NAME}) 5 | 6 | set(CMAKE_CXX_STANDARD 20) 7 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -Wall") 8 | 9 | add_executable(${PROJECT_NAME} main.cpp) 10 | 11 | find_package(MPI 3 REQUIRED) 12 | target_link_libraries(${PROJECT_NAME} PUBLIC MPI::MPI_CXX) 13 | 14 | message(${CMAKE_CXX_FLAGS}) -------------------------------------------------------------------------------- /forms/Laplacian.ufl: -------------------------------------------------------------------------------- 1 | from ufl import * 2 | 3 | element = FiniteElement("Lagrange", $cell, $degree) 4 | mesh = Mesh(VectorElement("Lagrange", $cell, 1)) 5 | 6 | V = FunctionSpace(mesh, element) 7 | u = TrialFunction(V) 8 | v = TestFunction(V) 9 | 10 | W = FunctionSpace(mesh, FiniteElement("Lagrange", $cell, 1)) 11 | k = Coefficient(W) 12 | 13 | a = k*inner(grad(u), grad(v))*dx 14 | 15 | un = Coefficient(V) 16 | L = action(a, un) -------------------------------------------------------------------------------- /forms/N1curl.ufl: -------------------------------------------------------------------------------- 1 | from ufl import * 2 | 3 | element = FiniteElement("N1curl", tetrahedron, $degree) 4 | coord_element = VectorElement("Lagrange", tetrahedron, 1) 5 | mesh = Mesh(coord_element) 6 | 7 | V = FunctionSpace(mesh, element) 8 | W = FunctionSpace(mesh, FiniteElement("Lagrange", tetrahedron, 1)) 9 | 10 | u = TrialFunction(V) 11 | v = TestFunction(V) 12 | k = Coefficient(W) 13 | a = k*inner(curl(u), curl(v))*dx 14 | 15 | un = Coefficient(V) 16 | L = action(a, un) -------------------------------------------------------------------------------- /forms/Elasticity.ufl: -------------------------------------------------------------------------------- 1 | from ufl import * 2 | 3 | element = VectorElement("Lagrange", $cell, $degree) 4 | mesh = Mesh(VectorElement("Lagrange", $cell, 1)) 5 | 6 | V = FunctionSpace(mesh, element) 7 | 8 | u = TrialFunction(V) 9 | v = TestFunction(V) 10 | 11 | W = FunctionSpace(mesh, FiniteElement("Lagrange", $cell, 1)) 12 | k = Coefficient(W) 13 | 14 | def eps(v): 15 | return sym(grad(v)) 16 | 17 | a = k*inner(eps(u), eps(v))*dx 18 | 19 | un = Coefficient(V) 20 | L = action(a, un) -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Prerequisites 2 | *.d 3 | 4 | # C files 5 | *.c 6 | *.h 7 | 8 | # Compiled Object files 9 | *.slo 10 | *.lo 11 | *.o 12 | *.obj 13 | 14 | # Precompiled Headers 15 | *.gch 16 | *.pch 17 | 18 | # Compiled Dynamic libraries 19 | *.so 20 | *.dylib 21 | *.dll 22 | 23 | # Fortran module files 24 | *.mod 25 | *.smod 26 | 27 | # Compiled Static libraries 28 | *.lai 29 | *.la 30 | *.a 31 | *.lib 32 | 33 | # Executables 34 | *.exe 35 | *.out 36 | *.app 37 | 38 | build*/ 39 | .vscode 40 | *.png 41 | *.txt 42 | */__pycache__/* 43 | tsfc_kernel.cpp 44 | __pycache__/* 45 | 46 | 47 | #generated code 48 | problem.hpp 49 | problem.py 50 | problem.ufl 51 | data.hpp 52 | output/* 53 | problem*.py -------------------------------------------------------------------------------- /ffcx/geometry.hpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | template 6 | std::vector create_geometry(int num_batches, int batch_size, int geom_size) 7 | { 8 | std::array coords = {0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0, 9 | 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0}; 10 | std::vector geometry(num_batches * geom_size); 11 | #ifdef USE_VECTOR_EXTENSIONS 12 | for (int c = 0; c < num_batches; c++) 13 | for (int i = 0; i < geom_size; i++) 14 | for (int j = 0; j < batch_size; j++) 15 | geometry[c * geom_size + i][j] = coords[i]; 16 | #else 17 | for (int c = 0; c < num_batches; c++) 18 | for (int i = 0; i < geom_size; i++) 19 | geometry[c * geom_size + i] = coords[i]; 20 | #endif 21 | return geometry; 22 | } -------------------------------------------------------------------------------- /installation.md: -------------------------------------------------------------------------------- 1 | # Installation 2 | 3 | ## Installing ffcx and dependencies 4 | 5 | Requires blas and numpy: 6 | 7 | ```bash 8 | python3 -m venv env/ffcx 9 | source env/ffcx/bin/activate 10 | 11 | python3 -m pip install git+https://github.com/FEniCS/ufl.git 12 | python3 -m pip install git+https://github.com/FEniCS/basix.git 13 | python3 -m pip install git+https://github.com/FEniCS/ffcx.git 14 | python3 -m pip install pyyaml 15 | 16 | ``` 17 | 18 | ## Installing ffcx and dependencies with spack 19 | 20 | ```bash 21 | spack env create ffcx 22 | spack env activate ffcx 23 | spack add mpich py-fenics-ffcx@main 24 | 25 | spack load python 26 | python3 -m ensurepip 27 | python3 -m pip install pyyaml 28 | ``` 29 | 30 | ## Roofline with Intel Advisor 31 | 32 | ### Intel advisor commands 33 | 34 | ```bash 35 | advisor --collect=survey --project-dir=./advi --search-dir src:r=. -- ./build/benchmark 36 | advisor -collect tripcounts -flop -stacks --project-dir=./advi --search-dir src:r=. -- ./build/benchmark 37 | advisor --report=roofline --with-stack --project-dir=./advi --report-output=./advi/out/roofline.html 38 | ``` 39 | 40 | ### Intel advisor with MPI 41 | 42 | ```bash 43 | mpirun -gtool "advisor --collect=survey --project-dir=./advi_results" -n 6 ./build/benchmark 44 | mpirun -gtool "advisor -collect tripcounts -flop -stacks --search-dir src:r=. --project-dir=./advi:1-6" -n 6 ./build/benchmark 45 | advisor --report=roofline --with-stack --project-dir=./advi --report-output=./advi/out/roofline.html 46 | ``` 47 | -------------------------------------------------------------------------------- /info/count_flops.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | import ffcx.options 4 | import ufl 5 | from ffcx.analysis import analyze_ufl_objects 6 | from ffcx.codegeneration.backend import FFCXBackend 7 | from ffcx.codegeneration.integrals import IntegralGenerator 8 | from ffcx.ir.representation import compute_ir 9 | import numpy 10 | 11 | 12 | def count_flops(form: ufl.Form, options: Optional[dict] = {}): 13 | """Return a list with the number of flops for each kernel in the Form.""" 14 | options = ffcx.options.get_options(options) 15 | assert isinstance(form, ufl.Form) 16 | analysis = analyze_ufl_objects([form], options) 17 | ir = compute_ir(analysis, {}, "flops", options, False) 18 | 19 | flops = [] 20 | _bytes = [] 21 | 22 | for integral_ir in ir.integrals: 23 | # Create FFCx C backend 24 | backend = FFCXBackend(integral_ir, options) 25 | # Configure kernel generator 26 | ig = IntegralGenerator(integral_ir, backend) 27 | # Generate code ast for the tabulate_tensor body 28 | ast = ig.generate() 29 | _sum = 0 30 | 31 | for statement in ast.statements: 32 | if isinstance(statement, ffcx.codegeneration.C.cnodes.ArrayDecl): 33 | _sum += numpy.prod(statement.sizes) 34 | if isinstance(statement, ffcx.codegeneration.C.cnodes.Scope): 35 | for sub_statements in statement.body.statements: 36 | if isinstance(sub_statements, ffcx.codegeneration.C.cnodes.ArrayDecl): 37 | _sum += numpy.prod(sub_statements.sizes) 38 | flops.append(ast.flops()) 39 | _bytes.append(_sum) 40 | 41 | return flops[0], _bytes[0] 42 | -------------------------------------------------------------------------------- /.github/workflows/python-app.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: Python application 5 | 6 | on: 7 | push: 8 | branches: [ "main" ] 9 | pull_request: 10 | branches: [ "main" ] 11 | 12 | permissions: 13 | contents: read 14 | 15 | jobs: 16 | build: 17 | 18 | runs-on: ubuntu-latest 19 | env: 20 | CC: gcc-10 21 | CXX: g++-10 22 | 23 | steps: 24 | - uses: actions/checkout@v3 25 | - name: Set up Python 3.10 26 | uses: actions/setup-python@v3 27 | with: 28 | python-version: 3.9 29 | - name: Install dependencies 30 | run: | 31 | sudo apt-get install -y libopenblas-dev liblapack-dev graphviz libgraphviz-dev ninja-build 32 | sudo apt-get -y install mpich 33 | python -m pip install --upgrade pip 34 | pip install flake8 pytest pyyaml scikit-build 35 | python3 -m pip install git+https://github.com/FEniCS/ufl.git 36 | python3 -m pip install git+https://github.com/FEniCS/basix.git 37 | python3 -m pip install git+https://github.com/FEniCS/ffcx.git 38 | python3 -m pip install pyyaml 39 | - name: Run local assembly 40 | run: | 41 | python run.py --form_compiler=ffcx --scalar_type=double --action --global_size=10000000 --degree 5 --problem=Mass --nrepeats 5 --cell_type=hexahedron 42 | python run.py --form_compiler=ffcx --scalar_type=double --action --global_size=10000000 --degree 5 --problem=Mass --nrepeats 5 --cell_type=tetrahedron 43 | cat output/Mass.txt 44 | -------------------------------------------------------------------------------- /compilers-a64fx.yaml: -------------------------------------------------------------------------------- 1 | 2 | # gcc-11-202103: 3 | # version: 4 | # - 11.0.1 5 | # cpp: 6 | # - /snx11273/projects/bristol/modules-a64fx/gcc/11-20210321/bin/g++ 7 | # cc: 8 | # - /snx11273/projects/bristol/modules-a64fx/gcc/11-20210321/bin/gcc 9 | # flags: 10 | # - -Ofast -march=native -msve-vector-bits=2048 11 | # - -Ofast -march=native -msve-vector-bits=512 12 | # - -Ofast -march=native -msve-vector-bits=128 13 | # - -Ofast -march=native -fno-tree-vectorize 14 | 15 | # gcc-11: 16 | # version: 17 | # - 11.1.0 18 | # cpp: 19 | # - /snx11273/projects/bristol/modules-a64fx/gcc/11.1.0/bin/g++ 20 | # cc: 21 | # - /snx11273/projects/bristol/modules-a64fx/gcc/11.1.0/bin/gcc 22 | # flags: 23 | # - -Ofast -march=native -msve-vector-bits=2048 24 | # - -Ofast -march=native -msve-vector-bits=512 25 | # - -Ofast -march=native -msve-vector-bits=128 26 | # - -Ofast -march=native -fno-tree-vectorize 27 | # - -O3 -march=native 28 | 29 | clang-11: 30 | version: 31 | - 11.1.0 32 | cpp: 33 | - /lustre/projects/bristol/modules-a64fx/llvm/11.0/bin/clang++ 34 | cc: 35 | - /lustre/projects/bristol/modules-a64fx/llvm/11.0/bin/clang 36 | flags: 37 | - -Ofast -mprefer-vector-width=2048 38 | - -Ofast -fno-slp-vectorize 39 | 40 | # cray: 41 | # version: 42 | # - 2.7.0 43 | # cpp: 44 | # - /opt/cray/pe/craype/2.7.0/bin/CC 45 | # cc: 46 | # - /opt/cray/pe/craype/2.7.0/bin/cc 47 | # flags: 48 | # - -O3 -hvector3 -hfp4 49 | # - -O3 50 | # - -O2 51 | 52 | # gcc-8: 53 | # version: 54 | # - 8.1.0 55 | # cpp: 56 | # - /opt/gcc/8.1.0/bin/g++ 57 | # cc: 58 | # - /opt/gcc/8.1.0/bin/gcc 59 | # flags: 60 | # - -Ofast -march=native -msve-vector-bits=2048 61 | # - -Ofast -march=native -fno-tree-vectorize 62 | # - -O3 -march=native 63 | 64 | -------------------------------------------------------------------------------- /ffcx/main.cpp: -------------------------------------------------------------------------------- 1 | #include "problem.hpp" 2 | #include "geometry.hpp" 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | int main(int argc, char *argv[]) 11 | { 12 | MPI_Init(&argc, &argv); 13 | { 14 | MPI_Comm comm = MPI_COMM_WORLD; 15 | int mpi_rank; 16 | MPI_Comm_rank(comm, &mpi_rank); 17 | 18 | // Const data from kernel 19 | constexpr int num_dofs = dim; 20 | constexpr int local_size = kernel_rank == 1 ? dim : dim * dim; 21 | constexpr int stride = num_dofs * num_coefficients; 22 | constexpr int num_cells = global_size / num_dofs; 23 | constexpr int num_batches = num_cells / batch_size; 24 | constexpr int geom_size = num_nodes * 3; 25 | 26 | // Allocate and initialize data 27 | std::vector A(num_batches * local_size); 28 | 29 | // Constants for cross element vectorization 30 | scalar_type one = {1.}; 31 | scalar_type zero = {0.}; 32 | 33 | // Create geometry and coefficients 34 | std::vector geometry = create_geometry(num_batches, batch_size, geom_size); 35 | std::vector coefficients(num_batches * stride); 36 | auto set_ = [one](auto &e) 37 | { e = one; }; 38 | std::for_each(coefficients.begin(), coefficients.end(), set_); 39 | 40 | std::array Ae; 41 | 42 | double start = MPI_Wtime(); 43 | for (int batch = 0; batch < num_batches; batch++) 44 | { 45 | std::fill(Ae.begin(), Ae.end(), zero); 46 | scalar_type *coeffs = coefficients.data() + batch * stride; 47 | geom_type *geo = geometry.data() + batch * geom_size; 48 | kernel(Ae.data(), coeffs, nullptr, geo, 0, 0); 49 | std::vector::iterator result = std::next(A.begin(), batch * local_size); 50 | std::transform(Ae.begin(), Ae.end(), result, result, std::plus()); 51 | } 52 | double end = MPI_Wtime(); 53 | double local_time = end - start; 54 | 55 | double max_time = 0; 56 | MPI_Allreduce(&local_time, &max_time, 1, MPI_DOUBLE, MPI_MAX, comm); 57 | 58 | if (mpi_rank == 0) 59 | std::cout << num_cells << ", " << max_time; 60 | } 61 | MPI_Finalize(); 62 | 63 | return 0; 64 | } -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import platform 3 | import yaml 4 | from string import Template 5 | from subprocess import Popen, PIPE 6 | 7 | import sys 8 | 9 | _build_cmd = "cd {form_compiler} && rm -rf build && mkdir build && cd build && cmake -DCMAKE_C_FLAGS={flag} -DCMAKE_CXX_FLAGS={flag} .. && make" 10 | 11 | 12 | def set_compiler(compiler): 13 | os.environ["CXX"] = compiler["cpp"][0] 14 | os.environ["CC"] = compiler["cc"][0] 15 | 16 | try: 17 | with Popen([compiler["cpp"][0], "-dumpversion"], stdout=PIPE) as p: 18 | compiler_version = p.stdout.read().decode("ascii").strip() 19 | except: 20 | compiler_version = compiler["version"][0] 21 | return compiler_version 22 | 23 | 24 | def parse_compiler_configuration(file): 25 | # Read Compiler configuration file 26 | with open(file, "r") as stream: 27 | try: 28 | compilers = yaml.safe_load(stream) 29 | except yaml.YAMLError as exc: 30 | print(exc) 31 | return compilers 32 | 33 | 34 | def machine_name(): 35 | # Set architecture from platform 36 | try: 37 | with open("/sys/devices/cpu/caps/pmu_name", "r") as pmu: 38 | machine = pmu.readlines()[0].strip() 39 | except: 40 | machine = platform.processor() 41 | return machine 42 | 43 | 44 | def create_ouput(problem): 45 | header = "machine,problem,compiler,version,flags,degree,fcomp,scalar,batch_size,rank,cell_type,ncells,time" 46 | path = "output/" 47 | out_file = path + str(problem) + ".txt" 48 | 49 | if not os.path.exists(out_file): 50 | if not os.path.isdir(path): 51 | os.mkdir(path) 52 | with open(out_file, "a") as f: 53 | f.write(header) 54 | return out_file 55 | 56 | 57 | def run(problem: str, degree: int, nrepeats: int, flag: str, action: bool, 58 | scalar_type: str, global_size: int, batch_size: int, mpi_size: int, 59 | cell_type: str): 60 | 61 | try: 62 | import ffcx 63 | import ffcx.codegeneration 64 | except ImportError: 65 | print("ffcx is not available") 66 | 67 | with open("forms/" + problem + ".ufl", 'r') as f: 68 | src = Template(f.read()) 69 | d = {'degree': str(degree), 'vdegree': str( 70 | degree + 1), "cell": cell_type} 71 | result = src.substitute(d) 72 | 73 | with open("ffcx/problem.py", "w") as f2: 74 | f2.writelines(result) 75 | 76 | sys.path.insert(1, 'ffcx/') 77 | from compile import generate_code 78 | generate_code(action, scalar_type, global_size, batch_size) 79 | 80 | run = f"mpirun -n {mpi_size} ./ffcx/build/benchmark" 81 | build = _build_cmd.format(form_compiler="ffcx", flag=flag) 82 | 83 | if os.system(build) != 0: 84 | raise RuntimeError("build failed") 85 | result = [Popen(run.split(), stdout=PIPE).stdout.read().decode("ascii").strip() for i in range(nrepeats)] 86 | print(result) 87 | 88 | return result 89 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Local Finite ELement Operator Benchmarks 2 | 3 | Depends on: 4 | 5 | - FFCx: The FEniCSx Form Compiler: [https://github.com/FEniCS/ffcx.git] 6 | - For sum-factorization: [https://github.com/FEniCS/ffcx/tree/igor/tensor] 7 | 8 | ## Usage 9 | 10 | ```bash 11 | python3 run.py --help 12 | usage: run.py [-h] [--form_compiler {ffcx,ffc,tsfc}] [--scalar_type {double,float,_Float16,double _Complex,float _Complex}] 13 | [--problem {Laplacian,Mass,Elasticity,N1curl,Stokes}] [--conf CONF] [--degree DEGREE [DEGREE ...]] [--nrepeats NREPEATS] 14 | [--batch_size {None,1,2,4,8,16}] [--global_size GLOBAL_SIZE] [--action] [--mpi_size MPI_SIZE] [--cell_type {tetrahedron,hexahedron}] 15 | 16 | Run local assembly benchmark. 17 | 18 | optional arguments: 19 | -h, --help show this help message and exit 20 | --form_compiler {ffcx,ffc,tsfc} 21 | Form Compiler to use (default: ffcx) 22 | --scalar_type {double,float,_Float16,double _Complex,float _Complex} 23 | Scalar type to use (default: double) 24 | --problem {Laplacian,Mass,Elasticity,N1curl,Stokes} 25 | Problem to run (default: Laplacian) 26 | --conf CONF Configuration file describing the compilers and flags. (default: compilers.yaml) 27 | --degree DEGREE [DEGREE ...] 28 | Polynomial degree to evaluate the operators. (default: range(1, 4)) 29 | --nrepeats NREPEATS Number of times to run each experiment. (default: 3) 30 | --batch_size {None,1,2,4,8,16} 31 | --global_size GLOBAL_SIZE 32 | Global number of dofs (assuming shared are dofs are duplicated). (default: 1000000.0) 33 | --action Specify whether to run the problems with matrix free approach. (default: False) 34 | --mpi_size MPI_SIZE The number of mpi processes to use. (default: 1) 35 | --cell_type {tetrahedron,hexahedron} 36 | Cell type to use (default: tetrahedron 37 | ``` 38 | 39 | ## Compiler Configuration File 40 | 41 | Example of compiler configuration file (compilers.yaml): 42 | 43 | ```yaml 44 | gcc-11: 45 | version: 46 | - 11.1.0 47 | cpp: 48 | - /usr/bin/g++-11 49 | cc: 50 | - /usr/bin/gcc-11 51 | flags: 52 | - -Ofast -march=native -mprefer-vector-width=256 53 | 54 | clang: 55 | version: 56 | - 12.0.1 57 | cpp: 58 | - /usr/bin/clang++ 59 | cc: 60 | - /usr/bin/clang 61 | flags: 62 | - -Ofast -march=native -mprefer-vector-width=256 63 | ``` 64 | 65 | ## Examples 66 | 67 | ### Matrix free Weighted Laplacian degrees 1-8 68 | 69 | ```bash 70 | python3 run.py --problem Lagrange --degree 1 2 3 4 5 6 7 8 --form_compiler=ffcx --action --global_size 10000000 71 | ``` 72 | 73 | ### Single Precision Mass operator on 76 cores 74 | 75 | ```bash 76 | python3 run.py --problem Mass --degree 1 2 3 4 5 6 7 8 --form_compiler=ffcx --action --mpi_size 76 --global_size 10000000 77 | ``` 78 | 79 | ## Output data description 80 | 81 | Results: 82 | output/{Problem}.txt 83 | 84 | ```bash 85 | table = [machine,problem,compiler,version,flags,degree,form_compiler,scalar_type,batch_size,form_rank,cell_type,num_cells,time] 86 | ``` 87 | -------------------------------------------------------------------------------- /info/kernel_info.py: -------------------------------------------------------------------------------- 1 | import os 2 | import platform 3 | import yaml 4 | from string import Template 5 | from subprocess import Popen, PIPE 6 | from importlib import reload, import_module 7 | from count_flops import count_flops 8 | import numpy 9 | from matplotlib import pyplot as plt 10 | from ffcx.element_interface import create_element 11 | 12 | 13 | problem_ = "Laplacian" 14 | degrees = numpy.arange(1, 16) 15 | cell_type = "hexahedron" 16 | 17 | for degree in degrees: 18 | with open("../forms/" + problem_ + ".ufl", 'r') as f: 19 | src = Template(f.read()) 20 | d = {'degree': str(degree), 'vdegree': str( 21 | degree + 1), "cell": cell_type} 22 | result = src.substitute(d) 23 | 24 | with open(f"problem{degree}.py", "w") as f2: 25 | f2.writelines(result) 26 | 27 | 28 | flops_per_dof = numpy.zeros_like(degrees) 29 | bytes_per_dof = numpy.zeros_like(degrees) 30 | cache_per_dof = numpy.zeros_like(degrees) 31 | dofs_list = numpy.zeros_like(degrees) 32 | for i, degree in enumerate(degrees): 33 | module = import_module(f"problem{degree}") 34 | form = module.L 35 | element = create_element(module.element) 36 | dofs = element.dim 37 | dofs_list[i] = dofs 38 | 39 | cell = element.cell() 40 | geometry_size = cell.num_vertices() * 3 41 | 42 | coeff_size = 0 43 | coefficients = form.coefficients() 44 | for coeff in coefficients: 45 | el = create_element(coeff.ufl_element()) 46 | coeff_size += el.dim 47 | 48 | flops_per_dof[i], cache_per_dof[i] = count_flops(form) 49 | bytes_per_dof[i] = ((coeff_size + 2*dofs + geometry_size) * 8)/dofs 50 | cache_per_dof[i] = (cache_per_dof[i] * 8)/dofs 51 | # flops_per_dof[i] = flops_per_dof[i]/dofs 52 | 53 | print(repr(flops_per_dof)) 54 | 55 | print(cache_per_dof) 56 | FLOPS = numpy.full_like(degrees, 1e100, dtype=float) 57 | BW = numpy.full_like(degrees, 300e9, dtype=float) 58 | BW1 = numpy.full_like(degrees, 1.5e4*1e9, dtype=float) 59 | BW2 = numpy.full_like(degrees, 0.8e4*1e9, dtype=float) 60 | BW3 = numpy.full_like(degrees, 1.5e3*1e9, dtype=float) 61 | 62 | 63 | cache_rate = numpy.zeros_like(BW) 64 | level = numpy.zeros_like(BW) 65 | 66 | for i, num_dofs in enumerate(dofs_list): 67 | size = num_dofs * (cache_per_dof[i] + bytes_per_dof[i]) 68 | ai_cache = numpy.divide(flops_per_dof[i], size/num_dofs) 69 | if size < 32e3: 70 | cache_rate[i] = ai_cache * BW1[i] 71 | level[i] = 1 72 | elif size < 1280e3: 73 | cache_rate[i] = ai_cache * BW2[i] 74 | level[i] = 2 75 | elif size < 1800e3: 76 | cache_rate[i] = ai_cache * BW3[i] 77 | level[i] = 3 78 | 79 | 80 | max_throughput_cache = cache_rate/flops_per_dof 81 | ai_std = flops_per_dof/bytes_per_dof 82 | max_flops = numpy.minimum(FLOPS, ai_std*BW) 83 | max_throughput = max_flops/flops_per_dof 84 | 85 | markers = ["o", "v", "s", "X"] 86 | plt.plot(degrees, max_throughput, label="standard model", 87 | marker=markers[1], linestyle='dashed') 88 | 89 | max_throughput_cache = numpy.minimum(max_throughput, max_throughput_cache) 90 | 91 | plt.plot(degrees, max_throughput_cache, label="cache-aware model", 92 | marker=markers[1], linestyle='dashed') 93 | 94 | 95 | plt.ylabel("max throughput(dofs/s)") 96 | plt.legend() 97 | plt.xlabel(r"polynomial degree $P$") 98 | plt.grid(True, which="both") 99 | plt.yscale("log") 100 | plt.show() 101 | 102 | 103 | print(level) 104 | -------------------------------------------------------------------------------- /run.py: -------------------------------------------------------------------------------- 1 | import utils 2 | import argparse 3 | 4 | if __name__ == "__main__": 5 | parser = argparse.ArgumentParser( 6 | description='Run local assembly benchmark.', 7 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 8 | 9 | parser.add_argument('--form_compiler', dest='form_compiler', type=str, 10 | default="ffcx", choices=['ffcx', 'ffc', 'tsfc'], 11 | help="Form Compiler to use") 12 | 13 | parser.add_argument('--scalar_type', dest='scalar_type', type=str, 14 | default="double", choices=['double', 'float', '_Float16', 'double _Complex', 'float _Complex'], 15 | help="Scalar type to use") 16 | 17 | parser.add_argument('--problem', dest='problem', type=str, 18 | default="Laplacian", choices=['Laplacian', 'Mass', 'Elasticity', 'N1curl', 'Stokes'], 19 | help="Problem to run") 20 | 21 | parser.add_argument('--conf', dest='conf', type=str, default="compilers.yaml", 22 | help="Configuration file describing the compilers and flags.") 23 | 24 | parser.add_argument('--degree', dest='degree', default=range(1, 4), nargs='+', 25 | help='Polynomial degree to evaluate the operators.') 26 | 27 | parser.add_argument('--nrepeats', dest='nrepeats', type=int, default=3, 28 | help='Number of times to run each experiment.') 29 | 30 | parser.add_argument('--batch_size', dest='batch_size', type=int, default=None, choices=[None, 1, 2, 4, 8, 16, 32, 64], 31 | help='') 32 | 33 | parser.add_argument('--global_size', dest='global_size', type=int, default=1e6, 34 | help='Global number of dofs (assuming shared are dofs are duplicated).') 35 | 36 | parser.add_argument('--action', dest='action', action='store_true', 37 | help='Specify whether to run the problems with matrix free approach.') 38 | 39 | parser.add_argument('--mpi_size', dest='mpi_size', type=int, default=1, 40 | help='The number of mpi processes to use.') 41 | 42 | parser.add_argument('--cell_type', dest='cell_type', type=str, 43 | default="tetrahedron", choices=['tetrahedron', 'hexahedron'], 44 | help="Cell type to use") 45 | 46 | args = parser.parse_args() 47 | form_compiler = args.form_compiler 48 | problem = args.problem 49 | conf_file = args.conf 50 | degrees = [int(d) for d in args.degree] 51 | nrepeats = args.nrepeats 52 | action = args.action 53 | batch_size = args.batch_size 54 | global_size = args.global_size 55 | scalar_type = args.scalar_type 56 | mpi_size = args.mpi_size 57 | cell_type = args.cell_type 58 | 59 | machine = utils.machine_name() 60 | out_file = utils.create_ouput(problem) 61 | compilers = utils.parse_compiler_configuration(conf_file) 62 | 63 | # Set rank to 1 for matrix free, 2 otherwise 64 | rank = 1 if action else 2 65 | 66 | for c_name in compilers: 67 | compiler = compilers[c_name] 68 | compiler_version = utils.set_compiler(compiler) 69 | flags = compiler["flags"] 70 | for flag in flags: 71 | flag = "\"" + ''.join(map(str, flag)) + "\"" 72 | for degree in degrees: 73 | text = f"\n{machine}, {problem}, {c_name}, {compiler_version}, {flag}, {degree}, {form_compiler}, {scalar_type}, {batch_size}, {cell_type}, " 74 | results = utils.run(problem, degree, nrepeats, flag, action, 75 | scalar_type, global_size, batch_size, 76 | mpi_size, cell_type) 77 | for result in results: 78 | row = text + f"{rank}, {result}" 79 | with open(out_file, "a") as file: 80 | file.write(row) 81 | -------------------------------------------------------------------------------- /ffcx/compile.py: -------------------------------------------------------------------------------- 1 | 2 | from ffcx.codegeneration.backend import FFCXBackend 3 | from ffcx.analysis import analyze_ufl_objects 4 | from ffcx.ir.representation import compute_ir 5 | from ffcx.codegeneration.integrals import IntegralGenerator 6 | from ffcx.element_interface import create_element 7 | from ffcx.codegeneration.C.format_lines import format_indented_lines 8 | from ffcx.options import get_options 9 | import basix 10 | import ufl 11 | import typing 12 | import problem 13 | from importlib import reload 14 | 15 | 16 | _arguments = """({scalar_type}* restrict A, 17 | const {scalar_type}* restrict w, 18 | const {scalar_type}* restrict c, 19 | const {geom_type}* restrict coordinate_dofs, 20 | const int* restrict entity_local_index, 21 | const uint8_t* restrict quadrature_permutation)\n""" 22 | 23 | 24 | _headers = """ 25 | #include 26 | #include 27 | #include 28 | #define restrict __restrict__ 29 | 30 | constexpr int dim = {dim}; 31 | constexpr int global_size = {global_size}; 32 | constexpr int kernel_rank = {rank}; 33 | constexpr int num_nodes = {num_nodes}; 34 | constexpr int batch_size = {batch_size}; 35 | constexpr int num_coefficients = {num_coefficients}; 36 | 37 | 38 | using scalar_type={scalar_type}; 39 | using geom_type={geom_type}; 40 | using namespace std; 41 | """ 42 | 43 | 44 | _headers_batched = """ 45 | #include 46 | #include 47 | 48 | #define restrict __restrict__ 49 | #define USE_VECTOR_EXTENSIONS 50 | 51 | 52 | constexpr int dim = {dim}; 53 | constexpr int global_size = {global_size}; 54 | constexpr int kernel_rank = {rank}; 55 | constexpr int num_nodes = {num_nodes}; 56 | constexpr int num_coefficients = {num_coefficients}; 57 | 58 | #if defined(__clang__) 59 | typedef {scalar_type} {scalar_type}{batch_size} __attribute__((ext_vector_type({batch_size}))); 60 | #elif defined(__GNUC__) || defined(__GNUG__) 61 | typedef {scalar_type} {scalar_type}{batch_size} __attribute__((vector_size({batch_size} * sizeof({scalar_type})))); 62 | #else 63 | #error "Compiler not supported" 64 | #endif 65 | 66 | #if defined(__clang__) 67 | typedef {geom_type} {geom_type}{batch_size} __attribute__((ext_vector_type({batch_size}))); 68 | #elif defined(__GNUC__) || defined(__GNUG__) 69 | typedef {geom_type} {geom_type}{batch_size} __attribute__((vector_size({batch_size} * sizeof({geom_type})))); 70 | #else 71 | #error "Compiler not supported" 72 | #endif 73 | 74 | using scalar_type = {scalar_type}{batch_size}; 75 | using geom_type = {geom_type}{batch_size}; 76 | 77 | constexpr int batch_size = {batch_size}; 78 | using namespace std; 79 | """ 80 | 81 | 82 | def compute_integral_body(ir, backend): 83 | # Configure kernel generator 84 | ig = IntegralGenerator(ir, backend) 85 | # Generate code ast for the tabulate_tensor body 86 | parts = ig.generate() 87 | # Format code as string 88 | body = format_indented_lines(parts.cs_format(ir.precision), 1) 89 | return body 90 | 91 | 92 | def compile_form(form: ufl.Form, name: str, 93 | parameters: typing.Dict = None, 94 | visualise: bool = False): 95 | 96 | if parameters is None: 97 | parameters = get_options() 98 | 99 | # Stage 1: analysis 100 | analysis = analyze_ufl_objects([form], parameters) 101 | 102 | # Stage 2: intermediate representation 103 | ir = compute_ir(analysis, {}, " ", parameters, visualise) 104 | 105 | if len(ir.integrals) > 1: 106 | raise RuntimeError( 107 | "This function is meant to compile one integral type a time.") 108 | 109 | # Stage 3: code generation 110 | integral_ir = ir.integrals[0] 111 | backend = FFCXBackend(integral_ir, parameters) 112 | 113 | scalar_type = parameters["scalar_type"] 114 | geom_type = scalar_type.replace(' _Complex', '') 115 | batch_size = parameters["batch_size"] 116 | 117 | if batch_size and batch_size > 1: 118 | geom_type += str(batch_size) 119 | scalar_type += str(batch_size) 120 | 121 | settings = {"scalar_type": scalar_type, "geom_type": geom_type} 122 | arguments = _arguments.format(**settings) 123 | signature = "inline void " + name + arguments 124 | body = compute_integral_body(integral_ir, backend) 125 | code = signature + " {\n" + body + "\n}\n" 126 | 127 | return code 128 | 129 | 130 | def generate_code(action, scalar_type, global_size, batch_size): 131 | reload(problem) 132 | 133 | batch_size = batch_size if batch_size else 1 134 | parameters = get_options() 135 | parameters["scalar_type"] = scalar_type 136 | parameters["batch_size"] = batch_size 137 | 138 | if action: 139 | code = compile_form(problem.L, "kernel", parameters) 140 | num_coefficients = analyze_ufl_objects( 141 | [problem.L], parameters).form_data[0].num_coefficients 142 | rank = 1 143 | else: 144 | code = compile_form(problem.a, "kernel", parameters) 145 | num_coefficients = analyze_ufl_objects( 146 | [problem.a], parameters).form_data[0].num_coefficients 147 | rank = 2 148 | 149 | element = create_element(problem.element) 150 | num_nodes = element.cell().num_vertices() 151 | geom_type = scalar_type.replace(' _Complex', '') 152 | 153 | if batch_size > 1: 154 | headers = _headers_batched.format(dim=element.dim, global_size=global_size, 155 | scalar_type=scalar_type, rank=rank, geom_type=geom_type, 156 | batch_size=batch_size, num_nodes=num_nodes, 157 | num_coefficients=num_coefficients) 158 | else: 159 | headers = _headers.format(dim=element.dim, global_size=global_size, 160 | scalar_type=scalar_type, rank=rank, geom_type=geom_type, 161 | batch_size=batch_size, num_nodes=num_nodes, num_coefficients=num_coefficients) 162 | 163 | with open("ffcx/problem.hpp", "w") as file: 164 | file.write(headers) 165 | file.write(code) 166 | --------------------------------------------------------------------------------