├── compilers.yaml
├── forms
    ├── Mass.ufl
    ├── Laplacian.ufl
    ├── N1curl.ufl
    └── Elasticity.ufl
├── ffcx
    ├── CMakeLists.txt
    ├── geometry.hpp
    ├── main.cpp
    └── compile.py
├── .gitignore
├── installation.md
├── info
    ├── count_flops.py
    └── kernel_info.py
├── .github
    └── workflows
    │   └── python-app.yml
├── compilers-a64fx.yaml
├── utils.py
├── README.md
└── run.py


/compilers.yaml:
--------------------------------------------------------------------------------
1 | gcc-10:
2 |   version:
3 |     - 10
4 |   cpp:
5 |     - g++-10
6 |   cc:
7 |     - gcc-10
8 |   flags:
9 |     - -Ofast -march=native -mprefer-vector-width=256


--------------------------------------------------------------------------------
/forms/Mass.ufl:
--------------------------------------------------------------------------------
 1 | from ufl import *
 2 | 
 3 | element = FiniteElement("Lagrange", $cell, $degree)
 4 | mesh = Mesh(VectorElement("Lagrange", $cell, 1))
 5 | 
 6 | V = FunctionSpace(mesh, element)
 7 | u = TrialFunction(V)
 8 | v = TestFunction(V)
 9 | 
10 | a = inner(u, v)*dx
11 | 
12 | un = Coefficient(V)
13 | L = action(a, un)


--------------------------------------------------------------------------------
/ffcx/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.12)
 2 | 
 3 | set(PROJECT_NAME benchmark)
 4 | project(${PROJECT_NAME})
 5 | 
 6 | set(CMAKE_CXX_STANDARD 20)
 7 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -Wall")
 8 | 
 9 | add_executable(${PROJECT_NAME} main.cpp)
10 | 
11 | find_package(MPI 3 REQUIRED)
12 | target_link_libraries(${PROJECT_NAME} PUBLIC MPI::MPI_CXX)
13 | 
14 | message(${CMAKE_CXX_FLAGS})


--------------------------------------------------------------------------------
/forms/Laplacian.ufl:
--------------------------------------------------------------------------------
 1 | from ufl import *
 2 | 
 3 | element = FiniteElement("Lagrange", $cell, $degree)
 4 | mesh = Mesh(VectorElement("Lagrange", $cell, 1))
 5 | 
 6 | V = FunctionSpace(mesh, element)
 7 | u = TrialFunction(V)
 8 | v = TestFunction(V)
 9 | 
10 | W = FunctionSpace(mesh, FiniteElement("Lagrange", $cell, 1))
11 | k = Coefficient(W)
12 | 
13 | a = k*inner(grad(u), grad(v))*dx
14 | 
15 | un = Coefficient(V)
16 | L = action(a, un)


--------------------------------------------------------------------------------
/forms/N1curl.ufl:
--------------------------------------------------------------------------------
 1 | from ufl import *
 2 | 
 3 | element = FiniteElement("N1curl", tetrahedron, $degree)
 4 | coord_element = VectorElement("Lagrange", tetrahedron, 1)
 5 | mesh = Mesh(coord_element)
 6 | 
 7 | V = FunctionSpace(mesh, element)
 8 | W = FunctionSpace(mesh, FiniteElement("Lagrange", tetrahedron, 1))
 9 | 
10 | u = TrialFunction(V)
11 | v = TestFunction(V)
12 | k = Coefficient(W)
13 | a = k*inner(curl(u), curl(v))*dx
14 | 
15 | un = Coefficient(V)
16 | L = action(a, un)


--------------------------------------------------------------------------------
/forms/Elasticity.ufl:
--------------------------------------------------------------------------------
 1 | from ufl import *
 2 | 
 3 | element = VectorElement("Lagrange", $cell, $degree)
 4 | mesh = Mesh(VectorElement("Lagrange", $cell, 1))
 5 | 
 6 | V = FunctionSpace(mesh, element)
 7 | 
 8 | u = TrialFunction(V)
 9 | v = TestFunction(V)
10 | 
11 | W = FunctionSpace(mesh, FiniteElement("Lagrange", $cell, 1))
12 | k = Coefficient(W)
13 | 
14 | def eps(v):
15 |     return sym(grad(v))
16 | 
17 | a = k*inner(eps(u), eps(v))*dx
18 | 
19 | un = Coefficient(V)
20 | L = action(a, un)


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Prerequisites
 2 | *.d
 3 | 
 4 | # C files
 5 | *.c
 6 | *.h
 7 | 
 8 | # Compiled Object files
 9 | *.slo
10 | *.lo
11 | *.o
12 | *.obj
13 | 
14 | # Precompiled Headers
15 | *.gch
16 | *.pch
17 | 
18 | # Compiled Dynamic libraries
19 | *.so
20 | *.dylib
21 | *.dll
22 | 
23 | # Fortran module files
24 | *.mod
25 | *.smod
26 | 
27 | # Compiled Static libraries
28 | *.lai
29 | *.la
30 | *.a
31 | *.lib
32 | 
33 | # Executables
34 | *.exe
35 | *.out
36 | *.app
37 | 
38 | build*/
39 | .vscode
40 | *.png
41 | *.txt
42 | */__pycache__/*
43 | tsfc_kernel.cpp
44 | __pycache__/*
45 | 
46 | 
47 | #generated code
48 | problem.hpp
49 | problem.py
50 | problem.ufl
51 | data.hpp
52 | output/*
53 | problem*.py


--------------------------------------------------------------------------------
/ffcx/geometry.hpp:
--------------------------------------------------------------------------------
 1 | #include <vector>
 2 | #include <array>
 3 | #include <iostream>
 4 | 
 5 | template <typename T>
 6 | std::vector<T> create_geometry(int num_batches, int batch_size, int geom_size)
 7 | {
 8 |     std::array<double, 24> coords = {0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 1.0,
 9 |                                      1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0};
10 |     std::vector<T> geometry(num_batches * geom_size);
11 | #ifdef USE_VECTOR_EXTENSIONS
12 |     for (int c = 0; c < num_batches; c++)
13 |         for (int i = 0; i < geom_size; i++)
14 |             for (int j = 0; j < batch_size; j++)
15 |                 geometry[c * geom_size + i][j] = coords[i];
16 | #else
17 |     for (int c = 0; c < num_batches; c++)
18 |         for (int i = 0; i < geom_size; i++)
19 |             geometry[c * geom_size + i] = coords[i];
20 | #endif
21 |     return geometry;
22 | }


--------------------------------------------------------------------------------
/installation.md:
--------------------------------------------------------------------------------
 1 | # Installation
 2 | 
 3 | ## Installing ffcx and dependencies
 4 | 
 5 | Requires blas and numpy:
 6 | 
 7 | ```bash
 8 | python3 -m venv env/ffcx
 9 | source env/ffcx/bin/activate
10 | 
11 | python3 -m pip install git+https://github.com/FEniCS/ufl.git
12 | python3 -m pip install git+https://github.com/FEniCS/basix.git
13 | python3 -m pip install git+https://github.com/FEniCS/ffcx.git
14 | python3 -m pip install pyyaml
15 | 
16 | ```
17 | 
18 | ## Installing ffcx and dependencies with spack
19 | 
20 | ```bash
21 | spack env create ffcx
22 | spack env activate ffcx
23 | spack add mpich py-fenics-ffcx@main
24 | 
25 | spack load python
26 | python3 -m ensurepip
27 | python3 -m pip install pyyaml
28 | ```
29 | 
30 | ## Roofline with Intel Advisor
31 | 
32 | ### Intel advisor commands
33 | 
34 | ```bash
35 | advisor --collect=survey --project-dir=./advi --search-dir src:r=. -- ./build/benchmark
36 | advisor -collect tripcounts -flop -stacks --project-dir=./advi --search-dir src:r=. -- ./build/benchmark
37 | advisor --report=roofline --with-stack --project-dir=./advi --report-output=./advi/out/roofline.html
38 | ```
39 | 
40 | ### Intel advisor with MPI
41 | 
42 | ```bash
43 | mpirun -gtool "advisor --collect=survey --project-dir=./advi_results" -n 6 ./build/benchmark
44 | mpirun -gtool "advisor -collect tripcounts -flop -stacks --search-dir src:r=. --project-dir=./advi:1-6"  -n 6 ./build/benchmark
45 | advisor --report=roofline --with-stack --project-dir=./advi --report-output=./advi/out/roofline.html
46 | ```
47 | 


--------------------------------------------------------------------------------
/info/count_flops.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | 
 3 | import ffcx.options
 4 | import ufl
 5 | from ffcx.analysis import analyze_ufl_objects
 6 | from ffcx.codegeneration.backend import FFCXBackend
 7 | from ffcx.codegeneration.integrals import IntegralGenerator
 8 | from ffcx.ir.representation import compute_ir
 9 | import numpy
10 | 
11 | 
12 | def count_flops(form: ufl.Form, options: Optional[dict] = {}):
13 |     """Return a list with the number of flops for each kernel in the Form."""
14 |     options = ffcx.options.get_options(options)
15 |     assert isinstance(form, ufl.Form)
16 |     analysis = analyze_ufl_objects([form], options)
17 |     ir = compute_ir(analysis, {}, "flops", options, False)
18 | 
19 |     flops = []
20 |     _bytes = []
21 | 
22 |     for integral_ir in ir.integrals:
23 |         # Create FFCx C backend
24 |         backend = FFCXBackend(integral_ir, options)
25 |         # Configure kernel generator
26 |         ig = IntegralGenerator(integral_ir, backend)
27 |         # Generate code ast for the tabulate_tensor body
28 |         ast = ig.generate()
29 |         _sum = 0
30 | 
31 |         for statement in ast.statements:
32 |             if isinstance(statement, ffcx.codegeneration.C.cnodes.ArrayDecl):
33 |                 _sum += numpy.prod(statement.sizes)
34 |             if isinstance(statement, ffcx.codegeneration.C.cnodes.Scope):
35 |                 for sub_statements in statement.body.statements:
36 |                     if isinstance(sub_statements, ffcx.codegeneration.C.cnodes.ArrayDecl):
37 |                         _sum += numpy.prod(sub_statements.sizes)
38 |         flops.append(ast.flops())
39 |         _bytes.append(_sum)
40 |         
41 |     return flops[0], _bytes[0]
42 | 


--------------------------------------------------------------------------------
/.github/workflows/python-app.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python
 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
 3 | 
 4 | name: Python application
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: [ "main" ]
 9 |   pull_request:
10 |     branches: [ "main" ]
11 | 
12 | permissions:
13 |   contents: read
14 | 
15 | jobs:
16 |   build:
17 | 
18 |     runs-on: ubuntu-latest
19 |     env:
20 |       CC: gcc-10
21 |       CXX: g++-10
22 | 
23 |     steps:
24 |     - uses: actions/checkout@v3
25 |     - name: Set up Python 3.10
26 |       uses: actions/setup-python@v3
27 |       with:
28 |         python-version: 3.9
29 |     - name: Install dependencies
30 |       run: |
31 |         sudo apt-get install -y libopenblas-dev liblapack-dev graphviz libgraphviz-dev ninja-build
32 |         sudo apt-get -y install mpich
33 |         python -m pip install --upgrade pip
34 |         pip install flake8 pytest pyyaml scikit-build
35 |         python3 -m pip install git+https://github.com/FEniCS/ufl.git
36 |         python3 -m pip install git+https://github.com/FEniCS/basix.git
37 |         python3 -m pip install git+https://github.com/FEniCS/ffcx.git
38 |         python3 -m pip install pyyaml
39 |     - name: Run local assembly
40 |       run: |
41 |         python run.py --form_compiler=ffcx --scalar_type=double --action --global_size=10000000 --degree 5 --problem=Mass --nrepeats 5 --cell_type=hexahedron
42 |         python run.py --form_compiler=ffcx --scalar_type=double --action --global_size=10000000 --degree 5 --problem=Mass --nrepeats 5 --cell_type=tetrahedron
43 |         cat output/Mass.txt
44 | 


--------------------------------------------------------------------------------
/compilers-a64fx.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | # gcc-11-202103:
 3 | #   version:
 4 | #     - 11.0.1
 5 | #   cpp:
 6 | #     - /snx11273/projects/bristol/modules-a64fx/gcc/11-20210321/bin/g++
 7 | #   cc:
 8 | #     - /snx11273/projects/bristol/modules-a64fx/gcc/11-20210321/bin/gcc
 9 | #   flags:
10 | #     - -Ofast -march=native -msve-vector-bits=2048
11 | #     - -Ofast -march=native -msve-vector-bits=512
12 | #     - -Ofast -march=native -msve-vector-bits=128
13 | #     - -Ofast -march=native -fno-tree-vectorize
14 | 
15 | # gcc-11:
16 | #   version:
17 | #     - 11.1.0
18 | #   cpp:
19 | #     - /snx11273/projects/bristol/modules-a64fx/gcc/11.1.0/bin/g++
20 | #   cc:
21 | #     - /snx11273/projects/bristol/modules-a64fx/gcc/11.1.0/bin/gcc
22 | #   flags:
23 | #     - -Ofast -march=native -msve-vector-bits=2048
24 | #     - -Ofast -march=native -msve-vector-bits=512
25 | #     - -Ofast -march=native -msve-vector-bits=128
26 | #     - -Ofast -march=native -fno-tree-vectorize
27 | #     - -O3 -march=native
28 | 
29 | clang-11:
30 |   version:
31 |     - 11.1.0
32 |   cpp:
33 |     - /lustre/projects/bristol/modules-a64fx/llvm/11.0/bin/clang++
34 |   cc:
35 |     - /lustre/projects/bristol/modules-a64fx/llvm/11.0/bin/clang
36 |   flags:
37 |     - -Ofast -mprefer-vector-width=2048
38 |     - -Ofast -fno-slp-vectorize
39 | 
40 | # cray:
41 | #   version:
42 | #     - 2.7.0
43 | #   cpp:
44 | #     - /opt/cray/pe/craype/2.7.0/bin/CC 
45 | #   cc:
46 | #     - /opt/cray/pe/craype/2.7.0/bin/cc 
47 | #   flags:
48 | #     - -O3 -hvector3 -hfp4
49 | #     - -O3 
50 | #     - -O2 
51 | 
52 | # gcc-8:
53 | #   version:
54 | #     - 8.1.0
55 | #   cpp:
56 | #     - /opt/gcc/8.1.0/bin/g++
57 | #   cc:
58 | #     - /opt/gcc/8.1.0/bin/gcc
59 | #   flags:
60 | #     - -Ofast -march=native -msve-vector-bits=2048
61 | #     - -Ofast -march=native -fno-tree-vectorize
62 | #     - -O3 -march=native
63 | 
64 | 


--------------------------------------------------------------------------------
/ffcx/main.cpp:
--------------------------------------------------------------------------------
 1 | #include "problem.hpp"
 2 | #include "geometry.hpp"
 3 | #include <algorithm>
 4 | #include <array>
 5 | #include <chrono>
 6 | #include <iostream>
 7 | #include <mpi.h>
 8 | #include <vector>
 9 | 
10 | int main(int argc, char *argv[])
11 | {
12 |   MPI_Init(&argc, &argv);
13 |   {
14 |     MPI_Comm comm = MPI_COMM_WORLD;
15 |     int mpi_rank;
16 |     MPI_Comm_rank(comm, &mpi_rank);
17 | 
18 |     // Const data from kernel
19 |     constexpr int num_dofs = dim;
20 |     constexpr int local_size = kernel_rank == 1 ? dim : dim * dim;
21 |     constexpr int stride = num_dofs * num_coefficients;
22 |     constexpr int num_cells = global_size / num_dofs;
23 |     constexpr int num_batches = num_cells / batch_size;
24 |     constexpr int geom_size = num_nodes * 3;
25 | 
26 |     // Allocate and initialize data
27 |     std::vector<scalar_type> A(num_batches * local_size);
28 | 
29 |     // Constants for cross element vectorization
30 |     scalar_type one = {1.};
31 |     scalar_type zero = {0.};
32 | 
33 |     // Create geometry and coefficients
34 |     std::vector<geom_type> geometry = create_geometry<geom_type>(num_batches, batch_size, geom_size);
35 |     std::vector<scalar_type> coefficients(num_batches * stride);
36 |     auto set_ = [one](auto &e)
37 |     { e = one; };
38 |     std::for_each(coefficients.begin(), coefficients.end(), set_);
39 | 
40 |     std::array<scalar_type, local_size> Ae;
41 | 
42 |     double start = MPI_Wtime();
43 |     for (int batch = 0; batch < num_batches; batch++)
44 |     {
45 |       std::fill(Ae.begin(), Ae.end(), zero);
46 |       scalar_type *coeffs = coefficients.data() + batch * stride;
47 |       geom_type *geo = geometry.data() + batch * geom_size;
48 |       kernel(Ae.data(), coeffs, nullptr, geo, 0, 0);
49 |       std::vector<scalar_type>::iterator result = std::next(A.begin(), batch * local_size);
50 |       std::transform(Ae.begin(), Ae.end(), result, result, std::plus<scalar_type>());
51 |     }
52 |     double end = MPI_Wtime();
53 |     double local_time = end - start;
54 | 
55 |     double max_time = 0;
56 |     MPI_Allreduce(&local_time, &max_time, 1, MPI_DOUBLE, MPI_MAX, comm);
57 | 
58 |     if (mpi_rank == 0)
59 |       std::cout << num_cells << ", " << max_time;
60 |   }
61 |   MPI_Finalize();
62 | 
63 |   return 0;
64 | }


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import platform
 3 | import yaml
 4 | from string import Template
 5 | from subprocess import Popen, PIPE
 6 | 
 7 | import sys
 8 | 
 9 | _build_cmd = "cd {form_compiler} && rm -rf build && mkdir build && cd build && cmake -DCMAKE_C_FLAGS={flag} -DCMAKE_CXX_FLAGS={flag} .. && make"
10 | 
11 | 
12 | def set_compiler(compiler):
13 |     os.environ["CXX"] = compiler["cpp"][0]
14 |     os.environ["CC"] = compiler["cc"][0]
15 | 
16 |     try:
17 |         with Popen([compiler["cpp"][0], "-dumpversion"], stdout=PIPE) as p:
18 |             compiler_version = p.stdout.read().decode("ascii").strip()
19 |     except:
20 |         compiler_version = compiler["version"][0]
21 |     return compiler_version
22 | 
23 | 
24 | def parse_compiler_configuration(file):
25 |     # Read Compiler configuration file
26 |     with open(file, "r") as stream:
27 |         try:
28 |             compilers = yaml.safe_load(stream)
29 |         except yaml.YAMLError as exc:
30 |             print(exc)
31 |     return compilers
32 | 
33 | 
34 | def machine_name():
35 |     # Set architecture from platform
36 |     try:
37 |         with open("/sys/devices/cpu/caps/pmu_name", "r") as pmu:
38 |             machine = pmu.readlines()[0].strip()
39 |     except:
40 |         machine = platform.processor()
41 |     return machine
42 | 
43 | 
44 | def create_ouput(problem):
45 |     header = "machine,problem,compiler,version,flags,degree,fcomp,scalar,batch_size,rank,cell_type,ncells,time"
46 |     path = "output/"
47 |     out_file = path + str(problem) + ".txt"
48 | 
49 |     if not os.path.exists(out_file):
50 |         if not os.path.isdir(path):
51 |             os.mkdir(path)
52 |         with open(out_file, "a") as f:
53 |             f.write(header)
54 |     return out_file
55 | 
56 | 
57 | def run(problem: str, degree: int, nrepeats: int, flag: str, action: bool,
58 |         scalar_type: str, global_size: int, batch_size: int, mpi_size: int,
59 |         cell_type: str):
60 | 
61 |     try:
62 |         import ffcx
63 |         import ffcx.codegeneration
64 |     except ImportError:
65 |         print("ffcx is not available")
66 | 
67 |     with open("forms/" + problem + ".ufl", 'r') as f:
68 |         src = Template(f.read())
69 |         d = {'degree': str(degree), 'vdegree': str(
70 |             degree + 1), "cell": cell_type}
71 |         result = src.substitute(d)
72 | 
73 |         with open("ffcx/problem.py", "w") as f2:
74 |             f2.writelines(result)
75 | 
76 |     sys.path.insert(1, 'ffcx/')
77 |     from compile import generate_code
78 |     generate_code(action, scalar_type, global_size, batch_size)
79 | 
80 |     run = f"mpirun -n {mpi_size} ./ffcx/build/benchmark"
81 |     build = _build_cmd.format(form_compiler="ffcx", flag=flag)
82 | 
83 |     if os.system(build) != 0:
84 |         raise RuntimeError("build failed")
85 |     result = [Popen(run.split(), stdout=PIPE).stdout.read().decode("ascii").strip() for i in range(nrepeats)]
86 |     print(result)
87 | 
88 |     return result
89 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Local Finite ELement Operator Benchmarks
 2 | 
 3 | Depends on:
 4 | 
 5 | - FFCx: The FEniCSx Form Compiler: [https://github.com/FEniCS/ffcx.git]
 6 | - For sum-factorization: [https://github.com/FEniCS/ffcx/tree/igor/tensor]
 7 | 
 8 | ## Usage
 9 | 
10 | ```bash
11 | python3 run.py --help
12 | usage: run.py [-h] [--form_compiler {ffcx,ffc,tsfc}] [--scalar_type {double,float,_Float16,double _Complex,float _Complex}]
13 |               [--problem {Laplacian,Mass,Elasticity,N1curl,Stokes}] [--conf CONF] [--degree DEGREE [DEGREE ...]] [--nrepeats NREPEATS]
14 |               [--batch_size {None,1,2,4,8,16}] [--global_size GLOBAL_SIZE] [--action] [--mpi_size MPI_SIZE] [--cell_type {tetrahedron,hexahedron}]
15 | 
16 | Run local assembly benchmark.
17 | 
18 | optional arguments:
19 |   -h, --help            show this help message and exit
20 |   --form_compiler {ffcx,ffc,tsfc}
21 |                         Form Compiler to use (default: ffcx)
22 |   --scalar_type {double,float,_Float16,double _Complex,float _Complex}
23 |                         Scalar type to use (default: double)
24 |   --problem {Laplacian,Mass,Elasticity,N1curl,Stokes}
25 |                         Problem to run (default: Laplacian)
26 |   --conf CONF           Configuration file describing the compilers and flags. (default: compilers.yaml)
27 |   --degree DEGREE [DEGREE ...]
28 |                         Polynomial degree to evaluate the operators. (default: range(1, 4))
29 |   --nrepeats NREPEATS   Number of times to run each experiment. (default: 3)
30 |   --batch_size {None,1,2,4,8,16}
31 |   --global_size GLOBAL_SIZE
32 |                         Global number of dofs (assuming shared are dofs are duplicated). (default: 1000000.0)
33 |   --action              Specify whether to run the problems with matrix free approach. (default: False)
34 |   --mpi_size MPI_SIZE   The number of mpi processes to use. (default: 1)
35 |   --cell_type {tetrahedron,hexahedron}
36 |                         Cell type to use (default: tetrahedron
37 | ```
38 | 
39 | ## Compiler Configuration File
40 | 
41 | Example of compiler configuration file (compilers.yaml):
42 | 
43 | ```yaml
44 | gcc-11:
45 |   version:
46 |     - 11.1.0
47 |   cpp:
48 |     - /usr/bin/g++-11
49 |   cc:
50 |     - /usr/bin/gcc-11
51 |   flags:
52 |     - -Ofast -march=native -mprefer-vector-width=256
53 | 
54 | clang:
55 |   version:
56 |     - 12.0.1
57 |   cpp:
58 |     - /usr/bin/clang++
59 |   cc:
60 |     - /usr/bin/clang
61 |   flags:
62 |     - -Ofast -march=native -mprefer-vector-width=256
63 | ```
64 | 
65 | ## Examples
66 | 
67 | ### Matrix free Weighted Laplacian degrees 1-8
68 | 
69 | ```bash
70 | python3 run.py --problem Lagrange  --degree 1 2 3 4 5 6 7 8 --form_compiler=ffcx --action --global_size 10000000
71 | ```
72 | 
73 | ### Single Precision Mass operator on 76 cores
74 | 
75 | ```bash
76 | python3 run.py --problem Mass --degree 1 2 3 4 5 6 7 8 --form_compiler=ffcx --action --mpi_size 76 --global_size 10000000
77 | ```
78 | 
79 | ## Output data description
80 | 
81 | Results:
82 | output/{Problem}.txt
83 | 
84 | ```bash
85 | table = [machine,problem,compiler,version,flags,degree,form_compiler,scalar_type,batch_size,form_rank,cell_type,num_cells,time]
86 | ```
87 | 


--------------------------------------------------------------------------------
/info/kernel_info.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import platform
  3 | import yaml
  4 | from string import Template
  5 | from subprocess import Popen, PIPE
  6 | from importlib import reload, import_module
  7 | from count_flops import count_flops
  8 | import numpy
  9 | from matplotlib import pyplot as plt
 10 | from ffcx.element_interface import create_element
 11 | 
 12 | 
 13 | problem_ = "Laplacian"
 14 | degrees = numpy.arange(1, 16)
 15 | cell_type = "hexahedron"
 16 | 
 17 | for degree in degrees:
 18 |     with open("../forms/" + problem_ + ".ufl", 'r') as f:
 19 |         src = Template(f.read())
 20 |         d = {'degree': str(degree), 'vdegree': str(
 21 |             degree + 1), "cell": cell_type}
 22 |         result = src.substitute(d)
 23 | 
 24 |     with open(f"problem{degree}.py", "w") as f2:
 25 |         f2.writelines(result)
 26 | 
 27 | 
 28 | flops_per_dof = numpy.zeros_like(degrees)
 29 | bytes_per_dof = numpy.zeros_like(degrees)
 30 | cache_per_dof = numpy.zeros_like(degrees)
 31 | dofs_list = numpy.zeros_like(degrees)
 32 | for i, degree in enumerate(degrees):
 33 |     module = import_module(f"problem{degree}")
 34 |     form = module.L
 35 |     element = create_element(module.element)
 36 |     dofs = element.dim
 37 |     dofs_list[i] = dofs
 38 | 
 39 |     cell = element.cell()
 40 |     geometry_size = cell.num_vertices() * 3
 41 | 
 42 |     coeff_size = 0
 43 |     coefficients = form.coefficients()
 44 |     for coeff in coefficients:
 45 |         el = create_element(coeff.ufl_element())
 46 |         coeff_size += el.dim
 47 | 
 48 |     flops_per_dof[i], cache_per_dof[i] = count_flops(form)
 49 |     bytes_per_dof[i] = ((coeff_size + 2*dofs + geometry_size) * 8)/dofs
 50 |     cache_per_dof[i] = (cache_per_dof[i] * 8)/dofs
 51 |     # flops_per_dof[i] = flops_per_dof[i]/dofs
 52 | 
 53 | print(repr(flops_per_dof))
 54 | 
 55 | print(cache_per_dof)
 56 | FLOPS = numpy.full_like(degrees, 1e100, dtype=float)
 57 | BW = numpy.full_like(degrees, 300e9, dtype=float)
 58 | BW1 = numpy.full_like(degrees, 1.5e4*1e9, dtype=float)
 59 | BW2 = numpy.full_like(degrees, 0.8e4*1e9, dtype=float)
 60 | BW3 = numpy.full_like(degrees, 1.5e3*1e9, dtype=float)
 61 | 
 62 | 
 63 | cache_rate = numpy.zeros_like(BW)
 64 | level = numpy.zeros_like(BW)
 65 | 
 66 | for i, num_dofs in enumerate(dofs_list):
 67 |     size = num_dofs * (cache_per_dof[i] + bytes_per_dof[i])
 68 |     ai_cache = numpy.divide(flops_per_dof[i], size/num_dofs)
 69 |     if size < 32e3:
 70 |         cache_rate[i] = ai_cache * BW1[i]
 71 |         level[i] = 1
 72 |     elif size < 1280e3:
 73 |         cache_rate[i] = ai_cache * BW2[i]
 74 |         level[i] = 2
 75 |     elif size < 1800e3:
 76 |         cache_rate[i] = ai_cache * BW3[i]
 77 |         level[i] = 3
 78 | 
 79 | 
 80 | max_throughput_cache = cache_rate/flops_per_dof
 81 | ai_std = flops_per_dof/bytes_per_dof
 82 | max_flops = numpy.minimum(FLOPS, ai_std*BW)
 83 | max_throughput = max_flops/flops_per_dof
 84 | 
 85 | markers = ["o", "v", "s", "X"]
 86 | plt.plot(degrees, max_throughput, label="standard model",
 87 |          marker=markers[1], linestyle='dashed')
 88 | 
 89 | max_throughput_cache = numpy.minimum(max_throughput, max_throughput_cache)
 90 | 
 91 | plt.plot(degrees, max_throughput_cache, label="cache-aware model",
 92 |          marker=markers[1], linestyle='dashed')
 93 | 
 94 | 
 95 | plt.ylabel("max throughput(dofs/s)")
 96 | plt.legend()
 97 | plt.xlabel(r"polynomial degree $P$")
 98 | plt.grid(True, which="both")
 99 | plt.yscale("log")
100 | plt.show()
101 | 
102 | 
103 | print(level)
104 | 


--------------------------------------------------------------------------------
/run.py:
--------------------------------------------------------------------------------
 1 | import utils
 2 | import argparse
 3 | 
 4 | if __name__ == "__main__":
 5 |     parser = argparse.ArgumentParser(
 6 |         description='Run local assembly benchmark.',
 7 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 8 | 
 9 |     parser.add_argument('--form_compiler', dest='form_compiler', type=str,
10 |                         default="ffcx", choices=['ffcx', 'ffc', 'tsfc'],
11 |                         help="Form Compiler to use")
12 | 
13 |     parser.add_argument('--scalar_type', dest='scalar_type', type=str,
14 |                         default="double", choices=['double', 'float', '_Float16', 'double _Complex', 'float _Complex'],
15 |                         help="Scalar type to use")
16 | 
17 |     parser.add_argument('--problem', dest='problem', type=str,
18 |                         default="Laplacian", choices=['Laplacian', 'Mass', 'Elasticity', 'N1curl', 'Stokes'],
19 |                         help="Problem to run")
20 | 
21 |     parser.add_argument('--conf', dest='conf', type=str, default="compilers.yaml",
22 |                         help="Configuration file describing the compilers and flags.")
23 | 
24 |     parser.add_argument('--degree', dest='degree', default=range(1, 4), nargs='+',
25 |                         help='Polynomial degree to evaluate the operators.')
26 | 
27 |     parser.add_argument('--nrepeats', dest='nrepeats', type=int, default=3,
28 |                         help='Number of times to run each experiment.')
29 | 
30 |     parser.add_argument('--batch_size', dest='batch_size', type=int, default=None, choices=[None, 1, 2, 4, 8, 16, 32, 64],
31 |                         help='')
32 | 
33 |     parser.add_argument('--global_size', dest='global_size', type=int, default=1e6,
34 |                         help='Global number of dofs (assuming shared are dofs are duplicated).')
35 | 
36 |     parser.add_argument('--action', dest='action', action='store_true',
37 |                         help='Specify whether to run the problems with matrix free approach.')
38 | 
39 |     parser.add_argument('--mpi_size', dest='mpi_size', type=int, default=1,
40 |                         help='The number of mpi processes to use.')
41 | 
42 |     parser.add_argument('--cell_type', dest='cell_type', type=str,
43 |                         default="tetrahedron", choices=['tetrahedron', 'hexahedron'],
44 |                         help="Cell type to use")
45 | 
46 |     args = parser.parse_args()
47 |     form_compiler = args.form_compiler
48 |     problem = args.problem
49 |     conf_file = args.conf
50 |     degrees = [int(d) for d in args.degree]
51 |     nrepeats = args.nrepeats
52 |     action = args.action
53 |     batch_size = args.batch_size
54 |     global_size = args.global_size
55 |     scalar_type = args.scalar_type
56 |     mpi_size = args.mpi_size
57 |     cell_type = args.cell_type
58 | 
59 |     machine = utils.machine_name()
60 |     out_file = utils.create_ouput(problem)
61 |     compilers = utils.parse_compiler_configuration(conf_file)
62 | 
63 |     # Set rank to 1 for matrix free, 2 otherwise
64 |     rank = 1 if action else 2
65 | 
66 |     for c_name in compilers:
67 |         compiler = compilers[c_name]
68 |         compiler_version = utils.set_compiler(compiler)
69 |         flags = compiler["flags"]
70 |         for flag in flags:
71 |             flag = "\"" + ''.join(map(str, flag)) + "\""
72 |             for degree in degrees:
73 |                 text = f"\n{machine}, {problem}, {c_name}, {compiler_version}, {flag}, {degree}, {form_compiler}, {scalar_type}, {batch_size}, {cell_type}, "
74 |                 results = utils.run(problem, degree, nrepeats, flag, action,
75 |                                     scalar_type, global_size, batch_size,
76 |                                     mpi_size, cell_type)
77 |                 for result in results:
78 |                     row = text + f"{rank}, {result}"
79 |                     with open(out_file, "a") as file:
80 |                         file.write(row)
81 | 


--------------------------------------------------------------------------------
/ffcx/compile.py:
--------------------------------------------------------------------------------
  1 | 
  2 | from ffcx.codegeneration.backend import FFCXBackend
  3 | from ffcx.analysis import analyze_ufl_objects
  4 | from ffcx.ir.representation import compute_ir
  5 | from ffcx.codegeneration.integrals import IntegralGenerator
  6 | from ffcx.element_interface import create_element
  7 | from ffcx.codegeneration.C.format_lines import format_indented_lines
  8 | from ffcx.options import get_options
  9 | import basix
 10 | import ufl
 11 | import typing
 12 | import problem
 13 | from importlib import reload
 14 | 
 15 | 
 16 | _arguments = """({scalar_type}* restrict A,
 17 |                    const {scalar_type}* restrict w,
 18 |                    const {scalar_type}* restrict c,
 19 |                    const {geom_type}* restrict coordinate_dofs,
 20 |                    const int* restrict entity_local_index,
 21 |                    const uint8_t* restrict quadrature_permutation)\n"""
 22 | 
 23 | 
 24 | _headers = """
 25 | #include <cmath>
 26 | #include <cstdint>
 27 | #include <complex.h>
 28 | #define restrict __restrict__
 29 | 
 30 | constexpr int dim = {dim};
 31 | constexpr int global_size = {global_size};
 32 | constexpr int kernel_rank = {rank};
 33 | constexpr int num_nodes = {num_nodes};
 34 | constexpr int batch_size = {batch_size};
 35 | constexpr int num_coefficients = {num_coefficients};
 36 | 
 37 | 
 38 | using scalar_type={scalar_type};
 39 | using geom_type={geom_type};
 40 | using namespace std;
 41 | """
 42 | 
 43 | 
 44 | _headers_batched = """
 45 | #include <cmath>
 46 | #include <cstdint>
 47 | 
 48 | #define restrict __restrict__
 49 | #define USE_VECTOR_EXTENSIONS
 50 | 
 51 | 
 52 | constexpr int dim = {dim};
 53 | constexpr int global_size = {global_size};
 54 | constexpr int kernel_rank = {rank};
 55 | constexpr int num_nodes = {num_nodes};
 56 | constexpr int num_coefficients = {num_coefficients};
 57 | 
 58 | #if defined(__clang__)
 59 |     typedef {scalar_type} {scalar_type}{batch_size} __attribute__((ext_vector_type({batch_size})));
 60 | #elif defined(__GNUC__) || defined(__GNUG__)
 61 |     typedef {scalar_type} {scalar_type}{batch_size} __attribute__((vector_size({batch_size} * sizeof({scalar_type}))));
 62 | #else
 63 |     #error "Compiler not supported"
 64 | #endif
 65 | 
 66 | #if defined(__clang__)
 67 |     typedef {geom_type} {geom_type}{batch_size} __attribute__((ext_vector_type({batch_size})));
 68 | #elif defined(__GNUC__) || defined(__GNUG__)
 69 |     typedef {geom_type} {geom_type}{batch_size} __attribute__((vector_size({batch_size} * sizeof({geom_type}))));
 70 | #else
 71 |     #error "Compiler not supported"
 72 | #endif
 73 | 
 74 | using scalar_type = {scalar_type}{batch_size};
 75 | using geom_type = {geom_type}{batch_size};
 76 | 
 77 | constexpr int batch_size = {batch_size};
 78 | using namespace std;
 79 | """
 80 | 
 81 | 
 82 | def compute_integral_body(ir, backend):
 83 |     # Configure kernel generator
 84 |     ig = IntegralGenerator(ir, backend)
 85 |     # Generate code ast for the tabulate_tensor body
 86 |     parts = ig.generate()
 87 |     # Format code as string
 88 |     body = format_indented_lines(parts.cs_format(ir.precision), 1)
 89 |     return body
 90 | 
 91 | 
 92 | def compile_form(form: ufl.Form, name: str,
 93 |                  parameters: typing.Dict = None,
 94 |                  visualise: bool = False):
 95 | 
 96 |     if parameters is None:
 97 |         parameters = get_options()
 98 | 
 99 |     # Stage 1: analysis
100 |     analysis = analyze_ufl_objects([form], parameters)
101 | 
102 |     # Stage 2: intermediate representation
103 |     ir = compute_ir(analysis, {}, " ", parameters, visualise)
104 | 
105 |     if len(ir.integrals) > 1:
106 |         raise RuntimeError(
107 |             "This function is meant to compile one integral type a time.")
108 | 
109 |     # Stage 3: code generation
110 |     integral_ir = ir.integrals[0]
111 |     backend = FFCXBackend(integral_ir, parameters)
112 | 
113 |     scalar_type = parameters["scalar_type"]
114 |     geom_type = scalar_type.replace(' _Complex', '')
115 |     batch_size = parameters["batch_size"]
116 | 
117 |     if batch_size and batch_size > 1:
118 |         geom_type += str(batch_size)
119 |         scalar_type += str(batch_size)
120 | 
121 |     settings = {"scalar_type": scalar_type, "geom_type": geom_type}
122 |     arguments = _arguments.format(**settings)
123 |     signature = "inline void " + name + arguments
124 |     body = compute_integral_body(integral_ir, backend)
125 |     code = signature + " {\n" + body + "\n}\n"
126 | 
127 |     return code
128 | 
129 | 
130 | def generate_code(action, scalar_type, global_size, batch_size):
131 |     reload(problem)
132 | 
133 |     batch_size = batch_size if batch_size else 1
134 |     parameters = get_options()
135 |     parameters["scalar_type"] = scalar_type
136 |     parameters["batch_size"] = batch_size
137 | 
138 |     if action:
139 |         code = compile_form(problem.L, "kernel", parameters)
140 |         num_coefficients = analyze_ufl_objects(
141 |             [problem.L], parameters).form_data[0].num_coefficients
142 |         rank = 1
143 |     else:
144 |         code = compile_form(problem.a, "kernel", parameters)
145 |         num_coefficients = analyze_ufl_objects(
146 |             [problem.a], parameters).form_data[0].num_coefficients
147 |         rank = 2
148 | 
149 |     element = create_element(problem.element)
150 |     num_nodes = element.cell().num_vertices()
151 |     geom_type = scalar_type.replace(' _Complex', '')
152 | 
153 |     if batch_size > 1:
154 |         headers = _headers_batched.format(dim=element.dim, global_size=global_size,
155 |                                           scalar_type=scalar_type, rank=rank, geom_type=geom_type,
156 |                                           batch_size=batch_size, num_nodes=num_nodes,
157 |                                           num_coefficients=num_coefficients)
158 |     else:
159 |         headers = _headers.format(dim=element.dim, global_size=global_size,
160 |                                   scalar_type=scalar_type, rank=rank, geom_type=geom_type,
161 |                                   batch_size=batch_size, num_nodes=num_nodes, num_coefficients=num_coefficients)
162 | 
163 |     with open("ffcx/problem.hpp", "w") as file:
164 |         file.write(headers)
165 |         file.write(code)
166 | 


--------------------------------------------------------------------------------