├── resources
    ├── .gitignore
    ├── LinalgOnTensorDFG.png
    ├── AffineAnalysisIntro.png
    ├── LoweringDialectDiagram.png
    ├── AffineLoweringDialectDiagram.png
    ├── LinalgOnTensorDFG.drawio
    ├── LoweringDialectDiagram.drawio
    ├── AffineLoweringDialectDiagram.drawio
    └── AffineAnalysisIntro.drawio
├── .gitmodules
├── .gitignore
├── demo3-lowering-mlir
    ├── clean.sh
    ├── demo3-0-linalg-on-tensor.mlir
    ├── demo3-1-linalg-on-memref.mlir
    ├── demo3-2-loops.mlir
    ├── lower.sh
    ├── demo3-3-branches.mlir
    ├── demo3-5.ll
    └── demo3-4-llvm.mlir
├── demo4-optimizing-mlir-with-affine-analysis
    ├── clean.sh
    ├── demo4-0-linalg-on-tensor.mlir
    ├── demo4-3-affine-fused.mlir
    ├── demo4-1-linalg-on-memref.mlir
    ├── demo4-4-affine-fused-tiled.mlir
    ├── demo4-2-affine.mlir
    ├── demo4-5-loops.mlir
    └── optimize.sh
├── LICENSE
├── demo1-mlir-motivation
    ├── tensor.h
    ├── demo1.cpp
    └── linalg_lib.h
├── demo2-entering-mlir
    └── demo2.mlir
└── README.md


/resources/.gitignore:
--------------------------------------------------------------------------------
1 | # Ignore temporary files created by draw.io
2 | *.bkp
3 | 
4 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "llvm-project"]
2 | 	path = llvm-project
3 | 	url = https://github.com/llvm/llvm-project.git
4 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Ignore the build directory for LLVM
2 | /build*
3 | 
4 | # Ignore cache file generated by editor.
5 | /.cache*
6 | 
7 | 


--------------------------------------------------------------------------------
/resources/LinalgOnTensorDFG.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlexandreSinger/mlir-beginner-friendly-tutorial/HEAD/resources/LinalgOnTensorDFG.png


--------------------------------------------------------------------------------
/resources/AffineAnalysisIntro.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlexandreSinger/mlir-beginner-friendly-tutorial/HEAD/resources/AffineAnalysisIntro.png


--------------------------------------------------------------------------------
/resources/LoweringDialectDiagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlexandreSinger/mlir-beginner-friendly-tutorial/HEAD/resources/LoweringDialectDiagram.png


--------------------------------------------------------------------------------
/resources/AffineLoweringDialectDiagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AlexandreSinger/mlir-beginner-friendly-tutorial/HEAD/resources/AffineLoweringDialectDiagram.png


--------------------------------------------------------------------------------
/demo3-lowering-mlir/clean.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -ex
 4 | 
 5 | SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
 6 | 
 7 | # Remove the MLIR files generated by the lowering script.
 8 | rm -f $SCRIPT_DIR/*.mlir
 9 | 
10 | # Remove the LLVMIR files generated by the lowering script.
11 | rm -f $SCRIPT_DIR/*.ll
12 | 
13 | 


--------------------------------------------------------------------------------
/demo4-optimizing-mlir-with-affine-analysis/clean.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -ex
 4 | 
 5 | SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
 6 | 
 7 | # Remove the MLIR files generated by the lowering script.
 8 | rm -f $SCRIPT_DIR/*.mlir
 9 | 
10 | # Remove the LLVMIR files generated by the lowering script.
11 | rm -f $SCRIPT_DIR/*.ll
12 | 
13 | 


--------------------------------------------------------------------------------
/demo3-lowering-mlir/demo3-0-linalg-on-tensor.mlir:
--------------------------------------------------------------------------------
 1 | #map = affine_map<(d0, d1) -> (d0, d1)>
 2 | module {
 3 |   func.func @main() -> tensor<256x1024xf32> {
 4 |     %0 = tensor.empty() : tensor<256x512xf32>
 5 |     %1 = tensor.empty() : tensor<512x1024xf32>
 6 |     %cst = arith.constant 0.000000e+00 : f32
 7 |     %splat = tensor.splat %cst : tensor<256x1024xf32>
 8 |     %2 = linalg.matmul ins(%0, %1 : tensor<256x512xf32>, tensor<512x1024xf32>) outs(%splat : tensor<256x1024xf32>) -> tensor<256x1024xf32>
 9 |     %3 = tensor.empty() : tensor<256x1024xf32>
10 |     %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%2 : tensor<256x1024xf32>) outs(%3 : tensor<256x1024xf32>) {
11 |     ^bb0(%in: f32, %out: f32):
12 |       %cst_0 = arith.constant 0.000000e+00 : f32
13 |       %5 = arith.cmpf ugt, %in, %cst_0 : f32
14 |       %6 = arith.select %5, %in, %cst_0 : f32
15 |       linalg.yield %6 : f32
16 |     } -> tensor<256x1024xf32>
17 |     return %4 : tensor<256x1024xf32>
18 |   }
19 | }
20 | 
21 | 


--------------------------------------------------------------------------------
/demo4-optimizing-mlir-with-affine-analysis/demo4-0-linalg-on-tensor.mlir:
--------------------------------------------------------------------------------
 1 | #map = affine_map<(d0, d1) -> (d0, d1)>
 2 | module {
 3 |   func.func @main() -> tensor<256x1024xf32> {
 4 |     %0 = tensor.empty() : tensor<256x512xf32>
 5 |     %1 = tensor.empty() : tensor<512x1024xf32>
 6 |     %cst = arith.constant 0.000000e+00 : f32
 7 |     %splat = tensor.splat %cst : tensor<256x1024xf32>
 8 |     %2 = linalg.matmul ins(%0, %1 : tensor<256x512xf32>, tensor<512x1024xf32>) outs(%splat : tensor<256x1024xf32>) -> tensor<256x1024xf32>
 9 |     %3 = tensor.empty() : tensor<256x1024xf32>
10 |     %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%2 : tensor<256x1024xf32>) outs(%3 : tensor<256x1024xf32>) {
11 |     ^bb0(%in: f32, %out: f32):
12 |       %cst_0 = arith.constant 0.000000e+00 : f32
13 |       %5 = arith.cmpf ugt, %in, %cst_0 : f32
14 |       %6 = arith.select %5, %in, %cst_0 : f32
15 |       linalg.yield %6 : f32
16 |     } -> tensor<256x1024xf32>
17 |     return %4 : tensor<256x1024xf32>
18 |   }
19 | }
20 | 
21 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 AlexandreSinger
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/demo3-lowering-mlir/demo3-1-linalg-on-memref.mlir:
--------------------------------------------------------------------------------
 1 | #map = affine_map<(d0, d1) -> (d0, d1)>
 2 | module {
 3 |   func.func @main() -> memref<256x1024xf32> {
 4 |     %cst = arith.constant 0.000000e+00 : f32
 5 |     %alloc = memref.alloc() {alignment = 64 : i64} : memref<256x512xf32>
 6 |     %alloc_0 = memref.alloc() {alignment = 64 : i64} : memref<512x1024xf32>
 7 |     %alloc_1 = memref.alloc() {alignment = 64 : i64} : memref<256x1024xf32>
 8 |     linalg.map outs(%alloc_1 : memref<256x1024xf32>)
 9 |       () {
10 |         linalg.yield %cst : f32
11 |       }
12 |     linalg.matmul ins(%alloc, %alloc_0 : memref<256x512xf32>, memref<512x1024xf32>) outs(%alloc_1 : memref<256x1024xf32>)
13 |     %alloc_2 = memref.alloc() {alignment = 64 : i64} : memref<256x1024xf32>
14 |     linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%alloc_1 : memref<256x1024xf32>) outs(%alloc_2 : memref<256x1024xf32>) {
15 |     ^bb0(%in: f32, %out: f32):
16 |       %0 = arith.cmpf ugt, %in, %cst : f32
17 |       %1 = arith.select %0, %in, %cst : f32
18 |       linalg.yield %1 : f32
19 |     }
20 |     return %alloc_2 : memref<256x1024xf32>
21 |   }
22 | }
23 | 
24 | 


--------------------------------------------------------------------------------
/demo4-optimizing-mlir-with-affine-analysis/demo4-3-affine-fused.mlir:
--------------------------------------------------------------------------------
 1 | module {
 2 |   func.func @main() -> memref<256x1024xf32> {
 3 |     %alloc = memref.alloc() : memref<1x1xf32>
 4 |     %cst = arith.constant 0.000000e+00 : f32
 5 |     %alloc_0 = memref.alloc() {alignment = 64 : i64} : memref<256x512xf32>
 6 |     %alloc_1 = memref.alloc() {alignment = 64 : i64} : memref<512x1024xf32>
 7 |     %alloc_2 = memref.alloc() {alignment = 64 : i64} : memref<256x1024xf32>
 8 |     affine.for %arg0 = 0 to 256 {
 9 |       affine.for %arg1 = 0 to 1024 {
10 |         affine.store %cst, %alloc[0, 0] : memref<1x1xf32>
11 |         affine.for %arg2 = 0 to 512 {
12 |           %3 = affine.load %alloc_0[%arg0, %arg2] : memref<256x512xf32>
13 |           %4 = affine.load %alloc_1[%arg2, %arg1] : memref<512x1024xf32>
14 |           %5 = affine.load %alloc[0, 0] : memref<1x1xf32>
15 |           %6 = arith.mulf %3, %4 : f32
16 |           %7 = arith.addf %5, %6 : f32
17 |           affine.store %7, %alloc[0, 0] : memref<1x1xf32>
18 |         }
19 |         %0 = affine.load %alloc[0, 0] : memref<1x1xf32>
20 |         %1 = arith.cmpf ugt, %0, %cst : f32
21 |         %2 = arith.select %1, %0, %cst : f32
22 |         affine.store %2, %alloc_2[%arg0, %arg1] : memref<256x1024xf32>
23 |       }
24 |     }
25 |     return %alloc_2 : memref<256x1024xf32>
26 |   }
27 | }
28 | 
29 | 


--------------------------------------------------------------------------------
/demo4-optimizing-mlir-with-affine-analysis/demo4-1-linalg-on-memref.mlir:
--------------------------------------------------------------------------------
 1 | #map = affine_map<(d0, d1) -> (d0, d1)>
 2 | module {
 3 |   func.func @main() -> memref<256x1024xf32> {
 4 |     %alloc = memref.alloc() {alignment = 64 : i64} : memref<256x512xf32>
 5 |     %alloc_0 = memref.alloc() {alignment = 64 : i64} : memref<512x1024xf32>
 6 |     %cst = arith.constant 0.000000e+00 : f32
 7 |     %alloc_1 = memref.alloc() {alignment = 64 : i64} : memref<256x1024xf32>
 8 |     linalg.map outs(%alloc_1 : memref<256x1024xf32>)
 9 |       () {
10 |         linalg.yield %cst : f32
11 |       }
12 |     linalg.matmul ins(%alloc, %alloc_0 : memref<256x512xf32>, memref<512x1024xf32>) outs(%alloc_1 : memref<256x1024xf32>)
13 |     %alloc_2 = memref.alloc() {alignment = 64 : i64} : memref<256x1024xf32>
14 |     linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%alloc_1 : memref<256x1024xf32>) outs(%alloc_2 : memref<256x1024xf32>) {
15 |     ^bb0(%in: f32, %out: f32):
16 |       %cst_3 = arith.constant 0.000000e+00 : f32
17 |       %0 = arith.cmpf ugt, %in, %cst_3 : f32
18 |       %1 = arith.select %0, %in, %cst_3 : f32
19 |       linalg.yield %1 : f32
20 |     }
21 |     %cast = memref.cast %alloc_2 : memref<256x1024xf32> to memref<256x1024xf32, strided<[?, ?], offset: ?>>
22 |     return %alloc_2 : memref<256x1024xf32>
23 |   }
24 | }
25 | 
26 | 


--------------------------------------------------------------------------------
/demo1-mlir-motivation/tensor.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @file
 3 |  * @author  Alex Singer
 4 |  * @date    March 2025
 5 |  * @brief   A fake tensor class made for demo1 of the beginner-friendly tutorial
 6 |  *          to MLIR.
 7 |  *
 8 |  * This is a basic multi-dimensional tensor class used to abstract the data
 9 |  * allocations required to perform linalg operations. This is often controlled
10 |  * by the linalg library and is very careful with how memory is managed to
11 |  * achieve the best performance.
12 |  */
13 | 
14 | #pragma once
15 | 
16 | #include <vector>
17 | 
18 | /**
19 |  * @brief Basic 2D static-shaped tensor class.
20 |  *
21 |  * This can trivially be extended to dynamic-shaped, but for this demo I wanted
22 |  * to keep things simple.
23 |  *
24 |  * NOTE: This type of class is necessary to define for a couple of reasons.
25 |  *       First, C++ (and many programming languages) do not provide a continuous
26 |  *       2D array of values. It is trivial to make one by linearizing the
27 |  *       offset indices; however this is usually abstracted from the user.
28 |  *       Second, high-performance libraries may use special classes to represent
29 |  *       tensors to store more information on the data being stored in order to
30 |  *       make their kernels even higher performance.
31 |  */
32 | template<typename T, size_t W, size_t H>
33 | struct Tensor {
34 |     Tensor() : data(W * H) {}
35 | 
36 |     T& get(size_t x, size_t y) {
37 |         return data[(x * H) + y];
38 |     }
39 | 
40 |     std::vector<T> data;
41 | };
42 | 
43 | 


--------------------------------------------------------------------------------
/demo4-optimizing-mlir-with-affine-analysis/demo4-4-affine-fused-tiled.mlir:
--------------------------------------------------------------------------------
 1 | module {
 2 |   func.func @main() -> memref<256x1024xf32> {
 3 |     %cst = arith.constant 0.000000e+00 : f32
 4 |     %alloc = memref.alloc() : memref<1x1xf32>
 5 |     %alloc_0 = memref.alloc() {alignment = 64 : i64} : memref<256x512xf32>
 6 |     %alloc_1 = memref.alloc() {alignment = 64 : i64} : memref<512x1024xf32>
 7 |     %alloc_2 = memref.alloc() {alignment = 64 : i64} : memref<256x1024xf32>
 8 |     affine.for %arg0 = 0 to 128 {
 9 |       affine.for %arg1 = 0 to 512 {
10 |         affine.for %arg2 = 0 to 2 {
11 |           affine.for %arg3 = 0 to 2 {
12 |             affine.store %cst, %alloc[0, 0] : memref<1x1xf32>
13 |             affine.for %arg4 = 0 to 512 {
14 |               %3 = affine.load %alloc_0[%arg2 + %arg0 * 2, %arg4] : memref<256x512xf32>
15 |               %4 = affine.load %alloc_1[%arg4, %arg3 + %arg1 * 2] : memref<512x1024xf32>
16 |               %5 = affine.load %alloc[0, 0] : memref<1x1xf32>
17 |               %6 = arith.mulf %3, %4 : f32
18 |               %7 = arith.addf %5, %6 : f32
19 |               affine.store %7, %alloc[0, 0] : memref<1x1xf32>
20 |             }
21 |             %0 = affine.load %alloc[0, 0] : memref<1x1xf32>
22 |             %1 = arith.cmpf ugt, %0, %cst : f32
23 |             %2 = arith.select %1, %0, %cst : f32
24 |             affine.store %2, %alloc_2[%arg2 + %arg0 * 2, %arg3 + %arg1 * 2] : memref<256x1024xf32>
25 |           }
26 |         }
27 |       }
28 |     }
29 |     return %alloc_2 : memref<256x1024xf32>
30 |   }
31 | }
32 | 
33 | 


--------------------------------------------------------------------------------
/demo4-optimizing-mlir-with-affine-analysis/demo4-2-affine.mlir:
--------------------------------------------------------------------------------
 1 | module {
 2 |   func.func @main() -> memref<256x1024xf32> {
 3 |     %cst = arith.constant 0.000000e+00 : f32
 4 |     %alloc = memref.alloc() {alignment = 64 : i64} : memref<256x512xf32>
 5 |     %alloc_0 = memref.alloc() {alignment = 64 : i64} : memref<512x1024xf32>
 6 |     %alloc_1 = memref.alloc() {alignment = 64 : i64} : memref<256x1024xf32>
 7 |     affine.for %arg0 = 0 to 256 {
 8 |       affine.for %arg1 = 0 to 1024 {
 9 |         affine.store %cst, %alloc_1[%arg0, %arg1] : memref<256x1024xf32>
10 |       }
11 |     }
12 |     affine.for %arg0 = 0 to 256 {
13 |       affine.for %arg1 = 0 to 1024 {
14 |         affine.for %arg2 = 0 to 512 {
15 |           %0 = affine.load %alloc[%arg0, %arg2] : memref<256x512xf32>
16 |           %1 = affine.load %alloc_0[%arg2, %arg1] : memref<512x1024xf32>
17 |           %2 = affine.load %alloc_1[%arg0, %arg1] : memref<256x1024xf32>
18 |           %3 = arith.mulf %0, %1 : f32
19 |           %4 = arith.addf %2, %3 : f32
20 |           affine.store %4, %alloc_1[%arg0, %arg1] : memref<256x1024xf32>
21 |         }
22 |       }
23 |     }
24 |     %alloc_2 = memref.alloc() {alignment = 64 : i64} : memref<256x1024xf32>
25 |     affine.for %arg0 = 0 to 256 {
26 |       affine.for %arg1 = 0 to 1024 {
27 |         %0 = affine.load %alloc_1[%arg0, %arg1] : memref<256x1024xf32>
28 |         %1 = arith.cmpf ugt, %0, %cst : f32
29 |         %2 = arith.select %1, %0, %cst : f32
30 |         affine.store %2, %alloc_2[%arg0, %arg1] : memref<256x1024xf32>
31 |       }
32 |     }
33 |     return %alloc_2 : memref<256x1024xf32>
34 |   }
35 | }
36 | 
37 | 


--------------------------------------------------------------------------------
/demo3-lowering-mlir/demo3-2-loops.mlir:
--------------------------------------------------------------------------------
 1 | module {
 2 |   func.func @main() -> memref<256x1024xf32> {
 3 |     %c512 = arith.constant 512 : index
 4 |     %c1024 = arith.constant 1024 : index
 5 |     %c1 = arith.constant 1 : index
 6 |     %c256 = arith.constant 256 : index
 7 |     %c0 = arith.constant 0 : index
 8 |     %cst = arith.constant 0.000000e+00 : f32
 9 |     %alloc = memref.alloc() {alignment = 64 : i64} : memref<256x512xf32>
10 |     %alloc_0 = memref.alloc() {alignment = 64 : i64} : memref<512x1024xf32>
11 |     %alloc_1 = memref.alloc() {alignment = 64 : i64} : memref<256x1024xf32>
12 |     scf.for %arg0 = %c0 to %c256 step %c1 {
13 |       scf.for %arg1 = %c0 to %c1024 step %c1 {
14 |         memref.store %cst, %alloc_1[%arg0, %arg1] : memref<256x1024xf32>
15 |       }
16 |     }
17 |     scf.for %arg0 = %c0 to %c256 step %c1 {
18 |       scf.for %arg1 = %c0 to %c1024 step %c1 {
19 |         scf.for %arg2 = %c0 to %c512 step %c1 {
20 |           %0 = memref.load %alloc[%arg0, %arg2] : memref<256x512xf32>
21 |           %1 = memref.load %alloc_0[%arg2, %arg1] : memref<512x1024xf32>
22 |           %2 = memref.load %alloc_1[%arg0, %arg1] : memref<256x1024xf32>
23 |           %3 = arith.mulf %0, %1 : f32
24 |           %4 = arith.addf %2, %3 : f32
25 |           memref.store %4, %alloc_1[%arg0, %arg1] : memref<256x1024xf32>
26 |         }
27 |       }
28 |     }
29 |     %alloc_2 = memref.alloc() {alignment = 64 : i64} : memref<256x1024xf32>
30 |     scf.for %arg0 = %c0 to %c256 step %c1 {
31 |       scf.for %arg1 = %c0 to %c1024 step %c1 {
32 |         %0 = memref.load %alloc_1[%arg0, %arg1] : memref<256x1024xf32>
33 |         %1 = arith.cmpf ugt, %0, %cst : f32
34 |         %2 = arith.select %1, %0, %cst : f32
35 |         memref.store %2, %alloc_2[%arg0, %arg1] : memref<256x1024xf32>
36 |       }
37 |     }
38 |     return %alloc_2 : memref<256x1024xf32>
39 |   }
40 | }
41 | 
42 | 


--------------------------------------------------------------------------------
/demo1-mlir-motivation/demo1.cpp:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @file
 3 |  * @author  Alex Singer
 4 |  * @date    March 2025
 5 |  * @brief   Demo 1 for the beginner friendly tutorial of MLIR.
 6 |  *
 7 |  * This demo will motivate the use of MLIR. This file writes, at a high-level,
 8 |  * what the programmer would like to do. That is, performing high-level matrix
 9 |  * operations using high-performance libraries provided externally.
10 |  *
11 |  * We assume that the tensor and linalg_lib libraries are out of our control
12 |  * and were optimized for this device.
13 |  */
14 | 
15 | #include "tensor.h"
16 | #include "linalg_lib.h"
17 | 
18 | int main(void) {
19 |     /**
20 |      * This is some high-level code which acts basically as a FC activation
21 |      * layer with batch size 256, input size 512, and output size 1024. This
22 |      * layer uses the ReLu activation function.
23 |      *
24 |      * The user writes this code at a high level and relies on optimized code
25 |      * written specifically for the chip they plan to run this on. The user can
26 |      * expect to achieve better performance using these libraries than if they
27 |      * tried to optimize it themself. This also makes the code more portable.
28 |      *
29 |      * NOTE: Often C++ is not the language of choice for this style of coding
30 |      *       since it is so low-level (with many programmers using Python);
31 |      *       however, this example works without loss of generality.
32 |      */
33 | 
34 |     // Initialize the FC input and weight matrices. Here we assume they were
35 |     // intiialized elsewhere for this demo.
36 |     Tensor<float, 256, 512> FC_INPUT;
37 |     Tensor<float, 512, 1024> FC_WEIGHT;
38 | 
39 |     // Perform the matrix-multiply and relu function from the FC layer.
40 |     Tensor<float, 256, 1024> FC_OUTPUT = matmul(FC_INPUT, FC_WEIGHT);
41 |     Tensor<float, 256, 1024> OUT = relu(FC_OUTPUT);
42 | }
43 | 
44 | 


--------------------------------------------------------------------------------
/demo4-optimizing-mlir-with-affine-analysis/demo4-5-loops.mlir:
--------------------------------------------------------------------------------
 1 | module {
 2 |   func.func @main() -> memref<256x1024xf32> {
 3 |     %c2 = arith.constant 2 : index
 4 |     %c512 = arith.constant 512 : index
 5 |     %c1 = arith.constant 1 : index
 6 |     %c128 = arith.constant 128 : index
 7 |     %c0 = arith.constant 0 : index
 8 |     %cst = arith.constant 0.000000e+00 : f32
 9 |     %alloc = memref.alloc() : memref<1x1xf32>
10 |     %alloc_0 = memref.alloc() {alignment = 64 : i64} : memref<256x512xf32>
11 |     %alloc_1 = memref.alloc() {alignment = 64 : i64} : memref<512x1024xf32>
12 |     %alloc_2 = memref.alloc() {alignment = 64 : i64} : memref<256x1024xf32>
13 |     scf.for %arg0 = %c0 to %c128 step %c1 {
14 |       scf.for %arg1 = %c0 to %c512 step %c1 {
15 |         scf.for %arg2 = %c0 to %c2 step %c1 {
16 |           scf.for %arg3 = %c0 to %c2 step %c1 {
17 |             memref.store %cst, %alloc[%c0, %c0] : memref<1x1xf32>
18 |             scf.for %arg4 = %c0 to %c512 step %c1 {
19 |               %7 = arith.muli %arg0, %c2 overflow<nsw> : index
20 |               %8 = arith.addi %arg2, %7 : index
21 |               %9 = memref.load %alloc_0[%8, %arg4] : memref<256x512xf32>
22 |               %10 = arith.muli %arg1, %c2 overflow<nsw> : index
23 |               %11 = arith.addi %arg3, %10 : index
24 |               %12 = memref.load %alloc_1[%arg4, %11] : memref<512x1024xf32>
25 |               %13 = memref.load %alloc[%c0, %c0] : memref<1x1xf32>
26 |               %14 = arith.mulf %9, %12 : f32
27 |               %15 = arith.addf %13, %14 : f32
28 |               memref.store %15, %alloc[%c0, %c0] : memref<1x1xf32>
29 |             }
30 |             %0 = memref.load %alloc[%c0, %c0] : memref<1x1xf32>
31 |             %1 = arith.cmpf ugt, %0, %cst : f32
32 |             %2 = arith.select %1, %0, %cst : f32
33 |             %3 = arith.muli %arg0, %c2 overflow<nsw> : index
34 |             %4 = arith.addi %arg2, %3 : index
35 |             %5 = arith.muli %arg1, %c2 overflow<nsw> : index
36 |             %6 = arith.addi %arg3, %5 : index
37 |             memref.store %2, %alloc_2[%4, %6] : memref<256x1024xf32>
38 |           }
39 |         }
40 |       }
41 |     }
42 |     return %alloc_2 : memref<256x1024xf32>
43 |   }
44 | }
45 | 
46 | 


--------------------------------------------------------------------------------
/demo4-optimizing-mlir-with-affine-analysis/optimize.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -ex
 4 | 
 5 | SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
 6 | LLVM_BIN_DIR=$SCRIPT_DIR/../build/bin
 7 | 
 8 | # Lets take the same example from Demo 2 again.
 9 | $LLVM_BIN_DIR/mlir-opt \
10 |     $SCRIPT_DIR/../demo2-entering-mlir/demo2.mlir \
11 |     -o $SCRIPT_DIR/demo4-0-linalg-on-tensor.mlir
12 | 
13 | # Bufferize it.
14 | $LLVM_BIN_DIR/mlir-opt \
15 |     $SCRIPT_DIR/demo4-0-linalg-on-tensor.mlir \
16 |     -one-shot-bufferize="bufferize-function-boundaries=true" \
17 |     -o $SCRIPT_DIR/demo4-1-linalg-on-memref.mlir
18 | 
19 | # Then lower to the Affine level of abstraction (instead of loops).
20 | # In the Affine dialect, it provides analytical models to represent how memory
21 | # is accessed in a kernel. This is incredibly useful when performing
22 | # optimizations on the buffer level.
23 | $LLVM_BIN_DIR/mlir-opt \
24 |     $SCRIPT_DIR/demo4-1-linalg-on-memref.mlir \
25 |     -convert-linalg-to-affine-loops \
26 |     -o $SCRIPT_DIR/demo4-2-affine.mlir
27 | 
28 | # One optimization we can perform is loop fusion. The kernel we are optimizing
29 | # has two loops with similar iteration loops. The Affine dialect is able to
30 | # analyze the memory access patterns of these two loops and recognize that they
31 | # can be fused safely. This cannot be done in the loops dialect, since it cannot
32 | # be certain that the loops are independent. The benefit of loop fusion in this
33 | # case is that it removes the allocation of a 1MB memref! Beyond just saving
34 | # memory, the reduces the memory footprint of the entire kernel which can
35 | # improve cache locality.
36 | $LLVM_BIN_DIR/mlir-opt \
37 |     $SCRIPT_DIR/demo4-2-affine.mlir \
38 |     -affine-loop-fusion \
39 |     -o $SCRIPT_DIR/demo4-3-affine-fused.mlir
40 | 
41 | # Speaking of cache locality, tiling is another technique used to improve the
42 | # cache locality. My computer has an L2 cache size of 1MiB, which is around
43 | # 1024 kB. We can tell the affine-loop-tile pass that we have this cache size
44 | # and it will tile the loops to optimize the core parts of the code to have a
45 | # footprint of 1024 kB.
46 | # Note: This can make the MLIR code look very strange since it does some
47 | #       weirdness with the induction variables. To make it more readible for
48 | #       this tutorial, I normalize and canonicalize everything.
49 | $LLVM_BIN_DIR/mlir-opt \
50 |     $SCRIPT_DIR/demo4-3-affine-fused.mlir \
51 |     -affine-loop-tile="cache-size=1024" \
52 |     -affine-loop-normalize \
53 |     -canonicalize \
54 |     -o $SCRIPT_DIR/demo4-4-affine-fused-tiled.mlir
55 | 
56 | # Once we have completed all of the Affine analysis we wish to perform, we lower
57 | # out of the affine dialect into the scf and memref dialects. This can then be
58 | # further lowered into LLVM, as shown in Demo 3.
59 | $LLVM_BIN_DIR/mlir-opt \
60 |     $SCRIPT_DIR/demo4-4-affine-fused-tiled.mlir \
61 |     -lower-affine \
62 |     -canonicalize \
63 |     -o $SCRIPT_DIR/demo4-5-loops.mlir
64 | 
65 | 


--------------------------------------------------------------------------------
/demo3-lowering-mlir/lower.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -ex
 4 | 
 5 | SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
 6 | LLVM_BIN_DIR=$SCRIPT_DIR/../build/bin
 7 | 
 8 | # Clean up demo 2. This is just to put a clean starting point in this folder.
 9 | # linalg-on-tensor is the entry point for this flow and represents the logical
10 | # computation the user wishes to perform, without any information on the device
11 | # we are targetting.
12 | $LLVM_BIN_DIR/mlir-opt \
13 |     $SCRIPT_DIR/../demo2-entering-mlir/demo2.mlir \
14 |     -o $SCRIPT_DIR/demo3-0-linalg-on-tensor.mlir
15 | 
16 | # Lower tensors to memrefs. Since we plan on targeting CPUs, we cannot stay at
17 | # the tensor abstraction since every chunk of data needs to have an allocated
18 | # buffer somewhere in memory. We can make use of the one-shot-bufferize pass
19 | # in MLIR to convert the Tensors into MemRefs. MemRefs are practically wrappers
20 | # around allocated pointers to memory with shape information.
21 | $LLVM_BIN_DIR/mlir-opt \
22 |     $SCRIPT_DIR/demo3-0-linalg-on-tensor.mlir \
23 |     -one-shot-bufferize="bufferize-function-boundaries=true" \
24 |     -canonicalize \
25 |     -o $SCRIPT_DIR/demo3-1-linalg-on-memref.mlir
26 | 
27 | # Convert the linalg operations into loops in the SCF dialect. We are assuming
28 | # that our target device cannot compute MatMul and ReLU directly; so we convert
29 | # these kernels into loops. This is a lower level of abstraction than the linalg
30 | # algorithms themselves and closer to what a CPU can execute.
31 | $LLVM_BIN_DIR/mlir-opt \
32 |     $SCRIPT_DIR/demo3-1-linalg-on-memref.mlir \
33 |     -convert-linalg-to-loops \
34 |     -o $SCRIPT_DIR/demo3-2-loops.mlir
35 | 
36 | # Convert the loops into branches. Many CPU architectures use branch instructions
37 | # to represent loops. We lower the loops in our kernel into branches; further
38 | # lowering the control flow abstraction to as close to CPUs as we can.
39 | $LLVM_BIN_DIR/mlir-opt \
40 |     $SCRIPT_DIR/demo3-2-loops.mlir \
41 |     -convert-scf-to-cf \
42 |     -o $SCRIPT_DIR/demo3-3-branches.mlir
43 | 
44 | # Finally, we convert everything to the LLVM dialect. We have lowered the
45 | # abstraction of the data and control flow low enough that we can convert
46 | # basically one-to-one to the LLVM dialect. We target LLVM here since LLVM
47 | # contains standard backend code which can generate real assembly for the kernel
48 | # we wrote.
49 | $LLVM_BIN_DIR/mlir-opt \
50 |     $SCRIPT_DIR/demo3-3-branches.mlir \
51 |     -convert-func-to-llvm \
52 |     -convert-cf-to-llvm \
53 |     -finalize-memref-to-llvm \
54 |     -convert-arith-to-llvm \
55 |     -reconcile-unrealized-casts \
56 |     -canonicalize \
57 |     -o $SCRIPT_DIR/demo3-4-llvm.mlir
58 | 
59 | # Once we are fully in the LLVM dialect, we can use the mlir-translate tool to
60 | # convert the LLVM dialect MLIR code into LLVMIR.
61 | $LLVM_BIN_DIR/mlir-translate \
62 |     $SCRIPT_DIR/demo3-4-llvm.mlir \
63 |     --mlir-to-llvmir \
64 |     -o $SCRIPT_DIR/demo3-5.ll
65 | 
66 | # This LLVMIR code can then be further optimized and lowered into assembly code
67 | # to run on a CPU. This is not shown for this tutorial.
68 | 
69 | 


--------------------------------------------------------------------------------
/demo3-lowering-mlir/demo3-3-branches.mlir:
--------------------------------------------------------------------------------
 1 | module {
 2 |   func.func @main() -> memref<256x1024xf32> {
 3 |     %c512 = arith.constant 512 : index
 4 |     %c1024 = arith.constant 1024 : index
 5 |     %c1 = arith.constant 1 : index
 6 |     %c256 = arith.constant 256 : index
 7 |     %c0 = arith.constant 0 : index
 8 |     %cst = arith.constant 0.000000e+00 : f32
 9 |     %alloc = memref.alloc() {alignment = 64 : i64} : memref<256x512xf32>
10 |     %alloc_0 = memref.alloc() {alignment = 64 : i64} : memref<512x1024xf32>
11 |     %alloc_1 = memref.alloc() {alignment = 64 : i64} : memref<256x1024xf32>
12 |     cf.br ^bb1(%c0 : index)
13 |   ^bb1(%0: index):  // 2 preds: ^bb0, ^bb5
14 |     %1 = arith.cmpi slt, %0, %c256 : index
15 |     cf.cond_br %1, ^bb2, ^bb6
16 |   ^bb2:  // pred: ^bb1
17 |     cf.br ^bb3(%c0 : index)
18 |   ^bb3(%2: index):  // 2 preds: ^bb2, ^bb4
19 |     %3 = arith.cmpi slt, %2, %c1024 : index
20 |     cf.cond_br %3, ^bb4, ^bb5
21 |   ^bb4:  // pred: ^bb3
22 |     memref.store %cst, %alloc_1[%0, %2] : memref<256x1024xf32>
23 |     %4 = arith.addi %2, %c1 : index
24 |     cf.br ^bb3(%4 : index)
25 |   ^bb5:  // pred: ^bb3
26 |     %5 = arith.addi %0, %c1 : index
27 |     cf.br ^bb1(%5 : index)
28 |   ^bb6:  // pred: ^bb1
29 |     cf.br ^bb7(%c0 : index)
30 |   ^bb7(%6: index):  // 2 preds: ^bb6, ^bb14
31 |     %7 = arith.cmpi slt, %6, %c256 : index
32 |     cf.cond_br %7, ^bb8, ^bb15
33 |   ^bb8:  // pred: ^bb7
34 |     cf.br ^bb9(%c0 : index)
35 |   ^bb9(%8: index):  // 2 preds: ^bb8, ^bb13
36 |     %9 = arith.cmpi slt, %8, %c1024 : index
37 |     cf.cond_br %9, ^bb10, ^bb14
38 |   ^bb10:  // pred: ^bb9
39 |     cf.br ^bb11(%c0 : index)
40 |   ^bb11(%10: index):  // 2 preds: ^bb10, ^bb12
41 |     %11 = arith.cmpi slt, %10, %c512 : index
42 |     cf.cond_br %11, ^bb12, ^bb13
43 |   ^bb12:  // pred: ^bb11
44 |     %12 = memref.load %alloc[%6, %10] : memref<256x512xf32>
45 |     %13 = memref.load %alloc_0[%10, %8] : memref<512x1024xf32>
46 |     %14 = memref.load %alloc_1[%6, %8] : memref<256x1024xf32>
47 |     %15 = arith.mulf %12, %13 : f32
48 |     %16 = arith.addf %14, %15 : f32
49 |     memref.store %16, %alloc_1[%6, %8] : memref<256x1024xf32>
50 |     %17 = arith.addi %10, %c1 : index
51 |     cf.br ^bb11(%17 : index)
52 |   ^bb13:  // pred: ^bb11
53 |     %18 = arith.addi %8, %c1 : index
54 |     cf.br ^bb9(%18 : index)
55 |   ^bb14:  // pred: ^bb9
56 |     %19 = arith.addi %6, %c1 : index
57 |     cf.br ^bb7(%19 : index)
58 |   ^bb15:  // pred: ^bb7
59 |     %alloc_2 = memref.alloc() {alignment = 64 : i64} : memref<256x1024xf32>
60 |     cf.br ^bb16(%c0 : index)
61 |   ^bb16(%20: index):  // 2 preds: ^bb15, ^bb20
62 |     %21 = arith.cmpi slt, %20, %c256 : index
63 |     cf.cond_br %21, ^bb17, ^bb21
64 |   ^bb17:  // pred: ^bb16
65 |     cf.br ^bb18(%c0 : index)
66 |   ^bb18(%22: index):  // 2 preds: ^bb17, ^bb19
67 |     %23 = arith.cmpi slt, %22, %c1024 : index
68 |     cf.cond_br %23, ^bb19, ^bb20
69 |   ^bb19:  // pred: ^bb18
70 |     %24 = memref.load %alloc_1[%20, %22] : memref<256x1024xf32>
71 |     %25 = arith.cmpf ugt, %24, %cst : f32
72 |     %26 = arith.select %25, %24, %cst : f32
73 |     memref.store %26, %alloc_2[%20, %22] : memref<256x1024xf32>
74 |     %27 = arith.addi %22, %c1 : index
75 |     cf.br ^bb18(%27 : index)
76 |   ^bb20:  // pred: ^bb18
77 |     %28 = arith.addi %20, %c1 : index
78 |     cf.br ^bb16(%28 : index)
79 |   ^bb21:  // pred: ^bb16
80 |     return %alloc_2 : memref<256x1024xf32>
81 |   }
82 | }
83 | 
84 | 


--------------------------------------------------------------------------------
/demo1-mlir-motivation/linalg_lib.h:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @file
 3 |  * @author  Alex Singer
 4 |  * @date    March 2025
 5 |  * @brief   A fake linear algebra library for demo 1 of the beginner-friendly
 6 |  *          MLIR tutorial.
 7 |  *
 8 |  * This is a basic linear algebra library that one may write. This is not
 9 |  * optimized in any way, but it is just to demonstrate that these kernels can
10 |  * be optimized by very skilled experts to allow users to achieve excellent
11 |  * performance without expertise.
12 |  */
13 | 
14 | #pragma once
15 | 
16 | #include "tensor.h"
17 | 
18 | /**
19 |  * @brief Perform a matrix-multiplication on two, fixed-shape tensors.
20 |  */
21 | template<typename T, size_t M, size_t K, size_t N>
22 | Tensor<T, M, N> matmul(Tensor<T, M, K>& A, Tensor<T, K, N>& B) {
23 |     /**
24 |      * This is the most basic implementation of a matmul of two fixed-shape
25 |      * tensors.
26 |      *
27 |      * This can be made more efficient by doing the following high-level
28 |      * optimization techniques:
29 |      *  - Tiling: If you know the cache heirarchy of the device this will run
30 |      *            on, you can tile the matrices (localize which regions of the
31 |      *            matrices are being read/written from/to). This maximizes the
32 |      *            reuse of elements in the cache which can improve the hit rate.
33 |      *  - Packing: By allocating "scratch-pad" memory, one can further improve
34 |      *             the performance by storing tiled memory into continuous
35 |      *             arrays.
36 |      *  - Vectorization: One can convert these matrix multiplications into
37 |      *                   vector multiplies and adds.
38 |      *  - Arch-Specific Instructions: If the target device has special
39 |      *                                instructions, specific for matrix multiply
40 |      *                                (like MAC or even a small MatMul), one can
41 |      *                                use these instructions.
42 |      *
43 |      * However, notice that all of these optimizations require knowledge about
44 |      * the underlying device that this kernel will run on. This is why this
45 |      * library must be rewritten for different targets.
46 |      *
47 |      * There are other libraries, like Halide or TVM, which use schedules to try
48 |      * and improve this process; however, this still requires an expert to
49 |      * choose appropriate schedules and tune them.
50 |      */
51 |     Tensor<T, M, N> C;
52 |     for (size_t i = 0; i < M; i++) {
53 |         for (size_t j = 0; j < N; j++) {
54 |             C.get(i, j) = 0.f;
55 |             for (size_t k = 0; k < K; k++) {
56 |                 C.get(i, j) += A.get(i, k) * B.get(k, j);
57 |             }
58 |         }
59 |     }
60 |     return C;
61 | }
62 | 
63 | /**
64 |  * @brief Perform an element-wise ReLu on all elements of the given fixed-shape
65 |  *        tensor.
66 |  */
67 | template<typename T, size_t W, size_t H>
68 | Tensor<T, W, H> relu(Tensor<T, W, H>& IN) {
69 |     /**
70 |      * Similar to the matmul kernel above, this is the most basic implementation
71 |      * of this kernel.
72 |      *
73 |      * The performance of this kernel can be improved using similar techniques
74 |      * to the matmul kernel above.
75 |      */
76 |     Tensor<T, W, H> OUT;
77 |     for (size_t i = 0; i < W; i++) {
78 |         for (size_t j = 0; j < H; j++) {
79 |             OUT.get(i, j) = std::max(IN.get(i, j), 0.f);
80 |         }
81 |     }
82 |     return OUT;
83 | }
84 | 


--------------------------------------------------------------------------------
/demo3-lowering-mlir/demo3-5.ll:
--------------------------------------------------------------------------------
  1 | ; ModuleID = 'LLVMDialectModule'
  2 | source_filename = "LLVMDialectModule"
  3 | 
  4 | declare ptr @malloc(i64)
  5 | 
  6 | define { ptr, ptr, i64, [2 x i64], [2 x i64] } @main() {
  7 |   %1 = call ptr @malloc(i64 524352)
  8 |   %2 = ptrtoint ptr %1 to i64
  9 |   %3 = add i64 %2, 63
 10 |   %4 = urem i64 %3, 64
 11 |   %5 = sub i64 %3, %4
 12 |   %6 = inttoptr i64 %5 to ptr
 13 |   %7 = call ptr @malloc(i64 2097216)
 14 |   %8 = ptrtoint ptr %7 to i64
 15 |   %9 = add i64 %8, 63
 16 |   %10 = urem i64 %9, 64
 17 |   %11 = sub i64 %9, %10
 18 |   %12 = inttoptr i64 %11 to ptr
 19 |   %13 = call ptr @malloc(i64 1048640)
 20 |   %14 = ptrtoint ptr %13 to i64
 21 |   %15 = add i64 %14, 63
 22 |   %16 = urem i64 %15, 64
 23 |   %17 = sub i64 %15, %16
 24 |   %18 = inttoptr i64 %17 to ptr
 25 |   br label %19
 26 | 
 27 | 19:                                               ; preds = %31, %0
 28 |   %20 = phi i64 [ %32, %31 ], [ 0, %0 ]
 29 |   %21 = icmp slt i64 %20, 256
 30 |   br i1 %21, label %22, label %33
 31 | 
 32 | 22:                                               ; preds = %19
 33 |   br label %23
 34 | 
 35 | 23:                                               ; preds = %26, %22
 36 |   %24 = phi i64 [ %30, %26 ], [ 0, %22 ]
 37 |   %25 = icmp slt i64 %24, 1024
 38 |   br i1 %25, label %26, label %31
 39 | 
 40 | 26:                                               ; preds = %23
 41 |   %27 = mul i64 %20, 1024
 42 |   %28 = add i64 %27, %24
 43 |   %29 = getelementptr float, ptr %18, i64 %28
 44 |   store float 0.000000e+00, ptr %29, align 4
 45 |   %30 = add i64 %24, 1
 46 |   br label %23
 47 | 
 48 | 31:                                               ; preds = %23
 49 |   %32 = add i64 %20, 1
 50 |   br label %19
 51 | 
 52 | 33:                                               ; preds = %19
 53 |   br label %34
 54 | 
 55 | 34:                                               ; preds = %66, %33
 56 |   %35 = phi i64 [ %67, %66 ], [ 0, %33 ]
 57 |   %36 = icmp slt i64 %35, 256
 58 |   br i1 %36, label %37, label %68
 59 | 
 60 | 37:                                               ; preds = %34
 61 |   br label %38
 62 | 
 63 | 38:                                               ; preds = %64, %37
 64 |   %39 = phi i64 [ %65, %64 ], [ 0, %37 ]
 65 |   %40 = icmp slt i64 %39, 1024
 66 |   br i1 %40, label %41, label %66
 67 | 
 68 | 41:                                               ; preds = %38
 69 |   br label %42
 70 | 
 71 | 42:                                               ; preds = %45, %41
 72 |   %43 = phi i64 [ %63, %45 ], [ 0, %41 ]
 73 |   %44 = icmp slt i64 %43, 512
 74 |   br i1 %44, label %45, label %64
 75 | 
 76 | 45:                                               ; preds = %42
 77 |   %46 = mul i64 %35, 512
 78 |   %47 = add i64 %46, %43
 79 |   %48 = getelementptr float, ptr %6, i64 %47
 80 |   %49 = load float, ptr %48, align 4
 81 |   %50 = mul i64 %43, 1024
 82 |   %51 = add i64 %50, %39
 83 |   %52 = getelementptr float, ptr %12, i64 %51
 84 |   %53 = load float, ptr %52, align 4
 85 |   %54 = mul i64 %35, 1024
 86 |   %55 = add i64 %54, %39
 87 |   %56 = getelementptr float, ptr %18, i64 %55
 88 |   %57 = load float, ptr %56, align 4
 89 |   %58 = fmul float %49, %53
 90 |   %59 = fadd float %57, %58
 91 |   %60 = mul i64 %35, 1024
 92 |   %61 = add i64 %60, %39
 93 |   %62 = getelementptr float, ptr %18, i64 %61
 94 |   store float %59, ptr %62, align 4
 95 |   %63 = add i64 %43, 1
 96 |   br label %42
 97 | 
 98 | 64:                                               ; preds = %42
 99 |   %65 = add i64 %39, 1
100 |   br label %38
101 | 
102 | 66:                                               ; preds = %38
103 |   %67 = add i64 %35, 1
104 |   br label %34
105 | 
106 | 68:                                               ; preds = %34
107 |   %69 = call ptr @malloc(i64 1048640)
108 |   %70 = ptrtoint ptr %69 to i64
109 |   %71 = add i64 %70, 63
110 |   %72 = urem i64 %71, 64
111 |   %73 = sub i64 %71, %72
112 |   %74 = inttoptr i64 %73 to ptr
113 |   %75 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } poison, ptr %69, 0
114 |   %76 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %75, ptr %74, 1
115 |   %77 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %76, i64 0, 2
116 |   %78 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %77, i64 256, 3, 0
117 |   %79 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %78, i64 1024, 3, 1
118 |   %80 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %79, i64 1024, 4, 0
119 |   %81 = insertvalue { ptr, ptr, i64, [2 x i64], [2 x i64] } %80, i64 1, 4, 1
120 |   br label %82
121 | 
122 | 82:                                               ; preds = %100, %68
123 |   %83 = phi i64 [ %101, %100 ], [ 0, %68 ]
124 |   %84 = icmp slt i64 %83, 256
125 |   br i1 %84, label %85, label %102
126 | 
127 | 85:                                               ; preds = %82
128 |   br label %86
129 | 
130 | 86:                                               ; preds = %89, %85
131 |   %87 = phi i64 [ %99, %89 ], [ 0, %85 ]
132 |   %88 = icmp slt i64 %87, 1024
133 |   br i1 %88, label %89, label %100
134 | 
135 | 89:                                               ; preds = %86
136 |   %90 = mul i64 %83, 1024
137 |   %91 = add i64 %90, %87
138 |   %92 = getelementptr float, ptr %18, i64 %91
139 |   %93 = load float, ptr %92, align 4
140 |   %94 = fcmp ugt float %93, 0.000000e+00
141 |   %95 = select i1 %94, float %93, float 0.000000e+00
142 |   %96 = mul i64 %83, 1024
143 |   %97 = add i64 %96, %87
144 |   %98 = getelementptr float, ptr %74, i64 %97
145 |   store float %95, ptr %98, align 4
146 |   %99 = add i64 %87, 1
147 |   br label %86
148 | 
149 | 100:                                              ; preds = %86
150 |   %101 = add i64 %83, 1
151 |   br label %82
152 | 
153 | 102:                                              ; preds = %82
154 |   ret { ptr, ptr, i64, [2 x i64], [2 x i64] } %81
155 | }
156 | 
157 | !llvm.module.flags = !{!0}
158 | 
159 | !0 = !{i32 2, !"Debug Info Version", i32 3}
160 | 


--------------------------------------------------------------------------------
/demo3-lowering-mlir/demo3-4-llvm.mlir:
--------------------------------------------------------------------------------
  1 | module {
  2 |   llvm.func @malloc(i64) -> !llvm.ptr
  3 |   llvm.func @main() -> !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> {
  4 |     %0 = llvm.mlir.poison : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
  5 |     %1 = llvm.mlir.constant(64 : index) : i64
  6 |     %2 = llvm.mlir.constant(512 : index) : i64
  7 |     %3 = llvm.mlir.constant(1024 : index) : i64
  8 |     %4 = llvm.mlir.constant(1 : index) : i64
  9 |     %5 = llvm.mlir.constant(256 : index) : i64
 10 |     %6 = llvm.mlir.constant(0 : index) : i64
 11 |     %7 = llvm.mlir.constant(0.000000e+00 : f32) : f32
 12 |     %8 = llvm.mlir.zero : !llvm.ptr
 13 |     %9 = llvm.getelementptr %8[131072] : (!llvm.ptr) -> !llvm.ptr, f32
 14 |     %10 = llvm.ptrtoint %9 : !llvm.ptr to i64
 15 |     %11 = llvm.add %10, %1 : i64
 16 |     %12 = llvm.call @malloc(%11) : (i64) -> !llvm.ptr
 17 |     %13 = llvm.ptrtoint %12 : !llvm.ptr to i64
 18 |     %14 = llvm.sub %1, %4 : i64
 19 |     %15 = llvm.add %13, %14 : i64
 20 |     %16 = llvm.urem %15, %1 : i64
 21 |     %17 = llvm.sub %15, %16 : i64
 22 |     %18 = llvm.inttoptr %17 : i64 to !llvm.ptr
 23 |     %19 = llvm.getelementptr %8[524288] : (!llvm.ptr) -> !llvm.ptr, f32
 24 |     %20 = llvm.ptrtoint %19 : !llvm.ptr to i64
 25 |     %21 = llvm.add %20, %1 : i64
 26 |     %22 = llvm.call @malloc(%21) : (i64) -> !llvm.ptr
 27 |     %23 = llvm.ptrtoint %22 : !llvm.ptr to i64
 28 |     %24 = llvm.sub %1, %4 : i64
 29 |     %25 = llvm.add %23, %24 : i64
 30 |     %26 = llvm.urem %25, %1 : i64
 31 |     %27 = llvm.sub %25, %26 : i64
 32 |     %28 = llvm.inttoptr %27 : i64 to !llvm.ptr
 33 |     %29 = llvm.getelementptr %8[262144] : (!llvm.ptr) -> !llvm.ptr, f32
 34 |     %30 = llvm.ptrtoint %29 : !llvm.ptr to i64
 35 |     %31 = llvm.add %30, %1 : i64
 36 |     %32 = llvm.call @malloc(%31) : (i64) -> !llvm.ptr
 37 |     %33 = llvm.ptrtoint %32 : !llvm.ptr to i64
 38 |     %34 = llvm.sub %1, %4 : i64
 39 |     %35 = llvm.add %33, %34 : i64
 40 |     %36 = llvm.urem %35, %1 : i64
 41 |     %37 = llvm.sub %35, %36 : i64
 42 |     %38 = llvm.inttoptr %37 : i64 to !llvm.ptr
 43 |     llvm.br ^bb1(%6 : i64)
 44 |   ^bb1(%39: i64):  // 2 preds: ^bb0, ^bb5
 45 |     %40 = llvm.icmp "slt" %39, %5 : i64
 46 |     llvm.cond_br %40, ^bb2, ^bb6
 47 |   ^bb2:  // pred: ^bb1
 48 |     llvm.br ^bb3(%6 : i64)
 49 |   ^bb3(%41: i64):  // 2 preds: ^bb2, ^bb4
 50 |     %42 = llvm.icmp "slt" %41, %3 : i64
 51 |     llvm.cond_br %42, ^bb4, ^bb5
 52 |   ^bb4:  // pred: ^bb3
 53 |     %43 = llvm.mul %39, %3 : i64
 54 |     %44 = llvm.add %43, %41 : i64
 55 |     %45 = llvm.getelementptr %38[%44] : (!llvm.ptr, i64) -> !llvm.ptr, f32
 56 |     llvm.store %7, %45 : f32, !llvm.ptr
 57 |     %46 = llvm.add %41, %4 : i64
 58 |     llvm.br ^bb3(%46 : i64)
 59 |   ^bb5:  // pred: ^bb3
 60 |     %47 = llvm.add %39, %4 : i64
 61 |     llvm.br ^bb1(%47 : i64)
 62 |   ^bb6:  // pred: ^bb1
 63 |     llvm.br ^bb7(%6 : i64)
 64 |   ^bb7(%48: i64):  // 2 preds: ^bb6, ^bb14
 65 |     %49 = llvm.icmp "slt" %48, %5 : i64
 66 |     llvm.cond_br %49, ^bb8, ^bb15
 67 |   ^bb8:  // pred: ^bb7
 68 |     llvm.br ^bb9(%6 : i64)
 69 |   ^bb9(%50: i64):  // 2 preds: ^bb8, ^bb13
 70 |     %51 = llvm.icmp "slt" %50, %3 : i64
 71 |     llvm.cond_br %51, ^bb10, ^bb14
 72 |   ^bb10:  // pred: ^bb9
 73 |     llvm.br ^bb11(%6 : i64)
 74 |   ^bb11(%52: i64):  // 2 preds: ^bb10, ^bb12
 75 |     %53 = llvm.icmp "slt" %52, %2 : i64
 76 |     llvm.cond_br %53, ^bb12, ^bb13
 77 |   ^bb12:  // pred: ^bb11
 78 |     %54 = llvm.mul %48, %2 : i64
 79 |     %55 = llvm.add %54, %52 : i64
 80 |     %56 = llvm.getelementptr %18[%55] : (!llvm.ptr, i64) -> !llvm.ptr, f32
 81 |     %57 = llvm.load %56 : !llvm.ptr -> f32
 82 |     %58 = llvm.mul %52, %3 : i64
 83 |     %59 = llvm.add %58, %50 : i64
 84 |     %60 = llvm.getelementptr %28[%59] : (!llvm.ptr, i64) -> !llvm.ptr, f32
 85 |     %61 = llvm.load %60 : !llvm.ptr -> f32
 86 |     %62 = llvm.mul %48, %3 : i64
 87 |     %63 = llvm.add %62, %50 : i64
 88 |     %64 = llvm.getelementptr %38[%63] : (!llvm.ptr, i64) -> !llvm.ptr, f32
 89 |     %65 = llvm.load %64 : !llvm.ptr -> f32
 90 |     %66 = llvm.fmul %57, %61 : f32
 91 |     %67 = llvm.fadd %65, %66 : f32
 92 |     %68 = llvm.mul %48, %3 : i64
 93 |     %69 = llvm.add %68, %50 : i64
 94 |     %70 = llvm.getelementptr %38[%69] : (!llvm.ptr, i64) -> !llvm.ptr, f32
 95 |     llvm.store %67, %70 : f32, !llvm.ptr
 96 |     %71 = llvm.add %52, %4 : i64
 97 |     llvm.br ^bb11(%71 : i64)
 98 |   ^bb13:  // pred: ^bb11
 99 |     %72 = llvm.add %50, %4 : i64
100 |     llvm.br ^bb9(%72 : i64)
101 |   ^bb14:  // pred: ^bb9
102 |     %73 = llvm.add %48, %4 : i64
103 |     llvm.br ^bb7(%73 : i64)
104 |   ^bb15:  // pred: ^bb7
105 |     %74 = llvm.getelementptr %8[262144] : (!llvm.ptr) -> !llvm.ptr, f32
106 |     %75 = llvm.ptrtoint %74 : !llvm.ptr to i64
107 |     %76 = llvm.add %75, %1 : i64
108 |     %77 = llvm.call @malloc(%76) : (i64) -> !llvm.ptr
109 |     %78 = llvm.ptrtoint %77 : !llvm.ptr to i64
110 |     %79 = llvm.sub %1, %4 : i64
111 |     %80 = llvm.add %78, %79 : i64
112 |     %81 = llvm.urem %80, %1 : i64
113 |     %82 = llvm.sub %80, %81 : i64
114 |     %83 = llvm.inttoptr %82 : i64 to !llvm.ptr
115 |     %84 = llvm.insertvalue %77, %0[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
116 |     %85 = llvm.insertvalue %83, %84[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
117 |     %86 = llvm.insertvalue %6, %85[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
118 |     %87 = llvm.insertvalue %5, %86[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
119 |     %88 = llvm.insertvalue %3, %87[3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
120 |     %89 = llvm.insertvalue %3, %88[4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
121 |     %90 = llvm.insertvalue %4, %89[4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> 
122 |     llvm.br ^bb16(%6 : i64)
123 |   ^bb16(%91: i64):  // 2 preds: ^bb15, ^bb20
124 |     %92 = llvm.icmp "slt" %91, %5 : i64
125 |     llvm.cond_br %92, ^bb17, ^bb21
126 |   ^bb17:  // pred: ^bb16
127 |     llvm.br ^bb18(%6 : i64)
128 |   ^bb18(%93: i64):  // 2 preds: ^bb17, ^bb19
129 |     %94 = llvm.icmp "slt" %93, %3 : i64
130 |     llvm.cond_br %94, ^bb19, ^bb20
131 |   ^bb19:  // pred: ^bb18
132 |     %95 = llvm.mul %91, %3 : i64
133 |     %96 = llvm.add %95, %93 : i64
134 |     %97 = llvm.getelementptr %38[%96] : (!llvm.ptr, i64) -> !llvm.ptr, f32
135 |     %98 = llvm.load %97 : !llvm.ptr -> f32
136 |     %99 = llvm.fcmp "ugt" %98, %7 : f32
137 |     %100 = llvm.select %99, %98, %7 : i1, f32
138 |     %101 = llvm.mul %91, %3 : i64
139 |     %102 = llvm.add %101, %93 : i64
140 |     %103 = llvm.getelementptr %83[%102] : (!llvm.ptr, i64) -> !llvm.ptr, f32
141 |     llvm.store %100, %103 : f32, !llvm.ptr
142 |     %104 = llvm.add %93, %4 : i64
143 |     llvm.br ^bb18(%104 : i64)
144 |   ^bb20:  // pred: ^bb18
145 |     %105 = llvm.add %91, %4 : i64
146 |     llvm.br ^bb16(%105 : i64)
147 |   ^bb21:  // pred: ^bb16
148 |     llvm.return %90 : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)>
149 |   }
150 | }
151 | 
152 | 


--------------------------------------------------------------------------------
/resources/LinalgOnTensorDFG.drawio:
--------------------------------------------------------------------------------
 1 | <mxfile host="Electron" agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) draw.io/26.0.16 Chrome/132.0.6834.196 Electron/34.2.0 Safari/537.36" version="26.0.16">
 2 |   <diagram name="Page-1" id="7S27D2kKKZdHQ-hWiyAw">
 3 |     <mxGraphModel dx="943" dy="661" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="1" pageScale="1" pageWidth="827" pageHeight="1169" math="0" shadow="0">
 4 |       <root>
 5 |         <mxCell id="0" />
 6 |         <mxCell id="1" parent="0" />
 7 |         <mxCell id="5IxwF7xX-7rUqVhXdwQs-5" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;entryX=0.25;entryY=0;entryDx=0;entryDy=0;strokeWidth=3;" edge="1" parent="1" source="zfjGxIKWQE5gDWcykbRW-1" target="5IxwF7xX-7rUqVhXdwQs-3">
 8 |           <mxGeometry relative="1" as="geometry">
 9 |             <Array as="points">
10 |               <mxPoint x="160" y="240" />
11 |               <mxPoint x="320" y="240" />
12 |             </Array>
13 |           </mxGeometry>
14 |         </mxCell>
15 |         <mxCell id="zfjGxIKWQE5gDWcykbRW-1" value="&lt;font style=&quot;font-size: 24px;&quot;&gt;&lt;b style=&quot;&quot;&gt;tensor.empty&lt;/b&gt;&lt;/font&gt;" style="rounded=1;whiteSpace=wrap;html=1;fillColor=#d5e8d4;strokeColor=#82b366;" parent="1" vertex="1">
16 |           <mxGeometry x="80" y="160" width="160" height="40" as="geometry" />
17 |         </mxCell>
18 |         <mxCell id="5IxwF7xX-7rUqVhXdwQs-8" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;strokeWidth=3;" edge="1" parent="1" source="5IxwF7xX-7rUqVhXdwQs-1" target="5IxwF7xX-7rUqVhXdwQs-3">
19 |           <mxGeometry relative="1" as="geometry" />
20 |         </mxCell>
21 |         <mxCell id="5IxwF7xX-7rUqVhXdwQs-1" value="&lt;font style=&quot;font-size: 24px;&quot;&gt;&lt;b style=&quot;&quot;&gt;tensor.empty&lt;/b&gt;&lt;/font&gt;" style="rounded=1;whiteSpace=wrap;html=1;fillColor=#d5e8d4;strokeColor=#82b366;" vertex="1" parent="1">
22 |           <mxGeometry x="280" y="160" width="160" height="40" as="geometry" />
23 |         </mxCell>
24 |         <mxCell id="5IxwF7xX-7rUqVhXdwQs-7" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;entryX=0.75;entryY=0;entryDx=0;entryDy=0;strokeWidth=3;" edge="1" parent="1" source="5IxwF7xX-7rUqVhXdwQs-2" target="5IxwF7xX-7rUqVhXdwQs-3">
25 |           <mxGeometry relative="1" as="geometry">
26 |             <Array as="points">
27 |               <mxPoint x="560" y="240" />
28 |               <mxPoint x="400" y="240" />
29 |             </Array>
30 |           </mxGeometry>
31 |         </mxCell>
32 |         <mxCell id="5IxwF7xX-7rUqVhXdwQs-2" value="&lt;font style=&quot;font-size: 24px;&quot;&gt;&lt;b style=&quot;&quot;&gt;tensor.splat&lt;/b&gt;&lt;/font&gt;" style="rounded=1;whiteSpace=wrap;html=1;fillColor=#d5e8d4;strokeColor=#82b366;" vertex="1" parent="1">
33 |           <mxGeometry x="480" y="160" width="160" height="40" as="geometry" />
34 |         </mxCell>
35 |         <mxCell id="5IxwF7xX-7rUqVhXdwQs-9" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;strokeWidth=3;" edge="1" parent="1" source="5IxwF7xX-7rUqVhXdwQs-3" target="5IxwF7xX-7rUqVhXdwQs-4">
36 |           <mxGeometry relative="1" as="geometry" />
37 |         </mxCell>
38 |         <mxCell id="5IxwF7xX-7rUqVhXdwQs-3" value="&lt;font style=&quot;font-size: 24px;&quot;&gt;&lt;b style=&quot;&quot;&gt;linalg.matmul&lt;/b&gt;&lt;/font&gt;" style="rounded=1;whiteSpace=wrap;html=1;fillColor=#dae8fc;strokeColor=#6c8ebf;" vertex="1" parent="1">
39 |           <mxGeometry x="280" y="280" width="160" height="40" as="geometry" />
40 |         </mxCell>
41 |         <mxCell id="5IxwF7xX-7rUqVhXdwQs-21" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;strokeWidth=3;" edge="1" parent="1" source="5IxwF7xX-7rUqVhXdwQs-4" target="5IxwF7xX-7rUqVhXdwQs-20">
42 |           <mxGeometry relative="1" as="geometry" />
43 |         </mxCell>
44 |         <mxCell id="5IxwF7xX-7rUqVhXdwQs-4" value="&lt;font style=&quot;font-size: 24px;&quot;&gt;&lt;b style=&quot;&quot;&gt;linalg.generic&lt;/b&gt;&lt;/font&gt;" style="rounded=1;whiteSpace=wrap;html=1;fillColor=#dae8fc;strokeColor=#6c8ebf;" vertex="1" parent="1">
45 |           <mxGeometry x="280" y="360" width="160" height="40" as="geometry" />
46 |         </mxCell>
47 |         <mxCell id="5IxwF7xX-7rUqVhXdwQs-11" value="&lt;b&gt;%FC_INPUT&lt;/b&gt;" style="text;html=1;align=left;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=16;" vertex="1" parent="1">
48 |           <mxGeometry x="170" y="200" width="110" height="40" as="geometry" />
49 |         </mxCell>
50 |         <mxCell id="5IxwF7xX-7rUqVhXdwQs-12" value="&lt;b&gt;%FC_WEIGHT&lt;/b&gt;" style="text;html=1;align=left;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=16;" vertex="1" parent="1">
51 |           <mxGeometry x="370" y="200" width="110" height="40" as="geometry" />
52 |         </mxCell>
53 |         <mxCell id="5IxwF7xX-7rUqVhXdwQs-14" value="&lt;b&gt;%init&lt;/b&gt;" style="text;html=1;align=left;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=16;" vertex="1" parent="1">
54 |           <mxGeometry x="570" y="200" width="70" height="40" as="geometry" />
55 |         </mxCell>
56 |         <mxCell id="5IxwF7xX-7rUqVhXdwQs-15" value="&lt;b&gt;%FC_OUTPUT&lt;/b&gt;" style="text;html=1;align=left;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=16;" vertex="1" parent="1">
57 |           <mxGeometry x="370" y="320" width="110" height="40" as="geometry" />
58 |         </mxCell>
59 |         <mxCell id="5IxwF7xX-7rUqVhXdwQs-16" value="&lt;b&gt;%OUT&lt;/b&gt;" style="text;html=1;align=left;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=16;" vertex="1" parent="1">
60 |           <mxGeometry x="370" y="400" width="110" height="40" as="geometry" />
61 |         </mxCell>
62 |         <mxCell id="5IxwF7xX-7rUqVhXdwQs-18" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;strokeWidth=3;" edge="1" parent="1" source="5IxwF7xX-7rUqVhXdwQs-17" target="5IxwF7xX-7rUqVhXdwQs-2">
63 |           <mxGeometry relative="1" as="geometry" />
64 |         </mxCell>
65 |         <mxCell id="5IxwF7xX-7rUqVhXdwQs-17" value="&lt;font style=&quot;font-size: 24px;&quot;&gt;&lt;b style=&quot;&quot;&gt;arith.const&lt;/b&gt;&lt;/font&gt;" style="rounded=1;whiteSpace=wrap;html=1;fillColor=#ffe6cc;strokeColor=#d79b00;" vertex="1" parent="1">
66 |           <mxGeometry x="480" y="40" width="160" height="40" as="geometry" />
67 |         </mxCell>
68 |         <mxCell id="5IxwF7xX-7rUqVhXdwQs-19" value="&lt;b&gt;%c_init&lt;/b&gt;" style="text;html=1;align=left;verticalAlign=middle;whiteSpace=wrap;rounded=0;fontSize=16;" vertex="1" parent="1">
69 |           <mxGeometry x="570" y="80" width="70" height="40" as="geometry" />
70 |         </mxCell>
71 |         <mxCell id="5IxwF7xX-7rUqVhXdwQs-20" value="&lt;font style=&quot;font-size: 24px;&quot;&gt;&lt;b style=&quot;&quot;&gt;func.return&lt;/b&gt;&lt;/font&gt;" style="rounded=1;whiteSpace=wrap;html=1;fillColor=#f5f5f5;strokeColor=#666666;fontColor=#333333;" vertex="1" parent="1">
72 |           <mxGeometry x="280" y="440" width="160" height="40" as="geometry" />
73 |         </mxCell>
74 |       </root>
75 |     </mxGraphModel>
76 |   </diagram>
77 | </mxfile>
78 | 


--------------------------------------------------------------------------------
/demo2-entering-mlir/demo2.mlir:
--------------------------------------------------------------------------------
  1 | /// @file
  2 | /// @author Alex Singer
  3 | /// @date   March 2025
  4 | /// @brief  The high-level code from demo 1 converted into MLIR.
  5 | ///
  6 | /// Usually tools are used to automatically convert the user's code into MLIR
  7 | /// code. For this tutorial I wrote this kernel by hand, but it should be
  8 | /// trivial to build a tool to do this conversion for this particular example.
  9 | 
 10 | // This is used by the linalg.generic op later. See that section for more info.
 11 | #map = affine_map<(i, j) -> (i, j)>
 12 | 
 13 | // This is the top-level container operation. It is part of the "builtin" dialect.
 14 | //
 15 | // This is the first occurence in this tutorial of something called an
 16 | // "Operation" in MLIR. Operations have application-specific semantics, meaning
 17 | // that their functionality is defined by the context in which they are used in.
 18 | // Applications have names, may return results, may take operands, may have
 19 | // properties, may have attributes, and may have regions which can contain other
 20 | // Operations. In this case, the ModuleOp contains one region. 
 21 | //
 22 | // A dialect in MLIR is a collection of Operations at some level of abstraction.
 23 | // For the builtin dialect, these are used for MLIR-specific organization. The
 24 | // built-in dialect often has no meaning to the code itself, and is used within
 25 | // MLIR to help with the language specification of MLIR.
 26 | //
 27 | // This operation contains all of the code which will be compiled through MLIR.
 28 | // It is used within the compiler to apply overall attributes to all operations
 29 | // within. For example, this is where the target device triple is stored.
 30 | module {
 31 | 
 32 | // This is the function operation as part of the "func" dialect. This dialect
 33 | // contains operations that have to do with defining and calling functions.
 34 | // FuncOps in MLIR have a name ("main" in this case), define arguments to the
 35 | // function, and define the output type. The function op contains one region
 36 | // containing ops which it will "execute" in order. The symantic of "executing"
 37 | // these ops in order comes from the abstraction of the FuncOp itself, not from
 38 | // MLIR's specifications.
 39 | // Here, we have a function named "main", it takes no arguments, and returns a
 40 | // tensor (which will be described later why).
 41 | func.func @main() -> tensor<256x1024xf32> {
 42 |     // This is the first occurence of a "Value" in this tutorial. Values are
 43 |     // what may get returned from operations. In MLIR, these are named using the
 44 |     // "%" symbol. Values in MLIR are Static-Single-Assignment (SSA). This means
 45 |     // that their value does not mutate during execution and they are only
 46 |     // assigned to once. This is a useful property since it makes the Data Flow
 47 |     // Graph (DFG) of functions directed and acyclic which enables many compiler
 48 |     // optimizations.
 49 |     //
 50 |     // Values in MLIR always have a "type". In this case, this value is a Tensor
 51 |     // type. The Tensor type specifies a multi-dimensional array; however, there
 52 |     // is NO concept of memory (i.e. how the data is stored in the device). This
 53 |     // is by design. Tensors only represent a "chunk" of data, thats it. This is
 54 |     // A useful abstraction since it allows us to deal with compute at a higher
 55 |     // level of abstraction (not caring about how the buffers are allocated).
 56 |     //
 57 |     // In demo1, the FC_INPUT and FC_WEIGHT matrices were just initialized with
 58 |     // random values (not specifically 0 initialized). The "tensor" dialect
 59 |     // provides ops to work with tensors. In this case, we use the TensorEmpty
 60 |     // op to create an empty tensor of the given shape and type.
 61 |     %FC_INPUT = tensor.empty() : tensor<256x512xf32>
 62 |     %FC_WEIGHT = tensor.empty() : tensor<512x1024xf32>
 63 | 
 64 |     // Here we perform our first high-level linear-algebra operation. At a high
 65 |     // level, all we care about is that FC_INPUT and FC_WEIGHT are multiplied
 66 |     // together. We do not care about the algorithm used to perform this matrix
 67 |     // multiplication. Luckily, as part of the Linalg dialect, a matmul op
 68 |     // exists which does exactly this! This MatMulOp takes 2 matrices as inputs
 69 |     // and produces one matrix as output. What you may notice is that I have to
 70 |     // create another tensor to act as my output. This is a quirk of the linalg
 71 |     // dialect. Most ops in the linalg dialect need to know what is in the output
 72 |     // tensor before the operation occured. Some operations, for example, may
 73 |     // not set every value in the tensor and the user may want to zero initialize
 74 |     // the output tensor. In this case, we must set the initial value of the
 75 |     // output tensor to all 0s since matrix multiplies use multiply-accumulate
 76 |     // instructions which accumulate into the output buffer.
 77 |     %c_init = arith.constant 0.0 : f32
 78 |     %matmul_init = tensor.splat %c_init : tensor<256x1024xf32>
 79 |     %FC_OUTPUT = linalg.matmul
 80 |                     ins(%FC_INPUT, %FC_WEIGHT : tensor<256x512xf32>, tensor<512x1024xf32>)
 81 |                     outs(%matmul_init : tensor<256x1024xf32>) -> tensor<256x1024xf32>
 82 | 
 83 |     // Our second high-level linear algebra operation that we wish to perform is
 84 |     // an elementwise ReLU operation. Currently, the linalg dialect does not
 85 |     // contain the ReLU activation function. MLIR generally contains ops for all
 86 |     // basic operations people may need, but ReLU may just not be common enough.
 87 |     // Luckily, in the linalg dialect there is a way to specify a "generic"
 88 |     // linear algebra operation. Just like matmul before, we need to specify
 89 |     // inputs and outputs; but now we also need to specify the function of this
 90 |     // operation. We start by specifying how the matrices will be indexed and
 91 |     // iterated over. The indexing maps I provided basically just say the we
 92 |     // index the matrices without transposing. The iteration type I provided
 93 |     // basically just say that we can iterate in any order (there is no data
 94 |     // dependencies between iterations). Next, we specify the function of this
 95 |     // operation. In this case, I used the "arithmetic" dialect to describe a
 96 |     // compare and select that will set the input value to zero if it is negative.
 97 |     // Since all values for this relu are being written into, and the %out is
 98 |     // not being used, we can allocate the init tensor without setting it to some
 99 |     // number.
100 |     %relu_init = tensor.empty() : tensor<256x1024xf32>
101 |     %OUT = linalg.generic { indexing_maps = [#map, #map],
102 |                             iterator_types = ["parallel", "parallel"]}
103 |                ins(%FC_OUTPUT : tensor<256x1024xf32>)
104 |                outs(%relu_init : tensor<256x1024xf32>) {
105 |                ^bb0(%in: f32, %out: f32):
106 |                     %c0 = arith.constant 0.0 : f32
107 |                     %cmp = arith.cmpf ugt, %in, %c0 : f32
108 |                     %sel = arith.select %cmp, %in, %c0 : f32
109 |                     linalg.yield %sel : f32
110 |                } -> tensor<256x1024xf32>
111 | 
112 |     // Return the final tensor result. This differs from the original main
113 |     // function since MLIR is often smart enough to realize that this tensor is
114 |     // never used and will optimize everything in this kernel away. To keep that
115 |     // from happening for this tutorial, I just returned the result.
116 |     func.return %OUT : tensor<256x1024xf32>
117 | }
118 | 
119 | }
120 | 
121 | 


--------------------------------------------------------------------------------
/resources/LoweringDialectDiagram.drawio:
--------------------------------------------------------------------------------
  1 | <mxfile host="Electron" agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) draw.io/26.0.16 Chrome/132.0.6834.196 Electron/34.2.0 Safari/537.36" version="26.0.16">
  2 |   <diagram name="Page-1" id="O9munYN-AlQGfuHdiY39">
  3 |     <mxGraphModel dx="1941" dy="958" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="1" pageScale="1" pageWidth="827" pageHeight="1169" math="0" shadow="0">
  4 |       <root>
  5 |         <mxCell id="0" />
  6 |         <mxCell id="1" parent="0" />
  7 |         <mxCell id="qbqpYpZ_GJigqZi00KbP-17" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;strokeWidth=3;" edge="1" parent="1" source="qbqpYpZ_GJigqZi00KbP-1" target="qbqpYpZ_GJigqZi00KbP-2">
  8 |           <mxGeometry relative="1" as="geometry" />
  9 |         </mxCell>
 10 |         <mxCell id="qbqpYpZ_GJigqZi00KbP-18" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;strokeWidth=3;" edge="1" parent="1" source="qbqpYpZ_GJigqZi00KbP-1" target="qbqpYpZ_GJigqZi00KbP-10">
 11 |           <mxGeometry relative="1" as="geometry">
 12 |             <Array as="points">
 13 |               <mxPoint x="380" y="220" />
 14 |               <mxPoint x="220" y="220" />
 15 |             </Array>
 16 |           </mxGeometry>
 17 |         </mxCell>
 18 |         <mxCell id="qbqpYpZ_GJigqZi00KbP-1" value="&lt;b&gt;&lt;font style=&quot;font-size: 24px;&quot;&gt;Linalg&lt;/font&gt;&lt;/b&gt;" style="rounded=1;whiteSpace=wrap;html=1;fillColor=#dae8fc;strokeColor=#6c8ebf;" vertex="1" parent="1">
 19 |           <mxGeometry x="320" y="120" width="120" height="40" as="geometry" />
 20 |         </mxCell>
 21 |         <mxCell id="qbqpYpZ_GJigqZi00KbP-16" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;strokeWidth=3;" edge="1" parent="1" source="qbqpYpZ_GJigqZi00KbP-2" target="qbqpYpZ_GJigqZi00KbP-6">
 22 |           <mxGeometry relative="1" as="geometry" />
 23 |         </mxCell>
 24 |         <mxCell id="qbqpYpZ_GJigqZi00KbP-2" value="&lt;span style=&quot;font-size: 24px;&quot;&gt;&lt;b&gt;scf&lt;/b&gt;&lt;/span&gt;" style="rounded=1;whiteSpace=wrap;html=1;fillColor=#dae8fc;strokeColor=#6c8ebf;" vertex="1" parent="1">
 25 |           <mxGeometry x="320" y="280" width="120" height="40" as="geometry" />
 26 |         </mxCell>
 27 |         <mxCell id="qbqpYpZ_GJigqZi00KbP-20" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;entryX=0.75;entryY=0;entryDx=0;entryDy=0;strokeWidth=3;" edge="1" parent="1" source="qbqpYpZ_GJigqZi00KbP-3" target="qbqpYpZ_GJigqZi00KbP-11">
 28 |           <mxGeometry relative="1" as="geometry">
 29 |             <Array as="points">
 30 |               <mxPoint x="540" y="410" />
 31 |               <mxPoint x="410" y="410" />
 32 |             </Array>
 33 |           </mxGeometry>
 34 |         </mxCell>
 35 |         <mxCell id="qbqpYpZ_GJigqZi00KbP-3" value="&lt;b&gt;&lt;font style=&quot;font-size: 24px;&quot;&gt;MemRef&lt;/font&gt;&lt;/b&gt;" style="rounded=1;whiteSpace=wrap;html=1;fillColor=#dae8fc;strokeColor=#6c8ebf;" vertex="1" parent="1">
 36 |           <mxGeometry x="480" y="200" width="120" height="40" as="geometry" />
 37 |         </mxCell>
 38 |         <mxCell id="qbqpYpZ_GJigqZi00KbP-13" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;strokeWidth=3;" edge="1" parent="1" source="qbqpYpZ_GJigqZi00KbP-6" target="qbqpYpZ_GJigqZi00KbP-11">
 39 |           <mxGeometry relative="1" as="geometry" />
 40 |         </mxCell>
 41 |         <mxCell id="qbqpYpZ_GJigqZi00KbP-6" value="&lt;span style=&quot;font-size: 24px;&quot;&gt;&lt;b&gt;cf&lt;/b&gt;&lt;/span&gt;" style="rounded=1;whiteSpace=wrap;html=1;fillColor=#dae8fc;strokeColor=#6c8ebf;" vertex="1" parent="1">
 42 |           <mxGeometry x="320" y="360" width="120" height="40" as="geometry" />
 43 |         </mxCell>
 44 |         <mxCell id="qbqpYpZ_GJigqZi00KbP-12" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;strokeWidth=3;" edge="1" parent="1" source="qbqpYpZ_GJigqZi00KbP-7" target="qbqpYpZ_GJigqZi00KbP-11">
 45 |           <mxGeometry relative="1" as="geometry" />
 46 |         </mxCell>
 47 |         <mxCell id="qbqpYpZ_GJigqZi00KbP-7" value="&lt;b&gt;&lt;font style=&quot;font-size: 24px;&quot;&gt;Func&lt;/font&gt;&lt;/b&gt;" style="rounded=1;whiteSpace=wrap;html=1;fillColor=#dae8fc;strokeColor=#6c8ebf;" vertex="1" parent="1">
 48 |           <mxGeometry y="120" width="120" height="40" as="geometry" />
 49 |         </mxCell>
 50 |         <mxCell id="qbqpYpZ_GJigqZi00KbP-19" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;strokeWidth=3;" edge="1" parent="1" source="qbqpYpZ_GJigqZi00KbP-9" target="qbqpYpZ_GJigqZi00KbP-3">
 51 |           <mxGeometry relative="1" as="geometry" />
 52 |         </mxCell>
 53 |         <mxCell id="qbqpYpZ_GJigqZi00KbP-9" value="&lt;b&gt;&lt;font style=&quot;font-size: 24px;&quot;&gt;Tensor&lt;/font&gt;&lt;/b&gt;" style="rounded=1;whiteSpace=wrap;html=1;fillColor=#dae8fc;strokeColor=#6c8ebf;" vertex="1" parent="1">
 54 |           <mxGeometry x="480" y="120" width="120" height="40" as="geometry" />
 55 |         </mxCell>
 56 |         <mxCell id="qbqpYpZ_GJigqZi00KbP-15" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;entryX=0.25;entryY=0;entryDx=0;entryDy=0;strokeWidth=3;" edge="1" parent="1" source="qbqpYpZ_GJigqZi00KbP-10" target="qbqpYpZ_GJigqZi00KbP-11">
 57 |           <mxGeometry relative="1" as="geometry">
 58 |             <Array as="points">
 59 |               <mxPoint x="220" y="410" />
 60 |               <mxPoint x="350" y="410" />
 61 |             </Array>
 62 |           </mxGeometry>
 63 |         </mxCell>
 64 |         <mxCell id="qbqpYpZ_GJigqZi00KbP-10" value="&lt;span style=&quot;font-size: 24px;&quot;&gt;&lt;b&gt;arith&lt;/b&gt;&lt;/span&gt;" style="rounded=1;whiteSpace=wrap;html=1;fillColor=#dae8fc;strokeColor=#6c8ebf;" vertex="1" parent="1">
 65 |           <mxGeometry x="160" y="360" width="120" height="40" as="geometry" />
 66 |         </mxCell>
 67 |         <mxCell id="qbqpYpZ_GJigqZi00KbP-11" value="&lt;span style=&quot;font-size: 24px;&quot;&gt;&lt;b&gt;llvm&lt;/b&gt;&lt;/span&gt;" style="rounded=1;whiteSpace=wrap;html=1;fillColor=#dae8fc;strokeColor=#6c8ebf;" vertex="1" parent="1">
 68 |           <mxGeometry x="320" y="440" width="120" height="40" as="geometry" />
 69 |         </mxCell>
 70 |         <mxCell id="qbqpYpZ_GJigqZi00KbP-25" value="LLVM IR" style="whiteSpace=wrap;html=1;shape=mxgraph.basic.document" vertex="1" parent="1">
 71 |           <mxGeometry x="350" y="560" width="60" height="70" as="geometry" />
 72 |         </mxCell>
 73 |         <mxCell id="qbqpYpZ_GJigqZi00KbP-27" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;entryPerimeter=0;strokeWidth=3;" edge="1" parent="1" source="qbqpYpZ_GJigqZi00KbP-11" target="qbqpYpZ_GJigqZi00KbP-25">
 74 |           <mxGeometry relative="1" as="geometry" />
 75 |         </mxCell>
 76 |         <mxCell id="qbqpYpZ_GJigqZi00KbP-28" value="" style="endArrow=none;dashed=1;html=1;rounded=0;strokeWidth=1;" edge="1" parent="1">
 77 |           <mxGeometry width="50" height="50" relative="1" as="geometry">
 78 |             <mxPoint x="-40" y="180" as="sourcePoint" />
 79 |             <mxPoint x="920" y="180" as="targetPoint" />
 80 |           </mxGeometry>
 81 |         </mxCell>
 82 |         <mxCell id="qbqpYpZ_GJigqZi00KbP-29" value="" style="endArrow=none;dashed=1;html=1;rounded=0;strokeWidth=1;" edge="1" parent="1">
 83 |           <mxGeometry width="50" height="50" relative="1" as="geometry">
 84 |             <mxPoint x="-40" y="260" as="sourcePoint" />
 85 |             <mxPoint x="920" y="260" as="targetPoint" />
 86 |           </mxGeometry>
 87 |         </mxCell>
 88 |         <mxCell id="qbqpYpZ_GJigqZi00KbP-30" value="" style="endArrow=none;dashed=1;html=1;rounded=0;strokeWidth=1;" edge="1" parent="1">
 89 |           <mxGeometry width="50" height="50" relative="1" as="geometry">
 90 |             <mxPoint x="-40" y="340" as="sourcePoint" />
 91 |             <mxPoint x="920" y="340" as="targetPoint" />
 92 |           </mxGeometry>
 93 |         </mxCell>
 94 |         <mxCell id="qbqpYpZ_GJigqZi00KbP-31" value="" style="endArrow=none;dashed=1;html=1;rounded=0;strokeWidth=1;" edge="1" parent="1">
 95 |           <mxGeometry width="50" height="50" relative="1" as="geometry">
 96 |             <mxPoint x="-40" y="420" as="sourcePoint" />
 97 |             <mxPoint x="920" y="420" as="targetPoint" />
 98 |           </mxGeometry>
 99 |         </mxCell>
100 |         <mxCell id="qbqpYpZ_GJigqZi00KbP-32" value="&lt;b&gt;&lt;font style=&quot;font-size: 24px;&quot;&gt;&quot;Linalg on Tensor&quot;&lt;/font&gt;&lt;/b&gt;" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" vertex="1" parent="1">
101 |           <mxGeometry x="680" y="120" width="240" height="40" as="geometry" />
102 |         </mxCell>
103 |         <mxCell id="qbqpYpZ_GJigqZi00KbP-33" value="&lt;b&gt;&lt;font style=&quot;font-size: 24px;&quot;&gt;&quot;Linalg on Buffers&quot;&lt;/font&gt;&lt;/b&gt;" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" vertex="1" parent="1">
104 |           <mxGeometry x="680" y="200" width="240" height="40" as="geometry" />
105 |         </mxCell>
106 |         <mxCell id="qbqpYpZ_GJigqZi00KbP-34" value="&lt;b&gt;&lt;font style=&quot;font-size: 24px;&quot;&gt;&quot;Loops&quot;&lt;/font&gt;&lt;/b&gt;" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" vertex="1" parent="1">
107 |           <mxGeometry x="680" y="280" width="240" height="40" as="geometry" />
108 |         </mxCell>
109 |         <mxCell id="qbqpYpZ_GJigqZi00KbP-35" value="&lt;b&gt;&lt;font style=&quot;font-size: 24px;&quot;&gt;&quot;Host/Device Code&quot;&lt;/font&gt;&lt;/b&gt;" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" vertex="1" parent="1">
110 |           <mxGeometry x="680" y="360" width="240" height="40" as="geometry" />
111 |         </mxCell>
112 |       </root>
113 |     </mxGraphModel>
114 |   </diagram>
115 | </mxfile>
116 | 


--------------------------------------------------------------------------------
/resources/AffineLoweringDialectDiagram.drawio:
--------------------------------------------------------------------------------
  1 | <mxfile host="Electron" agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) draw.io/26.0.16 Chrome/132.0.6834.196 Electron/34.2.0 Safari/537.36" version="26.0.16">
  2 |   <diagram name="Page-1" id="O9munYN-AlQGfuHdiY39">
  3 |     <mxGraphModel dx="1941" dy="958" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="1" pageScale="1" pageWidth="827" pageHeight="1169" math="0" shadow="0">
  4 |       <root>
  5 |         <mxCell id="0" />
  6 |         <mxCell id="1" parent="0" />
  7 |         <mxCell id="qbqpYpZ_GJigqZi00KbP-17" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;strokeWidth=3;" edge="1" parent="1" source="qbqpYpZ_GJigqZi00KbP-1" target="qbqpYpZ_GJigqZi00KbP-2">
  8 |           <mxGeometry relative="1" as="geometry" />
  9 |         </mxCell>
 10 |         <mxCell id="qbqpYpZ_GJigqZi00KbP-18" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;strokeWidth=3;" edge="1" parent="1" source="qbqpYpZ_GJigqZi00KbP-1" target="qbqpYpZ_GJigqZi00KbP-10">
 11 |           <mxGeometry relative="1" as="geometry">
 12 |             <Array as="points">
 13 |               <mxPoint x="380" y="140" />
 14 |               <mxPoint x="220" y="140" />
 15 |             </Array>
 16 |           </mxGeometry>
 17 |         </mxCell>
 18 |         <mxCell id="qbqpYpZ_GJigqZi00KbP-37" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;strokeWidth=3;" edge="1" parent="1" source="qbqpYpZ_GJigqZi00KbP-1" target="qbqpYpZ_GJigqZi00KbP-36">
 19 |           <mxGeometry relative="1" as="geometry">
 20 |             <Array as="points">
 21 |               <mxPoint x="380" y="140" />
 22 |               <mxPoint x="460" y="140" />
 23 |             </Array>
 24 |           </mxGeometry>
 25 |         </mxCell>
 26 |         <mxCell id="qbqpYpZ_GJigqZi00KbP-1" value="&lt;b&gt;&lt;font style=&quot;font-size: 24px;&quot;&gt;Linalg&lt;/font&gt;&lt;/b&gt;" style="rounded=1;whiteSpace=wrap;html=1;fillColor=#dae8fc;strokeColor=#6c8ebf;" vertex="1" parent="1">
 27 |           <mxGeometry x="320" y="40" width="120" height="40" as="geometry" />
 28 |         </mxCell>
 29 |         <mxCell id="qbqpYpZ_GJigqZi00KbP-16" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;strokeWidth=3;" edge="1" parent="1" source="qbqpYpZ_GJigqZi00KbP-2" target="qbqpYpZ_GJigqZi00KbP-6">
 30 |           <mxGeometry relative="1" as="geometry" />
 31 |         </mxCell>
 32 |         <mxCell id="qbqpYpZ_GJigqZi00KbP-2" value="&lt;span style=&quot;font-size: 24px;&quot;&gt;&lt;b&gt;scf&lt;/b&gt;&lt;/span&gt;" style="rounded=1;whiteSpace=wrap;html=1;fillColor=#dae8fc;strokeColor=#6c8ebf;" vertex="1" parent="1">
 33 |           <mxGeometry x="320" y="280" width="120" height="40" as="geometry" />
 34 |         </mxCell>
 35 |         <mxCell id="qbqpYpZ_GJigqZi00KbP-20" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;entryX=0.75;entryY=0;entryDx=0;entryDy=0;strokeWidth=3;" edge="1" parent="1" source="qbqpYpZ_GJigqZi00KbP-3" target="qbqpYpZ_GJigqZi00KbP-11">
 36 |           <mxGeometry relative="1" as="geometry">
 37 |             <Array as="points">
 38 |               <mxPoint x="540" y="410" />
 39 |               <mxPoint x="410" y="410" />
 40 |             </Array>
 41 |           </mxGeometry>
 42 |         </mxCell>
 43 |         <mxCell id="qbqpYpZ_GJigqZi00KbP-3" value="&lt;b&gt;&lt;font style=&quot;font-size: 24px;&quot;&gt;MemRef&lt;/font&gt;&lt;/b&gt;" style="rounded=1;whiteSpace=wrap;html=1;fillColor=#dae8fc;strokeColor=#6c8ebf;" vertex="1" parent="1">
 44 |           <mxGeometry x="480" y="120" width="120" height="40" as="geometry" />
 45 |         </mxCell>
 46 |         <mxCell id="qbqpYpZ_GJigqZi00KbP-13" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;strokeWidth=3;" edge="1" parent="1" source="qbqpYpZ_GJigqZi00KbP-6" target="qbqpYpZ_GJigqZi00KbP-11">
 47 |           <mxGeometry relative="1" as="geometry" />
 48 |         </mxCell>
 49 |         <mxCell id="qbqpYpZ_GJigqZi00KbP-6" value="&lt;span style=&quot;font-size: 24px;&quot;&gt;&lt;b&gt;cf&lt;/b&gt;&lt;/span&gt;" style="rounded=1;whiteSpace=wrap;html=1;fillColor=#dae8fc;strokeColor=#6c8ebf;" vertex="1" parent="1">
 50 |           <mxGeometry x="320" y="360" width="120" height="40" as="geometry" />
 51 |         </mxCell>
 52 |         <mxCell id="qbqpYpZ_GJigqZi00KbP-12" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;strokeWidth=3;" edge="1" parent="1" source="qbqpYpZ_GJigqZi00KbP-7" target="qbqpYpZ_GJigqZi00KbP-11">
 53 |           <mxGeometry relative="1" as="geometry" />
 54 |         </mxCell>
 55 |         <mxCell id="qbqpYpZ_GJigqZi00KbP-7" value="&lt;b&gt;&lt;font style=&quot;font-size: 24px;&quot;&gt;Func&lt;/font&gt;&lt;/b&gt;" style="rounded=1;whiteSpace=wrap;html=1;fillColor=#dae8fc;strokeColor=#6c8ebf;" vertex="1" parent="1">
 56 |           <mxGeometry y="40" width="120" height="40" as="geometry" />
 57 |         </mxCell>
 58 |         <mxCell id="qbqpYpZ_GJigqZi00KbP-19" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;strokeWidth=3;" edge="1" parent="1" source="qbqpYpZ_GJigqZi00KbP-9" target="qbqpYpZ_GJigqZi00KbP-3">
 59 |           <mxGeometry relative="1" as="geometry" />
 60 |         </mxCell>
 61 |         <mxCell id="qbqpYpZ_GJigqZi00KbP-9" value="&lt;b&gt;&lt;font style=&quot;font-size: 24px;&quot;&gt;Tensor&lt;/font&gt;&lt;/b&gt;" style="rounded=1;whiteSpace=wrap;html=1;fillColor=#dae8fc;strokeColor=#6c8ebf;" vertex="1" parent="1">
 62 |           <mxGeometry x="480" y="40" width="120" height="40" as="geometry" />
 63 |         </mxCell>
 64 |         <mxCell id="qbqpYpZ_GJigqZi00KbP-15" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;entryX=0.25;entryY=0;entryDx=0;entryDy=0;strokeWidth=3;" edge="1" parent="1" source="qbqpYpZ_GJigqZi00KbP-10" target="qbqpYpZ_GJigqZi00KbP-11">
 65 |           <mxGeometry relative="1" as="geometry">
 66 |             <Array as="points">
 67 |               <mxPoint x="220" y="410" />
 68 |               <mxPoint x="350" y="410" />
 69 |             </Array>
 70 |           </mxGeometry>
 71 |         </mxCell>
 72 |         <mxCell id="qbqpYpZ_GJigqZi00KbP-10" value="&lt;span style=&quot;font-size: 24px;&quot;&gt;&lt;b&gt;arith&lt;/b&gt;&lt;/span&gt;" style="rounded=1;whiteSpace=wrap;html=1;fillColor=#dae8fc;strokeColor=#6c8ebf;" vertex="1" parent="1">
 73 |           <mxGeometry x="160" y="360" width="120" height="40" as="geometry" />
 74 |         </mxCell>
 75 |         <mxCell id="qbqpYpZ_GJigqZi00KbP-11" value="&lt;span style=&quot;font-size: 24px;&quot;&gt;&lt;b&gt;llvm&lt;/b&gt;&lt;/span&gt;" style="rounded=1;whiteSpace=wrap;html=1;fillColor=#dae8fc;strokeColor=#6c8ebf;" vertex="1" parent="1">
 76 |           <mxGeometry x="320" y="440" width="120" height="40" as="geometry" />
 77 |         </mxCell>
 78 |         <mxCell id="qbqpYpZ_GJigqZi00KbP-25" value="LLVM IR" style="whiteSpace=wrap;html=1;shape=mxgraph.basic.document" vertex="1" parent="1">
 79 |           <mxGeometry x="350" y="560" width="60" height="70" as="geometry" />
 80 |         </mxCell>
 81 |         <mxCell id="qbqpYpZ_GJigqZi00KbP-27" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;entryPerimeter=0;strokeWidth=3;" edge="1" parent="1" source="qbqpYpZ_GJigqZi00KbP-11" target="qbqpYpZ_GJigqZi00KbP-25">
 82 |           <mxGeometry relative="1" as="geometry" />
 83 |         </mxCell>
 84 |         <mxCell id="qbqpYpZ_GJigqZi00KbP-28" value="" style="endArrow=none;dashed=1;html=1;rounded=0;strokeWidth=1;" edge="1" parent="1">
 85 |           <mxGeometry width="50" height="50" relative="1" as="geometry">
 86 |             <mxPoint x="-40" y="100" as="sourcePoint" />
 87 |             <mxPoint x="920" y="100" as="targetPoint" />
 88 |           </mxGeometry>
 89 |         </mxCell>
 90 |         <mxCell id="qbqpYpZ_GJigqZi00KbP-29" value="" style="endArrow=none;dashed=1;html=1;rounded=0;strokeWidth=1;" edge="1" parent="1">
 91 |           <mxGeometry width="50" height="50" relative="1" as="geometry">
 92 |             <mxPoint x="-40" y="180" as="sourcePoint" />
 93 |             <mxPoint x="920" y="180" as="targetPoint" />
 94 |           </mxGeometry>
 95 |         </mxCell>
 96 |         <mxCell id="qbqpYpZ_GJigqZi00KbP-30" value="" style="endArrow=none;dashed=1;html=1;rounded=0;strokeWidth=1;" edge="1" parent="1">
 97 |           <mxGeometry width="50" height="50" relative="1" as="geometry">
 98 |             <mxPoint x="-40" y="340" as="sourcePoint" />
 99 |             <mxPoint x="920" y="340" as="targetPoint" />
100 |           </mxGeometry>
101 |         </mxCell>
102 |         <mxCell id="qbqpYpZ_GJigqZi00KbP-31" value="" style="endArrow=none;dashed=1;html=1;rounded=0;strokeWidth=1;" edge="1" parent="1">
103 |           <mxGeometry width="50" height="50" relative="1" as="geometry">
104 |             <mxPoint x="-40" y="420" as="sourcePoint" />
105 |             <mxPoint x="920" y="420" as="targetPoint" />
106 |           </mxGeometry>
107 |         </mxCell>
108 |         <mxCell id="qbqpYpZ_GJigqZi00KbP-32" value="&lt;b&gt;&lt;font style=&quot;font-size: 24px;&quot;&gt;&quot;Linalg on Tensor&quot;&lt;/font&gt;&lt;/b&gt;" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" vertex="1" parent="1">
109 |           <mxGeometry x="680" y="40" width="240" height="40" as="geometry" />
110 |         </mxCell>
111 |         <mxCell id="qbqpYpZ_GJigqZi00KbP-33" value="&lt;b&gt;&lt;font style=&quot;font-size: 24px;&quot;&gt;&quot;Linalg on Buffers&quot;&lt;/font&gt;&lt;/b&gt;" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" vertex="1" parent="1">
112 |           <mxGeometry x="680" y="120" width="240" height="40" as="geometry" />
113 |         </mxCell>
114 |         <mxCell id="qbqpYpZ_GJigqZi00KbP-34" value="&lt;b&gt;&lt;font style=&quot;font-size: 24px;&quot;&gt;&quot;Loops&quot;&lt;/font&gt;&lt;/b&gt;" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" vertex="1" parent="1">
115 |           <mxGeometry x="680" y="280" width="240" height="40" as="geometry" />
116 |         </mxCell>
117 |         <mxCell id="qbqpYpZ_GJigqZi00KbP-35" value="&lt;b&gt;&lt;font style=&quot;font-size: 24px;&quot;&gt;&quot;Host/Device Code&quot;&lt;/font&gt;&lt;/b&gt;" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" vertex="1" parent="1">
118 |           <mxGeometry x="680" y="360" width="240" height="40" as="geometry" />
119 |         </mxCell>
120 |         <mxCell id="qbqpYpZ_GJigqZi00KbP-38" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0.5;exitY=1;exitDx=0;exitDy=0;entryX=0.75;entryY=0;entryDx=0;entryDy=0;strokeWidth=3;" edge="1" parent="1" source="qbqpYpZ_GJigqZi00KbP-36" target="qbqpYpZ_GJigqZi00KbP-2">
121 |           <mxGeometry relative="1" as="geometry">
122 |             <Array as="points">
123 |               <mxPoint x="460" y="250" />
124 |               <mxPoint x="410" y="250" />
125 |             </Array>
126 |           </mxGeometry>
127 |         </mxCell>
128 |         <mxCell id="qbqpYpZ_GJigqZi00KbP-36" value="&lt;span style=&quot;font-size: 24px;&quot;&gt;&lt;b&gt;Affine&lt;/b&gt;&lt;/span&gt;" style="rounded=1;whiteSpace=wrap;html=1;fillColor=#dae8fc;strokeColor=#6c8ebf;" vertex="1" parent="1">
129 |           <mxGeometry x="400" y="200" width="120" height="40" as="geometry" />
130 |         </mxCell>
131 |         <mxCell id="qbqpYpZ_GJigqZi00KbP-39" value="" style="endArrow=none;dashed=1;html=1;rounded=0;strokeWidth=1;" edge="1" parent="1">
132 |           <mxGeometry width="50" height="50" relative="1" as="geometry">
133 |             <mxPoint x="-40" y="260" as="sourcePoint" />
134 |             <mxPoint x="920" y="260" as="targetPoint" />
135 |           </mxGeometry>
136 |         </mxCell>
137 |         <mxCell id="qbqpYpZ_GJigqZi00KbP-40" value="&lt;b&gt;&lt;font style=&quot;font-size: 24px;&quot;&gt;&quot;Affine&quot;&lt;/font&gt;&lt;/b&gt;" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" vertex="1" parent="1">
138 |           <mxGeometry x="680" y="200" width="240" height="40" as="geometry" />
139 |         </mxCell>
140 |       </root>
141 |     </mxGraphModel>
142 |   </diagram>
143 | </mxfile>
144 | 


--------------------------------------------------------------------------------
/resources/AffineAnalysisIntro.drawio:
--------------------------------------------------------------------------------
  1 | <mxfile host="Electron" agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) draw.io/26.0.16 Chrome/132.0.6834.196 Electron/34.2.0 Safari/537.36" version="26.0.16">
  2 |   <diagram name="Page-1" id="6ShXn8bWX5ViflYGPpYK">
  3 |     <mxGraphModel dx="782" dy="547" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="1" pageScale="1" pageWidth="827" pageHeight="1169" math="0" shadow="0">
  4 |       <root>
  5 |         <mxCell id="0" />
  6 |         <mxCell id="1" parent="0" />
  7 |         <mxCell id="oXTzVpbmEas0PoUD19BF-1" value="" style="verticalLabelPosition=bottom;verticalAlign=top;html=1;shape=mxgraph.basic.patternFillRect;fillStyle=grid;step=5;fillStrokeWidth=0.2;fillStrokeColor=#dddddd;strokeWidth=1;" vertex="1" parent="1">
  8 |           <mxGeometry x="80" y="40" width="160" height="160" as="geometry" />
  9 |         </mxCell>
 10 |         <mxCell id="oXTzVpbmEas0PoUD19BF-2" value="&lt;font face=&quot;Courier New&quot;&gt;for i in 0 to 16&lt;/font&gt;&lt;div&gt;&lt;font face=&quot;Courier New&quot;&gt;&amp;nbsp; for j in 0 to 32&lt;/font&gt;&lt;/div&gt;&lt;div&gt;&lt;font face=&quot;Courier New&quot;&gt;&amp;nbsp; &amp;nbsp; &lt;font style=&quot;color: rgb(0, 0, 255);&quot;&gt;&lt;b&gt;A[i][j] = 1&lt;/b&gt;&lt;/font&gt;&lt;/font&gt;&lt;/div&gt;" style="text;html=1;align=left;verticalAlign=middle;whiteSpace=wrap;rounded=0;" vertex="1" parent="1">
 11 |           <mxGeometry x="260" y="40" width="160" height="160" as="geometry" />
 12 |         </mxCell>
 13 |         <mxCell id="oXTzVpbmEas0PoUD19BF-4" value="" style="verticalLabelPosition=bottom;verticalAlign=top;html=1;shape=mxgraph.basic.patternFillRect;fillStyle=grid;step=5;fillStrokeWidth=0.2;fillStrokeColor=#dddddd;strokeWidth=0;strokeColor=none;fillColor=#dae8fc;" vertex="1" parent="1">
 14 |           <mxGeometry x="80" y="40" width="159" height="80" as="geometry" />
 15 |         </mxCell>
 16 |         <mxCell id="oXTzVpbmEas0PoUD19BF-6" value="" style="verticalLabelPosition=bottom;verticalAlign=top;html=1;shape=mxgraph.basic.patternFillRect;fillStyle=grid;step=5;fillStrokeWidth=0.2;fillStrokeColor=#dddddd;strokeWidth=1;" vertex="1" parent="1">
 17 |           <mxGeometry x="80" y="240" width="160" height="160" as="geometry" />
 18 |         </mxCell>
 19 |         <mxCell id="oXTzVpbmEas0PoUD19BF-7" value="&lt;font face=&quot;Courier New&quot;&gt;for i in 0 to 32&lt;/font&gt;&lt;div&gt;&lt;font face=&quot;Courier New&quot;&gt;&amp;nbsp; for j in 16 to 32&lt;/font&gt;&lt;/div&gt;&lt;div&gt;&lt;font face=&quot;Courier New&quot;&gt;&amp;nbsp; &amp;nbsp; &lt;font style=&quot;color: rgb(255, 0, 0);&quot;&gt;&lt;b style=&quot;&quot;&gt;A[i][j] = 0&lt;/b&gt;&lt;/font&gt;&lt;/font&gt;&lt;/div&gt;" style="text;html=1;align=left;verticalAlign=middle;whiteSpace=wrap;rounded=0;" vertex="1" parent="1">
 20 |           <mxGeometry x="260" y="240" width="160" height="160" as="geometry" />
 21 |         </mxCell>
 22 |         <mxCell id="oXTzVpbmEas0PoUD19BF-8" value="" style="verticalLabelPosition=bottom;verticalAlign=top;html=1;shape=mxgraph.basic.patternFillRect;fillStyle=grid;step=5;fillStrokeWidth=0.2;fillStrokeColor=#dddddd;strokeWidth=0;strokeColor=#b85450;fillColor=#f8cecc;" vertex="1" parent="1">
 23 |           <mxGeometry x="160" y="240" width="79" height="160" as="geometry" />
 24 |         </mxCell>
 25 |         <mxCell id="oXTzVpbmEas0PoUD19BF-10" value="&lt;font face=&quot;Courier New&quot;&gt;i&lt;/font&gt;" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" vertex="1" parent="1">
 26 |           <mxGeometry x="40" y="305" width="40" height="30" as="geometry" />
 27 |         </mxCell>
 28 |         <mxCell id="oXTzVpbmEas0PoUD19BF-11" value="&lt;font face=&quot;Courier New&quot;&gt;j&lt;/font&gt;" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" vertex="1" parent="1">
 29 |           <mxGeometry x="140" y="400" width="40" height="30" as="geometry" />
 30 |         </mxCell>
 31 |         <mxCell id="oXTzVpbmEas0PoUD19BF-12" value="&lt;font face=&quot;Courier New&quot;&gt;j&lt;/font&gt;" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" vertex="1" parent="1">
 32 |           <mxGeometry x="140" y="200" width="40" height="30" as="geometry" />
 33 |         </mxCell>
 34 |         <mxCell id="oXTzVpbmEas0PoUD19BF-13" value="&lt;font face=&quot;Courier New&quot;&gt;i&lt;/font&gt;" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" vertex="1" parent="1">
 35 |           <mxGeometry x="40" y="105" width="40" height="30" as="geometry" />
 36 |         </mxCell>
 37 |         <mxCell id="oXTzVpbmEas0PoUD19BF-14" value="" style="verticalLabelPosition=bottom;verticalAlign=top;html=1;shape=mxgraph.basic.patternFillRect;fillStyle=grid;step=5;fillStrokeWidth=0.2;fillStrokeColor=#dddddd;strokeWidth=1;" vertex="1" parent="1">
 38 |           <mxGeometry x="80" y="440" width="160" height="160" as="geometry" />
 39 |         </mxCell>
 40 |         <mxCell id="oXTzVpbmEas0PoUD19BF-15" value="&lt;font face=&quot;Courier New&quot;&gt;for i in 16 to 32&lt;/font&gt;&lt;div&gt;&lt;font face=&quot;Courier New&quot;&gt;&amp;nbsp; for j in 0 to 32&lt;/font&gt;&lt;/div&gt;&lt;div&gt;&lt;font face=&quot;Courier New&quot;&gt;&amp;nbsp; &amp;nbsp; &lt;font style=&quot;color: rgb(0, 255, 0);&quot;&gt;&lt;b style=&quot;&quot;&gt;A[i][j] = -1&lt;/b&gt;&lt;/font&gt;&lt;/font&gt;&lt;/div&gt;" style="text;html=1;align=left;verticalAlign=middle;whiteSpace=wrap;rounded=0;" vertex="1" parent="1">
 41 |           <mxGeometry x="260" y="440" width="160" height="160" as="geometry" />
 42 |         </mxCell>
 43 |         <mxCell id="oXTzVpbmEas0PoUD19BF-16" value="" style="verticalLabelPosition=bottom;verticalAlign=top;html=1;shape=mxgraph.basic.patternFillRect;fillStyle=grid;step=5;fillStrokeWidth=0.2;fillStrokeColor=#dddddd;strokeWidth=0;strokeColor=#82b366;fillColor=#d5e8d4;" vertex="1" parent="1">
 44 |           <mxGeometry x="80" y="520" width="159" height="80" as="geometry" />
 45 |         </mxCell>
 46 |         <mxCell id="oXTzVpbmEas0PoUD19BF-17" value="&lt;font face=&quot;Courier New&quot;&gt;i&lt;/font&gt;" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" vertex="1" parent="1">
 47 |           <mxGeometry x="40" y="505" width="40" height="30" as="geometry" />
 48 |         </mxCell>
 49 |         <mxCell id="oXTzVpbmEas0PoUD19BF-18" value="&lt;font face=&quot;Courier New&quot;&gt;j&lt;/font&gt;" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" vertex="1" parent="1">
 50 |           <mxGeometry x="140" y="600" width="40" height="30" as="geometry" />
 51 |         </mxCell>
 52 |         <mxCell id="oXTzVpbmEas0PoUD19BF-19" value="" style="verticalLabelPosition=bottom;verticalAlign=top;html=1;shape=mxgraph.basic.patternFillRect;fillStyle=grid;step=5;fillStrokeWidth=0.2;fillStrokeColor=#dddddd;strokeWidth=1;" vertex="1" parent="1">
 53 |           <mxGeometry x="80" y="640" width="160" height="160" as="geometry" />
 54 |         </mxCell>
 55 |         <mxCell id="oXTzVpbmEas0PoUD19BF-20" value="&lt;div&gt;&lt;font face=&quot;Courier New&quot;&gt;for i in 0 to 16&lt;/font&gt;&lt;div&gt;&lt;font face=&quot;Courier New&quot;&gt;&amp;nbsp; for j in 0 to 32&lt;/font&gt;&lt;/div&gt;&lt;div&gt;&lt;font face=&quot;Courier New&quot;&gt;&amp;nbsp; &amp;nbsp;&amp;nbsp;&lt;font style=&quot;color: rgb(0, 0, 255);&quot;&gt;&lt;b&gt;A[i][j] = 1&lt;/b&gt;&lt;/font&gt;&lt;/font&gt;&lt;/div&gt;&lt;/div&gt;&lt;font face=&quot;Courier New&quot;&gt;&lt;div&gt;&lt;font face=&quot;Courier New&quot;&gt;for i in 0 to 32&lt;/font&gt;&lt;div style=&quot;font-family: Helvetica;&quot;&gt;&lt;font face=&quot;Courier New&quot;&gt;&amp;nbsp; for j in 16 to 32&lt;/font&gt;&lt;/div&gt;&lt;div style=&quot;font-family: Helvetica;&quot;&gt;&lt;font face=&quot;Courier New&quot;&gt;&amp;nbsp; &amp;nbsp;&amp;nbsp;&lt;font style=&quot;color: rgb(255, 0, 0);&quot;&gt;&lt;b&gt;A[i][j] = 0&lt;/b&gt;&lt;/font&gt;&lt;/font&gt;&lt;/div&gt;&lt;/div&gt;for i in 16 to 32&lt;/font&gt;&lt;div&gt;&lt;font face=&quot;Courier New&quot;&gt;&amp;nbsp; for j in 0 to 32&lt;/font&gt;&lt;/div&gt;&lt;div&gt;&lt;font face=&quot;Courier New&quot;&gt;&amp;nbsp; &amp;nbsp; &lt;font style=&quot;color: rgb(0, 255, 0);&quot;&gt;&lt;b style=&quot;&quot;&gt;A[i][j] = -1&lt;/b&gt;&lt;/font&gt;&lt;/font&gt;&lt;/div&gt;" style="text;html=1;align=left;verticalAlign=middle;whiteSpace=wrap;rounded=0;" vertex="1" parent="1">
 56 |           <mxGeometry x="260" y="640" width="160" height="160" as="geometry" />
 57 |         </mxCell>
 58 |         <mxCell id="oXTzVpbmEas0PoUD19BF-21" value="" style="verticalLabelPosition=bottom;verticalAlign=top;html=1;shape=mxgraph.basic.patternFillRect;fillStyle=grid;step=5;fillStrokeWidth=0.2;fillStrokeColor=#dddddd;strokeWidth=0;strokeColor=#82b366;fillColor=#d5e8d4;" vertex="1" parent="1">
 59 |           <mxGeometry x="80" y="720" width="159" height="80" as="geometry" />
 60 |         </mxCell>
 61 |         <mxCell id="oXTzVpbmEas0PoUD19BF-22" value="&lt;font face=&quot;Courier New&quot;&gt;i&lt;/font&gt;" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" vertex="1" parent="1">
 62 |           <mxGeometry x="40" y="705" width="40" height="30" as="geometry" />
 63 |         </mxCell>
 64 |         <mxCell id="oXTzVpbmEas0PoUD19BF-23" value="&lt;font face=&quot;Courier New&quot;&gt;j&lt;/font&gt;" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" vertex="1" parent="1">
 65 |           <mxGeometry x="140" y="800" width="40" height="30" as="geometry" />
 66 |         </mxCell>
 67 |         <mxCell id="oXTzVpbmEas0PoUD19BF-24" value="" style="verticalLabelPosition=bottom;verticalAlign=top;html=1;shape=mxgraph.basic.patternFillRect;fillStyle=grid;step=5;fillStrokeWidth=0.2;fillStrokeColor=#dddddd;strokeWidth=0;strokeColor=#b85450;fillColor=#f8cecc;" vertex="1" parent="1">
 68 |           <mxGeometry x="160" y="640" width="80" height="80" as="geometry" />
 69 |         </mxCell>
 70 |         <mxCell id="oXTzVpbmEas0PoUD19BF-25" value="" style="verticalLabelPosition=bottom;verticalAlign=top;html=1;shape=mxgraph.basic.patternFillRect;fillStyle=grid;step=5;fillStrokeWidth=0.2;fillStrokeColor=#dddddd;strokeWidth=0;strokeColor=none;fillColor=#dae8fc;" vertex="1" parent="1">
 71 |           <mxGeometry x="80" y="640" width="80" height="80" as="geometry" />
 72 |         </mxCell>
 73 |         <mxCell id="oXTzVpbmEas0PoUD19BF-26" value="" style="shape=flexArrow;endArrow=classic;html=1;rounded=0;" edge="1" parent="1">
 74 |           <mxGeometry width="50" height="50" relative="1" as="geometry">
 75 |             <mxPoint x="200" y="820" as="sourcePoint" />
 76 |             <mxPoint x="200" y="900" as="targetPoint" />
 77 |           </mxGeometry>
 78 |         </mxCell>
 79 |         <mxCell id="oXTzVpbmEas0PoUD19BF-27" value="Using Affine Analysis" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" vertex="1" parent="1">
 80 |           <mxGeometry x="220" y="840" width="140" height="40" as="geometry" />
 81 |         </mxCell>
 82 |         <mxCell id="oXTzVpbmEas0PoUD19BF-28" value="" style="verticalLabelPosition=bottom;verticalAlign=top;html=1;shape=mxgraph.basic.patternFillRect;fillStyle=grid;step=5;fillStrokeWidth=0.2;fillStrokeColor=#dddddd;strokeWidth=1;" vertex="1" parent="1">
 83 |           <mxGeometry x="80" y="920" width="160" height="160" as="geometry" />
 84 |         </mxCell>
 85 |         <mxCell id="oXTzVpbmEas0PoUD19BF-29" value="&lt;div&gt;&lt;font face=&quot;Courier New&quot;&gt;for i in 0 to 16&lt;/font&gt;&lt;div&gt;&lt;font face=&quot;Courier New&quot;&gt;&amp;nbsp; for j in 0 to &lt;b&gt;&lt;font style=&quot;color: rgb(255, 0, 255);&quot;&gt;16&lt;/font&gt;&lt;/b&gt;&lt;/font&gt;&lt;/div&gt;&lt;div&gt;&lt;font face=&quot;Courier New&quot;&gt;&amp;nbsp; &amp;nbsp;&amp;nbsp;&lt;font style=&quot;color: rgb(0, 0, 255);&quot;&gt;&lt;b&gt;A[i][j] = 1&lt;/b&gt;&lt;/font&gt;&lt;/font&gt;&lt;/div&gt;&lt;/div&gt;&lt;font face=&quot;Courier New&quot;&gt;&lt;div&gt;&lt;font face=&quot;Courier New&quot;&gt;for i in 0 to &lt;b&gt;&lt;font style=&quot;color: rgb(255, 0, 255);&quot;&gt;16&lt;/font&gt;&lt;/b&gt;&lt;/font&gt;&lt;div style=&quot;font-family: Helvetica;&quot;&gt;&lt;font face=&quot;Courier New&quot;&gt;&amp;nbsp; for j in 16 to 32&lt;/font&gt;&lt;/div&gt;&lt;div style=&quot;font-family: Helvetica;&quot;&gt;&lt;font face=&quot;Courier New&quot;&gt;&amp;nbsp; &amp;nbsp;&amp;nbsp;&lt;font style=&quot;color: rgb(255, 0, 0);&quot;&gt;&lt;b&gt;A[i][j] = 0&lt;/b&gt;&lt;/font&gt;&lt;/font&gt;&lt;/div&gt;&lt;/div&gt;for i in 16 to 32&lt;/font&gt;&lt;div&gt;&lt;font face=&quot;Courier New&quot;&gt;&amp;nbsp; for j in 0 to 32&lt;/font&gt;&lt;/div&gt;&lt;div&gt;&lt;font face=&quot;Courier New&quot;&gt;&amp;nbsp; &amp;nbsp; &lt;font style=&quot;color: rgb(0, 255, 0);&quot;&gt;&lt;b style=&quot;&quot;&gt;A[i][j] = -1&lt;/b&gt;&lt;/font&gt;&lt;/font&gt;&lt;/div&gt;" style="text;html=1;align=left;verticalAlign=middle;whiteSpace=wrap;rounded=0;" vertex="1" parent="1">
 86 |           <mxGeometry x="260" y="920" width="160" height="160" as="geometry" />
 87 |         </mxCell>
 88 |         <mxCell id="oXTzVpbmEas0PoUD19BF-30" value="" style="verticalLabelPosition=bottom;verticalAlign=top;html=1;shape=mxgraph.basic.patternFillRect;fillStyle=grid;step=5;fillStrokeWidth=0.2;fillStrokeColor=#dddddd;strokeWidth=0;strokeColor=#82b366;fillColor=#d5e8d4;" vertex="1" parent="1">
 89 |           <mxGeometry x="80" y="1000" width="159" height="80" as="geometry" />
 90 |         </mxCell>
 91 |         <mxCell id="oXTzVpbmEas0PoUD19BF-31" value="&lt;font face=&quot;Courier New&quot;&gt;i&lt;/font&gt;" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" vertex="1" parent="1">
 92 |           <mxGeometry x="40" y="985" width="40" height="30" as="geometry" />
 93 |         </mxCell>
 94 |         <mxCell id="oXTzVpbmEas0PoUD19BF-32" value="&lt;font face=&quot;Courier New&quot;&gt;j&lt;/font&gt;" style="text;html=1;align=center;verticalAlign=middle;whiteSpace=wrap;rounded=0;" vertex="1" parent="1">
 95 |           <mxGeometry x="140" y="1080" width="40" height="30" as="geometry" />
 96 |         </mxCell>
 97 |         <mxCell id="oXTzVpbmEas0PoUD19BF-33" value="" style="verticalLabelPosition=bottom;verticalAlign=top;html=1;shape=mxgraph.basic.patternFillRect;fillStyle=grid;step=5;fillStrokeWidth=0.2;fillStrokeColor=#dddddd;strokeWidth=0;strokeColor=#b85450;fillColor=#f8cecc;" vertex="1" parent="1">
 98 |           <mxGeometry x="160" y="920" width="80" height="80" as="geometry" />
 99 |         </mxCell>
100 |         <mxCell id="oXTzVpbmEas0PoUD19BF-34" value="" style="verticalLabelPosition=bottom;verticalAlign=top;html=1;shape=mxgraph.basic.patternFillRect;fillStyle=grid;step=5;fillStrokeWidth=0.2;fillStrokeColor=#dddddd;strokeWidth=0;strokeColor=none;fillColor=#dae8fc;" vertex="1" parent="1">
101 |           <mxGeometry x="80" y="920" width="80" height="80" as="geometry" />
102 |         </mxCell>
103 |       </root>
104 |     </mxGraphModel>
105 |   </diagram>
106 | </mxfile>
107 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # MLIR Beginner-Friendly Tutorial
  2 | 
  3 | This is a beginner-friendly tutorial on MLIR from the perspective of a user of
  4 | MLIR, not a compiler engineer. This tutorial will introduce why MLIR exists and
  5 | how it is used to compile code at different levels of abstraction. This tutorial
  6 | will focus on working with the "core" dialects of MLIR.
  7 | 
  8 | Part 1 of this tutorial was presented during the weekly Reading Group at the
  9 | University of Toronto. The recording of this presentation can be found here:
 10 | https://youtu.be/Uno_XhtkT5E
 11 | 
 12 | Part 2 of this tutorial was presented during the next week's Reading Group at the
 13 | University of Toronto. The recording of this presentation can be found here:
 14 | https://youtu.be/l0O4Vbc3l5c
 15 | 
 16 | # Demo 0: Building MLIR
 17 | 
 18 | In this repository, you will find a Git submodule of LLVM. This was the most
 19 | recent version of LLVM that was available when I wrote this tutorial. There is
 20 | nothing special about it, but I provided it here so the results of the tutorial
 21 | will always match in the future. Make sure you have initialized the submodule
 22 | using:
 23 | ```sh
 24 | git submodule init
 25 | git submodule update
 26 | ```
 27 | 
 28 | These build instructions are based on the Getting Started page provided by MLIR:
 29 | https://mlir.llvm.org/getting_started/
 30 | 
 31 | This tutorial uses the Ninja generator to build MLIR, this can be installed using:
 32 | ```sh
 33 | apt-get install ninja-build
 34 | ```
 35 | 
 36 | Create a build folder and run the following CMake command. After, run the final
 37 | command to build MLIR and check that it built successfully.
 38 | 
 39 | ```sh
 40 | mkdir build
 41 | cd build
 42 | cmake -G Ninja ../llvm-project/llvm \
 43 |     -DLLVM_ENABLE_PROJECTS=mlir \
 44 |     -DLLVM_BUILD_EXAMPLES=ON \
 45 |     -DLLVM_TARGETS_TO_BUILD="Native;NVPTX;AMDGPU" \
 46 |     -DCMAKE_BUILD_TYPE=Release \
 47 |     -DLLVM_ENABLE_ASSERTIONS=ON
 48 | ninja check-mlir
 49 | ```
 50 | 
 51 | Note: That last command will take a very long time to run. This will build all
 52 | of the LLVM and MLIR code necessary to check that MLIR was built correctly for
 53 | your system. I recommend giving it many cores using `-j<num_cores>`.
 54 | 
 55 | After this command completes, you should have the executables you need to perform
 56 | the rest of this tutorial.
 57 | 
 58 | # Demo 1: Motivating MLIR
 59 | 
 60 | Demo 1 is a demonstration on what writing high-performance code is like without
 61 | MLIR. It demonstrates a common technique where the user writes their code at a
 62 | high level (just basic linear-algebra operations) and use high-performance
 63 | libraries to make the code run fast. This is the same technique that is used
 64 | for libraries like TensorFlow and PyTorch in Python. This demo is written in C++ so
 65 | the actual instructions can be shown in more detail, but the same concepts apply
 66 | to Python.
 67 | 
 68 | `demo1.cpp` shows the high-level code that a user may write. This code is a
 69 | basic Fully-Connected layer one might find in a Deep Neural Network. To
 70 | implement this layer, one just needs to take the input vector (which is often
 71 | represented as a matrix due to batching) and multiply it by a weight matrix.
 72 | The output of this is then passed into an activation function (in this case
 73 | ReLU) to get the output of the layer. The user writes this code at a high-level,
 74 | without concern for performance, which makes the code easier to write and work
 75 | with. It also makes the code more portable to other targets.
 76 | 
 77 | In order to write such high-level code, one must make use of high-performance
 78 | libraries. In this case, I wrote a basic library consisting of a tensor class
 79 | (`tensor.h`) and a linear algebra kernel library (`linalg_lib.h`). The code I
 80 | wrote in these libraries is the bare minimum required to make a library like
 81 | this and is not optimized at all. The key idea of this library is that the user
 82 | has no control over what is written here (sometimes this library is not even
 83 | accessible and is hidden as a binary); experts on the architecture build these
 84 | libraries using in-depth knowledge about the device (BLAS is a good example of
 85 | one of these libraries). The MatMul kernel in `linalg.h` discusses different
 86 | optimizations one may perform for improved performance, however every
 87 | optimizations requires in-depth knowledge of the target architecture.
 88 | 
 89 | The benefit of this approach is that users do not need to be experts of the
 90 | device they are programming on to achieve high-performance
 91 | for their applications (like AI, HPC, etc.). This also allows for portability
 92 | between different accelerators and generations of chips. The downside is that
 93 | these high-performance libraries create a large barrier to entry for new chips
 94 | and require a lot of time and knowledge to design.
 95 | 
 96 | The key thing to notice from this demo is that the optimizations performed on
 97 | these kernels are often the same. To get good performance on MatMul, one has to
 98 | tile; but how much to tile is what changes.
 99 | There are scheduling libraries that
100 | try to resolve this issue by separating the kernels from the optimizations;
101 | however, ideally the compiler should be leveraged to perform these optimizations
102 | since it knows a lot of information about the device architecture.
103 | The problem is that, using traditional compilation techniques, in order to compile
104 | this code to an executable, the code
105 | must be written as bare for loops and memory accesses which lowers the abstraction
106 | level too early for the compiler to do these optimizations.
107 | This is where MLIR comes in. MLIR is a compiler framework which allows for
108 | different levels of abstraction ("dialects") to be represented within its Intermediate
109 | Representation. This allows for compiler passes to be written which perform
110 | these high-level optimizations on kernels. These passes and dialects, if written
111 | well, can be reused by different compiler flows to achieve good performance on all
112 | devices (even hardware accelerators, which usually remain at very high levels of
113 | abstraction).
114 | 
115 | There are other motivations for using MLIR, this is just a motivation that I felt
116 | best encapsulates the "Multi-Level" aspect of MLIR.
117 | 
118 | # Demo 2: Entering MLIR
119 | 
120 | Now that we have motivated why we may want to use MLIR, lets convert the high-level
121 | code from demo 1 into MLIR:
122 | ```cpp
123 | int main(void) {
124 |     Tensor<float, 256, 512> FC_INPUT;
125 |     Tensor<float, 512, 1024> FC_WEIGHT;
126 |     Tensor<float, 256, 1024> FC_OUTPUT = matmul(FC_INPUT, FC_WEIGHT);
127 |     Tensor<float, 256, 1024> OUT = relu(FC_OUTPUT);
128 | }
129 | ```
130 | 
131 | Often, custom tools are created which can convert code such as the code above
132 | into MLIR automatically. For this example, such a tool is trivial to write since
133 | I chose the API for the library to match the Linalg on Tensor level of abstraction
134 | in MLIR. Often times people do things the other way around: they build a level of
135 | abstraction in MLIR which matches their pre-existing APIs; however, for this
136 | tutorial, I wanted to use the core MLIR dialects.
137 | 
138 | I chose to enter the Linalg on Tensor level of abstraction for this demo since
139 | this is a common abstraction used by the key users of MLIR (such as TensorFlow and
140 | PyTorch). It is also a very interesting level of abstraction.
141 | 
142 | I converted the code above into MLIR code by hand. This code can be found in
143 | `demo2.mlir`. For this tutorial it does not matter if the MLIR code was generated
144 | by a tool or not. In this MLIR file, you will find comments where I describe how
145 | to read the Intermediate Representation at this level of abstraction.
146 | ```mlir
147 | #map = affine_map<(i, j) -> (i, j)>
148 | module {
149 | func.func @main() -> tensor<256x1024xf32> {
150 |     %FC_INPUT = tensor.empty() : tensor<256x512xf32>
151 |     %FC_WEIGHT = tensor.empty() : tensor<512x1024xf32>
152 |     %c_init = arith.constant 0.0 : f32
153 |     %matmul_init = tensor.splat %c_init : tensor<256x1024xf32>
154 |     %FC_OUTPUT = linalg.matmul
155 |                     ins(%FC_INPUT, %FC_WEIGHT : tensor<256x512xf32>, tensor<512x1024xf32>)
156 |                     outs(%matmul_init : tensor<256x1024xf32>) -> tensor<256x1024xf32>
157 |     %relu_init = tensor.empty() : tensor<256x1024xf32>
158 |     %OUT = linalg.generic { indexing_maps = [#map, #map],
159 |                             iterator_types = ["parallel", "parallel"]}
160 |                ins(%FC_OUTPUT : tensor<256x1024xf32>)
161 |                outs(%relu_init : tensor<256x1024xf32>) {
162 |                ^bb0(%in: f32, %out: f32):
163 |                     %c0 = arith.constant 0.0 : f32
164 |                     %cmp = arith.cmpf ugt, %in, %c0 : f32
165 |                     %sel = arith.select %cmp, %in, %c0 : f32
166 |                     linalg.yield %sel : f32
167 |                } -> tensor<256x1024xf32>
168 |     func.return %OUT : tensor<256x1024xf32>
169 | }
170 | ```
171 | 
172 | Since tensors are immutable and SSA, this allows us to create a Data Flow Graph
173 | (DFG) of the above kernel:
174 | 
175 | ![Linalg on tensor Data Flow Graph](resources/LinalgOnTensorDFG.png)
176 | 
177 | The MLIR infrastructure includes methods of traversing DFGs.
178 | A special property of this DFG is that it is directed and acyclic. This has major
179 | benefits for compiler optimizations. For example, we can notice that the result
180 | of the MatMul is fed directly into the ReLU; if our device had a special
181 | instruction that can perform MatMul + ReLU (for example, a specialized hardware
182 | accelerator), we can directly fuse these two Linalg ops together.
183 | This property of building a DFG at the data level is why this level of abstraction
184 | is sometimes called the "graph-level".
185 | 
186 | MLIR provides a tool called `mlir-opt` which is used to test MLIR code. This
187 | tool runs passes on MLIR code (which will be described in the next demo) and
188 | verifies that the MLIR code is valid between passes. Since it runs validation so
189 | often, this tool is often just used for testing / debugging; while custom tools
190 | based on this one are used when building a real compiler flow. For this part of
191 | the demo I want to use this tool to ensure that the MLIR code I wrote by hand
192 | is valid. After following the steps in Demo 0, you should have `mlir-opt` already
193 | built in the `build/bin` folder. To use it, we perform the following command:
194 | ```sh
195 | ./build/bin/mlir-opt demo2-entering-mlir/demo2.mlir
196 | ```
197 | 
198 | `mlir-opt` will print an error if there is a syntax error with what I wrote. In
199 | this case we get no error. You will notice that this tool will print the IR after
200 | parsing. This renames many of the values and remove any comments which are not
201 | necessary for compilation. You can print this result to a file using the `-o`
202 | option.
203 | 
204 | # Demo 3: Lowering MLIR
205 | 
206 | Since MLIR code exists at different levels of abstraction, we need to be able to
207 | lower from a higher level of abstraction to a lower one in order to compile to
208 | a given target. This demo will ignore optimizing at the different levels and only
209 | show how to lower the MLIR code from Demo 2 to LLVMIR for compiling onto a CPU.
210 | Our goal is to take the high-level code we wrote in Demo 2, and lower it to
211 | the level of abstraction closest to assembly language.
212 | 
213 | The script `lower.sh` lowers the code from Demo 2 step-by-step from the high-level
214 | Linalg on Tensor representation all the way to LLVMIR. You can run this script by doing:
215 | ```sh
216 | bash demo3-lowering-mlir/lower.sh
217 | ```
218 | We will now walk through what each step in this script is doing.
219 | 
220 | When working with MLIR, it is often a good idea to create a flow diagram to
221 | show how different dialects are converted into one another. A great example of
222 | one of these diagrams can be found
223 | [here](https://mlir.llvm.org/docs/Dialects/Vector/#positioning-in-the-codegen-infrastructure).
224 | I created a flow diagram for the lowering I perform in the script:
225 | 
226 | ![Dialect Lowering Flow Diagram](resources/LoweringDialectDiagram.png)
227 | 
228 | This diagram shows where I consider each level of abstraction to be, with the
229 | dashed lines representing a conversion pass we are doing to go from one level
230 | of abstraction to another.
231 | 
232 | The MLIR code from Demo 2 is at a level of abstraction called "Linalg on Tensor",
233 | which is shown in the following code:
234 | ```mlir
235 | #map = affine_map<(d0, d1) -> (d0, d1)>
236 | module {
237 |   func.func @main() -> tensor<256x1024xf32> {
238 |     %0 = tensor.empty() : tensor<256x512xf32>
239 |     %1 = tensor.empty() : tensor<512x1024xf32>
240 |     %cst = arith.constant 0.000000e+00 : f32
241 |     %splat = tensor.splat %cst : tensor<256x1024xf32>
242 |     %2 = linalg.matmul ins(%0, %1 : tensor<256x512xf32>, tensor<512x1024xf32>) outs(%splat : tensor<256x1024xf32>) -> tensor<256x1024xf32>
243 |     %3 = tensor.empty() : tensor<256x1024xf32>
244 |     %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%2 : tensor<256x1024xf32>) outs(%3 : tensor<256x1024xf32>) {
245 |     ^bb0(%in: f32, %out: f32):
246 |       %cst_0 = arith.constant 0.000000e+00 : f32
247 |       %5 = arith.cmpf ugt, %in, %cst_0 : f32
248 |       %6 = arith.select %5, %in, %cst_0 : f32
249 |       linalg.yield %6 : f32
250 |     } -> tensor<256x1024xf32>
251 |     return %4 : tensor<256x1024xf32>
252 |   }
253 | }
254 | ```
255 | You will notice that at this abstraction level there is no concept of what
256 | device this code is running on. Tensors, by design, do not contain any information
257 | on how the data is stored in memory, and the linalg operations have practically
258 | no information on how the linear algebra operations will be performed. It is
259 | just a high-level description of an algorithm.
260 | 
261 | The first thing we need to do is lower the Tensors into MemRefs. Tensors are
262 | abstract data types which only represent the data being created / used. We need
263 | this data to exist somewhere in memory as buffers. MLIR provides specialized
264 | passes to convert tensors into buffers for each of the dialects. A list of all
265 | these passes can be found [here](https://mlir.llvm.org/docs/Passes/#bufferization-passes).
266 | In this case, we want to use the `one-shot-bufferize` pass. This performs
267 | bufferization over all the dialects at once in "one-shot". If you do not need
268 | fine-grained control over how the buffers are created, this is a good pass to use.
269 | We will be using `mlir-opt` to run this pass. See the `lower.sh` script for how
270 | to use it. After performing bufferization, the code is lowered to a new level
271 | of abstraction called "Linalg on MemRef" or "Linalg on Buffers":
272 | ```mlir
273 | #map = affine_map<(d0, d1) -> (d0, d1)>
274 | module {
275 |   func.func @main() -> memref<256x1024xf32> {
276 |     %cst = arith.constant 0.000000e+00 : f32
277 |     %alloc = memref.alloc() {alignment = 64 : i64} : memref<256x512xf32>
278 |     %alloc_0 = memref.alloc() {alignment = 64 : i64} : memref<512x1024xf32>
279 |     %alloc_1 = memref.alloc() {alignment = 64 : i64} : memref<256x1024xf32>
280 |     linalg.map outs(%alloc_1 : memref<256x1024xf32>)
281 |       () {
282 |         linalg.yield %cst : f32
283 |       }
284 |     linalg.matmul ins(%alloc, %alloc_0 : memref<256x512xf32>, memref<512x1024xf32>) outs(%alloc_1 : memref<256x1024xf32>)
285 |     %alloc_2 = memref.alloc() {alignment = 64 : i64} : memref<256x1024xf32>
286 |     linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%alloc_1 : memref<256x1024xf32>) outs(%alloc_2 : memref<256x1024xf32>) {
287 |     ^bb0(%in: f32, %out: f32):
288 |       %0 = arith.cmpf ugt, %in, %cst : f32
289 |       %1 = arith.select %0, %in, %cst : f32
290 |       linalg.yield %1 : f32
291 |     }
292 |     return %alloc_2 : memref<256x1024xf32>
293 |   }
294 | }
295 | ```
296 | At this level of abstraction, you will notice that we can no longer just create
297 | empty tensors anymore; now, we have to actually allocate the memory onto the
298 | device. What changed here is that now all data buffers have real pointers
299 | underneath that have been allocated within the kernel. This is the first time
300 | we have seen MemRefs in this tutorial. These are incredibly imporant Types found
301 | in core MLIR. MemRefs represent memory buffers. They are wrappers around pointers.
302 | MemRefs truly are just pointers with shape / type information attached to them.
303 | They "break" SSA by allowing users to write to locations within the MemRef without
304 | having to create a new Value. I should be clear that the MemRef values themselves
305 | are still SSA (for example `%alloc` is still an SSA value), but because we are
306 | working with pointers, it is challenging to perform data-flow analysis on the
307 | data contained within MemRefs. This demonstrates how moving from one level of
308 | abstraction to another leads to necessary losses in information and challenges
309 | with optimization.
310 | 
311 | Now that the Tensors have been lowered into buffers, and the linalg operations
312 | are working on these buffers, we can now lower these linalg ops to real algorithms.
313 | CPUs do not come with ops to compute the MatMul over two buffers (normally), so
314 | we need to convert these ops into actual for-loops that can be executed. This
315 | will lower our abstraction level further from what is often called "graph-level"
316 | linalg operations to actual instructions. This will look similar to what one
317 | would write in C++. We can convert the linalg dialect to loops using the
318 | `convert-linalg-to-loops` pass found
319 | [here](https://mlir.llvm.org/docs/Passes/#-convert-linalg-to-loops).
320 | This will produce the following code:
321 | ```mlir
322 | module {
323 |   func.func @main() -> memref<256x1024xf32> {
324 |     %c512 = arith.constant 512 : index
325 |     %c1024 = arith.constant 1024 : index
326 |     %c1 = arith.constant 1 : index
327 |     %c256 = arith.constant 256 : index
328 |     %c0 = arith.constant 0 : index
329 |     %cst = arith.constant 0.000000e+00 : f32
330 |     %alloc = memref.alloc() {alignment = 64 : i64} : memref<256x512xf32>
331 |     %alloc_0 = memref.alloc() {alignment = 64 : i64} : memref<512x1024xf32>
332 |     %alloc_1 = memref.alloc() {alignment = 64 : i64} : memref<256x1024xf32>
333 |     scf.for %arg0 = %c0 to %c256 step %c1 {
334 |       scf.for %arg1 = %c0 to %c1024 step %c1 {
335 |         memref.store %cst, %alloc_1[%arg0, %arg1] : memref<256x1024xf32>
336 |       }
337 |     }
338 |     scf.for %arg0 = %c0 to %c256 step %c1 {
339 |       scf.for %arg1 = %c0 to %c1024 step %c1 {
340 |         scf.for %arg2 = %c0 to %c512 step %c1 {
341 |           %0 = memref.load %alloc[%arg0, %arg2] : memref<256x512xf32>
342 |           %1 = memref.load %alloc_0[%arg2, %arg1] : memref<512x1024xf32>
343 |           %2 = memref.load %alloc_1[%arg0, %arg1] : memref<256x1024xf32>
344 |           %3 = arith.mulf %0, %1 : f32
345 |           %4 = arith.addf %2, %3 : f32
346 |           memref.store %4, %alloc_1[%arg0, %arg1] : memref<256x1024xf32>
347 |         }
348 |       }
349 |     }
350 |     %alloc_2 = memref.alloc() {alignment = 64 : i64} : memref<256x1024xf32>
351 |     scf.for %arg0 = %c0 to %c256 step %c1 {
352 |       scf.for %arg1 = %c0 to %c1024 step %c1 {
353 |         %0 = memref.load %alloc_1[%arg0, %arg1] : memref<256x1024xf32>
354 |         %1 = arith.cmpf ugt, %0, %cst : f32
355 |         %2 = arith.select %1, %0, %cst : f32
356 |         memref.store %2, %alloc_2[%arg0, %arg1] : memref<256x1024xf32>
357 |       }
358 |     }
359 |     return %alloc_2 : memref<256x1024xf32>
360 |   }
361 | }
362 | ```
363 | Notice that the linalg operations have been converted to practically the same
364 | code we wrote in Demo 1 in C++! Another thing to notice is that we are now
365 | operating directly on the MemRef buffers, instead of naming high-level operations
366 | we want to perform. Thus, we are now directly loading from and storing to these
367 | buffers. Due to all of this, the length of the kernel also increases. This is the
368 | consequence of lowering the abstraction level. Code gets less abstract and more
369 | detailed.
370 | To represent loops in MLIR, we use the Strutured Control Flow dialect which
371 | creates ops for things like For loops, if statment, while loops, etc.
372 | This code is at the abstraction level that is closest to C++, but in
373 | order to compile this code we need to lower more towards assembly.
374 | 
375 | Assembly language does not have high-level control flow, like for loops. Due to
376 | this, we have to lower the for loops to something that is compatible with assembly.
377 | For most CPUs, this is done using branch instructions. General branching ops
378 | are provided by the control-flow dialect (`cf` dialect). We can convert the scf
379 | dialect into the cf dialect using the conversion pass `convert-scf-to-cf`. I should
380 | note that all passes available in core MLIR can be found [here](https://mlir.llvm.org/docs/Passes/).
381 | I will not show the resulting MLIR code here since it becomes very verbose and
382 | much harder to read. This is an important point, we are losing more and more
383 | high-level information as we lower further. This makes it harder to perform
384 | optimizations. This is a key idea in MLIR: perform optimizations at the level
385 | of abstraction that is most convenient, never later. After this point, we are
386 | at the level of abstraction that is as close to CPUs as we can get in core MLIR.
387 | 
388 | Now that we are at the CPU level of abstraction, we want to make use of LLVM to
389 | lower the rest of the way. This is very convenient since LLVM has been built over
390 | decades to convert CPU level code all the way down to assembly. We want to
391 | make use of this lowering! In order to emit LLVMIR, we need to convert all of
392 | our MLIR code to the LLVM dialect. This is done by converting each of the dialects we have
393 | in our kernel into the LLVM dialect. See the `lower.sh` script for which passes I chose to
394 | use for this particular kernel. After this point the code is as close to LLVMIR
395 | as we can get in MLIR.
396 | 
397 | The final tool that we will use is a translation tool called `mlir-translate`
398 | which will turn the MLIR code in the LLVM dialect into LLVMIR. This is different
399 | from `mlir-opt` since `mlir-opt` always takes in valid MLIR code and spits out
400 | valid MLIR code. The goal of `mlir-translate` is to take MLIR code and translate
401 | it into another target language; in our case, it is LLVMIR. After this step we
402 | have valid LLVMIR code which can be compiled further using LLVM all the way to
403 | executable assembly. I will not show this in this tutorial since it is out of
404 | scope.
405 | 
406 | # Demo 4: Optimizing MLIR with Affine Analysis
407 | 
408 | MLIR can do much more than just lower high-level code to CPUs and other targets.
409 | What makes MLIR so powerful is its ability to do high-level analysis and
410 | optimization as it is lowering. One analysis that is built-into MLIR's core
411 | dialects is Affine Analysis.
412 | 
413 | In a nutshell, Affine Analysis uses analytical models to represent how code
414 | accesses buffers. By using these analytical models, we can deduce the memory
415 | access patterns of buffers which allow us to do exciting optimization like
416 | fusing complicated loops together and tiling loops to improve cache locality.
417 | 
418 | The image below demonstrates how Affine Analysis is used in compilers to optimize
419 | memory access patterns. Affine Analysis allows us to create a polyhedron in
420 | memory space that represents a memory access pattern. This first example shows
421 | a simple memory access pattern which will set the first 16x32 elements in 2D
422 | array (32x32) to 1. Affine Analysis will tell us that the memory access pattern
423 | is a 16x32 rectangle starting at (0, 0), shown in blue.
424 | This may seem very simple and obvious, but this can get more complicated.
425 | Similarly, red and green rectangles are shown for two other access patterns.
426 | Where things get more interesting is when we combine these three loop nests into
427 | one kernel. You will notice in this kernel that we are doing extra work, since
428 | future writes may overlap with previous writes we made to memory! Affine Analysis
429 | allows us to recognize when these extra writes happen by simply computing the
430 | overlap of these rectangles! By recognizing when the blue rectangle is overlapped
431 | with the red / green rectangle, we can change the bound of j to 16 instead of
432 | 32. Similarly, we can change the bound of the red rectangle's i to 16 to prevent
433 | writing to locations in memory we know the green rectangle will overlap.
434 | 
435 | ![Affine Analysis Introduction](resources/AffineAnalysisIntro.png)
436 | 
437 | This was a simple example, but we can do much more interesting compiler optimizations
438 | with this information. For example, if we compute the area of these polyhedrons
439 | we will know exactly the number of elements that are accessed by the kernel!
440 | We can use this information to compute the memory footprint of the loops in
441 | the kernel.
442 | We can then transform the loops in the kernel to minimize the
443 | footprint within certain loop iterations.
444 | 
445 | Affine Analysis is very powerful, but it does have limitations. For loops, the
446 | lower bound and upper bound of the loops must be Affine functions (meaning they
447 | are linear combinations of variables) and the step size must be a fixed number.
448 | For loads and stores, the address must be computed using an Affine function.
449 | For example, `A[2*i][j + 32]` is affine; however, `A[0][i*j]` is not.
450 | There are similar limitations for if statements.
451 | 
452 | Affine Analysis is provided as part of the affine dialect in MLIR. This is a
453 | level of abstraction that exists below the linalg dialect, but above the loops
454 | dialect.
455 | The Affine Dialect guarantees that all of the loops, loads, and stores are
456 | affine (observe the limitations of Affine Analysis). For example, A loop which
457 | is not affine cannot be expressed in the Affine Dialect's IR.
458 | What we can do is as we are lowering from the Linalg dialect to the scf
459 | dialect, we can make a pit-stop into the affine dialect, perform optimizations,
460 | and the continue into the scf dialect. This is a classic approach to lowering
461 | in MLIR since it is pretty obvious how to turn linalg operations into Affine
462 | loops, loads, and stores; and easy to turn any Affine instruction into
463 | the SCF or MemRef dialects; but it can be tricky to raise any general SCF loop /
464 | MemRef load/store into the Affine dialect.
465 | 
466 | The script `optimize.sh` walks through how to enter into the affine dialect from
467 | our Demo 2 code, perform loop fusion and tiling, and then lower into the scf
468 | dialect. A flow diagram for the lowering is shown below:
469 | 
470 | ![Lowering to Affine Dialect Lowering Diagram](resources/AffineLoweringDialectDiagram.png)
471 | 
472 | We can convert the linalg dialect into the affine dialect using the
473 | `convert-linalg-to-affine-loops` pass after performing bufferization. This
474 | produces the following code:
475 | ```mlir
476 | module {
477 |   func.func @main() -> memref<256x1024xf32> {
478 |     %cst = arith.constant 0.000000e+00 : f32
479 |     %alloc = memref.alloc() {alignment = 64 : i64} : memref<256x512xf32>
480 |     %alloc_0 = memref.alloc() {alignment = 64 : i64} : memref<512x1024xf32>
481 |     %alloc_1 = memref.alloc() {alignment = 64 : i64} : memref<256x1024xf32>
482 |     affine.for %arg0 = 0 to 256 {
483 |       affine.for %arg1 = 0 to 1024 {
484 |         affine.store %cst, %alloc_1[%arg0, %arg1] : memref<256x1024xf32>
485 |       }
486 |     }
487 |     affine.for %arg0 = 0 to 256 {
488 |       affine.for %arg1 = 0 to 1024 {
489 |         affine.for %arg2 = 0 to 512 {
490 |           %0 = affine.load %alloc[%arg0, %arg2] : memref<256x512xf32>
491 |           %1 = affine.load %alloc_0[%arg2, %arg1] : memref<512x1024xf32>
492 |           %2 = affine.load %alloc_1[%arg0, %arg1] : memref<256x1024xf32>
493 |           %3 = arith.mulf %0, %1 : f32
494 |           %4 = arith.addf %2, %3 : f32
495 |           affine.store %4, %alloc_1[%arg0, %arg1] : memref<256x1024xf32>
496 |         }
497 |       }
498 |     }
499 |     %alloc_2 = memref.alloc() {alignment = 64 : i64} : memref<256x1024xf32>
500 |     affine.for %arg0 = 0 to 256 {
501 |       affine.for %arg1 = 0 to 1024 {
502 |         %0 = affine.load %alloc_1[%arg0, %arg1] : memref<256x1024xf32>
503 |         %1 = arith.cmpf ugt, %0, %cst : f32
504 |         %2 = arith.select %1, %0, %cst : f32
505 |         affine.store %2, %alloc_2[%arg0, %arg1] : memref<256x1024xf32>
506 |       }
507 |     }
508 |     return %alloc_2 : memref<256x1024xf32>
509 |   }
510 | }
511 | ```
512 | You will notice that the affine dialect looks very similar to the scf on memref
513 | level of abstraction; however, this dialect comes with gaurentees on how memrefs
514 | are accessed within the kernel, which makes performing Affine Analysis much
515 | easier. Now that we are in the affine dialect, we can make use of the optimizations
516 | provided by the dialect.
517 | 
518 | The first optimizations I want to perform is loop fusion. You will notice that
519 | the original kernel separates the zeroing of the output buffer, computing the
520 | MatMul, and performing the ReLU into different loop nests. We know that the
521 | memory accesses between these loops are independent, but without Affine Analysis
522 | the compiler struggles to realize this. Here, we use the `affine-loop-fusion`
523 | pass which produces the following code:
524 | ```mlir
525 | module {
526 |   func.func @main() -> memref<256x1024xf32> {
527 |     %alloc = memref.alloc() : memref<1x1xf32>
528 |     %cst = arith.constant 0.000000e+00 : f32
529 |     %alloc_0 = memref.alloc() {alignment = 64 : i64} : memref<256x512xf32>
530 |     %alloc_1 = memref.alloc() {alignment = 64 : i64} : memref<512x1024xf32>
531 |     %alloc_2 = memref.alloc() {alignment = 64 : i64} : memref<256x1024xf32>
532 |     affine.for %arg0 = 0 to 256 {
533 |       affine.for %arg1 = 0 to 1024 {
534 |         affine.store %cst, %alloc[0, 0] : memref<1x1xf32>
535 |         affine.for %arg2 = 0 to 512 {
536 |           %3 = affine.load %alloc_0[%arg0, %arg2] : memref<256x512xf32>
537 |           %4 = affine.load %alloc_1[%arg2, %arg1] : memref<512x1024xf32>
538 |           %5 = affine.load %alloc[0, 0] : memref<1x1xf32>
539 |           %6 = arith.mulf %3, %4 : f32
540 |           %7 = arith.addf %5, %6 : f32
541 |           affine.store %7, %alloc[0, 0] : memref<1x1xf32>
542 |         }
543 |         %0 = affine.load %alloc[0, 0] : memref<1x1xf32>
544 |         %1 = arith.cmpf ugt, %0, %cst : f32
545 |         %2 = arith.select %1, %0, %cst : f32
546 |         affine.store %2, %alloc_2[%arg0, %arg1] : memref<256x1024xf32>
547 |       }
548 |     }
549 |     return %alloc_2 : memref<256x1024xf32>
550 |   }
551 | }
552 | ```
553 | Notice that the compiler was able to recognize that the three loop nests could
554 | be fused and combined them all into 1! This has a massive affect on the performance
555 | of the kernel. Not only do we not have to iterate over 256x1024 elements three
556 | times, but notice that one of the buffers dissapeared! The ReLU function needed
557 | to allocate a 1MB buffer to store the result of the activation; however, using
558 | Affine Analysis, the compiler realized that this buffer was not necessary and
559 | removed it. This greatly reduces the memory footprint of the kernel which will
560 | help with cache locality.
561 | 
562 | Speaking of cache locality, we can further improve our cache hit rate by performing
563 | tiling. As mentioned in Demo 1, tiling is a target-specific optimization which
564 | requires knowledge on the size of the caches and the size of the memory accessed
565 | within the loop nests. Fortunitely, we can use Affine Analysis to deduce the size
566 | of the memory footprint within the loop nests and we can tell the affine dialect
567 | how large our cache is! For my computer, the cache size is around 1024 kB, so I
568 | can perform loop tiling using `affine-loop-tile="cache-size=1024"`. I also
569 | perform some other cleanup passes to make the code more legible below:
570 | ```mlir
571 | module {
572 |   func.func @main() -> memref<256x1024xf32> {
573 |     %cst = arith.constant 0.000000e+00 : f32
574 |     %alloc = memref.alloc() : memref<1x1xf32>
575 |     %alloc_0 = memref.alloc() {alignment = 64 : i64} : memref<256x512xf32>
576 |     %alloc_1 = memref.alloc() {alignment = 64 : i64} : memref<512x1024xf32>
577 |     %alloc_2 = memref.alloc() {alignment = 64 : i64} : memref<256x1024xf32>
578 |     affine.for %arg0 = 0 to 128 {
579 |       affine.for %arg1 = 0 to 512 {
580 |         affine.for %arg2 = 0 to 2 {
581 |           affine.for %arg3 = 0 to 2 {
582 |             affine.store %cst, %alloc[0, 0] : memref<1x1xf32>
583 |             affine.for %arg4 = 0 to 512 {
584 |               %3 = affine.load %alloc_0[%arg2 + %arg0 * 2, %arg4] : memref<256x512xf32>
585 |               %4 = affine.load %alloc_1[%arg4, %arg3 + %arg1 * 2] : memref<512x1024xf32>
586 |               %5 = affine.load %alloc[0, 0] : memref<1x1xf32>
587 |               %6 = arith.mulf %3, %4 : f32
588 |               %7 = arith.addf %5, %6 : f32
589 |               affine.store %7, %alloc[0, 0] : memref<1x1xf32>
590 |             }
591 |             %0 = affine.load %alloc[0, 0] : memref<1x1xf32>
592 |             %1 = arith.cmpf ugt, %0, %cst : f32
593 |             %2 = arith.select %1, %0, %cst : f32
594 |             affine.store %2, %alloc_2[%arg2 + %arg0 * 2, %arg3 + %arg1 * 2] : memref<256x1024xf32>
595 |           }
596 |         }
597 |       }
598 |     }
599 |     return %alloc_2 : memref<256x1024xf32>
600 |   }
601 | }
602 | ```
603 | Using Affine Analysis, the compiler decided to tile the two outermost loops by
604 | 2x2. This partitions the inner calculations into tiles of size 2x2 which the
605 | compiler believes will maximize cache locality.
606 | 
607 | Now that we have performed the optimizations we care about in the Affine dialect,
608 | we can continue the lowering process to the scf dialect using the `lower-affine`
609 | pass. This produces the output below:
610 | ```mlir
611 | module {
612 |   func.func @main() -> memref<256x1024xf32> {
613 |     %c2 = arith.constant 2 : index
614 |     %c512 = arith.constant 512 : index
615 |     %c1 = arith.constant 1 : index
616 |     %c128 = arith.constant 128 : index
617 |     %c0 = arith.constant 0 : index
618 |     %cst = arith.constant 0.000000e+00 : f32
619 |     %alloc = memref.alloc() : memref<1x1xf32>
620 |     %alloc_0 = memref.alloc() {alignment = 64 : i64} : memref<256x512xf32>
621 |     %alloc_1 = memref.alloc() {alignment = 64 : i64} : memref<512x1024xf32>
622 |     %alloc_2 = memref.alloc() {alignment = 64 : i64} : memref<256x1024xf32>
623 |     scf.for %arg0 = %c0 to %c128 step %c1 {
624 |       scf.for %arg1 = %c0 to %c512 step %c1 {
625 |         scf.for %arg2 = %c0 to %c2 step %c1 {
626 |           scf.for %arg3 = %c0 to %c2 step %c1 {
627 |             memref.store %cst, %alloc[%c0, %c0] : memref<1x1xf32>
628 |             scf.for %arg4 = %c0 to %c512 step %c1 {
629 |               %7 = arith.muli %arg0, %c2 overflow<nsw> : index
630 |               %8 = arith.addi %arg2, %7 : index
631 |               %9 = memref.load %alloc_0[%8, %arg4] : memref<256x512xf32>
632 |               %10 = arith.muli %arg1, %c2 overflow<nsw> : index
633 |               %11 = arith.addi %arg3, %10 : index
634 |               %12 = memref.load %alloc_1[%arg4, %11] : memref<512x1024xf32>
635 |               %13 = memref.load %alloc[%c0, %c0] : memref<1x1xf32>
636 |               %14 = arith.mulf %9, %12 : f32
637 |               %15 = arith.addf %13, %14 : f32
638 |               memref.store %15, %alloc[%c0, %c0] : memref<1x1xf32>
639 |             }
640 |             %0 = memref.load %alloc[%c0, %c0] : memref<1x1xf32>
641 |             %1 = arith.cmpf ugt, %0, %cst : f32
642 |             %2 = arith.select %1, %0, %cst : f32
643 |             %3 = arith.muli %arg0, %c2 overflow<nsw> : index
644 |             %4 = arith.addi %arg2, %3 : index
645 |             %5 = arith.muli %arg1, %c2 overflow<nsw> : index
646 |             %6 = arith.addi %arg3, %5 : index
647 |             memref.store %2, %alloc_2[%4, %6] : memref<256x1024xf32>
648 |           }
649 |         }
650 |       }
651 |     }
652 |     return %alloc_2 : memref<256x1024xf32>
653 |   }
654 | }
655 | ```
656 | 
657 | Let's compare the output above to the kernel in the scf dialect without performing
658 | Affine Analysis:
659 | ```mlir
660 | module {
661 |   func.func @main() -> memref<256x1024xf32> {
662 |     %c512 = arith.constant 512 : index
663 |     %c1024 = arith.constant 1024 : index
664 |     %c1 = arith.constant 1 : index
665 |     %c256 = arith.constant 256 : index
666 |     %c0 = arith.constant 0 : index
667 |     %cst = arith.constant 0.000000e+00 : f32
668 |     %alloc = memref.alloc() {alignment = 64 : i64} : memref<256x512xf32>
669 |     %alloc_0 = memref.alloc() {alignment = 64 : i64} : memref<512x1024xf32>
670 |     %alloc_1 = memref.alloc() {alignment = 64 : i64} : memref<256x1024xf32>
671 |     scf.for %arg0 = %c0 to %c256 step %c1 {
672 |       scf.for %arg1 = %c0 to %c1024 step %c1 {
673 |         memref.store %cst, %alloc_1[%arg0, %arg1] : memref<256x1024xf32>
674 |       }
675 |     }
676 |     scf.for %arg0 = %c0 to %c256 step %c1 {
677 |       scf.for %arg1 = %c0 to %c1024 step %c1 {
678 |         scf.for %arg2 = %c0 to %c512 step %c1 {
679 |           %0 = memref.load %alloc[%arg0, %arg2] : memref<256x512xf32>
680 |           %1 = memref.load %alloc_0[%arg2, %arg1] : memref<512x1024xf32>
681 |           %2 = memref.load %alloc_1[%arg0, %arg1] : memref<256x1024xf32>
682 |           %3 = arith.mulf %0, %1 : f32
683 |           %4 = arith.addf %2, %3 : f32
684 |           memref.store %4, %alloc_1[%arg0, %arg1] : memref<256x1024xf32>
685 |         }
686 |       }
687 |     }
688 |     %alloc_2 = memref.alloc() {alignment = 64 : i64} : memref<256x1024xf32>
689 |     scf.for %arg0 = %c0 to %c256 step %c1 {
690 |       scf.for %arg1 = %c0 to %c1024 step %c1 {
691 |         %0 = memref.load %alloc_1[%arg0, %arg1] : memref<256x1024xf32>
692 |         %1 = arith.cmpf ugt, %0, %cst : f32
693 |         %2 = arith.select %1, %0, %cst : f32
694 |         memref.store %2, %alloc_2[%arg0, %arg1] : memref<256x1024xf32>
695 |       }
696 |     }
697 |     return %alloc_2 : memref<256x1024xf32>
698 |   }
699 | }
700 | ```
701 | Notice that the kernel is much more concise, makes fewer allocations, and has
702 | better cache locality. All of this we got for free by using analysis provided
703 | by us being at a higher level of abstraction.
704 | 
705 | This kernel in the scf dialect can then be lowered in the same way as Demo 3.
706 | 
707 | # Conclusion and Further Reading
708 | 
709 | This tutorial was mainly focused on motivating MLIR and showing how it can be
710 | used to create a simple compiler out of the box. To keep this tutorial beginner-
711 | friendly, I wanted to avoid touching the MLIR infrastructure code itself and just
712 | demonstrate how the infrastructure is used. The users of the MLIR infrastructure
713 | generally do much more than just work with the core dialects; many users need to
714 | add many more levels of abstraction to compile the code exactly the way they
715 | want it. This tutorial was just meant to be an introduction to give people a
716 | taste of how to work with MLIR.
717 | 
718 | The following are good resources to learn more about MLIR.
719 | 
720 | - [The MLIR Paper](https://doi.org/10.1109/CGO51591.2021.9370308): This paper
721 |   introduces MLIR. It does a fantastic job motivating MLIR, describing the design
722 |   of the IR, and how MLIR was built. This paper is from 2021, so there are
723 |   aspect of it which are outdated; but for the most part the ideas from this
724 |   paper still apply to MLIR today.
725 | 
726 | - [The MLIR Website](https://mlir.llvm.org/): This is the best place to go to
727 |   learn more about MLIR. It provides documentation on the core dialects of MLIR.
728 |   It provides details on who is using MLIR and why. It provides rationale on
729 |   why to use MLIR. It also contains more Tutorials on how to work with MLIR.
730 | 
731 | - [MLIR Rationale](https://mlir.llvm.org/docs/Rationale/Rationale/): This is a document
732 |   on the MLIR website which basically summarises the paper. It provides explaination
733 |   on why the IR is built the way it is.
734 | 
735 | - [MLIR Language Reference](https://mlir.llvm.org/docs/LangRef/): This
736 |   document provides a very detailed summary of how to read MLIR IR. This goes
737 |   into way more detail than what I presented in this tutorial.
738 | 
739 | - [Dialects Documentation](https://mlir.llvm.org/docs/Dialects/): This is the
740 |   documentation of all the core dialects in MLIR. This documentation is automatically
741 |   generated by the code in MLIR; so sometimes it is not the best. It can provide
742 |   a good summary of the Ops available in upstream MLIR.
743 | 
744 | - [Passes Documentation](https://mlir.llvm.org/docs/Passes/): This is the
745 |   documentation of all the core passes in MLIR. Like the dialects, this
746 |   documentation is auto-generated; however, I found the documentation for these
747 |   passes to be pretty good.
748 | 
749 | - [MLIR Tutorials](https://mlir.llvm.org/docs/Tutorials/): These are tutorials
750 |   provided by MLIR to teach people how to work with the infrastructure. These
751 |   tutorials are mainly targetted towards the users who are creating their own
752 |   compiler flows using new dialects. It teaches how to create a dialect, how
753 |   to write a compiler pass, and more. These tutorials were the inspiration for
754 |   this tutorial.
755 | 
756 | - [Users of MLIR](https://mlir.llvm.org/users/): Almost every MLIR project
757 |   needs to define more dialects for their application. The core dialects are
758 |   just a few level of abstractions that the community felt all people needed in
759 |   some capacity. This is a list of the major users of MLIR in the community.
760 |   Most of these projects are open source and you can take a look at their dialects;
761 |   however, the quality of these downstream dialects may differ and may not be
762 |   compatible with each other.
763 | 
764 | - [MLIR YouTube Channel](https://www.youtube.com/@MLIRCompiler): These are
765 |   some videos on MLIR. Some of the videos are more approachable than others.
766 |   These videos talk both about using MLIR and future improvements to MLIR.
767 | 
768 | 


--------------------------------------------------------------------------------