├── .gitignore
├── .gitmodules
├── CMakeLists.txt
├── README.md
├── Torch_MLIR
    ├── modified
    │   └── modified.mlir
    └── pytorch
    │   ├── alexnet.mlir
    │   ├── resnet152.mlir
    │   └── vgg11.mlir
├── benchmark.sh
├── benchmarks
    ├── CMakeLists.txt
    ├── GoogleBenchmarks
    │   ├── CMakeLists.txt
    │   └── run_bench.cpp
    └── Hardware_Counters_or_Time
    │   ├── CMakeLists.txt
    │   ├── run_bench.cpp
    │   └── run_bench.h
├── build_llvm.sh
├── build_mlir.sh
├── build_obj.sh
├── example_output.png
├── include
    └── Transform
    │   ├── Affine
    │       └── Affine64Unroll.h
    │   └── MakeRunAble
    │       ├── RemoveForwardFuncArgsAndReturn.h
    │       ├── RemoveGlobalConstants.h
    │       └── ZeroInitRemoveForwardFuncArgsAndReturn.h
├── lib
    ├── CMakeLists.txt
    └── Transform
    │   ├── Affine
    │       ├── Affine64Unroll.cpp
    │       └── CMakeLists.txt
    │   ├── CMakeLists.txt
    │   └── MakeRunAble
    │       ├── CMakeLists.txt
    │       ├── RemoveForwardFuncArgsAndReturn.cpp
    │       ├── RemoveGlobalConstants.cpp
    │       └── ZeroInitRemoveForwardFuncArgsAndReturn.cpp
├── make_MLIR_obj.py
├── tests
    └── CMakeLists.txt
└── tools
    ├── CMakeLists.txt
    └── project-opt.cpp


/.gitignore:
--------------------------------------------------------------------------------
1 | build*
2 | .cache*
3 | .vscode*
4 | temp.mlir
5 | generate_MLIR.py
6 | Torch_MLIR/*
7 | /benchmarks/mlir_obj/*
8 | Onnx_models/*


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "External/llvm-project"]
2 | 	path = External/llvm-project
3 | 	url = https://github.com/llvm/llvm-project.git
4 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.20.0)
 2 | 
 3 | project(OptML LANGUAGES CXX C)
 4 | 
 5 | set(CMAKE_CXX_STANDARD 17 CACHE STRING "C++ standard to conform to")
 6 | set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 7 | set(BUILD_DEPS ON)
 8 | 
 9 | set(MLIR_DIR External/llvm-project/build/lib/cmake/mlir)
10 | 
11 | find_package(MLIR REQUIRED CONFIG)
12 | 
13 | message(STATUS "Using MLIRConfig.cmake in: ${MLIR_DIR}")
14 | message(STATUS "Using LLVMConfig.cmake in: ${LLVM_DIR}")
15 | 
16 | set(MLIR_BINARY_DIR ${CMAKE_BINARY_DIR})
17 | 
18 | include(AddLLVM)
19 | include(TableGen)
20 | 
21 | list(APPEND CMAKE_MODULE_PATH "${MLIR_CMAKE_DIR}")
22 | include(AddMLIR)
23 | include_directories(${LLVM_INCLUDE_DIRS})
24 | include_directories(${MLIR_INCLUDE_DIRS})
25 | include_directories(${PROJECT_SOURCE_DIR})
26 | include_directories(${PROJECT_SOURCE_DIR}/External/llvm-project)
27 | include_directories(${PROJECT_BINARY_DIR})
28 | 
29 | add_custom_command(
30 |     OUTPUT "${CMAKE_SOURCE_DIR}/benchmarks/mlir_obj/alexnet.o"
31 |     OUTPUT "${CMAKE_SOURCE_DIR}/benchmarks/mlir_obj/resnet152.o"
32 |     OUTPUT "${CMAKE_SOURCE_DIR}/benchmarks/mlir_obj/vgg11.o"
33 |     OUTPUT "${CMAKE_SOURCE_DIR}/benchmarks/mlir_obj/modified.o"
34 |     COMMAND ${CMAKE_COMMAND} -E echo "Compiling oracle object files"
35 |     COMMAND ${CMAKE_COMMAND} -E env bash ${CMAKE_SOURCE_DIR}/build_obj.sh ${CMAKE_SOURCE_DIR}
36 |     DEPENDS ${CMAKE_SOURCE_DIR}/Torch_MLIR/pytorch/alexnet.mlir
37 |     DEPENDS ${CMAKE_SOURCE_DIR}/Torch_MLIR/pytorch/resnet152.mlir
38 |     DEPENDS ${CMAKE_SOURCE_DIR}/Torch_MLIR/pytorch/vgg11.mlir
39 |     DEPENDS ${CMAKE_SOURCE_DIR}/Torch_MLIR/modified/modified.mlir
40 |     COMMENT "Compiling oracle object files"
41 | )
42 | add_custom_target(build_alexnet_obj ALL
43 |     DEPENDS ${CMAKE_SOURCE_DIR}/benchmarks/mlir_obj/alexnet.o
44 | )
45 | 
46 | 
47 | message(STATUS "Fetching Google Benchmarks..")
48 | include(FetchContent)
49 | set(BENCHMARK_ENABLE_TESTING NO)
50 | set(CMAKE_BUILD_TYPE Release) # Ensure the build type is Release
51 | FetchContent_Declare(
52 |   googlebenchmark
53 |   GIT_REPOSITORY https://github.com/google/benchmark.git
54 |   GIT_TAG origin/main
55 | )
56 | # FetchContent_MakeAvailable(or-tools googlebenchmark)
57 | FetchContent_MakeAvailable(googlebenchmark)
58 | set(BENCHMARK_ENABLE_LTO ON)  # Enable Link Time Optimization for better performance
59 | 
60 | # add_subdirectory(tests)
61 | add_subdirectory(tools)
62 | add_subdirectory(lib)
63 | add_subdirectory(benchmarks)


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # OptML
  2 | 
  3 | Welcome to OptML! This repository is designed for those new to MLIR and machine learning-based optimizations. As a compiler enthusiast, I wanted to create a platform for hobbyists like myself to experiment with and benchmark new optimizations on real ML models in an out-of-tree manner. This project is heavily inspired by [mlir-tutorial](https://github.com/j2kun/mlir-tutorial), which laid the foundation for my learning and development.
  4 | 
  5 | ## Table of Contents
  6 | 
  7 | 1. [Vision Models](#vision-models)
  8 | 2. [Benchmarking Options](#benchmarking-options)
  9 | 3. [Build Instructions](#build-instructions)
 10 | 4. [Usage Guide](#usage-guide)
 11 | 5. [Benchmarking Process](#benchmarking-process)
 12 | 6. [Files of Interest](#files-of-interest)
 13 | 
 14 | 
 15 | 
 16 | ## Vision Models
 17 | 
 18 | The repository includes three vision models generated from TorchScript:
 19 | 
 20 | 1. AlexNet
 21 | 2. VGG11
 22 | 3. ResNet152
 23 | 
 24 | ## Benchmarking Options
 25 | 
 26 | OptML supports multiple benchmarking methodologies:
 27 | 
 28 | 1. **Google Benchmarks**
 29 | 2. **Hardware Counters (PAPI)**
 30 | 3. **C++ Chrono library**
 31 | 
 32 | ## Build Instructions
 33 | 
 34 | ### Prerequisites
 35 | 
 36 | Before building and running OptML, make sure you have the following installed:
 37 | 
 38 | 1. **CMake** (version 3.20 or higher)
 39 | 2. **PAPI** (for hardware counter support)
 40 | 3. **Python 3.x** (for script execution)
 41 | 4. **C/C++ compiler** (clang 17 /gcc 11 or higher)
 42 | 
 43 | Ensure that these dependencies are installed and configured correctly before proceeding with the build instructions.
 44 | 
 45 | ### 1) Clone the Repository
 46 | ```bash
 47 | git clone https://github.com/mvvsmk/OptML.git
 48 | cd OptML
 49 | git submodule update --init --recursive
 50 | ```
 51 | 
 52 | ### 2) Run the Build Script
 53 | ```bash
 54 | ./build_llvm.sh   # Builds the LLVM submodule
 55 | # please note while doing check-mlir build might fail but it doesn't affect the project.
 56 | ./build_mlir.sh   # Builds the project-opt tool with out-of-tree optimizations
 57 | ```
 58 | 
 59 | ### 3) Run a Pre-included Benchmark to Verify the Setup
 60 | ```bash
 61 | ./benchmark.sh chrono Alexnet --affine-64-unroll
 62 | ```
 63 | 
 64 | ## Usage Guide
 65 | 
 66 | Let's walk through how to use this repository, specifically using the `Affine64Unroll` pass I implemented.
 67 | 
 68 | 1. **Headers**: Located in `$rootdir/include/Transform/Affine/`
 69 | 2. **Implementation**: Located under `$rootdir/lib/Transform/Affine/`
 70 | 3. **CMake File**: The CMake file in the implementation folder is straightforward; ensure you include the necessary libraries for your use case.
 71 | 4. **Register the Pass**: Register your pass with the `project-opt` tool.
 72 | 
 73 | Now, you're all set!
 74 | 
 75 | To run your own benchmarks, use the command:
 76 | ```bash
 77 | $rootdir/benchmark.sh [benchmark_type] [ML_model] [benchmark_flag] [PAPI_event_name]
 78 | ```
 79 | ![sample_output](https://github.com/mvvsmk/OptML/blob/main/example_output.png?raw=true)
 80 | 
 81 | > [!WARNING]  
 82 | > Before interpreting your benchmark results, it's important to understand how the benchmarking process works.
 83 | > This looks like a 29% increase but what you miss, is the object file size increase from 16KB to 2.5MB. XD . This was measured on a Intel(R) Core(TM) Ultra 9 185H, in a single threaded manner, this measurement also includes array initilization to all zeros.
 84 | 
 85 | 
 86 | 
 87 | ## Benchmarking Process
 88 | 
 89 | When the object file is compiled, two important passes are run:
 90 | 
 91 | 1. **`--rem-forward-func-args-and-return-run-mlir-zero-init`**:
 92 |     - This pass removes the arguments and return values of the `forward` function to make all functions uniform.
 93 |     - You can choose between two variants: one where the arguments are zero-initialized, and another where they remain uninitialized (resulting in undefined behavior).
 94 |     - Uninitialized one takes less memory to compile as adding the initialization ends up creating a lot of instructions which takes up a lot of ram.
 95 |      The default behaviour is to zero init the argument (usually the picture for the model) but if you are feeling lucky and want to experiment with undefined behviour change the following in make_MLIR_obj.py :
 96 | ```diff
 97 | if __name__ == "__main__":
 98 |     parser = argparse.ArgumentParser(description='Compile MLIR files to object files')
 99 |     parser.add_argument('input_folder', type=str, help='Absolute path to the folder with MLIR files or the only MLIR file')
100 |     parser.add_argument('output_folder', type=str, help='Absolute path to the folder where object files will be stored')
101 | -    parser.add_argument('--mlir-flags', required=False,default="--rem-forward-func-args-and-return-run-mlir-zero-init --rem-global-constants-run-mlir" ,type=str, help='Flags to be passed to project-opt')
102 | +    parser.add_argument('--mlir-flags', required=False,default="--rem-forward-func-args-and-return-run-mlir --rem-global-constants-run-mlir" ,type=str, help='Flags to be passed to project-opt')
103 | 
104 | ```
105 | 
106 | 2. **`--rem-global-constants-run-mlir`**:
107 |     - This pass removes global constants and places them inside the main function.This is to prevent some ```<stdin>:5:3: error: resource does not exist``` errors
108 | 
109 | After the modified MLIR is generated, it is compiled into an object file without any optimizations (creating either the original or oracle version). For benchmarking, the object file is linked against an empty C++ file that benchmarks the function:
110 | ```cpp
111 | extern "C" void forward();
112 | ```
113 | 
114 | When you run a pass using the `benchmark.sh` script, it generates a `Modified.mlir` file, which is then processed through the same pipeline and linked for benchmarking.
115 | 
116 | All these benchmarks are run in a single threaded manner without sudo taskset -c <thread>.
117 | 
118 | You can use the *.cpp and *.h files present in the `benchmarks/Hardware_Counters_or_Time` folder and add any custom parameters you want to measure.
119 | 
120 | ### Files of Interest
121 | 
122 | Do go through these files if you want to lean more on how the benchmarks actually compile and execute. 
123 | 
124 | - **`benchmark.sh`**: Executes your pass and compares the results against the original.
125 | - **`make_MLIR_obj.py`**: Converts the MLIR file to an object file for benchmarking.
126 | 


--------------------------------------------------------------------------------
/Torch_MLIR/pytorch/alexnet.mlir:
--------------------------------------------------------------------------------
  1 | module attributes {torch.debug_module_name = "AlexNet"} {
  2 |   memref.global "private" constant @__constant_64x3x11x11xf32 : memref<64x3x11x11xf32> = dense_resource<__elided__> {alignment = 64 : i64}
  3 |   memref.global "private" constant @__constant_64xf32 : memref<64xf32> = dense_resource<__elided__> {alignment = 64 : i64}
  4 |   memref.global "private" constant @__constant_192x64x5x5xf32 : memref<192x64x5x5xf32> = dense_resource<__elided__> {alignment = 64 : i64}
  5 |   memref.global "private" constant @__constant_192xf32 : memref<192xf32> = dense_resource<__elided__> {alignment = 64 : i64}
  6 |   memref.global "private" constant @__constant_384x192x3x3xf32 : memref<384x192x3x3xf32> = dense_resource<__elided__> {alignment = 64 : i64}
  7 |   memref.global "private" constant @__constant_384xf32 : memref<384xf32> = dense_resource<__elided__> {alignment = 64 : i64}
  8 |   memref.global "private" constant @__constant_256x384x3x3xf32 : memref<256x384x3x3xf32> = dense_resource<__elided__> {alignment = 64 : i64}
  9 |   memref.global "private" constant @__constant_256xf32_0 : memref<256xf32> = dense_resource<__elided__> {alignment = 64 : i64}
 10 |   memref.global "private" constant @__constant_256x256x3x3xf32 : memref<256x256x3x3xf32> = dense_resource<__elided__> {alignment = 64 : i64}
 11 |   memref.global "private" constant @__constant_256xf32 : memref<256xf32> = dense_resource<__elided__> {alignment = 64 : i64}
 12 |   memref.global "private" constant @__constant_4096x9216xf32 : memref<4096x9216xf32> = dense_resource<__elided__> {alignment = 64 : i64}
 13 |   memref.global "private" constant @__constant_4096xf32_0 : memref<4096xf32> = dense_resource<__elided__> {alignment = 64 : i64}
 14 |   memref.global "private" constant @__constant_4096x4096xf32 : memref<4096x4096xf32> = dense_resource<__elided__> {alignment = 64 : i64}
 15 |   memref.global "private" constant @__constant_4096xf32 : memref<4096xf32> = dense_resource<__elided__> {alignment = 64 : i64}
 16 |   memref.global "private" constant @__constant_1000x4096xf32 : memref<1000x4096xf32> = dense_resource<__elided__> {alignment = 64 : i64}
 17 |   memref.global "private" constant @__constant_1000xf32 : memref<1000xf32> = dense_resource<__elided__> {alignment = 64 : i64}
 18 |   func.func @forward(%arg0: memref<64x3x224x224xf32>) -> memref<64x1000xf32> {
 19 |     %cst = arith.constant -3.40282347E+38 : f32
 20 |     %cst_0 = arith.constant 0.000000e+00 : f32
 21 |     %0 = memref.get_global @__constant_1000xf32 : memref<1000xf32>
 22 |     %1 = memref.get_global @__constant_1000x4096xf32 : memref<1000x4096xf32>
 23 |     %2 = memref.get_global @__constant_4096xf32 : memref<4096xf32>
 24 |     %3 = memref.get_global @__constant_4096x4096xf32 : memref<4096x4096xf32>
 25 |     %4 = memref.get_global @__constant_4096xf32_0 : memref<4096xf32>
 26 |     %5 = memref.get_global @__constant_4096x9216xf32 : memref<4096x9216xf32>
 27 |     %6 = memref.get_global @__constant_256xf32 : memref<256xf32>
 28 |     %7 = memref.get_global @__constant_256x256x3x3xf32 : memref<256x256x3x3xf32>
 29 |     %8 = memref.get_global @__constant_256xf32_0 : memref<256xf32>
 30 |     %9 = memref.get_global @__constant_256x384x3x3xf32 : memref<256x384x3x3xf32>
 31 |     %10 = memref.get_global @__constant_384xf32 : memref<384xf32>
 32 |     %11 = memref.get_global @__constant_384x192x3x3xf32 : memref<384x192x3x3xf32>
 33 |     %12 = memref.get_global @__constant_192xf32 : memref<192xf32>
 34 |     %13 = memref.get_global @__constant_192x64x5x5xf32 : memref<192x64x5x5xf32>
 35 |     %14 = memref.get_global @__constant_64xf32 : memref<64xf32>
 36 |     %15 = memref.get_global @__constant_64x3x11x11xf32 : memref<64x3x11x11xf32>
 37 |     %alloc = memref.alloc() {alignment = 64 : i64} : memref<64x3x228x228xf32>
 38 |     affine.for %arg1 = 0 to 64 {
 39 |       affine.for %arg2 = 0 to 3 {
 40 |         affine.for %arg3 = 0 to 228 {
 41 |           affine.for %arg4 = 0 to 228 {
 42 |             affine.store %cst_0, %alloc[%arg1, %arg2, %arg3, %arg4] : memref<64x3x228x228xf32>
 43 |           }
 44 |         }
 45 |       }
 46 |     }
 47 |     %subview = memref.subview %alloc[0, 0, 2, 2] [64, 3, 224, 224] [1, 1, 1, 1] : memref<64x3x228x228xf32> to memref<64x3x224x224xf32, strided<[155952, 51984, 228, 1], offset: 458>>
 48 |     memref.copy %arg0, %subview : memref<64x3x224x224xf32> to memref<64x3x224x224xf32, strided<[155952, 51984, 228, 1], offset: 458>>
 49 |     %alloc_1 = memref.alloc() {alignment = 64 : i64} : memref<64x64x55x55xf32>
 50 |     %alloc_2 = memref.alloc() {alignment = 64 : i64} : memref<64x64x55x55xf32>
 51 |     affine.for %arg1 = 0 to 64 {
 52 |       affine.for %arg2 = 0 to 64 {
 53 |         affine.for %arg3 = 0 to 55 {
 54 |           affine.for %arg4 = 0 to 55 {
 55 |             %16 = affine.load %14[%arg2] : memref<64xf32>
 56 |             affine.store %16, %alloc_2[%arg1, %arg2, %arg3, %arg4] : memref<64x64x55x55xf32>
 57 |           }
 58 |         }
 59 |       }
 60 |     }
 61 |     affine.for %arg1 = 0 to 64 {
 62 |       affine.for %arg2 = 0 to 64 {
 63 |         affine.for %arg3 = 0 to 55 {
 64 |           affine.for %arg4 = 0 to 55 {
 65 |             affine.for %arg5 = 0 to 3 {
 66 |               affine.for %arg6 = 0 to 11 {
 67 |                 affine.for %arg7 = 0 to 11 {
 68 |                   %16 = affine.load %alloc[%arg1, %arg5, %arg3 * 4 + %arg6, %arg4 * 4 + %arg7] : memref<64x3x228x228xf32>
 69 |                   %17 = affine.load %15[%arg2, %arg5, %arg6, %arg7] : memref<64x3x11x11xf32>
 70 |                   %18 = affine.load %alloc_2[%arg1, %arg2, %arg3, %arg4] : memref<64x64x55x55xf32>
 71 |                   %19 = arith.mulf %16, %17 : f32
 72 |                   %20 = arith.addf %18, %19 : f32
 73 |                   affine.store %20, %alloc_2[%arg1, %arg2, %arg3, %arg4] : memref<64x64x55x55xf32>
 74 |                 }
 75 |               }
 76 |             }
 77 |           }
 78 |         }
 79 |       }
 80 |     }
 81 |     affine.for %arg1 = 0 to 64 {
 82 |       affine.for %arg2 = 0 to 64 {
 83 |         affine.for %arg3 = 0 to 55 {
 84 |           affine.for %arg4 = 0 to 55 {
 85 |             %16 = affine.load %alloc_2[%arg1, %arg2, %arg3, %arg4] : memref<64x64x55x55xf32>
 86 |             %17 = arith.cmpf ugt, %16, %cst_0 : f32
 87 |             %18 = arith.select %17, %16, %cst_0 : f32
 88 |             affine.store %18, %alloc_1[%arg1, %arg2, %arg3, %arg4] : memref<64x64x55x55xf32>
 89 |           }
 90 |         }
 91 |       }
 92 |     }
 93 |     %alloc_3 = memref.alloc() {alignment = 64 : i64} : memref<64x64x27x27xf32>
 94 |     affine.for %arg1 = 0 to 64 {
 95 |       affine.for %arg2 = 0 to 64 {
 96 |         affine.for %arg3 = 0 to 27 {
 97 |           affine.for %arg4 = 0 to 27 {
 98 |             affine.store %cst, %alloc_3[%arg1, %arg2, %arg3, %arg4] : memref<64x64x27x27xf32>
 99 |           }
100 |         }
101 |       }
102 |     }
103 |     affine.for %arg1 = 0 to 64 {
104 |       affine.for %arg2 = 0 to 64 {
105 |         affine.for %arg3 = 0 to 27 {
106 |           affine.for %arg4 = 0 to 27 {
107 |             affine.for %arg5 = 0 to 3 {
108 |               affine.for %arg6 = 0 to 3 {
109 |                 %16 = affine.load %alloc_1[%arg1, %arg2, %arg3 * 2 + %arg5, %arg4 * 2 + %arg6] : memref<64x64x55x55xf32>
110 |                 %17 = affine.load %alloc_3[%arg1, %arg2, %arg3, %arg4] : memref<64x64x27x27xf32>
111 |                 %18 = arith.maximumf %17, %16 : f32
112 |                 affine.store %18, %alloc_3[%arg1, %arg2, %arg3, %arg4] : memref<64x64x27x27xf32>
113 |               }
114 |             }
115 |           }
116 |         }
117 |       }
118 |     }
119 |     %alloc_4 = memref.alloc() {alignment = 64 : i64} : memref<64x64x31x31xf32>
120 |     affine.for %arg1 = 0 to 64 {
121 |       affine.for %arg2 = 0 to 64 {
122 |         affine.for %arg3 = 0 to 31 {
123 |           affine.for %arg4 = 0 to 31 {
124 |             affine.store %cst_0, %alloc_4[%arg1, %arg2, %arg3, %arg4] : memref<64x64x31x31xf32>
125 |           }
126 |         }
127 |       }
128 |     }
129 |     %subview_5 = memref.subview %alloc_4[0, 0, 2, 2] [64, 64, 27, 27] [1, 1, 1, 1] : memref<64x64x31x31xf32> to memref<64x64x27x27xf32, strided<[61504, 961, 31, 1], offset: 64>>
130 |     memref.copy %alloc_3, %subview_5 : memref<64x64x27x27xf32> to memref<64x64x27x27xf32, strided<[61504, 961, 31, 1], offset: 64>>
131 |     %alloc_6 = memref.alloc() {alignment = 64 : i64} : memref<64x192x27x27xf32>
132 |     %alloc_7 = memref.alloc() {alignment = 64 : i64} : memref<64x192x27x27xf32>
133 |     affine.for %arg1 = 0 to 64 {
134 |       affine.for %arg2 = 0 to 192 {
135 |         affine.for %arg3 = 0 to 27 {
136 |           affine.for %arg4 = 0 to 27 {
137 |             %16 = affine.load %12[%arg2] : memref<192xf32>
138 |             affine.store %16, %alloc_7[%arg1, %arg2, %arg3, %arg4] : memref<64x192x27x27xf32>
139 |           }
140 |         }
141 |       }
142 |     }
143 |     affine.for %arg1 = 0 to 64 {
144 |       affine.for %arg2 = 0 to 192 {
145 |         affine.for %arg3 = 0 to 27 {
146 |           affine.for %arg4 = 0 to 27 {
147 |             affine.for %arg5 = 0 to 64 {
148 |               affine.for %arg6 = 0 to 5 {
149 |                 affine.for %arg7 = 0 to 5 {
150 |                   %16 = affine.load %alloc_4[%arg1, %arg5, %arg3 + %arg6, %arg4 + %arg7] : memref<64x64x31x31xf32>
151 |                   %17 = affine.load %13[%arg2, %arg5, %arg6, %arg7] : memref<192x64x5x5xf32>
152 |                   %18 = affine.load %alloc_7[%arg1, %arg2, %arg3, %arg4] : memref<64x192x27x27xf32>
153 |                   %19 = arith.mulf %16, %17 : f32
154 |                   %20 = arith.addf %18, %19 : f32
155 |                   affine.store %20, %alloc_7[%arg1, %arg2, %arg3, %arg4] : memref<64x192x27x27xf32>
156 |                 }
157 |               }
158 |             }
159 |           }
160 |         }
161 |       }
162 |     }
163 |     affine.for %arg1 = 0 to 64 {
164 |       affine.for %arg2 = 0 to 192 {
165 |         affine.for %arg3 = 0 to 27 {
166 |           affine.for %arg4 = 0 to 27 {
167 |             %16 = affine.load %alloc_7[%arg1, %arg2, %arg3, %arg4] : memref<64x192x27x27xf32>
168 |             %17 = arith.cmpf ugt, %16, %cst_0 : f32
169 |             %18 = arith.select %17, %16, %cst_0 : f32
170 |             affine.store %18, %alloc_6[%arg1, %arg2, %arg3, %arg4] : memref<64x192x27x27xf32>
171 |           }
172 |         }
173 |       }
174 |     }
175 |     %alloc_8 = memref.alloc() {alignment = 64 : i64} : memref<64x192x13x13xf32>
176 |     affine.for %arg1 = 0 to 64 {
177 |       affine.for %arg2 = 0 to 192 {
178 |         affine.for %arg3 = 0 to 13 {
179 |           affine.for %arg4 = 0 to 13 {
180 |             affine.store %cst, %alloc_8[%arg1, %arg2, %arg3, %arg4] : memref<64x192x13x13xf32>
181 |           }
182 |         }
183 |       }
184 |     }
185 |     affine.for %arg1 = 0 to 64 {
186 |       affine.for %arg2 = 0 to 192 {
187 |         affine.for %arg3 = 0 to 13 {
188 |           affine.for %arg4 = 0 to 13 {
189 |             affine.for %arg5 = 0 to 3 {
190 |               affine.for %arg6 = 0 to 3 {
191 |                 %16 = affine.load %alloc_6[%arg1, %arg2, %arg3 * 2 + %arg5, %arg4 * 2 + %arg6] : memref<64x192x27x27xf32>
192 |                 %17 = affine.load %alloc_8[%arg1, %arg2, %arg3, %arg4] : memref<64x192x13x13xf32>
193 |                 %18 = arith.maximumf %17, %16 : f32
194 |                 affine.store %18, %alloc_8[%arg1, %arg2, %arg3, %arg4] : memref<64x192x13x13xf32>
195 |               }
196 |             }
197 |           }
198 |         }
199 |       }
200 |     }
201 |     %alloc_9 = memref.alloc() {alignment = 64 : i64} : memref<64x192x15x15xf32>
202 |     affine.for %arg1 = 0 to 64 {
203 |       affine.for %arg2 = 0 to 192 {
204 |         affine.for %arg3 = 0 to 15 {
205 |           affine.for %arg4 = 0 to 15 {
206 |             affine.store %cst_0, %alloc_9[%arg1, %arg2, %arg3, %arg4] : memref<64x192x15x15xf32>
207 |           }
208 |         }
209 |       }
210 |     }
211 |     %subview_10 = memref.subview %alloc_9[0, 0, 1, 1] [64, 192, 13, 13] [1, 1, 1, 1] : memref<64x192x15x15xf32> to memref<64x192x13x13xf32, strided<[43200, 225, 15, 1], offset: 16>>
212 |     memref.copy %alloc_8, %subview_10 : memref<64x192x13x13xf32> to memref<64x192x13x13xf32, strided<[43200, 225, 15, 1], offset: 16>>
213 |     %alloc_11 = memref.alloc() {alignment = 64 : i64} : memref<64x384x13x13xf32>
214 |     %alloc_12 = memref.alloc() {alignment = 64 : i64} : memref<64x384x13x13xf32>
215 |     affine.for %arg1 = 0 to 64 {
216 |       affine.for %arg2 = 0 to 384 {
217 |         affine.for %arg3 = 0 to 13 {
218 |           affine.for %arg4 = 0 to 13 {
219 |             %16 = affine.load %10[%arg2] : memref<384xf32>
220 |             affine.store %16, %alloc_12[%arg1, %arg2, %arg3, %arg4] : memref<64x384x13x13xf32>
221 |           }
222 |         }
223 |       }
224 |     }
225 |     affine.for %arg1 = 0 to 64 {
226 |       affine.for %arg2 = 0 to 384 {
227 |         affine.for %arg3 = 0 to 13 {
228 |           affine.for %arg4 = 0 to 13 {
229 |             affine.for %arg5 = 0 to 192 {
230 |               affine.for %arg6 = 0 to 3 {
231 |                 affine.for %arg7 = 0 to 3 {
232 |                   %16 = affine.load %alloc_9[%arg1, %arg5, %arg3 + %arg6, %arg4 + %arg7] : memref<64x192x15x15xf32>
233 |                   %17 = affine.load %11[%arg2, %arg5, %arg6, %arg7] : memref<384x192x3x3xf32>
234 |                   %18 = affine.load %alloc_12[%arg1, %arg2, %arg3, %arg4] : memref<64x384x13x13xf32>
235 |                   %19 = arith.mulf %16, %17 : f32
236 |                   %20 = arith.addf %18, %19 : f32
237 |                   affine.store %20, %alloc_12[%arg1, %arg2, %arg3, %arg4] : memref<64x384x13x13xf32>
238 |                 }
239 |               }
240 |             }
241 |           }
242 |         }
243 |       }
244 |     }
245 |     affine.for %arg1 = 0 to 64 {
246 |       affine.for %arg2 = 0 to 384 {
247 |         affine.for %arg3 = 0 to 13 {
248 |           affine.for %arg4 = 0 to 13 {
249 |             %16 = affine.load %alloc_12[%arg1, %arg2, %arg3, %arg4] : memref<64x384x13x13xf32>
250 |             %17 = arith.cmpf ugt, %16, %cst_0 : f32
251 |             %18 = arith.select %17, %16, %cst_0 : f32
252 |             affine.store %18, %alloc_11[%arg1, %arg2, %arg3, %arg4] : memref<64x384x13x13xf32>
253 |           }
254 |         }
255 |       }
256 |     }
257 |     %alloc_13 = memref.alloc() {alignment = 64 : i64} : memref<64x384x15x15xf32>
258 |     affine.for %arg1 = 0 to 64 {
259 |       affine.for %arg2 = 0 to 384 {
260 |         affine.for %arg3 = 0 to 15 {
261 |           affine.for %arg4 = 0 to 15 {
262 |             affine.store %cst_0, %alloc_13[%arg1, %arg2, %arg3, %arg4] : memref<64x384x15x15xf32>
263 |           }
264 |         }
265 |       }
266 |     }
267 |     %subview_14 = memref.subview %alloc_13[0, 0, 1, 1] [64, 384, 13, 13] [1, 1, 1, 1] : memref<64x384x15x15xf32> to memref<64x384x13x13xf32, strided<[86400, 225, 15, 1], offset: 16>>
268 |     memref.copy %alloc_11, %subview_14 : memref<64x384x13x13xf32> to memref<64x384x13x13xf32, strided<[86400, 225, 15, 1], offset: 16>>
269 |     %alloc_15 = memref.alloc() {alignment = 64 : i64} : memref<64x256x13x13xf32>
270 |     %alloc_16 = memref.alloc() {alignment = 64 : i64} : memref<64x256x13x13xf32>
271 |     affine.for %arg1 = 0 to 64 {
272 |       affine.for %arg2 = 0 to 256 {
273 |         affine.for %arg3 = 0 to 13 {
274 |           affine.for %arg4 = 0 to 13 {
275 |             %16 = affine.load %8[%arg2] : memref<256xf32>
276 |             affine.store %16, %alloc_16[%arg1, %arg2, %arg3, %arg4] : memref<64x256x13x13xf32>
277 |           }
278 |         }
279 |       }
280 |     }
281 |     affine.for %arg1 = 0 to 64 {
282 |       affine.for %arg2 = 0 to 256 {
283 |         affine.for %arg3 = 0 to 13 {
284 |           affine.for %arg4 = 0 to 13 {
285 |             affine.for %arg5 = 0 to 384 {
286 |               affine.for %arg6 = 0 to 3 {
287 |                 affine.for %arg7 = 0 to 3 {
288 |                   %16 = affine.load %alloc_13[%arg1, %arg5, %arg3 + %arg6, %arg4 + %arg7] : memref<64x384x15x15xf32>
289 |                   %17 = affine.load %9[%arg2, %arg5, %arg6, %arg7] : memref<256x384x3x3xf32>
290 |                   %18 = affine.load %alloc_16[%arg1, %arg2, %arg3, %arg4] : memref<64x256x13x13xf32>
291 |                   %19 = arith.mulf %16, %17 : f32
292 |                   %20 = arith.addf %18, %19 : f32
293 |                   affine.store %20, %alloc_16[%arg1, %arg2, %arg3, %arg4] : memref<64x256x13x13xf32>
294 |                 }
295 |               }
296 |             }
297 |           }
298 |         }
299 |       }
300 |     }
301 |     affine.for %arg1 = 0 to 64 {
302 |       affine.for %arg2 = 0 to 256 {
303 |         affine.for %arg3 = 0 to 13 {
304 |           affine.for %arg4 = 0 to 13 {
305 |             %16 = affine.load %alloc_16[%arg1, %arg2, %arg3, %arg4] : memref<64x256x13x13xf32>
306 |             %17 = arith.cmpf ugt, %16, %cst_0 : f32
307 |             %18 = arith.select %17, %16, %cst_0 : f32
308 |             affine.store %18, %alloc_15[%arg1, %arg2, %arg3, %arg4] : memref<64x256x13x13xf32>
309 |           }
310 |         }
311 |       }
312 |     }
313 |     %alloc_17 = memref.alloc() {alignment = 64 : i64} : memref<64x256x15x15xf32>
314 |     affine.for %arg1 = 0 to 64 {
315 |       affine.for %arg2 = 0 to 256 {
316 |         affine.for %arg3 = 0 to 15 {
317 |           affine.for %arg4 = 0 to 15 {
318 |             affine.store %cst_0, %alloc_17[%arg1, %arg2, %arg3, %arg4] : memref<64x256x15x15xf32>
319 |           }
320 |         }
321 |       }
322 |     }
323 |     %subview_18 = memref.subview %alloc_17[0, 0, 1, 1] [64, 256, 13, 13] [1, 1, 1, 1] : memref<64x256x15x15xf32> to memref<64x256x13x13xf32, strided<[57600, 225, 15, 1], offset: 16>>
324 |     memref.copy %alloc_15, %subview_18 : memref<64x256x13x13xf32> to memref<64x256x13x13xf32, strided<[57600, 225, 15, 1], offset: 16>>
325 |     %alloc_19 = memref.alloc() {alignment = 64 : i64} : memref<64x256x13x13xf32>
326 |     affine.for %arg1 = 0 to 64 {
327 |       affine.for %arg2 = 0 to 256 {
328 |         affine.for %arg3 = 0 to 13 {
329 |           affine.for %arg4 = 0 to 13 {
330 |             %16 = affine.load %6[%arg2] : memref<256xf32>
331 |             affine.store %16, %alloc_19[%arg1, %arg2, %arg3, %arg4] : memref<64x256x13x13xf32>
332 |           }
333 |         }
334 |       }
335 |     }
336 |     affine.for %arg1 = 0 to 64 {
337 |       affine.for %arg2 = 0 to 256 {
338 |         affine.for %arg3 = 0 to 13 {
339 |           affine.for %arg4 = 0 to 13 {
340 |             affine.for %arg5 = 0 to 256 {
341 |               affine.for %arg6 = 0 to 3 {
342 |                 affine.for %arg7 = 0 to 3 {
343 |                   %16 = affine.load %alloc_17[%arg1, %arg5, %arg3 + %arg6, %arg4 + %arg7] : memref<64x256x15x15xf32>
344 |                   %17 = affine.load %7[%arg2, %arg5, %arg6, %arg7] : memref<256x256x3x3xf32>
345 |                   %18 = affine.load %alloc_19[%arg1, %arg2, %arg3, %arg4] : memref<64x256x13x13xf32>
346 |                   %19 = arith.mulf %16, %17 : f32
347 |                   %20 = arith.addf %18, %19 : f32
348 |                   affine.store %20, %alloc_19[%arg1, %arg2, %arg3, %arg4] : memref<64x256x13x13xf32>
349 |                 }
350 |               }
351 |             }
352 |           }
353 |         }
354 |       }
355 |     }
356 |     affine.for %arg1 = 0 to 64 {
357 |       affine.for %arg2 = 0 to 256 {
358 |         affine.for %arg3 = 0 to 13 {
359 |           affine.for %arg4 = 0 to 13 {
360 |             %16 = affine.load %alloc_19[%arg1, %arg2, %arg3, %arg4] : memref<64x256x13x13xf32>
361 |             %17 = arith.cmpf ugt, %16, %cst_0 : f32
362 |             %18 = arith.select %17, %16, %cst_0 : f32
363 |             affine.store %18, %alloc_15[%arg1, %arg2, %arg3, %arg4] : memref<64x256x13x13xf32>
364 |           }
365 |         }
366 |       }
367 |     }
368 |     %alloc_20 = memref.alloc() {alignment = 64 : i64} : memref<64x256x6x6xf32>
369 |     %alloc_21 = memref.alloc() {alignment = 64 : i64} : memref<64x256x6x6xf32>
370 |     affine.for %arg1 = 0 to 64 {
371 |       affine.for %arg2 = 0 to 256 {
372 |         affine.for %arg3 = 0 to 6 {
373 |           affine.for %arg4 = 0 to 6 {
374 |             affine.store %cst, %alloc_21[%arg1, %arg2, %arg3, %arg4] : memref<64x256x6x6xf32>
375 |           }
376 |         }
377 |       }
378 |     }
379 |     affine.for %arg1 = 0 to 64 {
380 |       affine.for %arg2 = 0 to 256 {
381 |         affine.for %arg3 = 0 to 6 {
382 |           affine.for %arg4 = 0 to 6 {
383 |             affine.for %arg5 = 0 to 3 {
384 |               affine.for %arg6 = 0 to 3 {
385 |                 %16 = affine.load %alloc_15[%arg1, %arg2, %arg3 * 2 + %arg5, %arg4 * 2 + %arg6] : memref<64x256x13x13xf32>
386 |                 %17 = affine.load %alloc_21[%arg1, %arg2, %arg3, %arg4] : memref<64x256x6x6xf32>
387 |                 %18 = arith.maximumf %17, %16 : f32
388 |                 affine.store %18, %alloc_21[%arg1, %arg2, %arg3, %arg4] : memref<64x256x6x6xf32>
389 |               }
390 |             }
391 |           }
392 |         }
393 |       }
394 |     }
395 |     affine.for %arg1 = 0 to 64 {
396 |       affine.for %arg2 = 0 to 256 {
397 |         affine.for %arg3 = 0 to 6 {
398 |           affine.for %arg4 = 0 to 6 {
399 |             affine.store %cst_0, %alloc_20[%arg1, %arg2, %arg3, %arg4] : memref<64x256x6x6xf32>
400 |           }
401 |         }
402 |       }
403 |     }
404 |     affine.for %arg1 = 0 to 64 {
405 |       affine.for %arg2 = 0 to 256 {
406 |         affine.for %arg3 = 0 to 6 {
407 |           affine.for %arg4 = 0 to 6 {
408 |             affine.for %arg5 = 0 to 1 {
409 |               affine.for %arg6 = 0 to 1 {
410 |                 %16 = affine.load %alloc_21[%arg1, %arg2, %arg3 + %arg5, %arg4 + %arg6] : memref<64x256x6x6xf32>
411 |                 %17 = affine.load %alloc_20[%arg1, %arg2, %arg3, %arg4] : memref<64x256x6x6xf32>
412 |                 %18 = arith.addf %17, %16 : f32
413 |                 affine.store %18, %alloc_20[%arg1, %arg2, %arg3, %arg4] : memref<64x256x6x6xf32>
414 |               }
415 |             }
416 |           }
417 |         }
418 |       }
419 |     }
420 |     %collapse_shape = memref.collapse_shape %alloc_20 [[0], [1, 2, 3]] : memref<64x256x6x6xf32> into memref<64x9216xf32>
421 |     %alloc_22 = memref.alloc() {alignment = 64 : i64} : memref<9216x4096xf32>
422 |     affine.for %arg1 = 0 to 4096 {
423 |       affine.for %arg2 = 0 to 9216 {
424 |         %16 = affine.load %5[%arg1, %arg2] : memref<4096x9216xf32>
425 |         affine.store %16, %alloc_22[%arg2, %arg1] : memref<9216x4096xf32>
426 |       }
427 |     }
428 |     %alloc_23 = memref.alloc() {alignment = 64 : i64} : memref<64x4096xf32>
429 |     %alloc_24 = memref.alloc() {alignment = 64 : i64} : memref<64x4096xf32>
430 |     affine.for %arg1 = 0 to 64 {
431 |       affine.for %arg2 = 0 to 4096 {
432 |         affine.store %cst_0, %alloc_24[%arg1, %arg2] : memref<64x4096xf32>
433 |       }
434 |     }
435 |     %alloc_25 = memref.alloc() {alignment = 64 : i64} : memref<64x4096xf32>
436 |     memref.copy %alloc_24, %alloc_25 : memref<64x4096xf32> to memref<64x4096xf32>
437 |     affine.for %arg1 = 0 to 64 {
438 |       affine.for %arg2 = 0 to 4096 {
439 |         affine.for %arg3 = 0 to 9216 {
440 |           %16 = affine.load %collapse_shape[%arg1, %arg3] : memref<64x9216xf32>
441 |           %17 = affine.load %alloc_22[%arg3, %arg2] : memref<9216x4096xf32>
442 |           %18 = affine.load %alloc_25[%arg1, %arg2] : memref<64x4096xf32>
443 |           %19 = arith.mulf %16, %17 : f32
444 |           %20 = arith.addf %18, %19 : f32
445 |           affine.store %20, %alloc_25[%arg1, %arg2] : memref<64x4096xf32>
446 |         }
447 |       }
448 |     }
449 |     %alloc_26 = memref.alloc() {alignment = 64 : i64} : memref<64x4096xf32>
450 |     affine.for %arg1 = 0 to 64 {
451 |       affine.for %arg2 = 0 to 4096 {
452 |         %16 = affine.load %alloc_25[%arg1, %arg2] : memref<64x4096xf32>
453 |         %17 = affine.load %4[%arg2] : memref<4096xf32>
454 |         %18 = arith.addf %16, %17 : f32
455 |         affine.store %18, %alloc_26[%arg1, %arg2] : memref<64x4096xf32>
456 |       }
457 |     }
458 |     affine.for %arg1 = 0 to 64 {
459 |       affine.for %arg2 = 0 to 4096 {
460 |         %16 = affine.load %alloc_26[%arg1, %arg2] : memref<64x4096xf32>
461 |         %17 = arith.cmpf ugt, %16, %cst_0 : f32
462 |         %18 = arith.select %17, %16, %cst_0 : f32
463 |         affine.store %18, %alloc_23[%arg1, %arg2] : memref<64x4096xf32>
464 |       }
465 |     }
466 |     %alloc_27 = memref.alloc() {alignment = 64 : i64} : memref<4096x4096xf32>
467 |     affine.for %arg1 = 0 to 4096 {
468 |       affine.for %arg2 = 0 to 4096 {
469 |         %16 = affine.load %3[%arg1, %arg2] : memref<4096x4096xf32>
470 |         affine.store %16, %alloc_27[%arg2, %arg1] : memref<4096x4096xf32>
471 |       }
472 |     }
473 |     affine.for %arg1 = 0 to 64 {
474 |       affine.for %arg2 = 0 to 4096 {
475 |         affine.for %arg3 = 0 to 4096 {
476 |           %16 = affine.load %alloc_23[%arg1, %arg3] : memref<64x4096xf32>
477 |           %17 = affine.load %alloc_27[%arg3, %arg2] : memref<4096x4096xf32>
478 |           %18 = affine.load %alloc_24[%arg1, %arg2] : memref<64x4096xf32>
479 |           %19 = arith.mulf %16, %17 : f32
480 |           %20 = arith.addf %18, %19 : f32
481 |           affine.store %20, %alloc_24[%arg1, %arg2] : memref<64x4096xf32>
482 |         }
483 |       }
484 |     }
485 |     %alloc_28 = memref.alloc() {alignment = 64 : i64} : memref<64x4096xf32>
486 |     affine.for %arg1 = 0 to 64 {
487 |       affine.for %arg2 = 0 to 4096 {
488 |         %16 = affine.load %alloc_24[%arg1, %arg2] : memref<64x4096xf32>
489 |         %17 = affine.load %2[%arg2] : memref<4096xf32>
490 |         %18 = arith.addf %16, %17 : f32
491 |         affine.store %18, %alloc_28[%arg1, %arg2] : memref<64x4096xf32>
492 |       }
493 |     }
494 |     affine.for %arg1 = 0 to 64 {
495 |       affine.for %arg2 = 0 to 4096 {
496 |         %16 = affine.load %alloc_28[%arg1, %arg2] : memref<64x4096xf32>
497 |         %17 = arith.cmpf ugt, %16, %cst_0 : f32
498 |         %18 = arith.select %17, %16, %cst_0 : f32
499 |         affine.store %18, %alloc_23[%arg1, %arg2] : memref<64x4096xf32>
500 |       }
501 |     }
502 |     %alloc_29 = memref.alloc() {alignment = 64 : i64} : memref<4096x1000xf32>
503 |     affine.for %arg1 = 0 to 1000 {
504 |       affine.for %arg2 = 0 to 4096 {
505 |         %16 = affine.load %1[%arg1, %arg2] : memref<1000x4096xf32>
506 |         affine.store %16, %alloc_29[%arg2, %arg1] : memref<4096x1000xf32>
507 |       }
508 |     }
509 |     %alloc_30 = memref.alloc() {alignment = 64 : i64} : memref<64x1000xf32>
510 |     %alloc_31 = memref.alloc() {alignment = 64 : i64} : memref<64x1000xf32>
511 |     affine.for %arg1 = 0 to 64 {
512 |       affine.for %arg2 = 0 to 1000 {
513 |         affine.store %cst_0, %alloc_31[%arg1, %arg2] : memref<64x1000xf32>
514 |       }
515 |     }
516 |     affine.for %arg1 = 0 to 64 {
517 |       affine.for %arg2 = 0 to 1000 {
518 |         affine.for %arg3 = 0 to 4096 {
519 |           %16 = affine.load %alloc_23[%arg1, %arg3] : memref<64x4096xf32>
520 |           %17 = affine.load %alloc_29[%arg3, %arg2] : memref<4096x1000xf32>
521 |           %18 = affine.load %alloc_31[%arg1, %arg2] : memref<64x1000xf32>
522 |           %19 = arith.mulf %16, %17 : f32
523 |           %20 = arith.addf %18, %19 : f32
524 |           affine.store %20, %alloc_31[%arg1, %arg2] : memref<64x1000xf32>
525 |         }
526 |       }
527 |     }
528 |     affine.for %arg1 = 0 to 64 {
529 |       affine.for %arg2 = 0 to 1000 {
530 |         %16 = affine.load %alloc_31[%arg1, %arg2] : memref<64x1000xf32>
531 |         %17 = affine.load %0[%arg2] : memref<1000xf32>
532 |         %18 = arith.addf %16, %17 : f32
533 |         affine.store %18, %alloc_30[%arg1, %arg2] : memref<64x1000xf32>
534 |       }
535 |     }
536 |     memref.dealloc %alloc_1 : memref<64x64x55x55xf32>
537 |     memref.dealloc %alloc_2 : memref<64x64x55x55xf32>
538 |     memref.dealloc %alloc_3 : memref<64x64x27x27xf32>
539 |     memref.dealloc %alloc_6 : memref<64x192x27x27xf32>
540 |     memref.dealloc %alloc_7 : memref<64x192x27x27xf32>
541 |     memref.dealloc %alloc_8 : memref<64x192x13x13xf32>
542 |     memref.dealloc %alloc_11 : memref<64x384x13x13xf32>
543 |     memref.dealloc %alloc_12 : memref<64x384x13x13xf32>
544 |     memref.dealloc %alloc_15 : memref<64x256x13x13xf32>
545 |     memref.dealloc %alloc_16 : memref<64x256x13x13xf32>
546 |     memref.dealloc %alloc_19 : memref<64x256x13x13xf32>
547 |     memref.dealloc %alloc_20 : memref<64x256x6x6xf32>
548 |     memref.dealloc %alloc_21 : memref<64x256x6x6xf32>
549 |     memref.dealloc %alloc_22 : memref<9216x4096xf32>
550 |     memref.dealloc %alloc_23 : memref<64x4096xf32>
551 |     memref.dealloc %alloc_24 : memref<64x4096xf32>
552 |     memref.dealloc %alloc_25 : memref<64x4096xf32>
553 |     memref.dealloc %alloc_26 : memref<64x4096xf32>
554 |     memref.dealloc %alloc_27 : memref<4096x4096xf32>
555 |     memref.dealloc %alloc_28 : memref<64x4096xf32>
556 |     memref.dealloc %alloc_29 : memref<4096x1000xf32>
557 |     memref.dealloc %alloc_30 : memref<64x1000xf32>
558 |     memref.dealloc %alloc_31 : memref<64x1000xf32>
559 |     memref.dealloc %alloc : memref<64x3x228x228xf32>
560 |     memref.dealloc %alloc_4 : memref<64x64x31x31xf32>
561 |     memref.dealloc %alloc_9 : memref<64x192x15x15xf32>
562 |     memref.dealloc %alloc_13 : memref<64x384x15x15xf32>
563 |     memref.dealloc %alloc_17 : memref<64x256x15x15xf32>
564 |     return %alloc_30 : memref<64x1000xf32>
565 |   }
566 | }
567 | 


--------------------------------------------------------------------------------
/Torch_MLIR/pytorch/vgg11.mlir:
--------------------------------------------------------------------------------
  1 | module attributes {torch.debug_module_name = "VGG"} {
  2 |   memref.global "private" constant @__constant_64xf32 : memref<64xf32> = dense<0.000000e+00> {alignment = 64 : i64}
  3 |   memref.global "private" constant @__constant_64x3x3x3xf32 : memref<64x3x3x3xf32> = dense_resource<__elided__> {alignment = 64 : i64}
  4 |   memref.global "private" constant @__constant_128xf32 : memref<128xf32> = dense<0.000000e+00> {alignment = 64 : i64}
  5 |   memref.global "private" constant @__constant_128x64x3x3xf32 : memref<128x64x3x3xf32> = dense_resource<__elided__> {alignment = 64 : i64}
  6 |   memref.global "private" constant @__constant_256xf32 : memref<256xf32> = dense<0.000000e+00> {alignment = 64 : i64}
  7 |   memref.global "private" constant @__constant_256x128x3x3xf32 : memref<256x128x3x3xf32> = dense_resource<__elided__> {alignment = 64 : i64}
  8 |   memref.global "private" constant @__constant_256x256x3x3xf32 : memref<256x256x3x3xf32> = dense_resource<__elided__> {alignment = 64 : i64}
  9 |   memref.global "private" constant @__constant_512xf32 : memref<512xf32> = dense<0.000000e+00> {alignment = 64 : i64}
 10 |   memref.global "private" constant @__constant_512x256x3x3xf32 : memref<512x256x3x3xf32> = dense_resource<__elided__> {alignment = 64 : i64}
 11 |   memref.global "private" constant @__constant_512x512x3x3xf32_1 : memref<512x512x3x3xf32> = dense_resource<__elided__> {alignment = 64 : i64}
 12 |   memref.global "private" constant @__constant_512x512x3x3xf32_0 : memref<512x512x3x3xf32> = dense_resource<__elided__> {alignment = 64 : i64}
 13 |   memref.global "private" constant @__constant_512x512x3x3xf32 : memref<512x512x3x3xf32> = dense_resource<__elided__> {alignment = 64 : i64}
 14 |   memref.global "private" constant @__constant_4096xf32 : memref<4096xf32> = dense<0.000000e+00> {alignment = 64 : i64}
 15 |   memref.global "private" constant @__constant_4096x25088xf32 : memref<4096x25088xf32> = dense_resource<__elided__> {alignment = 64 : i64}
 16 |   memref.global "private" constant @__constant_4096x4096xf32 : memref<4096x4096xf32> = dense_resource<__elided__> {alignment = 64 : i64}
 17 |   memref.global "private" constant @__constant_1000xf32 : memref<1000xf32> = dense<0.000000e+00> {alignment = 64 : i64}
 18 |   memref.global "private" constant @__constant_1000x4096xf32 : memref<1000x4096xf32> = dense_resource<__elided__> {alignment = 64 : i64}
 19 |   
 20 |   func.func @forward(%arg0: memref<64x3x224x224xf32>) -> memref<64x1000xf32> {
 21 |     %cst = arith.constant -3.40282347E+38 : f32
 22 |     %cst_0 = arith.constant 0.000000e+00 : f32
 23 |     %0 = memref.get_global @__constant_1000x4096xf32 : memref<1000x4096xf32>
 24 |     %1 = memref.get_global @__constant_4096x4096xf32 : memref<4096x4096xf32>
 25 |     %2 = memref.get_global @__constant_4096x25088xf32 : memref<4096x25088xf32>
 26 |     %3 = memref.get_global @__constant_512x512x3x3xf32 : memref<512x512x3x3xf32>
 27 |     %4 = memref.get_global @__constant_512x512x3x3xf32_0 : memref<512x512x3x3xf32>
 28 |     %5 = memref.get_global @__constant_512x512x3x3xf32_1 : memref<512x512x3x3xf32>
 29 |     %6 = memref.get_global @__constant_512x256x3x3xf32 : memref<512x256x3x3xf32>
 30 |     %7 = memref.get_global @__constant_256x256x3x3xf32 : memref<256x256x3x3xf32>
 31 |     %8 = memref.get_global @__constant_256x128x3x3xf32 : memref<256x128x3x3xf32>
 32 |     %9 = memref.get_global @__constant_128x64x3x3xf32 : memref<128x64x3x3xf32>
 33 |     %10 = memref.get_global @__constant_64x3x3x3xf32 : memref<64x3x3x3xf32>
 34 |     %alloc = memref.alloc() {alignment = 64 : i64} : memref<64x3x226x226xf32>
 35 |     affine.for %arg1 = 0 to 64 {
 36 |       affine.for %arg2 = 0 to 3 {
 37 |         affine.for %arg3 = 0 to 226 {
 38 |           affine.for %arg4 = 0 to 226 {
 39 |             affine.store %cst_0, %alloc[%arg1, %arg2, %arg3, %arg4] : memref<64x3x226x226xf32>
 40 |           }
 41 |         }
 42 |       }
 43 |     }
 44 |     %subview = memref.subview %alloc[0, 0, 1, 1] [64, 3, 224, 224] [1, 1, 1, 1] : memref<64x3x226x226xf32> to memref<64x3x224x224xf32, strided<[153228, 51076, 226, 1], offset: 227>>
 45 |     memref.copy %arg0, %subview : memref<64x3x224x224xf32> to memref<64x3x224x224xf32, strided<[153228, 51076, 226, 1], offset: 227>>
 46 |     %alloc_1 = memref.alloc() {alignment = 64 : i64} : memref<64x64x224x224xf32>
 47 |     %alloc_2 = memref.alloc() {alignment = 64 : i64} : memref<64x64x224x224xf32>
 48 |     affine.for %arg1 = 0 to 64 {
 49 |       affine.for %arg2 = 0 to 64 {
 50 |         affine.for %arg3 = 0 to 224 {
 51 |           affine.for %arg4 = 0 to 224 {
 52 |             affine.store %cst_0, %alloc_2[%arg1, %arg2, %arg3, %arg4] : memref<64x64x224x224xf32>
 53 |           }
 54 |         }
 55 |       }
 56 |     }
 57 |     affine.for %arg1 = 0 to 64 {
 58 |       affine.for %arg2 = 0 to 64 {
 59 |         affine.for %arg3 = 0 to 224 {
 60 |           affine.for %arg4 = 0 to 224 {
 61 |             affine.for %arg5 = 0 to 3 {
 62 |               affine.for %arg6 = 0 to 3 {
 63 |                 affine.for %arg7 = 0 to 3 {
 64 |                   %11 = affine.load %alloc[%arg1, %arg5, %arg3 + %arg6, %arg4 + %arg7] : memref<64x3x226x226xf32>
 65 |                   %12 = affine.load %10[%arg2, %arg5, %arg6, %arg7] : memref<64x3x3x3xf32>
 66 |                   %13 = affine.load %alloc_2[%arg1, %arg2, %arg3, %arg4] : memref<64x64x224x224xf32>
 67 |                   %14 = arith.mulf %11, %12 : f32
 68 |                   %15 = arith.addf %13, %14 : f32
 69 |                   affine.store %15, %alloc_2[%arg1, %arg2, %arg3, %arg4] : memref<64x64x224x224xf32>
 70 |                 }
 71 |               }
 72 |             }
 73 |           }
 74 |         }
 75 |       }
 76 |     }
 77 |     affine.for %arg1 = 0 to 64 {
 78 |       affine.for %arg2 = 0 to 64 {
 79 |         affine.for %arg3 = 0 to 224 {
 80 |           affine.for %arg4 = 0 to 224 {
 81 |             %11 = affine.load %alloc_2[%arg1, %arg2, %arg3, %arg4] : memref<64x64x224x224xf32>
 82 |             %12 = arith.cmpf ugt, %11, %cst_0 : f32
 83 |             %13 = arith.select %12, %11, %cst_0 : f32
 84 |             affine.store %13, %alloc_1[%arg1, %arg2, %arg3, %arg4] : memref<64x64x224x224xf32>
 85 |           }
 86 |         }
 87 |       }
 88 |     }
 89 |     %alloc_3 = memref.alloc() {alignment = 64 : i64} : memref<64x64x112x112xf32>
 90 |     affine.for %arg1 = 0 to 64 {
 91 |       affine.for %arg2 = 0 to 64 {
 92 |         affine.for %arg3 = 0 to 112 {
 93 |           affine.for %arg4 = 0 to 112 {
 94 |             affine.store %cst, %alloc_3[%arg1, %arg2, %arg3, %arg4] : memref<64x64x112x112xf32>
 95 |           }
 96 |         }
 97 |       }
 98 |     }
 99 |     affine.for %arg1 = 0 to 64 {
100 |       affine.for %arg2 = 0 to 64 {
101 |         affine.for %arg3 = 0 to 112 {
102 |           affine.for %arg4 = 0 to 112 {
103 |             affine.for %arg5 = 0 to 2 {
104 |               affine.for %arg6 = 0 to 2 {
105 |                 %11 = affine.load %alloc_1[%arg1, %arg2, %arg3 * 2 + %arg5, %arg4 * 2 + %arg6] : memref<64x64x224x224xf32>
106 |                 %12 = affine.load %alloc_3[%arg1, %arg2, %arg3, %arg4] : memref<64x64x112x112xf32>
107 |                 %13 = arith.maximumf %12, %11 : f32
108 |                 affine.store %13, %alloc_3[%arg1, %arg2, %arg3, %arg4] : memref<64x64x112x112xf32>
109 |               }
110 |             }
111 |           }
112 |         }
113 |       }
114 |     }
115 |     %alloc_4 = memref.alloc() {alignment = 64 : i64} : memref<64x64x114x114xf32>
116 |     affine.for %arg1 = 0 to 64 {
117 |       affine.for %arg2 = 0 to 64 {
118 |         affine.for %arg3 = 0 to 114 {
119 |           affine.for %arg4 = 0 to 114 {
120 |             affine.store %cst_0, %alloc_4[%arg1, %arg2, %arg3, %arg4] : memref<64x64x114x114xf32>
121 |           }
122 |         }
123 |       }
124 |     }
125 |     %subview_5 = memref.subview %alloc_4[0, 0, 1, 1] [64, 64, 112, 112] [1, 1, 1, 1] : memref<64x64x114x114xf32> to memref<64x64x112x112xf32, strided<[831744, 12996, 114, 1], offset: 115>>
126 |     memref.copy %alloc_3, %subview_5 : memref<64x64x112x112xf32> to memref<64x64x112x112xf32, strided<[831744, 12996, 114, 1], offset: 115>>
127 |     %alloc_6 = memref.alloc() {alignment = 64 : i64} : memref<64x128x112x112xf32>
128 |     %alloc_7 = memref.alloc() {alignment = 64 : i64} : memref<64x128x112x112xf32>
129 |     affine.for %arg1 = 0 to 64 {
130 |       affine.for %arg2 = 0 to 128 {
131 |         affine.for %arg3 = 0 to 112 {
132 |           affine.for %arg4 = 0 to 112 {
133 |             affine.store %cst_0, %alloc_7[%arg1, %arg2, %arg3, %arg4] : memref<64x128x112x112xf32>
134 |           }
135 |         }
136 |       }
137 |     }
138 |     affine.for %arg1 = 0 to 64 {
139 |       affine.for %arg2 = 0 to 128 {
140 |         affine.for %arg3 = 0 to 112 {
141 |           affine.for %arg4 = 0 to 112 {
142 |             affine.for %arg5 = 0 to 64 {
143 |               affine.for %arg6 = 0 to 3 {
144 |                 affine.for %arg7 = 0 to 3 {
145 |                   %11 = affine.load %alloc_4[%arg1, %arg5, %arg3 + %arg6, %arg4 + %arg7] : memref<64x64x114x114xf32>
146 |                   %12 = affine.load %9[%arg2, %arg5, %arg6, %arg7] : memref<128x64x3x3xf32>
147 |                   %13 = affine.load %alloc_7[%arg1, %arg2, %arg3, %arg4] : memref<64x128x112x112xf32>
148 |                   %14 = arith.mulf %11, %12 : f32
149 |                   %15 = arith.addf %13, %14 : f32
150 |                   affine.store %15, %alloc_7[%arg1, %arg2, %arg3, %arg4] : memref<64x128x112x112xf32>
151 |                 }
152 |               }
153 |             }
154 |           }
155 |         }
156 |       }
157 |     }
158 |     affine.for %arg1 = 0 to 64 {
159 |       affine.for %arg2 = 0 to 128 {
160 |         affine.for %arg3 = 0 to 112 {
161 |           affine.for %arg4 = 0 to 112 {
162 |             %11 = affine.load %alloc_7[%arg1, %arg2, %arg3, %arg4] : memref<64x128x112x112xf32>
163 |             %12 = arith.cmpf ugt, %11, %cst_0 : f32
164 |             %13 = arith.select %12, %11, %cst_0 : f32
165 |             affine.store %13, %alloc_6[%arg1, %arg2, %arg3, %arg4] : memref<64x128x112x112xf32>
166 |           }
167 |         }
168 |       }
169 |     }
170 |     %alloc_8 = memref.alloc() {alignment = 64 : i64} : memref<64x128x56x56xf32>
171 |     affine.for %arg1 = 0 to 64 {
172 |       affine.for %arg2 = 0 to 128 {
173 |         affine.for %arg3 = 0 to 56 {
174 |           affine.for %arg4 = 0 to 56 {
175 |             affine.store %cst, %alloc_8[%arg1, %arg2, %arg3, %arg4] : memref<64x128x56x56xf32>
176 |           }
177 |         }
178 |       }
179 |     }
180 |     affine.for %arg1 = 0 to 64 {
181 |       affine.for %arg2 = 0 to 128 {
182 |         affine.for %arg3 = 0 to 56 {
183 |           affine.for %arg4 = 0 to 56 {
184 |             affine.for %arg5 = 0 to 2 {
185 |               affine.for %arg6 = 0 to 2 {
186 |                 %11 = affine.load %alloc_6[%arg1, %arg2, %arg3 * 2 + %arg5, %arg4 * 2 + %arg6] : memref<64x128x112x112xf32>
187 |                 %12 = affine.load %alloc_8[%arg1, %arg2, %arg3, %arg4] : memref<64x128x56x56xf32>
188 |                 %13 = arith.maximumf %12, %11 : f32
189 |                 affine.store %13, %alloc_8[%arg1, %arg2, %arg3, %arg4] : memref<64x128x56x56xf32>
190 |               }
191 |             }
192 |           }
193 |         }
194 |       }
195 |     }
196 |     %alloc_9 = memref.alloc() {alignment = 64 : i64} : memref<64x128x58x58xf32>
197 |     affine.for %arg1 = 0 to 64 {
198 |       affine.for %arg2 = 0 to 128 {
199 |         affine.for %arg3 = 0 to 58 {
200 |           affine.for %arg4 = 0 to 58 {
201 |             affine.store %cst_0, %alloc_9[%arg1, %arg2, %arg3, %arg4] : memref<64x128x58x58xf32>
202 |           }
203 |         }
204 |       }
205 |     }
206 |     %subview_10 = memref.subview %alloc_9[0, 0, 1, 1] [64, 128, 56, 56] [1, 1, 1, 1] : memref<64x128x58x58xf32> to memref<64x128x56x56xf32, strided<[430592, 3364, 58, 1], offset: 59>>
207 |     memref.copy %alloc_8, %subview_10 : memref<64x128x56x56xf32> to memref<64x128x56x56xf32, strided<[430592, 3364, 58, 1], offset: 59>>
208 |     %alloc_11 = memref.alloc() {alignment = 64 : i64} : memref<64x256x56x56xf32>
209 |     %alloc_12 = memref.alloc() {alignment = 64 : i64} : memref<64x256x56x56xf32>
210 |     affine.for %arg1 = 0 to 64 {
211 |       affine.for %arg2 = 0 to 256 {
212 |         affine.for %arg3 = 0 to 56 {
213 |           affine.for %arg4 = 0 to 56 {
214 |             affine.store %cst_0, %alloc_12[%arg1, %arg2, %arg3, %arg4] : memref<64x256x56x56xf32>
215 |           }
216 |         }
217 |       }
218 |     }
219 |     %alloc_13 = memref.alloc() {alignment = 64 : i64} : memref<64x256x56x56xf32>
220 |     memref.copy %alloc_12, %alloc_13 : memref<64x256x56x56xf32> to memref<64x256x56x56xf32>
221 |     affine.for %arg1 = 0 to 64 {
222 |       affine.for %arg2 = 0 to 256 {
223 |         affine.for %arg3 = 0 to 56 {
224 |           affine.for %arg4 = 0 to 56 {
225 |             affine.for %arg5 = 0 to 128 {
226 |               affine.for %arg6 = 0 to 3 {
227 |                 affine.for %arg7 = 0 to 3 {
228 |                   %11 = affine.load %alloc_9[%arg1, %arg5, %arg3 + %arg6, %arg4 + %arg7] : memref<64x128x58x58xf32>
229 |                   %12 = affine.load %8[%arg2, %arg5, %arg6, %arg7] : memref<256x128x3x3xf32>
230 |                   %13 = affine.load %alloc_13[%arg1, %arg2, %arg3, %arg4] : memref<64x256x56x56xf32>
231 |                   %14 = arith.mulf %11, %12 : f32
232 |                   %15 = arith.addf %13, %14 : f32
233 |                   affine.store %15, %alloc_13[%arg1, %arg2, %arg3, %arg4] : memref<64x256x56x56xf32>
234 |                 }
235 |               }
236 |             }
237 |           }
238 |         }
239 |       }
240 |     }
241 |     affine.for %arg1 = 0 to 64 {
242 |       affine.for %arg2 = 0 to 256 {
243 |         affine.for %arg3 = 0 to 56 {
244 |           affine.for %arg4 = 0 to 56 {
245 |             %11 = affine.load %alloc_13[%arg1, %arg2, %arg3, %arg4] : memref<64x256x56x56xf32>
246 |             %12 = arith.cmpf ugt, %11, %cst_0 : f32
247 |             %13 = arith.select %12, %11, %cst_0 : f32
248 |             affine.store %13, %alloc_11[%arg1, %arg2, %arg3, %arg4] : memref<64x256x56x56xf32>
249 |           }
250 |         }
251 |       }
252 |     }
253 |     %alloc_14 = memref.alloc() {alignment = 64 : i64} : memref<64x256x58x58xf32>
254 |     affine.for %arg1 = 0 to 64 {
255 |       affine.for %arg2 = 0 to 256 {
256 |         affine.for %arg3 = 0 to 58 {
257 |           affine.for %arg4 = 0 to 58 {
258 |             affine.store %cst_0, %alloc_14[%arg1, %arg2, %arg3, %arg4] : memref<64x256x58x58xf32>
259 |           }
260 |         }
261 |       }
262 |     }
263 |     %subview_15 = memref.subview %alloc_14[0, 0, 1, 1] [64, 256, 56, 56] [1, 1, 1, 1] : memref<64x256x58x58xf32> to memref<64x256x56x56xf32, strided<[861184, 3364, 58, 1], offset: 59>>
264 |     memref.copy %alloc_11, %subview_15 : memref<64x256x56x56xf32> to memref<64x256x56x56xf32, strided<[861184, 3364, 58, 1], offset: 59>>
265 |     affine.for %arg1 = 0 to 64 {
266 |       affine.for %arg2 = 0 to 256 {
267 |         affine.for %arg3 = 0 to 56 {
268 |           affine.for %arg4 = 0 to 56 {
269 |             affine.for %arg5 = 0 to 256 {
270 |               affine.for %arg6 = 0 to 3 {
271 |                 affine.for %arg7 = 0 to 3 {
272 |                   %11 = affine.load %alloc_14[%arg1, %arg5, %arg3 + %arg6, %arg4 + %arg7] : memref<64x256x58x58xf32>
273 |                   %12 = affine.load %7[%arg2, %arg5, %arg6, %arg7] : memref<256x256x3x3xf32>
274 |                   %13 = affine.load %alloc_12[%arg1, %arg2, %arg3, %arg4] : memref<64x256x56x56xf32>
275 |                   %14 = arith.mulf %11, %12 : f32
276 |                   %15 = arith.addf %13, %14 : f32
277 |                   affine.store %15, %alloc_12[%arg1, %arg2, %arg3, %arg4] : memref<64x256x56x56xf32>
278 |                 }
279 |               }
280 |             }
281 |           }
282 |         }
283 |       }
284 |     }
285 |     affine.for %arg1 = 0 to 64 {
286 |       affine.for %arg2 = 0 to 256 {
287 |         affine.for %arg3 = 0 to 56 {
288 |           affine.for %arg4 = 0 to 56 {
289 |             %11 = affine.load %alloc_12[%arg1, %arg2, %arg3, %arg4] : memref<64x256x56x56xf32>
290 |             %12 = arith.cmpf ugt, %11, %cst_0 : f32
291 |             %13 = arith.select %12, %11, %cst_0 : f32
292 |             affine.store %13, %alloc_11[%arg1, %arg2, %arg3, %arg4] : memref<64x256x56x56xf32>
293 |           }
294 |         }
295 |       }
296 |     }
297 |     %alloc_16 = memref.alloc() {alignment = 64 : i64} : memref<64x256x28x28xf32>
298 |     affine.for %arg1 = 0 to 64 {
299 |       affine.for %arg2 = 0 to 256 {
300 |         affine.for %arg3 = 0 to 28 {
301 |           affine.for %arg4 = 0 to 28 {
302 |             affine.store %cst, %alloc_16[%arg1, %arg2, %arg3, %arg4] : memref<64x256x28x28xf32>
303 |           }
304 |         }
305 |       }
306 |     }
307 |     affine.for %arg1 = 0 to 64 {
308 |       affine.for %arg2 = 0 to 256 {
309 |         affine.for %arg3 = 0 to 28 {
310 |           affine.for %arg4 = 0 to 28 {
311 |             affine.for %arg5 = 0 to 2 {
312 |               affine.for %arg6 = 0 to 2 {
313 |                 %11 = affine.load %alloc_11[%arg1, %arg2, %arg3 * 2 + %arg5, %arg4 * 2 + %arg6] : memref<64x256x56x56xf32>
314 |                 %12 = affine.load %alloc_16[%arg1, %arg2, %arg3, %arg4] : memref<64x256x28x28xf32>
315 |                 %13 = arith.maximumf %12, %11 : f32
316 |                 affine.store %13, %alloc_16[%arg1, %arg2, %arg3, %arg4] : memref<64x256x28x28xf32>
317 |               }
318 |             }
319 |           }
320 |         }
321 |       }
322 |     }
323 |     %alloc_17 = memref.alloc() {alignment = 64 : i64} : memref<64x256x30x30xf32>
324 |     affine.for %arg1 = 0 to 64 {
325 |       affine.for %arg2 = 0 to 256 {
326 |         affine.for %arg3 = 0 to 30 {
327 |           affine.for %arg4 = 0 to 30 {
328 |             affine.store %cst_0, %alloc_17[%arg1, %arg2, %arg3, %arg4] : memref<64x256x30x30xf32>
329 |           }
330 |         }
331 |       }
332 |     }
333 |     %subview_18 = memref.subview %alloc_17[0, 0, 1, 1] [64, 256, 28, 28] [1, 1, 1, 1] : memref<64x256x30x30xf32> to memref<64x256x28x28xf32, strided<[230400, 900, 30, 1], offset: 31>>
334 |     memref.copy %alloc_16, %subview_18 : memref<64x256x28x28xf32> to memref<64x256x28x28xf32, strided<[230400, 900, 30, 1], offset: 31>>
335 |     %alloc_19 = memref.alloc() {alignment = 64 : i64} : memref<64x512x28x28xf32>
336 |     %alloc_20 = memref.alloc() {alignment = 64 : i64} : memref<64x512x28x28xf32>
337 |     affine.for %arg1 = 0 to 64 {
338 |       affine.for %arg2 = 0 to 512 {
339 |         affine.for %arg3 = 0 to 28 {
340 |           affine.for %arg4 = 0 to 28 {
341 |             affine.store %cst_0, %alloc_20[%arg1, %arg2, %arg3, %arg4] : memref<64x512x28x28xf32>
342 |           }
343 |         }
344 |       }
345 |     }
346 |     %alloc_21 = memref.alloc() {alignment = 64 : i64} : memref<64x512x28x28xf32>
347 |     memref.copy %alloc_20, %alloc_21 : memref<64x512x28x28xf32> to memref<64x512x28x28xf32>
348 |     affine.for %arg1 = 0 to 64 {
349 |       affine.for %arg2 = 0 to 512 {
350 |         affine.for %arg3 = 0 to 28 {
351 |           affine.for %arg4 = 0 to 28 {
352 |             affine.for %arg5 = 0 to 256 {
353 |               affine.for %arg6 = 0 to 3 {
354 |                 affine.for %arg7 = 0 to 3 {
355 |                   %11 = affine.load %alloc_17[%arg1, %arg5, %arg3 + %arg6, %arg4 + %arg7] : memref<64x256x30x30xf32>
356 |                   %12 = affine.load %6[%arg2, %arg5, %arg6, %arg7] : memref<512x256x3x3xf32>
357 |                   %13 = affine.load %alloc_21[%arg1, %arg2, %arg3, %arg4] : memref<64x512x28x28xf32>
358 |                   %14 = arith.mulf %11, %12 : f32
359 |                   %15 = arith.addf %13, %14 : f32
360 |                   affine.store %15, %alloc_21[%arg1, %arg2, %arg3, %arg4] : memref<64x512x28x28xf32>
361 |                 }
362 |               }
363 |             }
364 |           }
365 |         }
366 |       }
367 |     }
368 |     affine.for %arg1 = 0 to 64 {
369 |       affine.for %arg2 = 0 to 512 {
370 |         affine.for %arg3 = 0 to 28 {
371 |           affine.for %arg4 = 0 to 28 {
372 |             %11 = affine.load %alloc_21[%arg1, %arg2, %arg3, %arg4] : memref<64x512x28x28xf32>
373 |             %12 = arith.cmpf ugt, %11, %cst_0 : f32
374 |             %13 = arith.select %12, %11, %cst_0 : f32
375 |             affine.store %13, %alloc_19[%arg1, %arg2, %arg3, %arg4] : memref<64x512x28x28xf32>
376 |           }
377 |         }
378 |       }
379 |     }
380 |     %alloc_22 = memref.alloc() {alignment = 64 : i64} : memref<64x512x30x30xf32>
381 |     affine.for %arg1 = 0 to 64 {
382 |       affine.for %arg2 = 0 to 512 {
383 |         affine.for %arg3 = 0 to 30 {
384 |           affine.for %arg4 = 0 to 30 {
385 |             affine.store %cst_0, %alloc_22[%arg1, %arg2, %arg3, %arg4] : memref<64x512x30x30xf32>
386 |           }
387 |         }
388 |       }
389 |     }
390 |     %subview_23 = memref.subview %alloc_22[0, 0, 1, 1] [64, 512, 28, 28] [1, 1, 1, 1] : memref<64x512x30x30xf32> to memref<64x512x28x28xf32, strided<[460800, 900, 30, 1], offset: 31>>
391 |     memref.copy %alloc_19, %subview_23 : memref<64x512x28x28xf32> to memref<64x512x28x28xf32, strided<[460800, 900, 30, 1], offset: 31>>
392 |     affine.for %arg1 = 0 to 64 {
393 |       affine.for %arg2 = 0 to 512 {
394 |         affine.for %arg3 = 0 to 28 {
395 |           affine.for %arg4 = 0 to 28 {
396 |             affine.for %arg5 = 0 to 512 {
397 |               affine.for %arg6 = 0 to 3 {
398 |                 affine.for %arg7 = 0 to 3 {
399 |                   %11 = affine.load %alloc_22[%arg1, %arg5, %arg3 + %arg6, %arg4 + %arg7] : memref<64x512x30x30xf32>
400 |                   %12 = affine.load %5[%arg2, %arg5, %arg6, %arg7] : memref<512x512x3x3xf32>
401 |                   %13 = affine.load %alloc_20[%arg1, %arg2, %arg3, %arg4] : memref<64x512x28x28xf32>
402 |                   %14 = arith.mulf %11, %12 : f32
403 |                   %15 = arith.addf %13, %14 : f32
404 |                   affine.store %15, %alloc_20[%arg1, %arg2, %arg3, %arg4] : memref<64x512x28x28xf32>
405 |                 }
406 |               }
407 |             }
408 |           }
409 |         }
410 |       }
411 |     }
412 |     affine.for %arg1 = 0 to 64 {
413 |       affine.for %arg2 = 0 to 512 {
414 |         affine.for %arg3 = 0 to 28 {
415 |           affine.for %arg4 = 0 to 28 {
416 |             %11 = affine.load %alloc_20[%arg1, %arg2, %arg3, %arg4] : memref<64x512x28x28xf32>
417 |             %12 = arith.cmpf ugt, %11, %cst_0 : f32
418 |             %13 = arith.select %12, %11, %cst_0 : f32
419 |             affine.store %13, %alloc_19[%arg1, %arg2, %arg3, %arg4] : memref<64x512x28x28xf32>
420 |           }
421 |         }
422 |       }
423 |     }
424 |     %alloc_24 = memref.alloc() {alignment = 64 : i64} : memref<64x512x14x14xf32>
425 |     affine.for %arg1 = 0 to 64 {
426 |       affine.for %arg2 = 0 to 512 {
427 |         affine.for %arg3 = 0 to 14 {
428 |           affine.for %arg4 = 0 to 14 {
429 |             affine.store %cst, %alloc_24[%arg1, %arg2, %arg3, %arg4] : memref<64x512x14x14xf32>
430 |           }
431 |         }
432 |       }
433 |     }
434 |     affine.for %arg1 = 0 to 64 {
435 |       affine.for %arg2 = 0 to 512 {
436 |         affine.for %arg3 = 0 to 14 {
437 |           affine.for %arg4 = 0 to 14 {
438 |             affine.for %arg5 = 0 to 2 {
439 |               affine.for %arg6 = 0 to 2 {
440 |                 %11 = affine.load %alloc_19[%arg1, %arg2, %arg3 * 2 + %arg5, %arg4 * 2 + %arg6] : memref<64x512x28x28xf32>
441 |                 %12 = affine.load %alloc_24[%arg1, %arg2, %arg3, %arg4] : memref<64x512x14x14xf32>
442 |                 %13 = arith.maximumf %12, %11 : f32
443 |                 affine.store %13, %alloc_24[%arg1, %arg2, %arg3, %arg4] : memref<64x512x14x14xf32>
444 |               }
445 |             }
446 |           }
447 |         }
448 |       }
449 |     }
450 |     %alloc_25 = memref.alloc() {alignment = 64 : i64} : memref<64x512x16x16xf32>
451 |     affine.for %arg1 = 0 to 64 {
452 |       affine.for %arg2 = 0 to 512 {
453 |         affine.for %arg3 = 0 to 16 {
454 |           affine.for %arg4 = 0 to 16 {
455 |             affine.store %cst_0, %alloc_25[%arg1, %arg2, %arg3, %arg4] : memref<64x512x16x16xf32>
456 |           }
457 |         }
458 |       }
459 |     }
460 |     %subview_26 = memref.subview %alloc_25[0, 0, 1, 1] [64, 512, 14, 14] [1, 1, 1, 1] : memref<64x512x16x16xf32> to memref<64x512x14x14xf32, strided<[131072, 256, 16, 1], offset: 17>>
461 |     memref.copy %alloc_24, %subview_26 : memref<64x512x14x14xf32> to memref<64x512x14x14xf32, strided<[131072, 256, 16, 1], offset: 17>>
462 |     %alloc_27 = memref.alloc() {alignment = 64 : i64} : memref<64x512x14x14xf32>
463 |     affine.for %arg1 = 0 to 64 {
464 |       affine.for %arg2 = 0 to 512 {
465 |         affine.for %arg3 = 0 to 14 {
466 |           affine.for %arg4 = 0 to 14 {
467 |             affine.store %cst_0, %alloc_27[%arg1, %arg2, %arg3, %arg4] : memref<64x512x14x14xf32>
468 |           }
469 |         }
470 |       }
471 |     }
472 |     %alloc_28 = memref.alloc() {alignment = 64 : i64} : memref<64x512x14x14xf32>
473 |     memref.copy %alloc_27, %alloc_28 : memref<64x512x14x14xf32> to memref<64x512x14x14xf32>
474 |     affine.for %arg1 = 0 to 64 {
475 |       affine.for %arg2 = 0 to 512 {
476 |         affine.for %arg3 = 0 to 14 {
477 |           affine.for %arg4 = 0 to 14 {
478 |             affine.for %arg5 = 0 to 512 {
479 |               affine.for %arg6 = 0 to 3 {
480 |                 affine.for %arg7 = 0 to 3 {
481 |                   %11 = affine.load %alloc_25[%arg1, %arg5, %arg3 + %arg6, %arg4 + %arg7] : memref<64x512x16x16xf32>
482 |                   %12 = affine.load %4[%arg2, %arg5, %arg6, %arg7] : memref<512x512x3x3xf32>
483 |                   %13 = affine.load %alloc_28[%arg1, %arg2, %arg3, %arg4] : memref<64x512x14x14xf32>
484 |                   %14 = arith.mulf %11, %12 : f32
485 |                   %15 = arith.addf %13, %14 : f32
486 |                   affine.store %15, %alloc_28[%arg1, %arg2, %arg3, %arg4] : memref<64x512x14x14xf32>
487 |                 }
488 |               }
489 |             }
490 |           }
491 |         }
492 |       }
493 |     }
494 |     affine.for %arg1 = 0 to 64 {
495 |       affine.for %arg2 = 0 to 512 {
496 |         affine.for %arg3 = 0 to 14 {
497 |           affine.for %arg4 = 0 to 14 {
498 |             %11 = affine.load %alloc_28[%arg1, %arg2, %arg3, %arg4] : memref<64x512x14x14xf32>
499 |             %12 = arith.cmpf ugt, %11, %cst_0 : f32
500 |             %13 = arith.select %12, %11, %cst_0 : f32
501 |             affine.store %13, %alloc_24[%arg1, %arg2, %arg3, %arg4] : memref<64x512x14x14xf32>
502 |           }
503 |         }
504 |       }
505 |     }
506 |     %alloc_29 = memref.alloc() {alignment = 64 : i64} : memref<64x512x16x16xf32>
507 |     affine.for %arg1 = 0 to 64 {
508 |       affine.for %arg2 = 0 to 512 {
509 |         affine.for %arg3 = 0 to 16 {
510 |           affine.for %arg4 = 0 to 16 {
511 |             affine.store %cst_0, %alloc_29[%arg1, %arg2, %arg3, %arg4] : memref<64x512x16x16xf32>
512 |           }
513 |         }
514 |       }
515 |     }
516 |     %subview_30 = memref.subview %alloc_29[0, 0, 1, 1] [64, 512, 14, 14] [1, 1, 1, 1] : memref<64x512x16x16xf32> to memref<64x512x14x14xf32, strided<[131072, 256, 16, 1], offset: 17>>
517 |     memref.copy %alloc_24, %subview_30 : memref<64x512x14x14xf32> to memref<64x512x14x14xf32, strided<[131072, 256, 16, 1], offset: 17>>
518 |     affine.for %arg1 = 0 to 64 {
519 |       affine.for %arg2 = 0 to 512 {
520 |         affine.for %arg3 = 0 to 14 {
521 |           affine.for %arg4 = 0 to 14 {
522 |             affine.for %arg5 = 0 to 512 {
523 |               affine.for %arg6 = 0 to 3 {
524 |                 affine.for %arg7 = 0 to 3 {
525 |                   %11 = affine.load %alloc_29[%arg1, %arg5, %arg3 + %arg6, %arg4 + %arg7] : memref<64x512x16x16xf32>
526 |                   %12 = affine.load %3[%arg2, %arg5, %arg6, %arg7] : memref<512x512x3x3xf32>
527 |                   %13 = affine.load %alloc_27[%arg1, %arg2, %arg3, %arg4] : memref<64x512x14x14xf32>
528 |                   %14 = arith.mulf %11, %12 : f32
529 |                   %15 = arith.addf %13, %14 : f32
530 |                   affine.store %15, %alloc_27[%arg1, %arg2, %arg3, %arg4] : memref<64x512x14x14xf32>
531 |                 }
532 |               }
533 |             }
534 |           }
535 |         }
536 |       }
537 |     }
538 |     affine.for %arg1 = 0 to 64 {
539 |       affine.for %arg2 = 0 to 512 {
540 |         affine.for %arg3 = 0 to 14 {
541 |           affine.for %arg4 = 0 to 14 {
542 |             %11 = affine.load %alloc_27[%arg1, %arg2, %arg3, %arg4] : memref<64x512x14x14xf32>
543 |             %12 = arith.cmpf ugt, %11, %cst_0 : f32
544 |             %13 = arith.select %12, %11, %cst_0 : f32
545 |             affine.store %13, %alloc_24[%arg1, %arg2, %arg3, %arg4] : memref<64x512x14x14xf32>
546 |           }
547 |         }
548 |       }
549 |     }
550 |     %alloc_31 = memref.alloc() {alignment = 64 : i64} : memref<64x512x7x7xf32>
551 |     %alloc_32 = memref.alloc() {alignment = 64 : i64} : memref<64x512x7x7xf32>
552 |     affine.for %arg1 = 0 to 64 {
553 |       affine.for %arg2 = 0 to 512 {
554 |         affine.for %arg3 = 0 to 7 {
555 |           affine.for %arg4 = 0 to 7 {
556 |             affine.store %cst, %alloc_32[%arg1, %arg2, %arg3, %arg4] : memref<64x512x7x7xf32>
557 |           }
558 |         }
559 |       }
560 |     }
561 |     affine.for %arg1 = 0 to 64 {
562 |       affine.for %arg2 = 0 to 512 {
563 |         affine.for %arg3 = 0 to 7 {
564 |           affine.for %arg4 = 0 to 7 {
565 |             affine.for %arg5 = 0 to 2 {
566 |               affine.for %arg6 = 0 to 2 {
567 |                 %11 = affine.load %alloc_24[%arg1, %arg2, %arg3 * 2 + %arg5, %arg4 * 2 + %arg6] : memref<64x512x14x14xf32>
568 |                 %12 = affine.load %alloc_32[%arg1, %arg2, %arg3, %arg4] : memref<64x512x7x7xf32>
569 |                 %13 = arith.maximumf %12, %11 : f32
570 |                 affine.store %13, %alloc_32[%arg1, %arg2, %arg3, %arg4] : memref<64x512x7x7xf32>
571 |               }
572 |             }
573 |           }
574 |         }
575 |       }
576 |     }
577 |     affine.for %arg1 = 0 to 64 {
578 |       affine.for %arg2 = 0 to 512 {
579 |         affine.for %arg3 = 0 to 7 {
580 |           affine.for %arg4 = 0 to 7 {
581 |             affine.store %cst_0, %alloc_31[%arg1, %arg2, %arg3, %arg4] : memref<64x512x7x7xf32>
582 |           }
583 |         }
584 |       }
585 |     }
586 |     affine.for %arg1 = 0 to 64 {
587 |       affine.for %arg2 = 0 to 512 {
588 |         affine.for %arg3 = 0 to 7 {
589 |           affine.for %arg4 = 0 to 7 {
590 |             affine.for %arg5 = 0 to 1 {
591 |               affine.for %arg6 = 0 to 1 {
592 |                 %11 = affine.load %alloc_32[%arg1, %arg2, %arg3 + %arg5, %arg4 + %arg6] : memref<64x512x7x7xf32>
593 |                 %12 = affine.load %alloc_31[%arg1, %arg2, %arg3, %arg4] : memref<64x512x7x7xf32>
594 |                 %13 = arith.addf %12, %11 : f32
595 |                 affine.store %13, %alloc_31[%arg1, %arg2, %arg3, %arg4] : memref<64x512x7x7xf32>
596 |               }
597 |             }
598 |           }
599 |         }
600 |       }
601 |     }
602 |     %collapse_shape = memref.collapse_shape %alloc_31 [[0], [1, 2, 3]] : memref<64x512x7x7xf32> into memref<64x25088xf32>
603 |     %alloc_33 = memref.alloc() {alignment = 64 : i64} : memref<25088x4096xf32>
604 |     affine.for %arg1 = 0 to 4096 {
605 |       affine.for %arg2 = 0 to 25088 {
606 |         %11 = affine.load %2[%arg1, %arg2] : memref<4096x25088xf32>
607 |         affine.store %11, %alloc_33[%arg2, %arg1] : memref<25088x4096xf32>
608 |       }
609 |     }
610 |     %alloc_34 = memref.alloc() {alignment = 64 : i64} : memref<64x4096xf32>
611 |     %alloc_35 = memref.alloc() {alignment = 64 : i64} : memref<64x4096xf32>
612 |     affine.for %arg1 = 0 to 64 {
613 |       affine.for %arg2 = 0 to 4096 {
614 |         affine.store %cst_0, %alloc_35[%arg1, %arg2] : memref<64x4096xf32>
615 |       }
616 |     }
617 |     %alloc_36 = memref.alloc() {alignment = 64 : i64} : memref<64x4096xf32>
618 |     memref.copy %alloc_35, %alloc_36 : memref<64x4096xf32> to memref<64x4096xf32>
619 |     affine.for %arg1 = 0 to 64 {
620 |       affine.for %arg2 = 0 to 4096 {
621 |         affine.for %arg3 = 0 to 25088 {
622 |           %11 = affine.load %collapse_shape[%arg1, %arg3] : memref<64x25088xf32>
623 |           %12 = affine.load %alloc_33[%arg3, %arg2] : memref<25088x4096xf32>
624 |           %13 = affine.load %alloc_36[%arg1, %arg2] : memref<64x4096xf32>
625 |           %14 = arith.mulf %11, %12 : f32
626 |           %15 = arith.addf %13, %14 : f32
627 |           affine.store %15, %alloc_36[%arg1, %arg2] : memref<64x4096xf32>
628 |         }
629 |       }
630 |     }
631 |     %alloc_37 = memref.alloc() {alignment = 64 : i64} : memref<64x4096xf32>
632 |     affine.for %arg1 = 0 to 64 {
633 |       affine.for %arg2 = 0 to 4096 {
634 |         %11 = affine.load %alloc_36[%arg1, %arg2] : memref<64x4096xf32>
635 |         %12 = arith.addf %11, %cst_0 : f32
636 |         affine.store %12, %alloc_37[%arg1, %arg2] : memref<64x4096xf32>
637 |       }
638 |     }
639 |     affine.for %arg1 = 0 to 64 {
640 |       affine.for %arg2 = 0 to 4096 {
641 |         %11 = affine.load %alloc_37[%arg1, %arg2] : memref<64x4096xf32>
642 |         %12 = arith.cmpf ugt, %11, %cst_0 : f32
643 |         %13 = arith.select %12, %11, %cst_0 : f32
644 |         affine.store %13, %alloc_34[%arg1, %arg2] : memref<64x4096xf32>
645 |       }
646 |     }
647 |     %alloc_38 = memref.alloc() {alignment = 64 : i64} : memref<4096x4096xf32>
648 |     affine.for %arg1 = 0 to 4096 {
649 |       affine.for %arg2 = 0 to 4096 {
650 |         %11 = affine.load %1[%arg1, %arg2] : memref<4096x4096xf32>
651 |         affine.store %11, %alloc_38[%arg2, %arg1] : memref<4096x4096xf32>
652 |       }
653 |     }
654 |     affine.for %arg1 = 0 to 64 {
655 |       affine.for %arg2 = 0 to 4096 {
656 |         affine.for %arg3 = 0 to 4096 {
657 |           %11 = affine.load %alloc_34[%arg1, %arg3] : memref<64x4096xf32>
658 |           %12 = affine.load %alloc_38[%arg3, %arg2] : memref<4096x4096xf32>
659 |           %13 = affine.load %alloc_35[%arg1, %arg2] : memref<64x4096xf32>
660 |           %14 = arith.mulf %11, %12 : f32
661 |           %15 = arith.addf %13, %14 : f32
662 |           affine.store %15, %alloc_35[%arg1, %arg2] : memref<64x4096xf32>
663 |         }
664 |       }
665 |     }
666 |     %alloc_39 = memref.alloc() {alignment = 64 : i64} : memref<64x4096xf32>
667 |     affine.for %arg1 = 0 to 64 {
668 |       affine.for %arg2 = 0 to 4096 {
669 |         %11 = affine.load %alloc_35[%arg1, %arg2] : memref<64x4096xf32>
670 |         %12 = arith.addf %11, %cst_0 : f32
671 |         affine.store %12, %alloc_39[%arg1, %arg2] : memref<64x4096xf32>
672 |       }
673 |     }
674 |     affine.for %arg1 = 0 to 64 {
675 |       affine.for %arg2 = 0 to 4096 {
676 |         %11 = affine.load %alloc_39[%arg1, %arg2] : memref<64x4096xf32>
677 |         %12 = arith.cmpf ugt, %11, %cst_0 : f32
678 |         %13 = arith.select %12, %11, %cst_0 : f32
679 |         affine.store %13, %alloc_34[%arg1, %arg2] : memref<64x4096xf32>
680 |       }
681 |     }
682 |     %alloc_40 = memref.alloc() {alignment = 64 : i64} : memref<4096x1000xf32>
683 |     affine.for %arg1 = 0 to 1000 {
684 |       affine.for %arg2 = 0 to 4096 {
685 |         %11 = affine.load %0[%arg1, %arg2] : memref<1000x4096xf32>
686 |         affine.store %11, %alloc_40[%arg2, %arg1] : memref<4096x1000xf32>
687 |       }
688 |     }
689 |     %alloc_41 = memref.alloc() {alignment = 64 : i64} : memref<64x1000xf32>
690 |     %alloc_42 = memref.alloc() {alignment = 64 : i64} : memref<64x1000xf32>
691 |     affine.for %arg1 = 0 to 64 {
692 |       affine.for %arg2 = 0 to 1000 {
693 |         affine.store %cst_0, %alloc_42[%arg1, %arg2] : memref<64x1000xf32>
694 |       }
695 |     }
696 |     affine.for %arg1 = 0 to 64 {
697 |       affine.for %arg2 = 0 to 1000 {
698 |         affine.for %arg3 = 0 to 4096 {
699 |           %11 = affine.load %alloc_34[%arg1, %arg3] : memref<64x4096xf32>
700 |           %12 = affine.load %alloc_40[%arg3, %arg2] : memref<4096x1000xf32>
701 |           %13 = affine.load %alloc_42[%arg1, %arg2] : memref<64x1000xf32>
702 |           %14 = arith.mulf %11, %12 : f32
703 |           %15 = arith.addf %13, %14 : f32
704 |           affine.store %15, %alloc_42[%arg1, %arg2] : memref<64x1000xf32>
705 |         }
706 |       }
707 |     }
708 |     affine.for %arg1 = 0 to 64 {
709 |       affine.for %arg2 = 0 to 1000 {
710 |         %11 = affine.load %alloc_42[%arg1, %arg2] : memref<64x1000xf32>
711 |         %12 = arith.addf %11, %cst_0 : f32
712 |         affine.store %12, %alloc_41[%arg1, %arg2] : memref<64x1000xf32>
713 |       }
714 |     }
715 |     memref.dealloc %alloc_1 : memref<64x64x224x224xf32>
716 |     memref.dealloc %alloc_2 : memref<64x64x224x224xf32>
717 |     memref.dealloc %alloc_3 : memref<64x64x112x112xf32>
718 |     memref.dealloc %alloc_6 : memref<64x128x112x112xf32>
719 |     memref.dealloc %alloc_7 : memref<64x128x112x112xf32>
720 |     memref.dealloc %alloc_8 : memref<64x128x56x56xf32>
721 |     memref.dealloc %alloc_11 : memref<64x256x56x56xf32>
722 |     memref.dealloc %alloc_12 : memref<64x256x56x56xf32>
723 |     memref.dealloc %alloc_13 : memref<64x256x56x56xf32>
724 |     memref.dealloc %alloc_16 : memref<64x256x28x28xf32>
725 |     memref.dealloc %alloc_19 : memref<64x512x28x28xf32>
726 |     memref.dealloc %alloc_20 : memref<64x512x28x28xf32>
727 |     memref.dealloc %alloc_21 : memref<64x512x28x28xf32>
728 |     memref.dealloc %alloc_24 : memref<64x512x14x14xf32>
729 |     memref.dealloc %alloc_27 : memref<64x512x14x14xf32>
730 |     memref.dealloc %alloc_28 : memref<64x512x14x14xf32>
731 |     memref.dealloc %alloc_31 : memref<64x512x7x7xf32>
732 |     memref.dealloc %alloc_32 : memref<64x512x7x7xf32>
733 |     memref.dealloc %alloc_33 : memref<25088x4096xf32>
734 |     memref.dealloc %alloc_34 : memref<64x4096xf32>
735 |     memref.dealloc %alloc_35 : memref<64x4096xf32>
736 |     memref.dealloc %alloc_36 : memref<64x4096xf32>
737 |     memref.dealloc %alloc_37 : memref<64x4096xf32>
738 |     memref.dealloc %alloc_38 : memref<4096x4096xf32>
739 |     memref.dealloc %alloc_39 : memref<64x4096xf32>
740 |     memref.dealloc %alloc_40 : memref<4096x1000xf32>
741 |     memref.dealloc %alloc_41 : memref<64x1000xf32>
742 |     memref.dealloc %alloc_42 : memref<64x1000xf32>
743 |     memref.dealloc %alloc : memref<64x3x226x226xf32>
744 |     memref.dealloc %alloc_4 : memref<64x64x114x114xf32>
745 |     memref.dealloc %alloc_9 : memref<64x128x58x58xf32>
746 |     memref.dealloc %alloc_14 : memref<64x256x58x58xf32>
747 |     memref.dealloc %alloc_17 : memref<64x256x30x30xf32>
748 |     memref.dealloc %alloc_22 : memref<64x512x30x30xf32>
749 |     memref.dealloc %alloc_25 : memref<64x512x16x16xf32>
750 |     memref.dealloc %alloc_29 : memref<64x512x16x16xf32>
751 |     return %alloc_41 : memref<64x1000xf32>
752 |   }
753 | }
754 | 


--------------------------------------------------------------------------------
/benchmark.sh:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env bash
  2 | 
  3 | # Function to display help menu
  4 | display_help() {
  5 |     echo "Usage: $0 [benchmark_type] [ML_model] [opt_flag] [PAPI_event_name]"
  6 |     echo
  7 |     echo "Arguments:"
  8 |     echo "  benchmark_type   Type of benchmark to run (e.g., GB for Google Benchmark, PAPI for PAPI-based, chrono for C++ chrono)"
  9 |     echo "  ML_model         Name of the machine learning model to benchmark (e.g., Alexnet, ResNet50, etc.)"
 10 |     echo "  opt_flag         Flag to run the custom optimization pass (e.g., --affine-unroll for enabling optimization)"
 11 |     echo "  PAPI_event_name  Name of the PAPI event to monitor (e.g., PAPI_TOT_CYC for total cycles) only for PAPI-based benchmark"
 12 |     echo
 13 |     echo "Example:"
 14 |     echo "  $0 GB Alexnet --affine-64-unroll"
 15 |     echo
 16 |     exit 0
 17 | }
 18 | 
 19 | # Check if help is requested
 20 | if [[ "$1" == "--help" || "$1" == "-h" ]]; then
 21 |     display_help
 22 | fi
 23 | 
 24 | # Check if all three arguments are provided
 25 | if [ $# -ne 3 ]; then
 26 |     echo "Error: Missing arguments."
 27 |     display_help
 28 | fi
 29 | 
 30 | # Check if 4th argument is provided for PAPI-based benchmark
 31 | if [ "$1" == "PAPI" ] && [ $# -ne 4 ]; then
 32 |     echo "Error: Missing PAPI event name."
 33 |     display_help
 34 | fi
 35 | 
 36 | # Assigning arguments to variables for readability
 37 | BENCHMARK_TYPE=$1
 38 | ML_MODEL=$2
 39 | OPT_FLAG=$3
 40 | PAPI_EVENT_NAME=$4
 41 | PROJECT_OPT="$(pwd)/build-ninja/tools/project-opt"
 42 | MODIFIED_MLIR="$(pwd)/Torch_MLIR/modified/modified.mlir"
 43 | MLIR_OBJ_PY="$(pwd)/make_MLIR_obj.py"
 44 | MODIFIED_MLIR="$(pwd)/Torch_MLIR/modified/modified.mlir"
 45 | MODIFIED_OBJ_FOLDER="$(pwd)/benchmarks/mlir_obj/"
 46 | ORACLE_MLIR_OBJ_FOLDER="$(pwd)/Torch_MLIR/pytorch/"
 47 | ML_MODEL_MLIR="${ORACLE_MLIR_OBJ_FOLDER}$(echo ${ML_MODEL}| tr '[:upper:]' '[:lower:]').mlir"
 48 | BUILD_DIR="$(pwd)/build-ninja/"
 49 | BENCHMARK_DIR="${BUILD_DIR}/benchmarks/"
 50 | GOOGLE_BENCHMARK="${BENCHMARK_DIR}/GoogleBenchmarks/"
 51 | HC_BENCHMARK="${BENCHMARK_DIR}/Hardware_Counters_or_Time/"
 52 | 
 53 | echo "Compiling your optimized mlir to an object file mlir_oracle : ${ML_MODEL_MLIR} "
 54 | echo "=========== Running your pass ${OPT_FLAG} on ${ML_MODEL_MLIR} =================="
 55 | echo " ${PROJECT_OPT} ${OPT_FLAG} ${ML_MODEL_MLIR} -o ${MODIFIED_MLIR} "
 56 | $PROJECT_OPT $OPT_FLAG $ML_MODEL_MLIR -o $MODIFIED_MLIR
 57 | #make your modified mlir obj
 58 | echo "============= Compiling Required MLIR files ${ML_MODEL_MLIR} ==================="
 59 | $MLIR_OBJ_PY $MODIFIED_MLIR $MODIFIED_OBJ_FOLDER
 60 | cmake --build $BUILD_DIR --target run_bench_Time_Modified
 61 | cmake --build $BUILD_DIR --target run_bench_HC_Modified
 62 | cmake --build $BUILD_DIR --target run_bench_GB_Modified
 63 | cmake --build $BUILD_DIR --target run_bench_GB_$ML_MODEL
 64 | cmake --build $BUILD_DIR --target run_bench_Time_$ML_MODEL
 65 | cmake --build $BUILD_DIR --target run_bench_HC_$ML_MODEL
 66 | 
 67 | # Implement logic for each benchmark type
 68 | case $BENCHMARK_TYPE in
 69 |     GB)
 70 |         echo "Running Google Benchmark for $ML_MODEL with flag: $OPT_FLAG"
 71 |         echo "========================== Orginal with O0 ====================================="
 72 |         $GOOGLE_BENCHMARK/run_bench_GB_$ML_MODEL --benchmark_time_unit=s
 73 |         echo "======================= After Transformation ==================================="
 74 |         $GOOGLE_BENCHMARK/run_bench_GB_Modified --benchmark_time_unit=s
 75 |         # Add your command to run Google Benchmark with the specified ML model and opt flag
 76 |         ;;
 77 |     PAPI)
 78 |         echo "Running PAPI Benchmark for $ML_MODEL with flag: $OPT_FLAG"
 79 |         echo "========================== Orginal with O0 ====================================="
 80 |         $HC_BENCHMARK/run_bench_HC_$ML_MODEL PAPI_EVENT_NAME=$PAPI_EVENT_NAME
 81 |         echo "======================= After Transformation ==================================="
 82 |         $HC_BENCHMARK/run_bench_HC_Modified PAPI_EVENT_NAME=$PAPI_EVENT_NAME
 83 |         # Add your command to run PAPI-based benchmark with the specified ML model and opt flag
 84 |         # e.g., ./run_bench_PAPI $ML_MODEL $OPT_FLAG
 85 |         ;;
 86 |     chrono)
 87 |         echo "Running Chrono-based Benchmark for $ML_MODEL with flag: $OPT_FLAG"
 88 |         echo "========================== Orginal with O0 ====================================="
 89 |         $HC_BENCHMARK/run_bench_Time_$ML_MODEL
 90 |         echo "======================= After Transformation ==================================="
 91 |         $HC_BENCHMARK/run_bench_Time_Modified
 92 |         # Add your command to run chrono-based benchmark with the specified ML model and opt flag
 93 |         # e.g., ./run_bench_chrono $ML_MODEL $OPT_FLAG
 94 |         ;;
 95 |     *)
 96 |         echo "Error: Invalid benchmark type."
 97 |         display_help
 98 |         ;;
 99 | esac
100 | 


--------------------------------------------------------------------------------
/benchmarks/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(GoogleBenchmarks)
2 | add_subdirectory(Hardware_Counters_or_Time)
3 | 


--------------------------------------------------------------------------------
/benchmarks/GoogleBenchmarks/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set(SUPPORTED_MODELS "Alexnet" "Resnet" "VGG" "Modified")
 2 | 
 3 | set(ML_MODEL "" CACHE STRING "Choose the ml model to link with run_bench_GB")
 4 | 
 5 | if(ML_MODEL STREQUAL "")
 6 | set(ML_MODEL "Modified" )
 7 | message(STATUS "No model specified. Defaulting to: MODIFIED")
 8 | endif()
 9 | 
10 | if(NOT ML_MODEL IN_LIST SUPPORTED_MODELS)
11 | message(FATAL_ERROR "Invalid model: ${ML_MODEL}. Supported models are: ${SUPPORTED_MODELS}")
12 | endif()
13 | 
14 | # add_executable (run_bench_GB "run_bench.cpp")
15 | add_library(libmlir_c_runner_utils SHARED IMPORTED)
16 | set_target_properties(libmlir_c_runner_utils PROPERTIES IMPORTED_LOCATION "${CMAKE_SOURCE_DIR}/External/llvm-project/build/lib/libmlir_c_runner_utils.so")
17 | 
18 | 
19 | add_executable (run_bench_GB_Alexnet "run_bench.cpp")
20 | target_compile_options(run_bench_GB_Alexnet PRIVATE -O0)
21 | target_sources(run_bench_GB_Alexnet PRIVATE "${CMAKE_SOURCE_DIR}/benchmarks/mlir_obj/alexnet.o")
22 | target_link_libraries(run_bench_GB_Alexnet benchmark::benchmark libmlir_c_runner_utils)
23 | 
24 | add_executable (run_bench_GB_Resnet152 "run_bench.cpp")
25 | target_compile_options(run_bench_GB_Resnet152 PRIVATE -O0)
26 | target_sources(run_bench_GB_Resnet152 PRIVATE "${CMAKE_SOURCE_DIR}/benchmarks/mlir_obj/resnet152.o")
27 | target_link_libraries(run_bench_GB_Resnet152 benchmark::benchmark libmlir_c_runner_utils)
28 | 
29 | add_executable (run_bench_GB_Vgg11 "run_bench.cpp")
30 | target_compile_options(run_bench_GB_Vgg11 PRIVATE -O0)
31 | target_sources(run_bench_GB_Vgg11 PRIVATE "${CMAKE_SOURCE_DIR}/benchmarks/mlir_obj/vgg11.o")
32 | target_link_libraries(run_bench_GB_Vgg11 benchmark::benchmark libmlir_c_runner_utils)
33 | 
34 | add_executable (run_bench_GB_Modified "run_bench.cpp")
35 | target_compile_options(run_bench_GB_Modified PRIVATE -O0)
36 | target_sources(run_bench_GB_Modified PRIVATE "${CMAKE_SOURCE_DIR}/benchmarks/mlir_obj/modified.o")
37 | target_link_libraries(run_bench_GB_Modified benchmark::benchmark libmlir_c_runner_utils)
38 | 
39 | # set_target_properties(run_bench_GB PROPERTIES INTERPROCEDURAL_OPTIMIZATION FALSE)
40 | 
41 | message(STATUS "Compiling done for Google Benchmarks")


--------------------------------------------------------------------------------
/benchmarks/GoogleBenchmarks/run_bench.cpp:
--------------------------------------------------------------------------------
 1 | // run_bench.cpp : Defines the entry point for the application.
 2 | //
 3 | 
 4 | #include <benchmark/benchmark.h>
 5 | extern "C" void forward();
 6 | static void BM_forward(benchmark::State &state) {
 7 |   for (auto _ : state) {
 8 |     forward();
 9 |   }
10 | }
11 | BENCHMARK(BM_forward);
12 | BENCHMARK_MAIN();


--------------------------------------------------------------------------------
/benchmarks/Hardware_Counters_or_Time/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | set(SUPPORTED_MODELS "Alexnet" "Resnet" "VGG" "Modified")
 2 | set(SUPPORTED_INST_TYPE "Time" "PAPI")
 3 | 
 4 | set(ML_MODEL "" CACHE STRING "Choose the ml model to link with run_bench_HC")
 5 | 
 6 | if(ML_MODEL STREQUAL "")
 7 | set(ML_MODEL "Modified")
 8 | message(STATUS "No model specified. Defaulting to: MODIFIED")
 9 | endif()
10 | 
11 | if(NOT ML_MODEL IN_LIST SUPPORTED_MODELS)
12 | message(FATAL_ERROR "Invalid model: ${ML_MODEL}. Supported models are: ${SUPPORTED_MODELS}")
13 | endif()
14 | 
15 | set(INST_TYPE "" CACHE STRING "Choose choose instrumentation type for run_bench_HC")
16 | if(INST_TYPE STREQUAL "")
17 | set(INST_TYPE "Time")
18 | message(STATUS "No model specified. Defaulting to: Time")
19 | endif()
20 | 
21 | if(NOT INST_TYPE IN_LIST SUPPORTED_INST_TYPE)
22 | message(FATAL_ERROR "Invalid model: ${INST_TYPE}. Supported models are: ${SUPPORTED_INST_TYPE}")
23 | endif()
24 | 
25 | add_library(libmlir_c_runner_utils SHARED IMPORTED)
26 | set_target_properties(libmlir_c_runner_utils PROPERTIES IMPORTED_LOCATION "${CMAKE_SOURCE_DIR}/External/llvm-project/build/lib/libmlir_c_runner_utils.so")
27 | 
28 | # PAPI instrumentation
29 | add_executable (run_bench_HC_Alexnet "run_bench.cpp")
30 | target_compile_definitions(run_bench_HC_Alexnet PRIVATE -DPAPI_INST__ -DHUMAN_READABLE)
31 | target_sources(run_bench_HC_Alexnet PRIVATE "${CMAKE_SOURCE_DIR}/benchmarks/mlir_obj/alexnet.o")
32 | target_link_libraries(run_bench_HC_Alexnet libmlir_c_runner_utils papi)
33 | 
34 | add_executable (run_bench_HC_Resnet152 "run_bench.cpp")
35 | target_compile_definitions(run_bench_HC_Resnet152 PRIVATE -DPAPI_INST__ -DHUMAN_READABLE)
36 | target_sources(run_bench_HC_Resnet152 PRIVATE "${CMAKE_SOURCE_DIR}/benchmarks/mlir_obj/resnet152.o")
37 | target_link_libraries(run_bench_HC_Resnet152 libmlir_c_runner_utils papi)
38 | 
39 | add_executable (run_bench_HC_Vgg11 "run_bench.cpp")
40 | target_compile_definitions(run_bench_HC_Vgg11 PRIVATE -DPAPI_INST__ -DHUMAN_READABLE)
41 | target_sources(run_bench_HC_Vgg11 PRIVATE "${CMAKE_SOURCE_DIR}/benchmarks/mlir_obj/vgg11.o")
42 | target_link_libraries(run_bench_HC_Vgg11 libmlir_c_runner_utils papi)
43 | 
44 | add_executable (run_bench_HC_Modified "run_bench.cpp")
45 | target_compile_definitions(run_bench_HC_Modified PRIVATE -DPAPI_INST__ -DHUMAN_READABLE)
46 | target_sources(run_bench_HC_Modified PRIVATE "${CMAKE_SOURCE_DIR}/benchmarks/mlir_obj/modified.o")
47 | target_link_libraries(run_bench_HC_Modified libmlir_c_runner_utils papi)
48 | 
49 | # Time instrumentation
50 | add_executable (run_bench_Time_Alexnet "run_bench.cpp")
51 | target_compile_definitions(run_bench_Time_Alexnet PRIVATE -DTIME_INST__ -DHUMAN_READABLE)
52 | target_sources(run_bench_Time_Alexnet PRIVATE "${CMAKE_SOURCE_DIR}/benchmarks/mlir_obj/alexnet.o")
53 | target_link_libraries(run_bench_Time_Alexnet libmlir_c_runner_utils)
54 | 
55 | add_executable (run_bench_Time_Resnet152 "run_bench.cpp")
56 | target_compile_definitions(run_bench_Time_Resnet152 PRIVATE -DTIME_INST__ -DHUMAN_READABLE)
57 | target_sources(run_bench_Time_Resnet152 PRIVATE "${CMAKE_SOURCE_DIR}/benchmarks/mlir_obj/resnet152.o")
58 | target_link_libraries(run_bench_Time_Resnet152 libmlir_c_runner_utils)
59 | 
60 | add_executable (run_bench_Time_Vgg11 "run_bench.cpp")
61 | target_compile_definitions(run_bench_Time_Vgg11 PRIVATE -DTIME_INST__ -DHUMAN_READABLE)
62 | target_sources(run_bench_Time_Vgg11 PRIVATE "${CMAKE_SOURCE_DIR}/benchmarks/mlir_obj/vgg11.o")
63 | target_link_libraries(run_bench_Time_Vgg11 libmlir_c_runner_utils)
64 | 
65 | add_executable (run_bench_Time_Modified "run_bench.cpp")
66 | target_compile_definitions(run_bench_Time_Modified PRIVATE -DTIME_INST__ -DHUMAN_READABLE)
67 | target_sources(run_bench_Time_Modified PRIVATE "${CMAKE_SOURCE_DIR}/benchmarks/mlir_obj/modified.o")
68 | target_link_libraries(run_bench_Time_Modified libmlir_c_runner_utils)
69 | 
70 | 
71 | message(STATUS "Compiled run_bench_HC_* and run_bench_Tsime_* ")


--------------------------------------------------------------------------------
/benchmarks/Hardware_Counters_or_Time/run_bench.cpp:
--------------------------------------------------------------------------------
 1 | #include "run_bench.h"
 2 | #include <cstdio>
 3 | #include <iostream>
 4 | #include <stdio.h>
 5 | 
 6 | extern "C" void forward();
 7 | 
 8 | int main() {
 9 |   start_instrumentaion;
10 |   forward();
11 |   stop_instrumentation;
12 |   print_instruments;
13 |   return 0;
14 | }


--------------------------------------------------------------------------------
/benchmarks/Hardware_Counters_or_Time/run_bench.h:
--------------------------------------------------------------------------------
  1 | #ifndef PAPI_TIME_INST
  2 | #define PAPI_TIME_INST
  3 | 
  4 | #include <chrono>
  5 | #include <cstdio>
  6 | #include <cstdlib>
  7 | #include <iostream>
  8 | #include <stddef.h>
  9 | #include <stdint.h>
 10 | 
 11 | #define start_instrumentaion
 12 | #define stop_instrumentation
 13 | #define print_instruments
 14 | 
 15 | #ifdef PAPI_INST__
 16 | #include <papi.h>
 17 | 
 18 | long long int papi_event_value = 0;
 19 | int eventset = 0;
 20 | const char *env_var_name = "PAPI_EVENT_NAME";
 21 | const char *papi_event_name = getenv(env_var_name);
 22 | int retval = 0;
 23 | 
 24 | void papi_init() {
 25 |   retval = PAPI_library_init(PAPI_VER_CURRENT);
 26 |   if (retval != PAPI_VER_CURRENT) {
 27 |     std::cerr << "Error initializing PAPI! " << PAPI_strerror(retval)
 28 |               << std::endl;
 29 |     exit(1);
 30 |   }
 31 | }
 32 | 
 33 | void create_event_set() {
 34 |   eventset = PAPI_NULL;
 35 |   // papi creating event set
 36 |   retval = PAPI_create_eventset(&eventset);
 37 |   if (retval != PAPI_OK) {
 38 |     std::cerr << "Error creating eventset! " << PAPI_strerror(retval)
 39 |               << std::endl;
 40 |   }
 41 |   // papi adding event set
 42 |   retval = PAPI_add_named_event(eventset, papi_event_name);
 43 |   if (retval != PAPI_OK) {
 44 |     std::cerr << "Error adding " << papi_event_name << ": "
 45 |               << PAPI_strerror(retval) << std::endl;
 46 |   }
 47 | }
 48 | 
 49 | void papi_start() {
 50 |   PAPI_reset(eventset);
 51 |   retval = PAPI_start(eventset);
 52 |   if (retval != PAPI_OK) {
 53 |     std::cerr << "Error PAPI: " << PAPI_strerror(retval) << std::endl;
 54 |   }
 55 | }
 56 | 
 57 | void papi_stop() {
 58 |   retval = PAPI_stop(eventset, &papi_event_value);
 59 |   if (retval != PAPI_OK) {
 60 |     std::cerr << "Error stopping: " << PAPI_strerror(retval) << std::endl;
 61 |   }
 62 | }
 63 | 
 64 | void print_papi() {
 65 | #ifdef HUMAN_READABLE
 66 |   fprintf(stderr, "Measured %s event %lld times\n", papi_event_name,
 67 |           papi_event_value);
 68 | #endif
 69 |   fprintf(stdout, "%lld", papi_event_value);
 70 | }
 71 | 
 72 | #undef start_instrumentaion
 73 | #undef stop_instrumentation
 74 | #undef print_instruments
 75 | 
 76 | #define start_instrumentaion                                                   \
 77 |   papi_init();                                                                 \
 78 |   create_event_set();                                                          \
 79 |   papi_start();
 80 | 
 81 | #define stop_instrumentation papi_stop();
 82 | 
 83 | #define print_instruments print_papi();
 84 | 
 85 | #endif
 86 | 
 87 | #ifdef TIME_INST__
 88 | 
 89 | double time_reading;
 90 | double time_reading_ns;
 91 | std::chrono::time_point<std::chrono::high_resolution_clock> start_time_counter;
 92 | std::chrono::time_point<std::chrono::high_resolution_clock> end_time_counter;
 93 | 
 94 | void start_time() {
 95 |   start_time_counter = std::chrono::high_resolution_clock::now();
 96 | }
 97 | 
 98 | void end_time() {
 99 |   end_time_counter = std::chrono::high_resolution_clock::now();
100 | }
101 | 
102 | void print_time() {
103 |   std::chrono::duration<double> timing = end_time_counter - start_time_counter;
104 |   time_reading_ns =
105 |       std::chrono::duration_cast<std::chrono::nanoseconds>(timing).count();
106 |   time_reading = time_reading_ns / 1000000000;
107 | #ifdef HUMAN_READABLE
108 |   std::cerr << "Measured Time : " << time_reading << " seconds" << std::endl;
109 | #endif
110 |   std::cerr << time_reading << std::endl;
111 | }
112 | 
113 | #undef start_instrumentaion
114 | #undef stop_instrumentation
115 | #undef print_instruments
116 | 
117 | #define start_instrumentaion start_time();
118 | 
119 | #define stop_instrumentation end_time();
120 | 
121 | #define print_instruments print_time();
122 | 
123 | #endif
124 | 
125 | #endif /* PAPI_TIME_INST */


--------------------------------------------------------------------------------
/build_llvm.sh:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env bash
 2 | 
 3 | BUILD_SYSTEM=Ninja
 4 | BUILD_TAG=ninja
 5 | THIRDPARTY_LLVM_DIR=$PWD/External/llvm-project
 6 | BUILD_DIR=$THIRDPARTY_LLVM_DIR/build
 7 | INSTALL_DIR=$THIRDPARTY_LLVM_DIR/install
 8 | 
 9 | # rm -rf $BUILD_DIR
10 | mkdir -p $BUILD_DIR
11 | mkdir -p $INSTALL_DIR
12 | 
13 | pushd $BUILD_DIR
14 | 
15 | cmake ../llvm -G $BUILD_SYSTEM \
16 |       -DCMAKE_CXX_COMPILER=clang++ \
17 |       -DCMAKE_C_COMPILER=clang \
18 |       -DCMAKE_INSTALL_PREFIX=$INSTALL_DIR \
19 |       -DLLVM_LOCAL_RPATH=$INSTALL_DIR/lib \
20 |       -DLLVM_PARALLEL_COMPILE_JOBS=12 \
21 |       -DLLVM_PARALLEL_LINK_JOBS=6 \
22 |       -DLLVM_BUILD_EXAMPLES=OFF \
23 |       -DLLVM_INSTALL_UTILS=ON \
24 |       -DCMAKE_OSX_ARCHITECTURES="$(uname -m)" \
25 |       -DCMAKE_BUILD_TYPE=Release \
26 |       -DLLVM_ENABLE_ASSERTIONS=ON \
27 |       -DLLVM_CCACHE_BUILD=ON \
28 |       -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
29 |       -DLLVM_ENABLE_PROJECTS='mlir' \
30 | 
31 | 
32 | cmake --build . --target check-mlir
33 | cmake --build . --target mlir-libraries
34 | cmake --build . --target llc
35 | 
36 | popd


--------------------------------------------------------------------------------
/build_mlir.sh:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env bash
 2 | 
 3 | BUILD_SYSTEM="Ninja"
 4 | BUILD_DIR=./build-`echo ${BUILD_SYSTEM}| tr '[:upper:]' '[:lower:]'`
 5 | echo "build dir ${BUILD_DIR}"
 6 | rm -rf $BUILD_DIR
 7 | mkdir $BUILD_DIR
 8 | pushd $BUILD_DIR
 9 | 
10 | LLVM_BUILD_DIR=External/llvm-project/build
11 | cmake -G $BUILD_SYSTEM .. \
12 |     -DLLVM_DIR="$LLVM_BUILD_DIR/lib/cmake/llvm" \
13 |     -DMLIR_DIR="$LLVM_BUILD_DIR/lib/cmake/mlir" \
14 |     -DBUILD_DEPS="ON" \
15 |     -DBUILD_SHARED_LIBS="OFF" \
16 |     -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
17 |     -DCMAKE_BUILD_TYPE=Debug
18 | 
19 | popd
20 | 
21 | cmake --build $BUILD_DIR --target project-opt
22 | 


--------------------------------------------------------------------------------
/build_obj.sh:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env bash
 2 | if [ $# -ne 1 ]; then
 3 |     echo "Error: Missing arguments."
 4 |     echo "Usage: $0 [CMAKE_SOURCE_DIR]"
 5 |     exit 1
 6 | fi
 7 | CMAKE_SOURCE_DIR=$1
 8 | cd $CMAKE_SOURCE_DIR
 9 | python3 ${CMAKE_SOURCE_DIR}/make_MLIR_obj.py ${CMAKE_SOURCE_DIR}/Torch_MLIR/pytorch/ ${CMAKE_SOURCE_DIR}/benchmarks/mlir_obj/ &> /dev/null
10 | echo "Compiled the oracle MLIR files to object files"
11 | 


--------------------------------------------------------------------------------
/example_output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mvvsmk/OptML/32b88924bfd333a235c718495935883b2a638094/example_output.png


--------------------------------------------------------------------------------
/include/Transform/Affine/Affine64Unroll.h:
--------------------------------------------------------------------------------
 1 | #ifndef LIB_TRANSFORM_AFFINE_AFFINEFULLUNROLL_H_
 2 | #define LIB_TRANSFORM_AFFINE_AFFINEFULLUNROLL_H_
 3 | 
 4 | #include "mlir/Dialect/Affine/IR/AffineOps.h"
 5 | #include "mlir/Dialect/Func/IR/FuncOps.h"
 6 | #include "mlir/include/mlir/Pass/Pass.h"
 7 | 
 8 | namespace mlir {
 9 | namespace project {
10 | 
11 | class Affine64UnrollPass
12 |     : public PassWrapper<Affine64UnrollPass,
13 |                          OperationPass<mlir::func::FuncOp>> {
14 | private:
15 |   void runOnOperation() override;
16 | 
17 |   StringRef getArgument() const final { return "affine-64-unroll"; }
18 | 
19 |   StringRef getDescription() const final {
20 |     return "Unroll loop if it is at a loop depth of 3 or more with a factor of "
21 |            "64";
22 |   }
23 | };
24 | 
25 | } // namespace project
26 | } // namespace mlir
27 | 
28 | #endif // LIB_TRANSFORM_AFFINE_AFFINEFULLUNROLL_H_


--------------------------------------------------------------------------------
/include/Transform/MakeRunAble/RemoveForwardFuncArgsAndReturn.h:
--------------------------------------------------------------------------------
 1 | #ifndef LIB_TRANSFORM_MAKE_RUN_ABLE_RFFAAR_H_
 2 | #define LIB_TRANSFORM_MAKE_RUN_ABLE_RFFAAR_H_
 3 | 
 4 | #include "include/Transform/MakeRunAble/RemoveGlobalConstants.h"
 5 | #include "mlir/Dialect/Affine/IR/AffineOps.h"
 6 | #include "mlir/Dialect/Func/IR/FuncOps.h"
 7 | #include "mlir/Dialect/MemRef/IR/MemRef.h"
 8 | #include "mlir/IR/BuiltinOps.h"
 9 | #include "mlir/include/mlir/Pass/Pass.h"
10 | 
11 | namespace mlir {
12 | namespace project {
13 | 
14 | class RemoveForwardFuncArgsAndReturn
15 |     : public PassWrapper<RemoveForwardFuncArgsAndReturn,
16 |                          OperationPass<mlir::func::FuncOp>> {
17 | private:
18 |   void runOnOperation() override;
19 | 
20 |   StringRef getArgument() const final {
21 |     return "rem-forward-func-args-and-return-run-mlir";
22 |   }
23 | 
24 |   StringRef getDescription() const final {
25 |     return "Removes forward function's arguments and return type so that both "
26 |            "of them are void";
27 |   }
28 | };
29 | 
30 | } // namespace project
31 | } // namespace mlir
32 | 
33 | #endif // LIB_TRANSFORM_MAKE_RUN_ABLE_RFFAAR_H_


--------------------------------------------------------------------------------
/include/Transform/MakeRunAble/RemoveGlobalConstants.h:
--------------------------------------------------------------------------------
 1 | #ifndef LIB_TRANSFORM_MAKE_RUN_ABLE_RGC_H_
 2 | #define LIB_TRANSFORM_MAKE_RUN_ABLE_RGC_H_
 3 | 
 4 | #include "mlir/Dialect/Affine/IR/AffineOps.h"
 5 | #include "mlir/Dialect/Func/IR/FuncOps.h"
 6 | #include "mlir/Dialect/MemRef/IR/MemRef.h"
 7 | #include "mlir/IR/BuiltinOps.h"
 8 | #include "mlir/include/mlir/Pass/Pass.h"
 9 | 
10 | namespace mlir {
11 | namespace project {
12 | 
13 | class RemoveGlobalConstants
14 |     : public PassWrapper<RemoveGlobalConstants, OperationPass<mlir::ModuleOp>> {
15 | private:
16 |   void runOnOperation() override;
17 | 
18 |   StringRef getArgument() const final {
19 |     return "rem-global-constants-run-mlir";
20 |   }
21 | 
22 |   StringRef getDescription() const final {
23 |     return "Removes global constants and moves them to the forward function";
24 |   }
25 | };
26 | 
27 | } // namespace project
28 | } // namespace mlir
29 | 
30 | #endif // LIB_TRANSFORM_MAKE_RUN_ABLE_RGC_H_


--------------------------------------------------------------------------------
/include/Transform/MakeRunAble/ZeroInitRemoveForwardFuncArgsAndReturn.h:
--------------------------------------------------------------------------------
 1 | #ifndef LIB_TRANSFORM_MAKE_RUN_ABLE_ZIRFFAAR_H_
 2 | #define LIB_TRANSFORM_MAKE_RUN_ABLE_ZIRFFAAR_H_
 3 | 
 4 | #include "mlir/Dialect/Affine/IR/AffineOps.h"
 5 | #include "mlir/Dialect/Func/IR/FuncOps.h"
 6 | #include "mlir/Dialect/MemRef/IR/MemRef.h"
 7 | #include "mlir/IR/BuiltinOps.h"
 8 | #include "mlir/include/mlir/Pass/Pass.h"
 9 | 
10 | namespace mlir {
11 | namespace project {
12 | 
13 | class ZeroInitRemoveForwardFuncArgsAndReturn
14 |     : public PassWrapper<ZeroInitRemoveForwardFuncArgsAndReturn,
15 |                          OperationPass<mlir::func::FuncOp>> {
16 | private:
17 |   void runOnOperation() override;
18 | 
19 |   StringRef getArgument() const final {
20 |     return "rem-forward-func-args-and-return-run-mlir-zero-init";
21 |   }
22 | 
23 |   StringRef getDescription() const final {
24 |     return "Removes forward function's arguments and return type so that both "
25 |            "of them are void";
26 |   }
27 | };
28 | 
29 | } // namespace project
30 | } // namespace mlir
31 | 
32 | #endif // LIB_TRANSFORM_MAKE_RUN_ABLE_ZIRFFAAR_H_


--------------------------------------------------------------------------------
/lib/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | # add_subdirectory(Conversion)
2 | # add_subdirectory(Dialect)
3 | # add_subdirectory(Analysis)
4 | add_subdirectory(Transform)
5 | 


--------------------------------------------------------------------------------
/lib/Transform/Affine/Affine64Unroll.cpp:
--------------------------------------------------------------------------------
 1 | #include "include/Transform/Affine/Affine64Unroll.h"
 2 | #include "mlir/Dialect/Affine/Analysis/Utils.h"
 3 | #include "mlir/Dialect/Affine/IR/AffineOps.h"
 4 | #include "mlir/Dialect/Affine/LoopFusionUtils.h"
 5 | #include "mlir/Dialect/Affine/LoopUtils.h"
 6 | #include "mlir/IR/PatternMatch.h"
 7 | #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 8 | #include "llvm/Support/LogicalResult.h"
 9 | 
10 | namespace mlir {
11 | namespace project {
12 | 
13 | void Affine64UnrollPass::runOnOperation() {
14 |   getOperation()->walk([&](affine::AffineForOp op) {
15 |     affine::LoopNestStats *stats;
16 |     if (affine::getNestingDepth(op) > 2)
17 |       if (llvm::failed(affine::loopUnrollUpToFactor(op, 64))) {
18 |         op->emitError("Unrooling failed");
19 |         signalPassFailure();
20 |       }
21 |   });
22 | }
23 | 
24 | } // namespace project
25 | } // namespace mlir


--------------------------------------------------------------------------------
/lib/Transform/Affine/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_mlir_library(Affine64Unroll
 2 |     Affine64Unroll.cpp
 3 |     # AffineFullUnrollPatternRewrite.cpp
 4 | 
 5 |     ${PROJECT_SOURCE_DIR}/include/Transform/Affine/
 6 |     ADDITIONAL_HEADER_DIRS
 7 | 
 8 |     LINK_LIBS PUBLIC
 9 | )
10 | 
11 | # set(LLVM_TARGET_DEFINITIONS Passes.td)
12 | # mlir_tablegen(Passes.h.inc -gen-pass-decls -name Affine)
13 | # add_public_tablegen_target(MLIRAffineFullUnrollPasses)
14 | # add_mlir_doc(Passes AffinePasses ./ -gen-pass-doc)
15 | 


--------------------------------------------------------------------------------
/lib/Transform/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_subdirectory(Affine)
2 | add_subdirectory(MakeRunAble)
3 | # add_subdirectory(Arith)
4 | # add_subdirectory(Noisy)
5 | 


--------------------------------------------------------------------------------
/lib/Transform/MakeRunAble/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_mlir_library(MakeRunAble
 2 |     RemoveGlobalConstants.cpp
 3 |     ZeroInitRemoveForwardFuncArgsAndReturn.cpp
 4 |     RemoveForwardFuncArgsAndReturn.cpp
 5 |     ${PROJECT_SOURCE_DIR}/include/Transform/MakeRunAble/
 6 |     ADDITIONAL_HEADER_DIRS
 7 | 
 8 |     LINK_LIBS PUBLIC
 9 | )
10 | 


--------------------------------------------------------------------------------
/lib/Transform/MakeRunAble/RemoveForwardFuncArgsAndReturn.cpp:
--------------------------------------------------------------------------------
 1 | #include "include/Transform/MakeRunAble/RemoveForwardFuncArgsAndReturn.h"
 2 | #include "mlir/Dialect/Affine/IR/AffineOps.h"
 3 | #include "mlir/Dialect/Affine/LoopUtils.h"
 4 | #include "mlir/Dialect/Func/IR/FuncOps.h"
 5 | #include "mlir/Dialect/MemRef/IR/MemRef.h"
 6 | #include "mlir/IR/Attributes.h"
 7 | #include "mlir/IR/Builders.h"
 8 | #include "mlir/IR/BuiltinAttributes.h"
 9 | #include "mlir/IR/BuiltinOps.h"
10 | #include "mlir/IR/BuiltinTypes.h"
11 | #include "mlir/IR/MLIRContext.h"
12 | #include "mlir/IR/PatternMatch.h"
13 | #include "mlir/IR/Value.h"
14 | #include "mlir/Support/LLVM.h"
15 | #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
16 | #include "llvm/ADT/SmallVector.h"
17 | #include "llvm/ADT/StringRef.h"
18 | #include "llvm/Support/Error.h"
19 | #include "llvm/Support/LogicalResult.h"
20 | #include "llvm/Support/raw_ostream.h"
21 | #include "llvm/Transforms/IPO/Attributor.h"
22 | 
23 | namespace mlir {
24 | namespace project {
25 | 
26 | void RemoveForwardFuncArgsAndReturn::runOnOperation() {
27 |   func::FuncOp func = getOperation();
28 |   mlir::MLIRContext *ctx = func->getContext();
29 |   mlir::OpBuilder builder(ctx);
30 |   auto fuctionblock = &func.getBody().front();
31 |   builder.setInsertionPointToStart(fuctionblock);
32 |   // llvm::errs() << "\n" << func->getName() << "\n";
33 |   mlir::SmallVector<BlockArgument, 5> argsvector;
34 |   auto args = func.getArguments();
35 | 
36 |   mlir::DenseMap<unsigned int, Value> argMap;
37 |   for (auto arg : args) {
38 |     auto memrefType = mlir::cast<MemRefType>(arg.getType());
39 |     // llvm::errs() << "-> " << arg.getArgNumber() << " moved \n";
40 |     auto allocOp = builder.create<memref::AllocOp>(arg.getLoc(), memrefType);
41 |     argMap[arg.getArgNumber()] = allocOp;
42 |     arg.replaceAllUsesWith(allocOp);
43 |     fuctionblock->eraseArgument(arg.getArgNumber());
44 |   }
45 |   auto newFuncType = FunctionType::get(ctx, {}, {});
46 |   func.setType(newFuncType);
47 |   func.walk([&](func::ReturnOp ret) {
48 |     OpBuilder retbuilder(ret);
49 |     retbuilder.create<func::ReturnOp>(ret->getLoc());
50 |     ret->erase();
51 |   });
52 | 
53 |   return;
54 | }
55 | 
56 | } // namespace project
57 | } // namespace mlir


--------------------------------------------------------------------------------
/lib/Transform/MakeRunAble/RemoveGlobalConstants.cpp:
--------------------------------------------------------------------------------
 1 | #include "include/Transform/MakeRunAble/RemoveGlobalConstants.h"
 2 | #include "mlir/Dialect/Affine/IR/AffineOps.h"
 3 | #include "mlir/Dialect/Affine/LoopUtils.h"
 4 | #include "mlir/Dialect/Func/IR/FuncOps.h"
 5 | #include "mlir/Dialect/MemRef/IR/MemRef.h"
 6 | #include "mlir/IR/Attributes.h"
 7 | #include "mlir/IR/Builders.h"
 8 | #include "mlir/IR/BuiltinAttributes.h"
 9 | #include "mlir/IR/BuiltinOps.h"
10 | #include "mlir/IR/BuiltinTypes.h"
11 | #include "mlir/IR/MLIRContext.h"
12 | #include "mlir/IR/PatternMatch.h"
13 | #include "mlir/IR/Value.h"
14 | #include "mlir/Support/LLVM.h"
15 | #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
16 | #include "llvm/ADT/StringRef.h"
17 | #include "llvm/Support/Error.h"
18 | #include "llvm/Support/LogicalResult.h"
19 | #include "llvm/Support/raw_ostream.h"
20 | 
21 | namespace mlir {
22 | namespace project {
23 | 
24 | void RemoveGlobalConstants::runOnOperation() {
25 |   ModuleOp module = getOperation();
26 |   mlir::SmallVector<memref::GlobalOp, 5> global_constants;
27 |   mlir::MLIRContext *ctx = module->getContext();
28 |   mlir::OpBuilder builder(ctx);
29 |   module->walk([&](mlir::memref::GlobalOp globalop) {
30 |     global_constants.push_back(globalop);
31 |   });
32 |   if (global_constants.empty()) {
33 |     return;
34 |   }
35 |   func::FuncOp forwardFunctionDef =
36 |       module.lookupSymbol<mlir::func::FuncOp>("forward");
37 |   if (forwardFunctionDef) {
38 |     auto entryblock = &forwardFunctionDef.getBody().front();
39 |     builder.setInsertionPointToStart(entryblock);
40 |   }
41 |   mlir::DenseMap<llvm::StringRef, Value> globalMap;
42 | 
43 |   for (auto global : global_constants) {
44 |     auto memrefType = mlir::cast<MemRefType>(global.getType());
45 |     // llvm::errs() << "->" << global.getSymName() << "\n";
46 |     auto allocOp =
47 |         builder.create<memref::AllocOp>(global->getLoc(), memrefType);
48 |     globalMap[global.getSymName()] = allocOp;
49 |     global->erase();
50 |   }
51 | 
52 |   module->walk([&](memref::GetGlobalOp getGlobalOp) {
53 |     auto globalName = getGlobalOp.getName();
54 |     if (globalMap.count(globalName)) {
55 |       getGlobalOp.replaceAllUsesWith(globalMap[globalName]);
56 |       getGlobalOp.erase();
57 |     }
58 |   });
59 |   // llvm::errs() << " Replacements done "
60 |   //              << "\n";
61 |   return;
62 | }
63 | 
64 | } // namespace project
65 | } // namespace mlir


--------------------------------------------------------------------------------
/lib/Transform/MakeRunAble/ZeroInitRemoveForwardFuncArgsAndReturn.cpp:
--------------------------------------------------------------------------------
 1 | #include "include/Transform/MakeRunAble/ZeroInitRemoveForwardFuncArgsAndReturn.h"
 2 | #include "mlir/Dialect/Affine/IR/AffineOps.h"
 3 | #include "mlir/Dialect/Affine/LoopUtils.h"
 4 | #include "mlir/Dialect/Arith/IR/Arith.h"
 5 | #include "mlir/Dialect/Func/IR/FuncOps.h"
 6 | #include "mlir/Dialect/MemRef/IR/MemRef.h"
 7 | #include "mlir/IR/Attributes.h"
 8 | #include "mlir/IR/Builders.h"
 9 | #include "mlir/IR/BuiltinAttributes.h"
10 | #include "mlir/IR/BuiltinOps.h"
11 | #include "mlir/IR/BuiltinTypes.h"
12 | #include "mlir/IR/Location.h"
13 | #include "mlir/IR/MLIRContext.h"
14 | #include "mlir/IR/PatternMatch.h"
15 | #include "mlir/IR/Value.h"
16 | #include "mlir/IR/ValueRange.h"
17 | #include "mlir/Support/LLVM.h"
18 | #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
19 | #include "llvm/ADT/ArrayRef.h"
20 | #include "llvm/ADT/SmallVector.h"
21 | #include "llvm/ADT/StringRef.h"
22 | #include "llvm/Support/Error.h"
23 | #include "llvm/Support/LogicalResult.h"
24 | #include "llvm/Support/raw_ostream.h"
25 | #include "llvm/Transforms/IPO/Attributor.h"
26 | #include <cstdint>
27 | 
28 | namespace mlir {
29 | namespace project {
30 | 
31 | void ZeroInitRemoveForwardFuncArgsAndReturn::runOnOperation() {
32 |   func::FuncOp func = getOperation();
33 |   mlir::MLIRContext *ctx = func->getContext();
34 |   mlir::OpBuilder builder(ctx);
35 |   auto functionBlock = &func.getBody().front();
36 |   builder.setInsertionPointToStart(functionBlock);
37 | 
38 |   auto args = func.getArguments();
39 |   mlir::DenseMap<unsigned int, Value> argMap;
40 | 
41 |   // Helper to create a constant index value
42 |   auto createConstantIndex = [&](int64_t val, mlir::Location argloc) -> Value {
43 |     return builder.create<arith::ConstantIndexOp>(argloc, val);
44 |   };
45 | 
46 |   for (auto arg : args) {
47 |     auto memrefType = mlir::cast<MemRefType>(arg.getType());
48 | 
49 |     // Allocate memory
50 |     auto allocOp = builder.create<memref::AllocOp>(arg.getLoc(), memrefType);
51 |     argMap[arg.getArgNumber()] = allocOp;
52 | 
53 |     // Get shape of memref to iterate over
54 |     auto shape = memrefType.getShape();
55 |     unsigned rank = memrefType.getRank();
56 |     Value zeroVal = builder.create<arith::ConstantOp>(
57 |         arg.getLoc(), builder.getZeroAttr(memrefType.getElementType()));
58 |     auto loc = allocOp->getLoc();
59 | 
60 |     SmallVector<int64_t, 4> lowerBounds(memrefType.getRank(), /*Value=*/0);
61 |     SmallVector<int64_t, 4> steps(memrefType.getRank(), /*Value=*/1);
62 |     affine::buildAffineLoopNest(
63 |         builder, loc, lowerBounds, memrefType.getShape(), steps,
64 |         [&](OpBuilder &nestedBuilder, Location loc, ValueRange ivs) {
65 |           nestedBuilder.create<affine::AffineStoreOp>(loc, zeroVal, allocOp,
66 |                                                       ivs);
67 |         });
68 | 
69 |     // Replace the original argument with the allocated memory
70 |     arg.replaceAllUsesWith(allocOp);
71 |     functionBlock->eraseArgument(arg.getArgNumber());
72 |   }
73 | 
74 |   // Update the function signature to have no arguments and no return type
75 |   auto newFuncType = FunctionType::get(ctx, {}, {});
76 |   func.setType(newFuncType);
77 | 
78 |   // Replace all return operations with empty returns
79 |   func.walk([&](func::ReturnOp ret) {
80 |     OpBuilder retbuilder(ret);
81 |     retbuilder.create<func::ReturnOp>(ret->getLoc());
82 |     ret->erase();
83 |   });
84 | }
85 | 
86 | } // namespace project
87 | } // namespace mlir


--------------------------------------------------------------------------------
/make_MLIR_obj.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python3
 2 | import argparse
 3 | import os
 4 | import subprocess
 5 | import shutil
 6 | import re
 7 | from concurrent.futures import ThreadPoolExecutor
 8 | 
 9 | def convert_to_ll(source_file, 
10 |                   output_file,
11 |                   project_opt_flags = '--rem-forward-func-args-and-return-run-mlir --rem-global-constants-run-mlir',
12 |                   mlir_opt_path = os.path.abspath("./External/llvm-project/build/bin/mlir-opt"),
13 |                   project_opt_path = os.path.abspath("./build-ninja/tools/project-opt"),
14 |                   mlir_translate_path = os.path.abspath("./External/llvm-project/build/bin/mlir-translate")):
15 |     
16 |     # Run the mlir-opt command to convert the input file to LLVM IR
17 |     
18 |     # project_opt_flags = '--rem-forward-func-args-and-return-run-mlir --rem-global-constants-run-mlir'
19 |     """
20 |     ./mlir-opt --lower-affine --expand-strided-metadata  --convert-scf-to-cf --convert-cf-to-llvm --llvm-request-c-wrappers  --convert-func-to-llvm --normalize-memrefs --memref-expand --finalize-memref-to-llvm --reconcile-unrealized-casts --llvm-legalize-for-export ../../squeezenet1_0.mlir | /home/intern24005/code/compiler_builds/mlir_test_build/bin/mlir-translate --mlir-to-llvmir > squeezenet1_0.ll
21 |     """
22 |     mlir_flags = '--lower-affine --expand-strided-metadata  --convert-scf-to-cf --convert-math-to-llvm --convert-cf-to-llvm --llvm-request-c-wrappers  --convert-func-to-llvm --normalize-memrefs --memref-expand --finalize-memref-to-llvm --reconcile-unrealized-casts --llvm-legalize-for-export'
23 |     command = f"{project_opt_path} {project_opt_flags} {source_file} | {mlir_opt_path} {mlir_flags} | {mlir_translate_path} --mlir-to-llvmir > {output_file}"
24 |     print(command)
25 |     try:
26 |         subprocess.run(command, shell=True, check=True)
27 |     except subprocess.CalledProcessError as e:
28 |         print(f"Erorr: {e} while running command: {command}")
29 |     return output_file
30 | 
31 | def convert_to_obj(source_file,
32 |                    output_file,
33 |                    llc_path = os.path.abspath("./External/llvm-project/build/bin/llc")):
34 |     # Run the llc command to convert the input file to an object file
35 |     """
36 |     ./llc -filetype=obj -march=x86_64-linux-gnu squeezenet1_0.ll -o squeezenet1_0.o
37 |     """
38 |     llc_flags = '-filetype=obj --relocation-model=pic'
39 |     command = f"{llc_path} {llc_flags} {source_file} -o {output_file}"
40 |     try :
41 |         subprocess.run(command, shell=True, check=True)
42 |         if os.path.exists(source_file):
43 |             os.remove(source_file)
44 |     except subprocess.CalledProcessError as e:
45 |         print(f"Erorr: {e} while running command: {command}")
46 |     return output_file
47 | 
48 | def compile_multiple_files(path_to_file, path_to_ll_file, path_to_obj_file, mlir_flags):
49 |     #convert the file to llvm ir
50 |     ll_file = convert_to_ll(path_to_file, path_to_ll_file ,project_opt_flags=mlir_flags)
51 |     convert_to_obj(ll_file, path_to_obj_file)
52 |     print(f"Compiled {path_to_file} to {path_to_obj_file}")
53 | 
54 | def compile_to_object_from_mlir(path_to_folder_with_mlir_files,path_to_obj_folder,mlir_flags):
55 |     if os.path.isdir(path_to_folder_with_mlir_files):
56 |         with ThreadPoolExecutor(max_workers=6) as executor:
57 |             for file in os.listdir(path_to_folder_with_mlir_files):
58 |                 if file.endswith(".mlir"):
59 |                     file_name = os.path.basename(file).split(".")[0]
60 |                     
61 |                     output_file = os.path.join(path_to_obj_folder, file_name + ".o")
62 |                     ll_file = os.path.join(path_to_obj_folder, file_name + ".ll")
63 |                     executor.submit(compile_multiple_files, os.path.join(path_to_folder_with_mlir_files, file), ll_file, output_file, mlir_flags)
64 |     elif os.path.isfile(path_to_folder_with_mlir_files) :
65 |         file_name = os.path.basename(path_to_folder_with_mlir_files).split(".")[0]
66 |         output_file = os.path.join(path_to_obj_folder, file_name + ".o")
67 |         ll_file = os.path.join(path_to_obj_folder,file_name + ".ll")
68 |         compile_multiple_files(os.path.join(path_to_folder_with_mlir_files),ll_file,output_file,mlir_flags)
69 | 
70 |             
71 | if __name__ == "__main__":
72 |     parser = argparse.ArgumentParser(description='Compile MLIR files to object files')
73 |     parser.add_argument('input_folder', type=str, help='Absolute path to the folder with MLIR files or the only MLIR file')
74 |     parser.add_argument('output_folder', type=str, help='Absolute path to the folder where object files will be stored')
75 |     parser.add_argument('--mlir-flags', required=False,default="--rem-forward-func-args-and-return-run-mlir-zero-init --rem-global-constants-run-mlir" ,type=str, help='Flags to be passed to project-opt')
76 |     args = parser.parse_args()
77 |     compile_to_object_from_mlir(args.input_folder, args.output_folder,args.mlir_flags)
78 | 


--------------------------------------------------------------------------------
/tests/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | # someting too add here.


--------------------------------------------------------------------------------
/tools/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)
 2 | get_property(conversion_libs GLOBAL PROPERTY MLIR_CONVERSION_LIBS)
 3 | 
 4 | set (LIBS
 5 |     ${dialect_libs}
 6 |     ${conversion_libs}
 7 |     Affine64Unroll
 8 |     MakeRunAble
 9 |     MLIROptLib
10 |     MLIRPass
11 | )
12 | 
13 | add_llvm_executable(project-opt project-opt.cpp)
14 | 
15 | llvm_update_compile_flags(project-opt)
16 | target_link_libraries(project-opt PRIVATE ${LIBS})
17 | 
18 | mlir_check_all_link_libraries(project-opt)
19 | 


--------------------------------------------------------------------------------
/tools/project-opt.cpp:
--------------------------------------------------------------------------------
 1 | #include "include/Transform/Affine/Affine64Unroll.h"
 2 | #include "include/Transform/MakeRunAble/RemoveForwardFuncArgsAndReturn.h"
 3 | #include "include/Transform/MakeRunAble/RemoveGlobalConstants.h"
 4 | #include "include/Transform/MakeRunAble/ZeroInitRemoveForwardFuncArgsAndReturn.h"
 5 | #include "mlir/include/mlir/InitAllDialects.h"
 6 | #include "mlir/include/mlir/Pass/PassManager.h"
 7 | #include "mlir/include/mlir/Pass/PassRegistry.h"
 8 | #include "mlir/include/mlir/Tools/mlir-opt/MlirOptMain.h"
 9 | 
10 | int main(int argc, char **argv) {
11 |   mlir::DialectRegistry registry;
12 |   mlir::registerAllDialects(registry);
13 | 
14 |   mlir::PassRegistration<mlir::project::Affine64UnrollPass>();
15 |   mlir::PassRegistration<mlir::project::RemoveGlobalConstants>();
16 |   mlir::PassRegistration<mlir::project::RemoveForwardFuncArgsAndReturn>();
17 |   mlir::PassRegistration<
18 |       mlir::project::ZeroInitRemoveForwardFuncArgsAndReturn>();
19 |   return mlir::asMainReturnCode(
20 |       mlir::MlirOptMain(argc, argv, "Pass Driver", registry));
21 | }


--------------------------------------------------------------------------------