├── .gitignore ├── .gitmodules ├── CMakeLists.txt ├── README.md ├── Torch_MLIR ├── modified │ └── modified.mlir └── pytorch │ ├── alexnet.mlir │ ├── resnet152.mlir │ └── vgg11.mlir ├── benchmark.sh ├── benchmarks ├── CMakeLists.txt ├── GoogleBenchmarks │ ├── CMakeLists.txt │ └── run_bench.cpp └── Hardware_Counters_or_Time │ ├── CMakeLists.txt │ ├── run_bench.cpp │ └── run_bench.h ├── build_llvm.sh ├── build_mlir.sh ├── build_obj.sh ├── example_output.png ├── include └── Transform │ ├── Affine │ └── Affine64Unroll.h │ └── MakeRunAble │ ├── RemoveForwardFuncArgsAndReturn.h │ ├── RemoveGlobalConstants.h │ └── ZeroInitRemoveForwardFuncArgsAndReturn.h ├── lib ├── CMakeLists.txt └── Transform │ ├── Affine │ ├── Affine64Unroll.cpp │ └── CMakeLists.txt │ ├── CMakeLists.txt │ └── MakeRunAble │ ├── CMakeLists.txt │ ├── RemoveForwardFuncArgsAndReturn.cpp │ ├── RemoveGlobalConstants.cpp │ └── ZeroInitRemoveForwardFuncArgsAndReturn.cpp ├── make_MLIR_obj.py ├── tests └── CMakeLists.txt └── tools ├── CMakeLists.txt └── project-opt.cpp /.gitignore: -------------------------------------------------------------------------------- 1 | build* 2 | .cache* 3 | .vscode* 4 | temp.mlir 5 | generate_MLIR.py 6 | Torch_MLIR/* 7 | /benchmarks/mlir_obj/* 8 | Onnx_models/* -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "External/llvm-project"] 2 | path = External/llvm-project 3 | url = https://github.com/llvm/llvm-project.git 4 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.20.0) 2 | 3 | project(OptML LANGUAGES CXX C) 4 | 5 | set(CMAKE_CXX_STANDARD 17 CACHE STRING "C++ standard to conform to") 6 | set(CMAKE_POSITION_INDEPENDENT_CODE ON) 7 | set(BUILD_DEPS ON) 8 | 9 | set(MLIR_DIR External/llvm-project/build/lib/cmake/mlir) 10 | 11 | find_package(MLIR REQUIRED CONFIG) 12 | 13 | message(STATUS "Using MLIRConfig.cmake in: ${MLIR_DIR}") 14 | message(STATUS "Using LLVMConfig.cmake in: ${LLVM_DIR}") 15 | 16 | set(MLIR_BINARY_DIR ${CMAKE_BINARY_DIR}) 17 | 18 | include(AddLLVM) 19 | include(TableGen) 20 | 21 | list(APPEND CMAKE_MODULE_PATH "${MLIR_CMAKE_DIR}") 22 | include(AddMLIR) 23 | include_directories(${LLVM_INCLUDE_DIRS}) 24 | include_directories(${MLIR_INCLUDE_DIRS}) 25 | include_directories(${PROJECT_SOURCE_DIR}) 26 | include_directories(${PROJECT_SOURCE_DIR}/External/llvm-project) 27 | include_directories(${PROJECT_BINARY_DIR}) 28 | 29 | add_custom_command( 30 | OUTPUT "${CMAKE_SOURCE_DIR}/benchmarks/mlir_obj/alexnet.o" 31 | OUTPUT "${CMAKE_SOURCE_DIR}/benchmarks/mlir_obj/resnet152.o" 32 | OUTPUT "${CMAKE_SOURCE_DIR}/benchmarks/mlir_obj/vgg11.o" 33 | OUTPUT "${CMAKE_SOURCE_DIR}/benchmarks/mlir_obj/modified.o" 34 | COMMAND ${CMAKE_COMMAND} -E echo "Compiling oracle object files" 35 | COMMAND ${CMAKE_COMMAND} -E env bash ${CMAKE_SOURCE_DIR}/build_obj.sh ${CMAKE_SOURCE_DIR} 36 | DEPENDS ${CMAKE_SOURCE_DIR}/Torch_MLIR/pytorch/alexnet.mlir 37 | DEPENDS ${CMAKE_SOURCE_DIR}/Torch_MLIR/pytorch/resnet152.mlir 38 | DEPENDS ${CMAKE_SOURCE_DIR}/Torch_MLIR/pytorch/vgg11.mlir 39 | DEPENDS ${CMAKE_SOURCE_DIR}/Torch_MLIR/modified/modified.mlir 40 | COMMENT "Compiling oracle object files" 41 | ) 42 | add_custom_target(build_alexnet_obj ALL 43 | DEPENDS ${CMAKE_SOURCE_DIR}/benchmarks/mlir_obj/alexnet.o 44 | ) 45 | 46 | 47 | message(STATUS "Fetching Google Benchmarks..") 48 | include(FetchContent) 49 | set(BENCHMARK_ENABLE_TESTING NO) 50 | set(CMAKE_BUILD_TYPE Release) # Ensure the build type is Release 51 | FetchContent_Declare( 52 | googlebenchmark 53 | GIT_REPOSITORY https://github.com/google/benchmark.git 54 | GIT_TAG origin/main 55 | ) 56 | # FetchContent_MakeAvailable(or-tools googlebenchmark) 57 | FetchContent_MakeAvailable(googlebenchmark) 58 | set(BENCHMARK_ENABLE_LTO ON) # Enable Link Time Optimization for better performance 59 | 60 | # add_subdirectory(tests) 61 | add_subdirectory(tools) 62 | add_subdirectory(lib) 63 | add_subdirectory(benchmarks) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # OptML 2 | 3 | Welcome to OptML! This repository is designed for those new to MLIR and machine learning-based optimizations. As a compiler enthusiast, I wanted to create a platform for hobbyists like myself to experiment with and benchmark new optimizations on real ML models in an out-of-tree manner. This project is heavily inspired by [mlir-tutorial](https://github.com/j2kun/mlir-tutorial), which laid the foundation for my learning and development. 4 | 5 | ## Table of Contents 6 | 7 | 1. [Vision Models](#vision-models) 8 | 2. [Benchmarking Options](#benchmarking-options) 9 | 3. [Build Instructions](#build-instructions) 10 | 4. [Usage Guide](#usage-guide) 11 | 5. [Benchmarking Process](#benchmarking-process) 12 | 6. [Files of Interest](#files-of-interest) 13 | 14 | 15 | 16 | ## Vision Models 17 | 18 | The repository includes three vision models generated from TorchScript: 19 | 20 | 1. AlexNet 21 | 2. VGG11 22 | 3. ResNet152 23 | 24 | ## Benchmarking Options 25 | 26 | OptML supports multiple benchmarking methodologies: 27 | 28 | 1. **Google Benchmarks** 29 | 2. **Hardware Counters (PAPI)** 30 | 3. **C++ Chrono library** 31 | 32 | ## Build Instructions 33 | 34 | ### Prerequisites 35 | 36 | Before building and running OptML, make sure you have the following installed: 37 | 38 | 1. **CMake** (version 3.20 or higher) 39 | 2. **PAPI** (for hardware counter support) 40 | 3. **Python 3.x** (for script execution) 41 | 4. **C/C++ compiler** (clang 17 /gcc 11 or higher) 42 | 43 | Ensure that these dependencies are installed and configured correctly before proceeding with the build instructions. 44 | 45 | ### 1) Clone the Repository 46 | ```bash 47 | git clone https://github.com/mvvsmk/OptML.git 48 | cd OptML 49 | git submodule update --init --recursive 50 | ``` 51 | 52 | ### 2) Run the Build Script 53 | ```bash 54 | ./build_llvm.sh # Builds the LLVM submodule 55 | # please note while doing check-mlir build might fail but it doesn't affect the project. 56 | ./build_mlir.sh # Builds the project-opt tool with out-of-tree optimizations 57 | ``` 58 | 59 | ### 3) Run a Pre-included Benchmark to Verify the Setup 60 | ```bash 61 | ./benchmark.sh chrono Alexnet --affine-64-unroll 62 | ``` 63 | 64 | ## Usage Guide 65 | 66 | Let's walk through how to use this repository, specifically using the `Affine64Unroll` pass I implemented. 67 | 68 | 1. **Headers**: Located in `$rootdir/include/Transform/Affine/` 69 | 2. **Implementation**: Located under `$rootdir/lib/Transform/Affine/` 70 | 3. **CMake File**: The CMake file in the implementation folder is straightforward; ensure you include the necessary libraries for your use case. 71 | 4. **Register the Pass**: Register your pass with the `project-opt` tool. 72 | 73 | Now, you're all set! 74 | 75 | To run your own benchmarks, use the command: 76 | ```bash 77 | $rootdir/benchmark.sh [benchmark_type] [ML_model] [benchmark_flag] [PAPI_event_name] 78 | ``` 79 | ![sample_output](https://github.com/mvvsmk/OptML/blob/main/example_output.png?raw=true) 80 | 81 | > [!WARNING] 82 | > Before interpreting your benchmark results, it's important to understand how the benchmarking process works. 83 | > This looks like a 29% increase but what you miss, is the object file size increase from 16KB to 2.5MB. XD . This was measured on a Intel(R) Core(TM) Ultra 9 185H, in a single threaded manner, this measurement also includes array initilization to all zeros. 84 | 85 | 86 | 87 | ## Benchmarking Process 88 | 89 | When the object file is compiled, two important passes are run: 90 | 91 | 1. **`--rem-forward-func-args-and-return-run-mlir-zero-init`**: 92 | - This pass removes the arguments and return values of the `forward` function to make all functions uniform. 93 | - You can choose between two variants: one where the arguments are zero-initialized, and another where they remain uninitialized (resulting in undefined behavior). 94 | - Uninitialized one takes less memory to compile as adding the initialization ends up creating a lot of instructions which takes up a lot of ram. 95 | The default behaviour is to zero init the argument (usually the picture for the model) but if you are feeling lucky and want to experiment with undefined behviour change the following in make_MLIR_obj.py : 96 | ```diff 97 | if __name__ == "__main__": 98 | parser = argparse.ArgumentParser(description='Compile MLIR files to object files') 99 | parser.add_argument('input_folder', type=str, help='Absolute path to the folder with MLIR files or the only MLIR file') 100 | parser.add_argument('output_folder', type=str, help='Absolute path to the folder where object files will be stored') 101 | - parser.add_argument('--mlir-flags', required=False,default="--rem-forward-func-args-and-return-run-mlir-zero-init --rem-global-constants-run-mlir" ,type=str, help='Flags to be passed to project-opt') 102 | + parser.add_argument('--mlir-flags', required=False,default="--rem-forward-func-args-and-return-run-mlir --rem-global-constants-run-mlir" ,type=str, help='Flags to be passed to project-opt') 103 | 104 | ``` 105 | 106 | 2. **`--rem-global-constants-run-mlir`**: 107 | - This pass removes global constants and places them inside the main function.This is to prevent some ```:5:3: error: resource does not exist``` errors 108 | 109 | After the modified MLIR is generated, it is compiled into an object file without any optimizations (creating either the original or oracle version). For benchmarking, the object file is linked against an empty C++ file that benchmarks the function: 110 | ```cpp 111 | extern "C" void forward(); 112 | ``` 113 | 114 | When you run a pass using the `benchmark.sh` script, it generates a `Modified.mlir` file, which is then processed through the same pipeline and linked for benchmarking. 115 | 116 | All these benchmarks are run in a single threaded manner without sudo taskset -c . 117 | 118 | You can use the *.cpp and *.h files present in the `benchmarks/Hardware_Counters_or_Time` folder and add any custom parameters you want to measure. 119 | 120 | ### Files of Interest 121 | 122 | Do go through these files if you want to lean more on how the benchmarks actually compile and execute. 123 | 124 | - **`benchmark.sh`**: Executes your pass and compares the results against the original. 125 | - **`make_MLIR_obj.py`**: Converts the MLIR file to an object file for benchmarking. 126 | -------------------------------------------------------------------------------- /Torch_MLIR/pytorch/alexnet.mlir: -------------------------------------------------------------------------------- 1 | module attributes {torch.debug_module_name = "AlexNet"} { 2 | memref.global "private" constant @__constant_64x3x11x11xf32 : memref<64x3x11x11xf32> = dense_resource<__elided__> {alignment = 64 : i64} 3 | memref.global "private" constant @__constant_64xf32 : memref<64xf32> = dense_resource<__elided__> {alignment = 64 : i64} 4 | memref.global "private" constant @__constant_192x64x5x5xf32 : memref<192x64x5x5xf32> = dense_resource<__elided__> {alignment = 64 : i64} 5 | memref.global "private" constant @__constant_192xf32 : memref<192xf32> = dense_resource<__elided__> {alignment = 64 : i64} 6 | memref.global "private" constant @__constant_384x192x3x3xf32 : memref<384x192x3x3xf32> = dense_resource<__elided__> {alignment = 64 : i64} 7 | memref.global "private" constant @__constant_384xf32 : memref<384xf32> = dense_resource<__elided__> {alignment = 64 : i64} 8 | memref.global "private" constant @__constant_256x384x3x3xf32 : memref<256x384x3x3xf32> = dense_resource<__elided__> {alignment = 64 : i64} 9 | memref.global "private" constant @__constant_256xf32_0 : memref<256xf32> = dense_resource<__elided__> {alignment = 64 : i64} 10 | memref.global "private" constant @__constant_256x256x3x3xf32 : memref<256x256x3x3xf32> = dense_resource<__elided__> {alignment = 64 : i64} 11 | memref.global "private" constant @__constant_256xf32 : memref<256xf32> = dense_resource<__elided__> {alignment = 64 : i64} 12 | memref.global "private" constant @__constant_4096x9216xf32 : memref<4096x9216xf32> = dense_resource<__elided__> {alignment = 64 : i64} 13 | memref.global "private" constant @__constant_4096xf32_0 : memref<4096xf32> = dense_resource<__elided__> {alignment = 64 : i64} 14 | memref.global "private" constant @__constant_4096x4096xf32 : memref<4096x4096xf32> = dense_resource<__elided__> {alignment = 64 : i64} 15 | memref.global "private" constant @__constant_4096xf32 : memref<4096xf32> = dense_resource<__elided__> {alignment = 64 : i64} 16 | memref.global "private" constant @__constant_1000x4096xf32 : memref<1000x4096xf32> = dense_resource<__elided__> {alignment = 64 : i64} 17 | memref.global "private" constant @__constant_1000xf32 : memref<1000xf32> = dense_resource<__elided__> {alignment = 64 : i64} 18 | func.func @forward(%arg0: memref<64x3x224x224xf32>) -> memref<64x1000xf32> { 19 | %cst = arith.constant -3.40282347E+38 : f32 20 | %cst_0 = arith.constant 0.000000e+00 : f32 21 | %0 = memref.get_global @__constant_1000xf32 : memref<1000xf32> 22 | %1 = memref.get_global @__constant_1000x4096xf32 : memref<1000x4096xf32> 23 | %2 = memref.get_global @__constant_4096xf32 : memref<4096xf32> 24 | %3 = memref.get_global @__constant_4096x4096xf32 : memref<4096x4096xf32> 25 | %4 = memref.get_global @__constant_4096xf32_0 : memref<4096xf32> 26 | %5 = memref.get_global @__constant_4096x9216xf32 : memref<4096x9216xf32> 27 | %6 = memref.get_global @__constant_256xf32 : memref<256xf32> 28 | %7 = memref.get_global @__constant_256x256x3x3xf32 : memref<256x256x3x3xf32> 29 | %8 = memref.get_global @__constant_256xf32_0 : memref<256xf32> 30 | %9 = memref.get_global @__constant_256x384x3x3xf32 : memref<256x384x3x3xf32> 31 | %10 = memref.get_global @__constant_384xf32 : memref<384xf32> 32 | %11 = memref.get_global @__constant_384x192x3x3xf32 : memref<384x192x3x3xf32> 33 | %12 = memref.get_global @__constant_192xf32 : memref<192xf32> 34 | %13 = memref.get_global @__constant_192x64x5x5xf32 : memref<192x64x5x5xf32> 35 | %14 = memref.get_global @__constant_64xf32 : memref<64xf32> 36 | %15 = memref.get_global @__constant_64x3x11x11xf32 : memref<64x3x11x11xf32> 37 | %alloc = memref.alloc() {alignment = 64 : i64} : memref<64x3x228x228xf32> 38 | affine.for %arg1 = 0 to 64 { 39 | affine.for %arg2 = 0 to 3 { 40 | affine.for %arg3 = 0 to 228 { 41 | affine.for %arg4 = 0 to 228 { 42 | affine.store %cst_0, %alloc[%arg1, %arg2, %arg3, %arg4] : memref<64x3x228x228xf32> 43 | } 44 | } 45 | } 46 | } 47 | %subview = memref.subview %alloc[0, 0, 2, 2] [64, 3, 224, 224] [1, 1, 1, 1] : memref<64x3x228x228xf32> to memref<64x3x224x224xf32, strided<[155952, 51984, 228, 1], offset: 458>> 48 | memref.copy %arg0, %subview : memref<64x3x224x224xf32> to memref<64x3x224x224xf32, strided<[155952, 51984, 228, 1], offset: 458>> 49 | %alloc_1 = memref.alloc() {alignment = 64 : i64} : memref<64x64x55x55xf32> 50 | %alloc_2 = memref.alloc() {alignment = 64 : i64} : memref<64x64x55x55xf32> 51 | affine.for %arg1 = 0 to 64 { 52 | affine.for %arg2 = 0 to 64 { 53 | affine.for %arg3 = 0 to 55 { 54 | affine.for %arg4 = 0 to 55 { 55 | %16 = affine.load %14[%arg2] : memref<64xf32> 56 | affine.store %16, %alloc_2[%arg1, %arg2, %arg3, %arg4] : memref<64x64x55x55xf32> 57 | } 58 | } 59 | } 60 | } 61 | affine.for %arg1 = 0 to 64 { 62 | affine.for %arg2 = 0 to 64 { 63 | affine.for %arg3 = 0 to 55 { 64 | affine.for %arg4 = 0 to 55 { 65 | affine.for %arg5 = 0 to 3 { 66 | affine.for %arg6 = 0 to 11 { 67 | affine.for %arg7 = 0 to 11 { 68 | %16 = affine.load %alloc[%arg1, %arg5, %arg3 * 4 + %arg6, %arg4 * 4 + %arg7] : memref<64x3x228x228xf32> 69 | %17 = affine.load %15[%arg2, %arg5, %arg6, %arg7] : memref<64x3x11x11xf32> 70 | %18 = affine.load %alloc_2[%arg1, %arg2, %arg3, %arg4] : memref<64x64x55x55xf32> 71 | %19 = arith.mulf %16, %17 : f32 72 | %20 = arith.addf %18, %19 : f32 73 | affine.store %20, %alloc_2[%arg1, %arg2, %arg3, %arg4] : memref<64x64x55x55xf32> 74 | } 75 | } 76 | } 77 | } 78 | } 79 | } 80 | } 81 | affine.for %arg1 = 0 to 64 { 82 | affine.for %arg2 = 0 to 64 { 83 | affine.for %arg3 = 0 to 55 { 84 | affine.for %arg4 = 0 to 55 { 85 | %16 = affine.load %alloc_2[%arg1, %arg2, %arg3, %arg4] : memref<64x64x55x55xf32> 86 | %17 = arith.cmpf ugt, %16, %cst_0 : f32 87 | %18 = arith.select %17, %16, %cst_0 : f32 88 | affine.store %18, %alloc_1[%arg1, %arg2, %arg3, %arg4] : memref<64x64x55x55xf32> 89 | } 90 | } 91 | } 92 | } 93 | %alloc_3 = memref.alloc() {alignment = 64 : i64} : memref<64x64x27x27xf32> 94 | affine.for %arg1 = 0 to 64 { 95 | affine.for %arg2 = 0 to 64 { 96 | affine.for %arg3 = 0 to 27 { 97 | affine.for %arg4 = 0 to 27 { 98 | affine.store %cst, %alloc_3[%arg1, %arg2, %arg3, %arg4] : memref<64x64x27x27xf32> 99 | } 100 | } 101 | } 102 | } 103 | affine.for %arg1 = 0 to 64 { 104 | affine.for %arg2 = 0 to 64 { 105 | affine.for %arg3 = 0 to 27 { 106 | affine.for %arg4 = 0 to 27 { 107 | affine.for %arg5 = 0 to 3 { 108 | affine.for %arg6 = 0 to 3 { 109 | %16 = affine.load %alloc_1[%arg1, %arg2, %arg3 * 2 + %arg5, %arg4 * 2 + %arg6] : memref<64x64x55x55xf32> 110 | %17 = affine.load %alloc_3[%arg1, %arg2, %arg3, %arg4] : memref<64x64x27x27xf32> 111 | %18 = arith.maximumf %17, %16 : f32 112 | affine.store %18, %alloc_3[%arg1, %arg2, %arg3, %arg4] : memref<64x64x27x27xf32> 113 | } 114 | } 115 | } 116 | } 117 | } 118 | } 119 | %alloc_4 = memref.alloc() {alignment = 64 : i64} : memref<64x64x31x31xf32> 120 | affine.for %arg1 = 0 to 64 { 121 | affine.for %arg2 = 0 to 64 { 122 | affine.for %arg3 = 0 to 31 { 123 | affine.for %arg4 = 0 to 31 { 124 | affine.store %cst_0, %alloc_4[%arg1, %arg2, %arg3, %arg4] : memref<64x64x31x31xf32> 125 | } 126 | } 127 | } 128 | } 129 | %subview_5 = memref.subview %alloc_4[0, 0, 2, 2] [64, 64, 27, 27] [1, 1, 1, 1] : memref<64x64x31x31xf32> to memref<64x64x27x27xf32, strided<[61504, 961, 31, 1], offset: 64>> 130 | memref.copy %alloc_3, %subview_5 : memref<64x64x27x27xf32> to memref<64x64x27x27xf32, strided<[61504, 961, 31, 1], offset: 64>> 131 | %alloc_6 = memref.alloc() {alignment = 64 : i64} : memref<64x192x27x27xf32> 132 | %alloc_7 = memref.alloc() {alignment = 64 : i64} : memref<64x192x27x27xf32> 133 | affine.for %arg1 = 0 to 64 { 134 | affine.for %arg2 = 0 to 192 { 135 | affine.for %arg3 = 0 to 27 { 136 | affine.for %arg4 = 0 to 27 { 137 | %16 = affine.load %12[%arg2] : memref<192xf32> 138 | affine.store %16, %alloc_7[%arg1, %arg2, %arg3, %arg4] : memref<64x192x27x27xf32> 139 | } 140 | } 141 | } 142 | } 143 | affine.for %arg1 = 0 to 64 { 144 | affine.for %arg2 = 0 to 192 { 145 | affine.for %arg3 = 0 to 27 { 146 | affine.for %arg4 = 0 to 27 { 147 | affine.for %arg5 = 0 to 64 { 148 | affine.for %arg6 = 0 to 5 { 149 | affine.for %arg7 = 0 to 5 { 150 | %16 = affine.load %alloc_4[%arg1, %arg5, %arg3 + %arg6, %arg4 + %arg7] : memref<64x64x31x31xf32> 151 | %17 = affine.load %13[%arg2, %arg5, %arg6, %arg7] : memref<192x64x5x5xf32> 152 | %18 = affine.load %alloc_7[%arg1, %arg2, %arg3, %arg4] : memref<64x192x27x27xf32> 153 | %19 = arith.mulf %16, %17 : f32 154 | %20 = arith.addf %18, %19 : f32 155 | affine.store %20, %alloc_7[%arg1, %arg2, %arg3, %arg4] : memref<64x192x27x27xf32> 156 | } 157 | } 158 | } 159 | } 160 | } 161 | } 162 | } 163 | affine.for %arg1 = 0 to 64 { 164 | affine.for %arg2 = 0 to 192 { 165 | affine.for %arg3 = 0 to 27 { 166 | affine.for %arg4 = 0 to 27 { 167 | %16 = affine.load %alloc_7[%arg1, %arg2, %arg3, %arg4] : memref<64x192x27x27xf32> 168 | %17 = arith.cmpf ugt, %16, %cst_0 : f32 169 | %18 = arith.select %17, %16, %cst_0 : f32 170 | affine.store %18, %alloc_6[%arg1, %arg2, %arg3, %arg4] : memref<64x192x27x27xf32> 171 | } 172 | } 173 | } 174 | } 175 | %alloc_8 = memref.alloc() {alignment = 64 : i64} : memref<64x192x13x13xf32> 176 | affine.for %arg1 = 0 to 64 { 177 | affine.for %arg2 = 0 to 192 { 178 | affine.for %arg3 = 0 to 13 { 179 | affine.for %arg4 = 0 to 13 { 180 | affine.store %cst, %alloc_8[%arg1, %arg2, %arg3, %arg4] : memref<64x192x13x13xf32> 181 | } 182 | } 183 | } 184 | } 185 | affine.for %arg1 = 0 to 64 { 186 | affine.for %arg2 = 0 to 192 { 187 | affine.for %arg3 = 0 to 13 { 188 | affine.for %arg4 = 0 to 13 { 189 | affine.for %arg5 = 0 to 3 { 190 | affine.for %arg6 = 0 to 3 { 191 | %16 = affine.load %alloc_6[%arg1, %arg2, %arg3 * 2 + %arg5, %arg4 * 2 + %arg6] : memref<64x192x27x27xf32> 192 | %17 = affine.load %alloc_8[%arg1, %arg2, %arg3, %arg4] : memref<64x192x13x13xf32> 193 | %18 = arith.maximumf %17, %16 : f32 194 | affine.store %18, %alloc_8[%arg1, %arg2, %arg3, %arg4] : memref<64x192x13x13xf32> 195 | } 196 | } 197 | } 198 | } 199 | } 200 | } 201 | %alloc_9 = memref.alloc() {alignment = 64 : i64} : memref<64x192x15x15xf32> 202 | affine.for %arg1 = 0 to 64 { 203 | affine.for %arg2 = 0 to 192 { 204 | affine.for %arg3 = 0 to 15 { 205 | affine.for %arg4 = 0 to 15 { 206 | affine.store %cst_0, %alloc_9[%arg1, %arg2, %arg3, %arg4] : memref<64x192x15x15xf32> 207 | } 208 | } 209 | } 210 | } 211 | %subview_10 = memref.subview %alloc_9[0, 0, 1, 1] [64, 192, 13, 13] [1, 1, 1, 1] : memref<64x192x15x15xf32> to memref<64x192x13x13xf32, strided<[43200, 225, 15, 1], offset: 16>> 212 | memref.copy %alloc_8, %subview_10 : memref<64x192x13x13xf32> to memref<64x192x13x13xf32, strided<[43200, 225, 15, 1], offset: 16>> 213 | %alloc_11 = memref.alloc() {alignment = 64 : i64} : memref<64x384x13x13xf32> 214 | %alloc_12 = memref.alloc() {alignment = 64 : i64} : memref<64x384x13x13xf32> 215 | affine.for %arg1 = 0 to 64 { 216 | affine.for %arg2 = 0 to 384 { 217 | affine.for %arg3 = 0 to 13 { 218 | affine.for %arg4 = 0 to 13 { 219 | %16 = affine.load %10[%arg2] : memref<384xf32> 220 | affine.store %16, %alloc_12[%arg1, %arg2, %arg3, %arg4] : memref<64x384x13x13xf32> 221 | } 222 | } 223 | } 224 | } 225 | affine.for %arg1 = 0 to 64 { 226 | affine.for %arg2 = 0 to 384 { 227 | affine.for %arg3 = 0 to 13 { 228 | affine.for %arg4 = 0 to 13 { 229 | affine.for %arg5 = 0 to 192 { 230 | affine.for %arg6 = 0 to 3 { 231 | affine.for %arg7 = 0 to 3 { 232 | %16 = affine.load %alloc_9[%arg1, %arg5, %arg3 + %arg6, %arg4 + %arg7] : memref<64x192x15x15xf32> 233 | %17 = affine.load %11[%arg2, %arg5, %arg6, %arg7] : memref<384x192x3x3xf32> 234 | %18 = affine.load %alloc_12[%arg1, %arg2, %arg3, %arg4] : memref<64x384x13x13xf32> 235 | %19 = arith.mulf %16, %17 : f32 236 | %20 = arith.addf %18, %19 : f32 237 | affine.store %20, %alloc_12[%arg1, %arg2, %arg3, %arg4] : memref<64x384x13x13xf32> 238 | } 239 | } 240 | } 241 | } 242 | } 243 | } 244 | } 245 | affine.for %arg1 = 0 to 64 { 246 | affine.for %arg2 = 0 to 384 { 247 | affine.for %arg3 = 0 to 13 { 248 | affine.for %arg4 = 0 to 13 { 249 | %16 = affine.load %alloc_12[%arg1, %arg2, %arg3, %arg4] : memref<64x384x13x13xf32> 250 | %17 = arith.cmpf ugt, %16, %cst_0 : f32 251 | %18 = arith.select %17, %16, %cst_0 : f32 252 | affine.store %18, %alloc_11[%arg1, %arg2, %arg3, %arg4] : memref<64x384x13x13xf32> 253 | } 254 | } 255 | } 256 | } 257 | %alloc_13 = memref.alloc() {alignment = 64 : i64} : memref<64x384x15x15xf32> 258 | affine.for %arg1 = 0 to 64 { 259 | affine.for %arg2 = 0 to 384 { 260 | affine.for %arg3 = 0 to 15 { 261 | affine.for %arg4 = 0 to 15 { 262 | affine.store %cst_0, %alloc_13[%arg1, %arg2, %arg3, %arg4] : memref<64x384x15x15xf32> 263 | } 264 | } 265 | } 266 | } 267 | %subview_14 = memref.subview %alloc_13[0, 0, 1, 1] [64, 384, 13, 13] [1, 1, 1, 1] : memref<64x384x15x15xf32> to memref<64x384x13x13xf32, strided<[86400, 225, 15, 1], offset: 16>> 268 | memref.copy %alloc_11, %subview_14 : memref<64x384x13x13xf32> to memref<64x384x13x13xf32, strided<[86400, 225, 15, 1], offset: 16>> 269 | %alloc_15 = memref.alloc() {alignment = 64 : i64} : memref<64x256x13x13xf32> 270 | %alloc_16 = memref.alloc() {alignment = 64 : i64} : memref<64x256x13x13xf32> 271 | affine.for %arg1 = 0 to 64 { 272 | affine.for %arg2 = 0 to 256 { 273 | affine.for %arg3 = 0 to 13 { 274 | affine.for %arg4 = 0 to 13 { 275 | %16 = affine.load %8[%arg2] : memref<256xf32> 276 | affine.store %16, %alloc_16[%arg1, %arg2, %arg3, %arg4] : memref<64x256x13x13xf32> 277 | } 278 | } 279 | } 280 | } 281 | affine.for %arg1 = 0 to 64 { 282 | affine.for %arg2 = 0 to 256 { 283 | affine.for %arg3 = 0 to 13 { 284 | affine.for %arg4 = 0 to 13 { 285 | affine.for %arg5 = 0 to 384 { 286 | affine.for %arg6 = 0 to 3 { 287 | affine.for %arg7 = 0 to 3 { 288 | %16 = affine.load %alloc_13[%arg1, %arg5, %arg3 + %arg6, %arg4 + %arg7] : memref<64x384x15x15xf32> 289 | %17 = affine.load %9[%arg2, %arg5, %arg6, %arg7] : memref<256x384x3x3xf32> 290 | %18 = affine.load %alloc_16[%arg1, %arg2, %arg3, %arg4] : memref<64x256x13x13xf32> 291 | %19 = arith.mulf %16, %17 : f32 292 | %20 = arith.addf %18, %19 : f32 293 | affine.store %20, %alloc_16[%arg1, %arg2, %arg3, %arg4] : memref<64x256x13x13xf32> 294 | } 295 | } 296 | } 297 | } 298 | } 299 | } 300 | } 301 | affine.for %arg1 = 0 to 64 { 302 | affine.for %arg2 = 0 to 256 { 303 | affine.for %arg3 = 0 to 13 { 304 | affine.for %arg4 = 0 to 13 { 305 | %16 = affine.load %alloc_16[%arg1, %arg2, %arg3, %arg4] : memref<64x256x13x13xf32> 306 | %17 = arith.cmpf ugt, %16, %cst_0 : f32 307 | %18 = arith.select %17, %16, %cst_0 : f32 308 | affine.store %18, %alloc_15[%arg1, %arg2, %arg3, %arg4] : memref<64x256x13x13xf32> 309 | } 310 | } 311 | } 312 | } 313 | %alloc_17 = memref.alloc() {alignment = 64 : i64} : memref<64x256x15x15xf32> 314 | affine.for %arg1 = 0 to 64 { 315 | affine.for %arg2 = 0 to 256 { 316 | affine.for %arg3 = 0 to 15 { 317 | affine.for %arg4 = 0 to 15 { 318 | affine.store %cst_0, %alloc_17[%arg1, %arg2, %arg3, %arg4] : memref<64x256x15x15xf32> 319 | } 320 | } 321 | } 322 | } 323 | %subview_18 = memref.subview %alloc_17[0, 0, 1, 1] [64, 256, 13, 13] [1, 1, 1, 1] : memref<64x256x15x15xf32> to memref<64x256x13x13xf32, strided<[57600, 225, 15, 1], offset: 16>> 324 | memref.copy %alloc_15, %subview_18 : memref<64x256x13x13xf32> to memref<64x256x13x13xf32, strided<[57600, 225, 15, 1], offset: 16>> 325 | %alloc_19 = memref.alloc() {alignment = 64 : i64} : memref<64x256x13x13xf32> 326 | affine.for %arg1 = 0 to 64 { 327 | affine.for %arg2 = 0 to 256 { 328 | affine.for %arg3 = 0 to 13 { 329 | affine.for %arg4 = 0 to 13 { 330 | %16 = affine.load %6[%arg2] : memref<256xf32> 331 | affine.store %16, %alloc_19[%arg1, %arg2, %arg3, %arg4] : memref<64x256x13x13xf32> 332 | } 333 | } 334 | } 335 | } 336 | affine.for %arg1 = 0 to 64 { 337 | affine.for %arg2 = 0 to 256 { 338 | affine.for %arg3 = 0 to 13 { 339 | affine.for %arg4 = 0 to 13 { 340 | affine.for %arg5 = 0 to 256 { 341 | affine.for %arg6 = 0 to 3 { 342 | affine.for %arg7 = 0 to 3 { 343 | %16 = affine.load %alloc_17[%arg1, %arg5, %arg3 + %arg6, %arg4 + %arg7] : memref<64x256x15x15xf32> 344 | %17 = affine.load %7[%arg2, %arg5, %arg6, %arg7] : memref<256x256x3x3xf32> 345 | %18 = affine.load %alloc_19[%arg1, %arg2, %arg3, %arg4] : memref<64x256x13x13xf32> 346 | %19 = arith.mulf %16, %17 : f32 347 | %20 = arith.addf %18, %19 : f32 348 | affine.store %20, %alloc_19[%arg1, %arg2, %arg3, %arg4] : memref<64x256x13x13xf32> 349 | } 350 | } 351 | } 352 | } 353 | } 354 | } 355 | } 356 | affine.for %arg1 = 0 to 64 { 357 | affine.for %arg2 = 0 to 256 { 358 | affine.for %arg3 = 0 to 13 { 359 | affine.for %arg4 = 0 to 13 { 360 | %16 = affine.load %alloc_19[%arg1, %arg2, %arg3, %arg4] : memref<64x256x13x13xf32> 361 | %17 = arith.cmpf ugt, %16, %cst_0 : f32 362 | %18 = arith.select %17, %16, %cst_0 : f32 363 | affine.store %18, %alloc_15[%arg1, %arg2, %arg3, %arg4] : memref<64x256x13x13xf32> 364 | } 365 | } 366 | } 367 | } 368 | %alloc_20 = memref.alloc() {alignment = 64 : i64} : memref<64x256x6x6xf32> 369 | %alloc_21 = memref.alloc() {alignment = 64 : i64} : memref<64x256x6x6xf32> 370 | affine.for %arg1 = 0 to 64 { 371 | affine.for %arg2 = 0 to 256 { 372 | affine.for %arg3 = 0 to 6 { 373 | affine.for %arg4 = 0 to 6 { 374 | affine.store %cst, %alloc_21[%arg1, %arg2, %arg3, %arg4] : memref<64x256x6x6xf32> 375 | } 376 | } 377 | } 378 | } 379 | affine.for %arg1 = 0 to 64 { 380 | affine.for %arg2 = 0 to 256 { 381 | affine.for %arg3 = 0 to 6 { 382 | affine.for %arg4 = 0 to 6 { 383 | affine.for %arg5 = 0 to 3 { 384 | affine.for %arg6 = 0 to 3 { 385 | %16 = affine.load %alloc_15[%arg1, %arg2, %arg3 * 2 + %arg5, %arg4 * 2 + %arg6] : memref<64x256x13x13xf32> 386 | %17 = affine.load %alloc_21[%arg1, %arg2, %arg3, %arg4] : memref<64x256x6x6xf32> 387 | %18 = arith.maximumf %17, %16 : f32 388 | affine.store %18, %alloc_21[%arg1, %arg2, %arg3, %arg4] : memref<64x256x6x6xf32> 389 | } 390 | } 391 | } 392 | } 393 | } 394 | } 395 | affine.for %arg1 = 0 to 64 { 396 | affine.for %arg2 = 0 to 256 { 397 | affine.for %arg3 = 0 to 6 { 398 | affine.for %arg4 = 0 to 6 { 399 | affine.store %cst_0, %alloc_20[%arg1, %arg2, %arg3, %arg4] : memref<64x256x6x6xf32> 400 | } 401 | } 402 | } 403 | } 404 | affine.for %arg1 = 0 to 64 { 405 | affine.for %arg2 = 0 to 256 { 406 | affine.for %arg3 = 0 to 6 { 407 | affine.for %arg4 = 0 to 6 { 408 | affine.for %arg5 = 0 to 1 { 409 | affine.for %arg6 = 0 to 1 { 410 | %16 = affine.load %alloc_21[%arg1, %arg2, %arg3 + %arg5, %arg4 + %arg6] : memref<64x256x6x6xf32> 411 | %17 = affine.load %alloc_20[%arg1, %arg2, %arg3, %arg4] : memref<64x256x6x6xf32> 412 | %18 = arith.addf %17, %16 : f32 413 | affine.store %18, %alloc_20[%arg1, %arg2, %arg3, %arg4] : memref<64x256x6x6xf32> 414 | } 415 | } 416 | } 417 | } 418 | } 419 | } 420 | %collapse_shape = memref.collapse_shape %alloc_20 [[0], [1, 2, 3]] : memref<64x256x6x6xf32> into memref<64x9216xf32> 421 | %alloc_22 = memref.alloc() {alignment = 64 : i64} : memref<9216x4096xf32> 422 | affine.for %arg1 = 0 to 4096 { 423 | affine.for %arg2 = 0 to 9216 { 424 | %16 = affine.load %5[%arg1, %arg2] : memref<4096x9216xf32> 425 | affine.store %16, %alloc_22[%arg2, %arg1] : memref<9216x4096xf32> 426 | } 427 | } 428 | %alloc_23 = memref.alloc() {alignment = 64 : i64} : memref<64x4096xf32> 429 | %alloc_24 = memref.alloc() {alignment = 64 : i64} : memref<64x4096xf32> 430 | affine.for %arg1 = 0 to 64 { 431 | affine.for %arg2 = 0 to 4096 { 432 | affine.store %cst_0, %alloc_24[%arg1, %arg2] : memref<64x4096xf32> 433 | } 434 | } 435 | %alloc_25 = memref.alloc() {alignment = 64 : i64} : memref<64x4096xf32> 436 | memref.copy %alloc_24, %alloc_25 : memref<64x4096xf32> to memref<64x4096xf32> 437 | affine.for %arg1 = 0 to 64 { 438 | affine.for %arg2 = 0 to 4096 { 439 | affine.for %arg3 = 0 to 9216 { 440 | %16 = affine.load %collapse_shape[%arg1, %arg3] : memref<64x9216xf32> 441 | %17 = affine.load %alloc_22[%arg3, %arg2] : memref<9216x4096xf32> 442 | %18 = affine.load %alloc_25[%arg1, %arg2] : memref<64x4096xf32> 443 | %19 = arith.mulf %16, %17 : f32 444 | %20 = arith.addf %18, %19 : f32 445 | affine.store %20, %alloc_25[%arg1, %arg2] : memref<64x4096xf32> 446 | } 447 | } 448 | } 449 | %alloc_26 = memref.alloc() {alignment = 64 : i64} : memref<64x4096xf32> 450 | affine.for %arg1 = 0 to 64 { 451 | affine.for %arg2 = 0 to 4096 { 452 | %16 = affine.load %alloc_25[%arg1, %arg2] : memref<64x4096xf32> 453 | %17 = affine.load %4[%arg2] : memref<4096xf32> 454 | %18 = arith.addf %16, %17 : f32 455 | affine.store %18, %alloc_26[%arg1, %arg2] : memref<64x4096xf32> 456 | } 457 | } 458 | affine.for %arg1 = 0 to 64 { 459 | affine.for %arg2 = 0 to 4096 { 460 | %16 = affine.load %alloc_26[%arg1, %arg2] : memref<64x4096xf32> 461 | %17 = arith.cmpf ugt, %16, %cst_0 : f32 462 | %18 = arith.select %17, %16, %cst_0 : f32 463 | affine.store %18, %alloc_23[%arg1, %arg2] : memref<64x4096xf32> 464 | } 465 | } 466 | %alloc_27 = memref.alloc() {alignment = 64 : i64} : memref<4096x4096xf32> 467 | affine.for %arg1 = 0 to 4096 { 468 | affine.for %arg2 = 0 to 4096 { 469 | %16 = affine.load %3[%arg1, %arg2] : memref<4096x4096xf32> 470 | affine.store %16, %alloc_27[%arg2, %arg1] : memref<4096x4096xf32> 471 | } 472 | } 473 | affine.for %arg1 = 0 to 64 { 474 | affine.for %arg2 = 0 to 4096 { 475 | affine.for %arg3 = 0 to 4096 { 476 | %16 = affine.load %alloc_23[%arg1, %arg3] : memref<64x4096xf32> 477 | %17 = affine.load %alloc_27[%arg3, %arg2] : memref<4096x4096xf32> 478 | %18 = affine.load %alloc_24[%arg1, %arg2] : memref<64x4096xf32> 479 | %19 = arith.mulf %16, %17 : f32 480 | %20 = arith.addf %18, %19 : f32 481 | affine.store %20, %alloc_24[%arg1, %arg2] : memref<64x4096xf32> 482 | } 483 | } 484 | } 485 | %alloc_28 = memref.alloc() {alignment = 64 : i64} : memref<64x4096xf32> 486 | affine.for %arg1 = 0 to 64 { 487 | affine.for %arg2 = 0 to 4096 { 488 | %16 = affine.load %alloc_24[%arg1, %arg2] : memref<64x4096xf32> 489 | %17 = affine.load %2[%arg2] : memref<4096xf32> 490 | %18 = arith.addf %16, %17 : f32 491 | affine.store %18, %alloc_28[%arg1, %arg2] : memref<64x4096xf32> 492 | } 493 | } 494 | affine.for %arg1 = 0 to 64 { 495 | affine.for %arg2 = 0 to 4096 { 496 | %16 = affine.load %alloc_28[%arg1, %arg2] : memref<64x4096xf32> 497 | %17 = arith.cmpf ugt, %16, %cst_0 : f32 498 | %18 = arith.select %17, %16, %cst_0 : f32 499 | affine.store %18, %alloc_23[%arg1, %arg2] : memref<64x4096xf32> 500 | } 501 | } 502 | %alloc_29 = memref.alloc() {alignment = 64 : i64} : memref<4096x1000xf32> 503 | affine.for %arg1 = 0 to 1000 { 504 | affine.for %arg2 = 0 to 4096 { 505 | %16 = affine.load %1[%arg1, %arg2] : memref<1000x4096xf32> 506 | affine.store %16, %alloc_29[%arg2, %arg1] : memref<4096x1000xf32> 507 | } 508 | } 509 | %alloc_30 = memref.alloc() {alignment = 64 : i64} : memref<64x1000xf32> 510 | %alloc_31 = memref.alloc() {alignment = 64 : i64} : memref<64x1000xf32> 511 | affine.for %arg1 = 0 to 64 { 512 | affine.for %arg2 = 0 to 1000 { 513 | affine.store %cst_0, %alloc_31[%arg1, %arg2] : memref<64x1000xf32> 514 | } 515 | } 516 | affine.for %arg1 = 0 to 64 { 517 | affine.for %arg2 = 0 to 1000 { 518 | affine.for %arg3 = 0 to 4096 { 519 | %16 = affine.load %alloc_23[%arg1, %arg3] : memref<64x4096xf32> 520 | %17 = affine.load %alloc_29[%arg3, %arg2] : memref<4096x1000xf32> 521 | %18 = affine.load %alloc_31[%arg1, %arg2] : memref<64x1000xf32> 522 | %19 = arith.mulf %16, %17 : f32 523 | %20 = arith.addf %18, %19 : f32 524 | affine.store %20, %alloc_31[%arg1, %arg2] : memref<64x1000xf32> 525 | } 526 | } 527 | } 528 | affine.for %arg1 = 0 to 64 { 529 | affine.for %arg2 = 0 to 1000 { 530 | %16 = affine.load %alloc_31[%arg1, %arg2] : memref<64x1000xf32> 531 | %17 = affine.load %0[%arg2] : memref<1000xf32> 532 | %18 = arith.addf %16, %17 : f32 533 | affine.store %18, %alloc_30[%arg1, %arg2] : memref<64x1000xf32> 534 | } 535 | } 536 | memref.dealloc %alloc_1 : memref<64x64x55x55xf32> 537 | memref.dealloc %alloc_2 : memref<64x64x55x55xf32> 538 | memref.dealloc %alloc_3 : memref<64x64x27x27xf32> 539 | memref.dealloc %alloc_6 : memref<64x192x27x27xf32> 540 | memref.dealloc %alloc_7 : memref<64x192x27x27xf32> 541 | memref.dealloc %alloc_8 : memref<64x192x13x13xf32> 542 | memref.dealloc %alloc_11 : memref<64x384x13x13xf32> 543 | memref.dealloc %alloc_12 : memref<64x384x13x13xf32> 544 | memref.dealloc %alloc_15 : memref<64x256x13x13xf32> 545 | memref.dealloc %alloc_16 : memref<64x256x13x13xf32> 546 | memref.dealloc %alloc_19 : memref<64x256x13x13xf32> 547 | memref.dealloc %alloc_20 : memref<64x256x6x6xf32> 548 | memref.dealloc %alloc_21 : memref<64x256x6x6xf32> 549 | memref.dealloc %alloc_22 : memref<9216x4096xf32> 550 | memref.dealloc %alloc_23 : memref<64x4096xf32> 551 | memref.dealloc %alloc_24 : memref<64x4096xf32> 552 | memref.dealloc %alloc_25 : memref<64x4096xf32> 553 | memref.dealloc %alloc_26 : memref<64x4096xf32> 554 | memref.dealloc %alloc_27 : memref<4096x4096xf32> 555 | memref.dealloc %alloc_28 : memref<64x4096xf32> 556 | memref.dealloc %alloc_29 : memref<4096x1000xf32> 557 | memref.dealloc %alloc_30 : memref<64x1000xf32> 558 | memref.dealloc %alloc_31 : memref<64x1000xf32> 559 | memref.dealloc %alloc : memref<64x3x228x228xf32> 560 | memref.dealloc %alloc_4 : memref<64x64x31x31xf32> 561 | memref.dealloc %alloc_9 : memref<64x192x15x15xf32> 562 | memref.dealloc %alloc_13 : memref<64x384x15x15xf32> 563 | memref.dealloc %alloc_17 : memref<64x256x15x15xf32> 564 | return %alloc_30 : memref<64x1000xf32> 565 | } 566 | } 567 | -------------------------------------------------------------------------------- /Torch_MLIR/pytorch/vgg11.mlir: -------------------------------------------------------------------------------- 1 | module attributes {torch.debug_module_name = "VGG"} { 2 | memref.global "private" constant @__constant_64xf32 : memref<64xf32> = dense<0.000000e+00> {alignment = 64 : i64} 3 | memref.global "private" constant @__constant_64x3x3x3xf32 : memref<64x3x3x3xf32> = dense_resource<__elided__> {alignment = 64 : i64} 4 | memref.global "private" constant @__constant_128xf32 : memref<128xf32> = dense<0.000000e+00> {alignment = 64 : i64} 5 | memref.global "private" constant @__constant_128x64x3x3xf32 : memref<128x64x3x3xf32> = dense_resource<__elided__> {alignment = 64 : i64} 6 | memref.global "private" constant @__constant_256xf32 : memref<256xf32> = dense<0.000000e+00> {alignment = 64 : i64} 7 | memref.global "private" constant @__constant_256x128x3x3xf32 : memref<256x128x3x3xf32> = dense_resource<__elided__> {alignment = 64 : i64} 8 | memref.global "private" constant @__constant_256x256x3x3xf32 : memref<256x256x3x3xf32> = dense_resource<__elided__> {alignment = 64 : i64} 9 | memref.global "private" constant @__constant_512xf32 : memref<512xf32> = dense<0.000000e+00> {alignment = 64 : i64} 10 | memref.global "private" constant @__constant_512x256x3x3xf32 : memref<512x256x3x3xf32> = dense_resource<__elided__> {alignment = 64 : i64} 11 | memref.global "private" constant @__constant_512x512x3x3xf32_1 : memref<512x512x3x3xf32> = dense_resource<__elided__> {alignment = 64 : i64} 12 | memref.global "private" constant @__constant_512x512x3x3xf32_0 : memref<512x512x3x3xf32> = dense_resource<__elided__> {alignment = 64 : i64} 13 | memref.global "private" constant @__constant_512x512x3x3xf32 : memref<512x512x3x3xf32> = dense_resource<__elided__> {alignment = 64 : i64} 14 | memref.global "private" constant @__constant_4096xf32 : memref<4096xf32> = dense<0.000000e+00> {alignment = 64 : i64} 15 | memref.global "private" constant @__constant_4096x25088xf32 : memref<4096x25088xf32> = dense_resource<__elided__> {alignment = 64 : i64} 16 | memref.global "private" constant @__constant_4096x4096xf32 : memref<4096x4096xf32> = dense_resource<__elided__> {alignment = 64 : i64} 17 | memref.global "private" constant @__constant_1000xf32 : memref<1000xf32> = dense<0.000000e+00> {alignment = 64 : i64} 18 | memref.global "private" constant @__constant_1000x4096xf32 : memref<1000x4096xf32> = dense_resource<__elided__> {alignment = 64 : i64} 19 | 20 | func.func @forward(%arg0: memref<64x3x224x224xf32>) -> memref<64x1000xf32> { 21 | %cst = arith.constant -3.40282347E+38 : f32 22 | %cst_0 = arith.constant 0.000000e+00 : f32 23 | %0 = memref.get_global @__constant_1000x4096xf32 : memref<1000x4096xf32> 24 | %1 = memref.get_global @__constant_4096x4096xf32 : memref<4096x4096xf32> 25 | %2 = memref.get_global @__constant_4096x25088xf32 : memref<4096x25088xf32> 26 | %3 = memref.get_global @__constant_512x512x3x3xf32 : memref<512x512x3x3xf32> 27 | %4 = memref.get_global @__constant_512x512x3x3xf32_0 : memref<512x512x3x3xf32> 28 | %5 = memref.get_global @__constant_512x512x3x3xf32_1 : memref<512x512x3x3xf32> 29 | %6 = memref.get_global @__constant_512x256x3x3xf32 : memref<512x256x3x3xf32> 30 | %7 = memref.get_global @__constant_256x256x3x3xf32 : memref<256x256x3x3xf32> 31 | %8 = memref.get_global @__constant_256x128x3x3xf32 : memref<256x128x3x3xf32> 32 | %9 = memref.get_global @__constant_128x64x3x3xf32 : memref<128x64x3x3xf32> 33 | %10 = memref.get_global @__constant_64x3x3x3xf32 : memref<64x3x3x3xf32> 34 | %alloc = memref.alloc() {alignment = 64 : i64} : memref<64x3x226x226xf32> 35 | affine.for %arg1 = 0 to 64 { 36 | affine.for %arg2 = 0 to 3 { 37 | affine.for %arg3 = 0 to 226 { 38 | affine.for %arg4 = 0 to 226 { 39 | affine.store %cst_0, %alloc[%arg1, %arg2, %arg3, %arg4] : memref<64x3x226x226xf32> 40 | } 41 | } 42 | } 43 | } 44 | %subview = memref.subview %alloc[0, 0, 1, 1] [64, 3, 224, 224] [1, 1, 1, 1] : memref<64x3x226x226xf32> to memref<64x3x224x224xf32, strided<[153228, 51076, 226, 1], offset: 227>> 45 | memref.copy %arg0, %subview : memref<64x3x224x224xf32> to memref<64x3x224x224xf32, strided<[153228, 51076, 226, 1], offset: 227>> 46 | %alloc_1 = memref.alloc() {alignment = 64 : i64} : memref<64x64x224x224xf32> 47 | %alloc_2 = memref.alloc() {alignment = 64 : i64} : memref<64x64x224x224xf32> 48 | affine.for %arg1 = 0 to 64 { 49 | affine.for %arg2 = 0 to 64 { 50 | affine.for %arg3 = 0 to 224 { 51 | affine.for %arg4 = 0 to 224 { 52 | affine.store %cst_0, %alloc_2[%arg1, %arg2, %arg3, %arg4] : memref<64x64x224x224xf32> 53 | } 54 | } 55 | } 56 | } 57 | affine.for %arg1 = 0 to 64 { 58 | affine.for %arg2 = 0 to 64 { 59 | affine.for %arg3 = 0 to 224 { 60 | affine.for %arg4 = 0 to 224 { 61 | affine.for %arg5 = 0 to 3 { 62 | affine.for %arg6 = 0 to 3 { 63 | affine.for %arg7 = 0 to 3 { 64 | %11 = affine.load %alloc[%arg1, %arg5, %arg3 + %arg6, %arg4 + %arg7] : memref<64x3x226x226xf32> 65 | %12 = affine.load %10[%arg2, %arg5, %arg6, %arg7] : memref<64x3x3x3xf32> 66 | %13 = affine.load %alloc_2[%arg1, %arg2, %arg3, %arg4] : memref<64x64x224x224xf32> 67 | %14 = arith.mulf %11, %12 : f32 68 | %15 = arith.addf %13, %14 : f32 69 | affine.store %15, %alloc_2[%arg1, %arg2, %arg3, %arg4] : memref<64x64x224x224xf32> 70 | } 71 | } 72 | } 73 | } 74 | } 75 | } 76 | } 77 | affine.for %arg1 = 0 to 64 { 78 | affine.for %arg2 = 0 to 64 { 79 | affine.for %arg3 = 0 to 224 { 80 | affine.for %arg4 = 0 to 224 { 81 | %11 = affine.load %alloc_2[%arg1, %arg2, %arg3, %arg4] : memref<64x64x224x224xf32> 82 | %12 = arith.cmpf ugt, %11, %cst_0 : f32 83 | %13 = arith.select %12, %11, %cst_0 : f32 84 | affine.store %13, %alloc_1[%arg1, %arg2, %arg3, %arg4] : memref<64x64x224x224xf32> 85 | } 86 | } 87 | } 88 | } 89 | %alloc_3 = memref.alloc() {alignment = 64 : i64} : memref<64x64x112x112xf32> 90 | affine.for %arg1 = 0 to 64 { 91 | affine.for %arg2 = 0 to 64 { 92 | affine.for %arg3 = 0 to 112 { 93 | affine.for %arg4 = 0 to 112 { 94 | affine.store %cst, %alloc_3[%arg1, %arg2, %arg3, %arg4] : memref<64x64x112x112xf32> 95 | } 96 | } 97 | } 98 | } 99 | affine.for %arg1 = 0 to 64 { 100 | affine.for %arg2 = 0 to 64 { 101 | affine.for %arg3 = 0 to 112 { 102 | affine.for %arg4 = 0 to 112 { 103 | affine.for %arg5 = 0 to 2 { 104 | affine.for %arg6 = 0 to 2 { 105 | %11 = affine.load %alloc_1[%arg1, %arg2, %arg3 * 2 + %arg5, %arg4 * 2 + %arg6] : memref<64x64x224x224xf32> 106 | %12 = affine.load %alloc_3[%arg1, %arg2, %arg3, %arg4] : memref<64x64x112x112xf32> 107 | %13 = arith.maximumf %12, %11 : f32 108 | affine.store %13, %alloc_3[%arg1, %arg2, %arg3, %arg4] : memref<64x64x112x112xf32> 109 | } 110 | } 111 | } 112 | } 113 | } 114 | } 115 | %alloc_4 = memref.alloc() {alignment = 64 : i64} : memref<64x64x114x114xf32> 116 | affine.for %arg1 = 0 to 64 { 117 | affine.for %arg2 = 0 to 64 { 118 | affine.for %arg3 = 0 to 114 { 119 | affine.for %arg4 = 0 to 114 { 120 | affine.store %cst_0, %alloc_4[%arg1, %arg2, %arg3, %arg4] : memref<64x64x114x114xf32> 121 | } 122 | } 123 | } 124 | } 125 | %subview_5 = memref.subview %alloc_4[0, 0, 1, 1] [64, 64, 112, 112] [1, 1, 1, 1] : memref<64x64x114x114xf32> to memref<64x64x112x112xf32, strided<[831744, 12996, 114, 1], offset: 115>> 126 | memref.copy %alloc_3, %subview_5 : memref<64x64x112x112xf32> to memref<64x64x112x112xf32, strided<[831744, 12996, 114, 1], offset: 115>> 127 | %alloc_6 = memref.alloc() {alignment = 64 : i64} : memref<64x128x112x112xf32> 128 | %alloc_7 = memref.alloc() {alignment = 64 : i64} : memref<64x128x112x112xf32> 129 | affine.for %arg1 = 0 to 64 { 130 | affine.for %arg2 = 0 to 128 { 131 | affine.for %arg3 = 0 to 112 { 132 | affine.for %arg4 = 0 to 112 { 133 | affine.store %cst_0, %alloc_7[%arg1, %arg2, %arg3, %arg4] : memref<64x128x112x112xf32> 134 | } 135 | } 136 | } 137 | } 138 | affine.for %arg1 = 0 to 64 { 139 | affine.for %arg2 = 0 to 128 { 140 | affine.for %arg3 = 0 to 112 { 141 | affine.for %arg4 = 0 to 112 { 142 | affine.for %arg5 = 0 to 64 { 143 | affine.for %arg6 = 0 to 3 { 144 | affine.for %arg7 = 0 to 3 { 145 | %11 = affine.load %alloc_4[%arg1, %arg5, %arg3 + %arg6, %arg4 + %arg7] : memref<64x64x114x114xf32> 146 | %12 = affine.load %9[%arg2, %arg5, %arg6, %arg7] : memref<128x64x3x3xf32> 147 | %13 = affine.load %alloc_7[%arg1, %arg2, %arg3, %arg4] : memref<64x128x112x112xf32> 148 | %14 = arith.mulf %11, %12 : f32 149 | %15 = arith.addf %13, %14 : f32 150 | affine.store %15, %alloc_7[%arg1, %arg2, %arg3, %arg4] : memref<64x128x112x112xf32> 151 | } 152 | } 153 | } 154 | } 155 | } 156 | } 157 | } 158 | affine.for %arg1 = 0 to 64 { 159 | affine.for %arg2 = 0 to 128 { 160 | affine.for %arg3 = 0 to 112 { 161 | affine.for %arg4 = 0 to 112 { 162 | %11 = affine.load %alloc_7[%arg1, %arg2, %arg3, %arg4] : memref<64x128x112x112xf32> 163 | %12 = arith.cmpf ugt, %11, %cst_0 : f32 164 | %13 = arith.select %12, %11, %cst_0 : f32 165 | affine.store %13, %alloc_6[%arg1, %arg2, %arg3, %arg4] : memref<64x128x112x112xf32> 166 | } 167 | } 168 | } 169 | } 170 | %alloc_8 = memref.alloc() {alignment = 64 : i64} : memref<64x128x56x56xf32> 171 | affine.for %arg1 = 0 to 64 { 172 | affine.for %arg2 = 0 to 128 { 173 | affine.for %arg3 = 0 to 56 { 174 | affine.for %arg4 = 0 to 56 { 175 | affine.store %cst, %alloc_8[%arg1, %arg2, %arg3, %arg4] : memref<64x128x56x56xf32> 176 | } 177 | } 178 | } 179 | } 180 | affine.for %arg1 = 0 to 64 { 181 | affine.for %arg2 = 0 to 128 { 182 | affine.for %arg3 = 0 to 56 { 183 | affine.for %arg4 = 0 to 56 { 184 | affine.for %arg5 = 0 to 2 { 185 | affine.for %arg6 = 0 to 2 { 186 | %11 = affine.load %alloc_6[%arg1, %arg2, %arg3 * 2 + %arg5, %arg4 * 2 + %arg6] : memref<64x128x112x112xf32> 187 | %12 = affine.load %alloc_8[%arg1, %arg2, %arg3, %arg4] : memref<64x128x56x56xf32> 188 | %13 = arith.maximumf %12, %11 : f32 189 | affine.store %13, %alloc_8[%arg1, %arg2, %arg3, %arg4] : memref<64x128x56x56xf32> 190 | } 191 | } 192 | } 193 | } 194 | } 195 | } 196 | %alloc_9 = memref.alloc() {alignment = 64 : i64} : memref<64x128x58x58xf32> 197 | affine.for %arg1 = 0 to 64 { 198 | affine.for %arg2 = 0 to 128 { 199 | affine.for %arg3 = 0 to 58 { 200 | affine.for %arg4 = 0 to 58 { 201 | affine.store %cst_0, %alloc_9[%arg1, %arg2, %arg3, %arg4] : memref<64x128x58x58xf32> 202 | } 203 | } 204 | } 205 | } 206 | %subview_10 = memref.subview %alloc_9[0, 0, 1, 1] [64, 128, 56, 56] [1, 1, 1, 1] : memref<64x128x58x58xf32> to memref<64x128x56x56xf32, strided<[430592, 3364, 58, 1], offset: 59>> 207 | memref.copy %alloc_8, %subview_10 : memref<64x128x56x56xf32> to memref<64x128x56x56xf32, strided<[430592, 3364, 58, 1], offset: 59>> 208 | %alloc_11 = memref.alloc() {alignment = 64 : i64} : memref<64x256x56x56xf32> 209 | %alloc_12 = memref.alloc() {alignment = 64 : i64} : memref<64x256x56x56xf32> 210 | affine.for %arg1 = 0 to 64 { 211 | affine.for %arg2 = 0 to 256 { 212 | affine.for %arg3 = 0 to 56 { 213 | affine.for %arg4 = 0 to 56 { 214 | affine.store %cst_0, %alloc_12[%arg1, %arg2, %arg3, %arg4] : memref<64x256x56x56xf32> 215 | } 216 | } 217 | } 218 | } 219 | %alloc_13 = memref.alloc() {alignment = 64 : i64} : memref<64x256x56x56xf32> 220 | memref.copy %alloc_12, %alloc_13 : memref<64x256x56x56xf32> to memref<64x256x56x56xf32> 221 | affine.for %arg1 = 0 to 64 { 222 | affine.for %arg2 = 0 to 256 { 223 | affine.for %arg3 = 0 to 56 { 224 | affine.for %arg4 = 0 to 56 { 225 | affine.for %arg5 = 0 to 128 { 226 | affine.for %arg6 = 0 to 3 { 227 | affine.for %arg7 = 0 to 3 { 228 | %11 = affine.load %alloc_9[%arg1, %arg5, %arg3 + %arg6, %arg4 + %arg7] : memref<64x128x58x58xf32> 229 | %12 = affine.load %8[%arg2, %arg5, %arg6, %arg7] : memref<256x128x3x3xf32> 230 | %13 = affine.load %alloc_13[%arg1, %arg2, %arg3, %arg4] : memref<64x256x56x56xf32> 231 | %14 = arith.mulf %11, %12 : f32 232 | %15 = arith.addf %13, %14 : f32 233 | affine.store %15, %alloc_13[%arg1, %arg2, %arg3, %arg4] : memref<64x256x56x56xf32> 234 | } 235 | } 236 | } 237 | } 238 | } 239 | } 240 | } 241 | affine.for %arg1 = 0 to 64 { 242 | affine.for %arg2 = 0 to 256 { 243 | affine.for %arg3 = 0 to 56 { 244 | affine.for %arg4 = 0 to 56 { 245 | %11 = affine.load %alloc_13[%arg1, %arg2, %arg3, %arg4] : memref<64x256x56x56xf32> 246 | %12 = arith.cmpf ugt, %11, %cst_0 : f32 247 | %13 = arith.select %12, %11, %cst_0 : f32 248 | affine.store %13, %alloc_11[%arg1, %arg2, %arg3, %arg4] : memref<64x256x56x56xf32> 249 | } 250 | } 251 | } 252 | } 253 | %alloc_14 = memref.alloc() {alignment = 64 : i64} : memref<64x256x58x58xf32> 254 | affine.for %arg1 = 0 to 64 { 255 | affine.for %arg2 = 0 to 256 { 256 | affine.for %arg3 = 0 to 58 { 257 | affine.for %arg4 = 0 to 58 { 258 | affine.store %cst_0, %alloc_14[%arg1, %arg2, %arg3, %arg4] : memref<64x256x58x58xf32> 259 | } 260 | } 261 | } 262 | } 263 | %subview_15 = memref.subview %alloc_14[0, 0, 1, 1] [64, 256, 56, 56] [1, 1, 1, 1] : memref<64x256x58x58xf32> to memref<64x256x56x56xf32, strided<[861184, 3364, 58, 1], offset: 59>> 264 | memref.copy %alloc_11, %subview_15 : memref<64x256x56x56xf32> to memref<64x256x56x56xf32, strided<[861184, 3364, 58, 1], offset: 59>> 265 | affine.for %arg1 = 0 to 64 { 266 | affine.for %arg2 = 0 to 256 { 267 | affine.for %arg3 = 0 to 56 { 268 | affine.for %arg4 = 0 to 56 { 269 | affine.for %arg5 = 0 to 256 { 270 | affine.for %arg6 = 0 to 3 { 271 | affine.for %arg7 = 0 to 3 { 272 | %11 = affine.load %alloc_14[%arg1, %arg5, %arg3 + %arg6, %arg4 + %arg7] : memref<64x256x58x58xf32> 273 | %12 = affine.load %7[%arg2, %arg5, %arg6, %arg7] : memref<256x256x3x3xf32> 274 | %13 = affine.load %alloc_12[%arg1, %arg2, %arg3, %arg4] : memref<64x256x56x56xf32> 275 | %14 = arith.mulf %11, %12 : f32 276 | %15 = arith.addf %13, %14 : f32 277 | affine.store %15, %alloc_12[%arg1, %arg2, %arg3, %arg4] : memref<64x256x56x56xf32> 278 | } 279 | } 280 | } 281 | } 282 | } 283 | } 284 | } 285 | affine.for %arg1 = 0 to 64 { 286 | affine.for %arg2 = 0 to 256 { 287 | affine.for %arg3 = 0 to 56 { 288 | affine.for %arg4 = 0 to 56 { 289 | %11 = affine.load %alloc_12[%arg1, %arg2, %arg3, %arg4] : memref<64x256x56x56xf32> 290 | %12 = arith.cmpf ugt, %11, %cst_0 : f32 291 | %13 = arith.select %12, %11, %cst_0 : f32 292 | affine.store %13, %alloc_11[%arg1, %arg2, %arg3, %arg4] : memref<64x256x56x56xf32> 293 | } 294 | } 295 | } 296 | } 297 | %alloc_16 = memref.alloc() {alignment = 64 : i64} : memref<64x256x28x28xf32> 298 | affine.for %arg1 = 0 to 64 { 299 | affine.for %arg2 = 0 to 256 { 300 | affine.for %arg3 = 0 to 28 { 301 | affine.for %arg4 = 0 to 28 { 302 | affine.store %cst, %alloc_16[%arg1, %arg2, %arg3, %arg4] : memref<64x256x28x28xf32> 303 | } 304 | } 305 | } 306 | } 307 | affine.for %arg1 = 0 to 64 { 308 | affine.for %arg2 = 0 to 256 { 309 | affine.for %arg3 = 0 to 28 { 310 | affine.for %arg4 = 0 to 28 { 311 | affine.for %arg5 = 0 to 2 { 312 | affine.for %arg6 = 0 to 2 { 313 | %11 = affine.load %alloc_11[%arg1, %arg2, %arg3 * 2 + %arg5, %arg4 * 2 + %arg6] : memref<64x256x56x56xf32> 314 | %12 = affine.load %alloc_16[%arg1, %arg2, %arg3, %arg4] : memref<64x256x28x28xf32> 315 | %13 = arith.maximumf %12, %11 : f32 316 | affine.store %13, %alloc_16[%arg1, %arg2, %arg3, %arg4] : memref<64x256x28x28xf32> 317 | } 318 | } 319 | } 320 | } 321 | } 322 | } 323 | %alloc_17 = memref.alloc() {alignment = 64 : i64} : memref<64x256x30x30xf32> 324 | affine.for %arg1 = 0 to 64 { 325 | affine.for %arg2 = 0 to 256 { 326 | affine.for %arg3 = 0 to 30 { 327 | affine.for %arg4 = 0 to 30 { 328 | affine.store %cst_0, %alloc_17[%arg1, %arg2, %arg3, %arg4] : memref<64x256x30x30xf32> 329 | } 330 | } 331 | } 332 | } 333 | %subview_18 = memref.subview %alloc_17[0, 0, 1, 1] [64, 256, 28, 28] [1, 1, 1, 1] : memref<64x256x30x30xf32> to memref<64x256x28x28xf32, strided<[230400, 900, 30, 1], offset: 31>> 334 | memref.copy %alloc_16, %subview_18 : memref<64x256x28x28xf32> to memref<64x256x28x28xf32, strided<[230400, 900, 30, 1], offset: 31>> 335 | %alloc_19 = memref.alloc() {alignment = 64 : i64} : memref<64x512x28x28xf32> 336 | %alloc_20 = memref.alloc() {alignment = 64 : i64} : memref<64x512x28x28xf32> 337 | affine.for %arg1 = 0 to 64 { 338 | affine.for %arg2 = 0 to 512 { 339 | affine.for %arg3 = 0 to 28 { 340 | affine.for %arg4 = 0 to 28 { 341 | affine.store %cst_0, %alloc_20[%arg1, %arg2, %arg3, %arg4] : memref<64x512x28x28xf32> 342 | } 343 | } 344 | } 345 | } 346 | %alloc_21 = memref.alloc() {alignment = 64 : i64} : memref<64x512x28x28xf32> 347 | memref.copy %alloc_20, %alloc_21 : memref<64x512x28x28xf32> to memref<64x512x28x28xf32> 348 | affine.for %arg1 = 0 to 64 { 349 | affine.for %arg2 = 0 to 512 { 350 | affine.for %arg3 = 0 to 28 { 351 | affine.for %arg4 = 0 to 28 { 352 | affine.for %arg5 = 0 to 256 { 353 | affine.for %arg6 = 0 to 3 { 354 | affine.for %arg7 = 0 to 3 { 355 | %11 = affine.load %alloc_17[%arg1, %arg5, %arg3 + %arg6, %arg4 + %arg7] : memref<64x256x30x30xf32> 356 | %12 = affine.load %6[%arg2, %arg5, %arg6, %arg7] : memref<512x256x3x3xf32> 357 | %13 = affine.load %alloc_21[%arg1, %arg2, %arg3, %arg4] : memref<64x512x28x28xf32> 358 | %14 = arith.mulf %11, %12 : f32 359 | %15 = arith.addf %13, %14 : f32 360 | affine.store %15, %alloc_21[%arg1, %arg2, %arg3, %arg4] : memref<64x512x28x28xf32> 361 | } 362 | } 363 | } 364 | } 365 | } 366 | } 367 | } 368 | affine.for %arg1 = 0 to 64 { 369 | affine.for %arg2 = 0 to 512 { 370 | affine.for %arg3 = 0 to 28 { 371 | affine.for %arg4 = 0 to 28 { 372 | %11 = affine.load %alloc_21[%arg1, %arg2, %arg3, %arg4] : memref<64x512x28x28xf32> 373 | %12 = arith.cmpf ugt, %11, %cst_0 : f32 374 | %13 = arith.select %12, %11, %cst_0 : f32 375 | affine.store %13, %alloc_19[%arg1, %arg2, %arg3, %arg4] : memref<64x512x28x28xf32> 376 | } 377 | } 378 | } 379 | } 380 | %alloc_22 = memref.alloc() {alignment = 64 : i64} : memref<64x512x30x30xf32> 381 | affine.for %arg1 = 0 to 64 { 382 | affine.for %arg2 = 0 to 512 { 383 | affine.for %arg3 = 0 to 30 { 384 | affine.for %arg4 = 0 to 30 { 385 | affine.store %cst_0, %alloc_22[%arg1, %arg2, %arg3, %arg4] : memref<64x512x30x30xf32> 386 | } 387 | } 388 | } 389 | } 390 | %subview_23 = memref.subview %alloc_22[0, 0, 1, 1] [64, 512, 28, 28] [1, 1, 1, 1] : memref<64x512x30x30xf32> to memref<64x512x28x28xf32, strided<[460800, 900, 30, 1], offset: 31>> 391 | memref.copy %alloc_19, %subview_23 : memref<64x512x28x28xf32> to memref<64x512x28x28xf32, strided<[460800, 900, 30, 1], offset: 31>> 392 | affine.for %arg1 = 0 to 64 { 393 | affine.for %arg2 = 0 to 512 { 394 | affine.for %arg3 = 0 to 28 { 395 | affine.for %arg4 = 0 to 28 { 396 | affine.for %arg5 = 0 to 512 { 397 | affine.for %arg6 = 0 to 3 { 398 | affine.for %arg7 = 0 to 3 { 399 | %11 = affine.load %alloc_22[%arg1, %arg5, %arg3 + %arg6, %arg4 + %arg7] : memref<64x512x30x30xf32> 400 | %12 = affine.load %5[%arg2, %arg5, %arg6, %arg7] : memref<512x512x3x3xf32> 401 | %13 = affine.load %alloc_20[%arg1, %arg2, %arg3, %arg4] : memref<64x512x28x28xf32> 402 | %14 = arith.mulf %11, %12 : f32 403 | %15 = arith.addf %13, %14 : f32 404 | affine.store %15, %alloc_20[%arg1, %arg2, %arg3, %arg4] : memref<64x512x28x28xf32> 405 | } 406 | } 407 | } 408 | } 409 | } 410 | } 411 | } 412 | affine.for %arg1 = 0 to 64 { 413 | affine.for %arg2 = 0 to 512 { 414 | affine.for %arg3 = 0 to 28 { 415 | affine.for %arg4 = 0 to 28 { 416 | %11 = affine.load %alloc_20[%arg1, %arg2, %arg3, %arg4] : memref<64x512x28x28xf32> 417 | %12 = arith.cmpf ugt, %11, %cst_0 : f32 418 | %13 = arith.select %12, %11, %cst_0 : f32 419 | affine.store %13, %alloc_19[%arg1, %arg2, %arg3, %arg4] : memref<64x512x28x28xf32> 420 | } 421 | } 422 | } 423 | } 424 | %alloc_24 = memref.alloc() {alignment = 64 : i64} : memref<64x512x14x14xf32> 425 | affine.for %arg1 = 0 to 64 { 426 | affine.for %arg2 = 0 to 512 { 427 | affine.for %arg3 = 0 to 14 { 428 | affine.for %arg4 = 0 to 14 { 429 | affine.store %cst, %alloc_24[%arg1, %arg2, %arg3, %arg4] : memref<64x512x14x14xf32> 430 | } 431 | } 432 | } 433 | } 434 | affine.for %arg1 = 0 to 64 { 435 | affine.for %arg2 = 0 to 512 { 436 | affine.for %arg3 = 0 to 14 { 437 | affine.for %arg4 = 0 to 14 { 438 | affine.for %arg5 = 0 to 2 { 439 | affine.for %arg6 = 0 to 2 { 440 | %11 = affine.load %alloc_19[%arg1, %arg2, %arg3 * 2 + %arg5, %arg4 * 2 + %arg6] : memref<64x512x28x28xf32> 441 | %12 = affine.load %alloc_24[%arg1, %arg2, %arg3, %arg4] : memref<64x512x14x14xf32> 442 | %13 = arith.maximumf %12, %11 : f32 443 | affine.store %13, %alloc_24[%arg1, %arg2, %arg3, %arg4] : memref<64x512x14x14xf32> 444 | } 445 | } 446 | } 447 | } 448 | } 449 | } 450 | %alloc_25 = memref.alloc() {alignment = 64 : i64} : memref<64x512x16x16xf32> 451 | affine.for %arg1 = 0 to 64 { 452 | affine.for %arg2 = 0 to 512 { 453 | affine.for %arg3 = 0 to 16 { 454 | affine.for %arg4 = 0 to 16 { 455 | affine.store %cst_0, %alloc_25[%arg1, %arg2, %arg3, %arg4] : memref<64x512x16x16xf32> 456 | } 457 | } 458 | } 459 | } 460 | %subview_26 = memref.subview %alloc_25[0, 0, 1, 1] [64, 512, 14, 14] [1, 1, 1, 1] : memref<64x512x16x16xf32> to memref<64x512x14x14xf32, strided<[131072, 256, 16, 1], offset: 17>> 461 | memref.copy %alloc_24, %subview_26 : memref<64x512x14x14xf32> to memref<64x512x14x14xf32, strided<[131072, 256, 16, 1], offset: 17>> 462 | %alloc_27 = memref.alloc() {alignment = 64 : i64} : memref<64x512x14x14xf32> 463 | affine.for %arg1 = 0 to 64 { 464 | affine.for %arg2 = 0 to 512 { 465 | affine.for %arg3 = 0 to 14 { 466 | affine.for %arg4 = 0 to 14 { 467 | affine.store %cst_0, %alloc_27[%arg1, %arg2, %arg3, %arg4] : memref<64x512x14x14xf32> 468 | } 469 | } 470 | } 471 | } 472 | %alloc_28 = memref.alloc() {alignment = 64 : i64} : memref<64x512x14x14xf32> 473 | memref.copy %alloc_27, %alloc_28 : memref<64x512x14x14xf32> to memref<64x512x14x14xf32> 474 | affine.for %arg1 = 0 to 64 { 475 | affine.for %arg2 = 0 to 512 { 476 | affine.for %arg3 = 0 to 14 { 477 | affine.for %arg4 = 0 to 14 { 478 | affine.for %arg5 = 0 to 512 { 479 | affine.for %arg6 = 0 to 3 { 480 | affine.for %arg7 = 0 to 3 { 481 | %11 = affine.load %alloc_25[%arg1, %arg5, %arg3 + %arg6, %arg4 + %arg7] : memref<64x512x16x16xf32> 482 | %12 = affine.load %4[%arg2, %arg5, %arg6, %arg7] : memref<512x512x3x3xf32> 483 | %13 = affine.load %alloc_28[%arg1, %arg2, %arg3, %arg4] : memref<64x512x14x14xf32> 484 | %14 = arith.mulf %11, %12 : f32 485 | %15 = arith.addf %13, %14 : f32 486 | affine.store %15, %alloc_28[%arg1, %arg2, %arg3, %arg4] : memref<64x512x14x14xf32> 487 | } 488 | } 489 | } 490 | } 491 | } 492 | } 493 | } 494 | affine.for %arg1 = 0 to 64 { 495 | affine.for %arg2 = 0 to 512 { 496 | affine.for %arg3 = 0 to 14 { 497 | affine.for %arg4 = 0 to 14 { 498 | %11 = affine.load %alloc_28[%arg1, %arg2, %arg3, %arg4] : memref<64x512x14x14xf32> 499 | %12 = arith.cmpf ugt, %11, %cst_0 : f32 500 | %13 = arith.select %12, %11, %cst_0 : f32 501 | affine.store %13, %alloc_24[%arg1, %arg2, %arg3, %arg4] : memref<64x512x14x14xf32> 502 | } 503 | } 504 | } 505 | } 506 | %alloc_29 = memref.alloc() {alignment = 64 : i64} : memref<64x512x16x16xf32> 507 | affine.for %arg1 = 0 to 64 { 508 | affine.for %arg2 = 0 to 512 { 509 | affine.for %arg3 = 0 to 16 { 510 | affine.for %arg4 = 0 to 16 { 511 | affine.store %cst_0, %alloc_29[%arg1, %arg2, %arg3, %arg4] : memref<64x512x16x16xf32> 512 | } 513 | } 514 | } 515 | } 516 | %subview_30 = memref.subview %alloc_29[0, 0, 1, 1] [64, 512, 14, 14] [1, 1, 1, 1] : memref<64x512x16x16xf32> to memref<64x512x14x14xf32, strided<[131072, 256, 16, 1], offset: 17>> 517 | memref.copy %alloc_24, %subview_30 : memref<64x512x14x14xf32> to memref<64x512x14x14xf32, strided<[131072, 256, 16, 1], offset: 17>> 518 | affine.for %arg1 = 0 to 64 { 519 | affine.for %arg2 = 0 to 512 { 520 | affine.for %arg3 = 0 to 14 { 521 | affine.for %arg4 = 0 to 14 { 522 | affine.for %arg5 = 0 to 512 { 523 | affine.for %arg6 = 0 to 3 { 524 | affine.for %arg7 = 0 to 3 { 525 | %11 = affine.load %alloc_29[%arg1, %arg5, %arg3 + %arg6, %arg4 + %arg7] : memref<64x512x16x16xf32> 526 | %12 = affine.load %3[%arg2, %arg5, %arg6, %arg7] : memref<512x512x3x3xf32> 527 | %13 = affine.load %alloc_27[%arg1, %arg2, %arg3, %arg4] : memref<64x512x14x14xf32> 528 | %14 = arith.mulf %11, %12 : f32 529 | %15 = arith.addf %13, %14 : f32 530 | affine.store %15, %alloc_27[%arg1, %arg2, %arg3, %arg4] : memref<64x512x14x14xf32> 531 | } 532 | } 533 | } 534 | } 535 | } 536 | } 537 | } 538 | affine.for %arg1 = 0 to 64 { 539 | affine.for %arg2 = 0 to 512 { 540 | affine.for %arg3 = 0 to 14 { 541 | affine.for %arg4 = 0 to 14 { 542 | %11 = affine.load %alloc_27[%arg1, %arg2, %arg3, %arg4] : memref<64x512x14x14xf32> 543 | %12 = arith.cmpf ugt, %11, %cst_0 : f32 544 | %13 = arith.select %12, %11, %cst_0 : f32 545 | affine.store %13, %alloc_24[%arg1, %arg2, %arg3, %arg4] : memref<64x512x14x14xf32> 546 | } 547 | } 548 | } 549 | } 550 | %alloc_31 = memref.alloc() {alignment = 64 : i64} : memref<64x512x7x7xf32> 551 | %alloc_32 = memref.alloc() {alignment = 64 : i64} : memref<64x512x7x7xf32> 552 | affine.for %arg1 = 0 to 64 { 553 | affine.for %arg2 = 0 to 512 { 554 | affine.for %arg3 = 0 to 7 { 555 | affine.for %arg4 = 0 to 7 { 556 | affine.store %cst, %alloc_32[%arg1, %arg2, %arg3, %arg4] : memref<64x512x7x7xf32> 557 | } 558 | } 559 | } 560 | } 561 | affine.for %arg1 = 0 to 64 { 562 | affine.for %arg2 = 0 to 512 { 563 | affine.for %arg3 = 0 to 7 { 564 | affine.for %arg4 = 0 to 7 { 565 | affine.for %arg5 = 0 to 2 { 566 | affine.for %arg6 = 0 to 2 { 567 | %11 = affine.load %alloc_24[%arg1, %arg2, %arg3 * 2 + %arg5, %arg4 * 2 + %arg6] : memref<64x512x14x14xf32> 568 | %12 = affine.load %alloc_32[%arg1, %arg2, %arg3, %arg4] : memref<64x512x7x7xf32> 569 | %13 = arith.maximumf %12, %11 : f32 570 | affine.store %13, %alloc_32[%arg1, %arg2, %arg3, %arg4] : memref<64x512x7x7xf32> 571 | } 572 | } 573 | } 574 | } 575 | } 576 | } 577 | affine.for %arg1 = 0 to 64 { 578 | affine.for %arg2 = 0 to 512 { 579 | affine.for %arg3 = 0 to 7 { 580 | affine.for %arg4 = 0 to 7 { 581 | affine.store %cst_0, %alloc_31[%arg1, %arg2, %arg3, %arg4] : memref<64x512x7x7xf32> 582 | } 583 | } 584 | } 585 | } 586 | affine.for %arg1 = 0 to 64 { 587 | affine.for %arg2 = 0 to 512 { 588 | affine.for %arg3 = 0 to 7 { 589 | affine.for %arg4 = 0 to 7 { 590 | affine.for %arg5 = 0 to 1 { 591 | affine.for %arg6 = 0 to 1 { 592 | %11 = affine.load %alloc_32[%arg1, %arg2, %arg3 + %arg5, %arg4 + %arg6] : memref<64x512x7x7xf32> 593 | %12 = affine.load %alloc_31[%arg1, %arg2, %arg3, %arg4] : memref<64x512x7x7xf32> 594 | %13 = arith.addf %12, %11 : f32 595 | affine.store %13, %alloc_31[%arg1, %arg2, %arg3, %arg4] : memref<64x512x7x7xf32> 596 | } 597 | } 598 | } 599 | } 600 | } 601 | } 602 | %collapse_shape = memref.collapse_shape %alloc_31 [[0], [1, 2, 3]] : memref<64x512x7x7xf32> into memref<64x25088xf32> 603 | %alloc_33 = memref.alloc() {alignment = 64 : i64} : memref<25088x4096xf32> 604 | affine.for %arg1 = 0 to 4096 { 605 | affine.for %arg2 = 0 to 25088 { 606 | %11 = affine.load %2[%arg1, %arg2] : memref<4096x25088xf32> 607 | affine.store %11, %alloc_33[%arg2, %arg1] : memref<25088x4096xf32> 608 | } 609 | } 610 | %alloc_34 = memref.alloc() {alignment = 64 : i64} : memref<64x4096xf32> 611 | %alloc_35 = memref.alloc() {alignment = 64 : i64} : memref<64x4096xf32> 612 | affine.for %arg1 = 0 to 64 { 613 | affine.for %arg2 = 0 to 4096 { 614 | affine.store %cst_0, %alloc_35[%arg1, %arg2] : memref<64x4096xf32> 615 | } 616 | } 617 | %alloc_36 = memref.alloc() {alignment = 64 : i64} : memref<64x4096xf32> 618 | memref.copy %alloc_35, %alloc_36 : memref<64x4096xf32> to memref<64x4096xf32> 619 | affine.for %arg1 = 0 to 64 { 620 | affine.for %arg2 = 0 to 4096 { 621 | affine.for %arg3 = 0 to 25088 { 622 | %11 = affine.load %collapse_shape[%arg1, %arg3] : memref<64x25088xf32> 623 | %12 = affine.load %alloc_33[%arg3, %arg2] : memref<25088x4096xf32> 624 | %13 = affine.load %alloc_36[%arg1, %arg2] : memref<64x4096xf32> 625 | %14 = arith.mulf %11, %12 : f32 626 | %15 = arith.addf %13, %14 : f32 627 | affine.store %15, %alloc_36[%arg1, %arg2] : memref<64x4096xf32> 628 | } 629 | } 630 | } 631 | %alloc_37 = memref.alloc() {alignment = 64 : i64} : memref<64x4096xf32> 632 | affine.for %arg1 = 0 to 64 { 633 | affine.for %arg2 = 0 to 4096 { 634 | %11 = affine.load %alloc_36[%arg1, %arg2] : memref<64x4096xf32> 635 | %12 = arith.addf %11, %cst_0 : f32 636 | affine.store %12, %alloc_37[%arg1, %arg2] : memref<64x4096xf32> 637 | } 638 | } 639 | affine.for %arg1 = 0 to 64 { 640 | affine.for %arg2 = 0 to 4096 { 641 | %11 = affine.load %alloc_37[%arg1, %arg2] : memref<64x4096xf32> 642 | %12 = arith.cmpf ugt, %11, %cst_0 : f32 643 | %13 = arith.select %12, %11, %cst_0 : f32 644 | affine.store %13, %alloc_34[%arg1, %arg2] : memref<64x4096xf32> 645 | } 646 | } 647 | %alloc_38 = memref.alloc() {alignment = 64 : i64} : memref<4096x4096xf32> 648 | affine.for %arg1 = 0 to 4096 { 649 | affine.for %arg2 = 0 to 4096 { 650 | %11 = affine.load %1[%arg1, %arg2] : memref<4096x4096xf32> 651 | affine.store %11, %alloc_38[%arg2, %arg1] : memref<4096x4096xf32> 652 | } 653 | } 654 | affine.for %arg1 = 0 to 64 { 655 | affine.for %arg2 = 0 to 4096 { 656 | affine.for %arg3 = 0 to 4096 { 657 | %11 = affine.load %alloc_34[%arg1, %arg3] : memref<64x4096xf32> 658 | %12 = affine.load %alloc_38[%arg3, %arg2] : memref<4096x4096xf32> 659 | %13 = affine.load %alloc_35[%arg1, %arg2] : memref<64x4096xf32> 660 | %14 = arith.mulf %11, %12 : f32 661 | %15 = arith.addf %13, %14 : f32 662 | affine.store %15, %alloc_35[%arg1, %arg2] : memref<64x4096xf32> 663 | } 664 | } 665 | } 666 | %alloc_39 = memref.alloc() {alignment = 64 : i64} : memref<64x4096xf32> 667 | affine.for %arg1 = 0 to 64 { 668 | affine.for %arg2 = 0 to 4096 { 669 | %11 = affine.load %alloc_35[%arg1, %arg2] : memref<64x4096xf32> 670 | %12 = arith.addf %11, %cst_0 : f32 671 | affine.store %12, %alloc_39[%arg1, %arg2] : memref<64x4096xf32> 672 | } 673 | } 674 | affine.for %arg1 = 0 to 64 { 675 | affine.for %arg2 = 0 to 4096 { 676 | %11 = affine.load %alloc_39[%arg1, %arg2] : memref<64x4096xf32> 677 | %12 = arith.cmpf ugt, %11, %cst_0 : f32 678 | %13 = arith.select %12, %11, %cst_0 : f32 679 | affine.store %13, %alloc_34[%arg1, %arg2] : memref<64x4096xf32> 680 | } 681 | } 682 | %alloc_40 = memref.alloc() {alignment = 64 : i64} : memref<4096x1000xf32> 683 | affine.for %arg1 = 0 to 1000 { 684 | affine.for %arg2 = 0 to 4096 { 685 | %11 = affine.load %0[%arg1, %arg2] : memref<1000x4096xf32> 686 | affine.store %11, %alloc_40[%arg2, %arg1] : memref<4096x1000xf32> 687 | } 688 | } 689 | %alloc_41 = memref.alloc() {alignment = 64 : i64} : memref<64x1000xf32> 690 | %alloc_42 = memref.alloc() {alignment = 64 : i64} : memref<64x1000xf32> 691 | affine.for %arg1 = 0 to 64 { 692 | affine.for %arg2 = 0 to 1000 { 693 | affine.store %cst_0, %alloc_42[%arg1, %arg2] : memref<64x1000xf32> 694 | } 695 | } 696 | affine.for %arg1 = 0 to 64 { 697 | affine.for %arg2 = 0 to 1000 { 698 | affine.for %arg3 = 0 to 4096 { 699 | %11 = affine.load %alloc_34[%arg1, %arg3] : memref<64x4096xf32> 700 | %12 = affine.load %alloc_40[%arg3, %arg2] : memref<4096x1000xf32> 701 | %13 = affine.load %alloc_42[%arg1, %arg2] : memref<64x1000xf32> 702 | %14 = arith.mulf %11, %12 : f32 703 | %15 = arith.addf %13, %14 : f32 704 | affine.store %15, %alloc_42[%arg1, %arg2] : memref<64x1000xf32> 705 | } 706 | } 707 | } 708 | affine.for %arg1 = 0 to 64 { 709 | affine.for %arg2 = 0 to 1000 { 710 | %11 = affine.load %alloc_42[%arg1, %arg2] : memref<64x1000xf32> 711 | %12 = arith.addf %11, %cst_0 : f32 712 | affine.store %12, %alloc_41[%arg1, %arg2] : memref<64x1000xf32> 713 | } 714 | } 715 | memref.dealloc %alloc_1 : memref<64x64x224x224xf32> 716 | memref.dealloc %alloc_2 : memref<64x64x224x224xf32> 717 | memref.dealloc %alloc_3 : memref<64x64x112x112xf32> 718 | memref.dealloc %alloc_6 : memref<64x128x112x112xf32> 719 | memref.dealloc %alloc_7 : memref<64x128x112x112xf32> 720 | memref.dealloc %alloc_8 : memref<64x128x56x56xf32> 721 | memref.dealloc %alloc_11 : memref<64x256x56x56xf32> 722 | memref.dealloc %alloc_12 : memref<64x256x56x56xf32> 723 | memref.dealloc %alloc_13 : memref<64x256x56x56xf32> 724 | memref.dealloc %alloc_16 : memref<64x256x28x28xf32> 725 | memref.dealloc %alloc_19 : memref<64x512x28x28xf32> 726 | memref.dealloc %alloc_20 : memref<64x512x28x28xf32> 727 | memref.dealloc %alloc_21 : memref<64x512x28x28xf32> 728 | memref.dealloc %alloc_24 : memref<64x512x14x14xf32> 729 | memref.dealloc %alloc_27 : memref<64x512x14x14xf32> 730 | memref.dealloc %alloc_28 : memref<64x512x14x14xf32> 731 | memref.dealloc %alloc_31 : memref<64x512x7x7xf32> 732 | memref.dealloc %alloc_32 : memref<64x512x7x7xf32> 733 | memref.dealloc %alloc_33 : memref<25088x4096xf32> 734 | memref.dealloc %alloc_34 : memref<64x4096xf32> 735 | memref.dealloc %alloc_35 : memref<64x4096xf32> 736 | memref.dealloc %alloc_36 : memref<64x4096xf32> 737 | memref.dealloc %alloc_37 : memref<64x4096xf32> 738 | memref.dealloc %alloc_38 : memref<4096x4096xf32> 739 | memref.dealloc %alloc_39 : memref<64x4096xf32> 740 | memref.dealloc %alloc_40 : memref<4096x1000xf32> 741 | memref.dealloc %alloc_41 : memref<64x1000xf32> 742 | memref.dealloc %alloc_42 : memref<64x1000xf32> 743 | memref.dealloc %alloc : memref<64x3x226x226xf32> 744 | memref.dealloc %alloc_4 : memref<64x64x114x114xf32> 745 | memref.dealloc %alloc_9 : memref<64x128x58x58xf32> 746 | memref.dealloc %alloc_14 : memref<64x256x58x58xf32> 747 | memref.dealloc %alloc_17 : memref<64x256x30x30xf32> 748 | memref.dealloc %alloc_22 : memref<64x512x30x30xf32> 749 | memref.dealloc %alloc_25 : memref<64x512x16x16xf32> 750 | memref.dealloc %alloc_29 : memref<64x512x16x16xf32> 751 | return %alloc_41 : memref<64x1000xf32> 752 | } 753 | } 754 | -------------------------------------------------------------------------------- /benchmark.sh: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env bash 2 | 3 | # Function to display help menu 4 | display_help() { 5 | echo "Usage: $0 [benchmark_type] [ML_model] [opt_flag] [PAPI_event_name]" 6 | echo 7 | echo "Arguments:" 8 | echo " benchmark_type Type of benchmark to run (e.g., GB for Google Benchmark, PAPI for PAPI-based, chrono for C++ chrono)" 9 | echo " ML_model Name of the machine learning model to benchmark (e.g., Alexnet, ResNet50, etc.)" 10 | echo " opt_flag Flag to run the custom optimization pass (e.g., --affine-unroll for enabling optimization)" 11 | echo " PAPI_event_name Name of the PAPI event to monitor (e.g., PAPI_TOT_CYC for total cycles) only for PAPI-based benchmark" 12 | echo 13 | echo "Example:" 14 | echo " $0 GB Alexnet --affine-64-unroll" 15 | echo 16 | exit 0 17 | } 18 | 19 | # Check if help is requested 20 | if [[ "$1" == "--help" || "$1" == "-h" ]]; then 21 | display_help 22 | fi 23 | 24 | # Check if all three arguments are provided 25 | if [ $# -ne 3 ]; then 26 | echo "Error: Missing arguments." 27 | display_help 28 | fi 29 | 30 | # Check if 4th argument is provided for PAPI-based benchmark 31 | if [ "$1" == "PAPI" ] && [ $# -ne 4 ]; then 32 | echo "Error: Missing PAPI event name." 33 | display_help 34 | fi 35 | 36 | # Assigning arguments to variables for readability 37 | BENCHMARK_TYPE=$1 38 | ML_MODEL=$2 39 | OPT_FLAG=$3 40 | PAPI_EVENT_NAME=$4 41 | PROJECT_OPT="$(pwd)/build-ninja/tools/project-opt" 42 | MODIFIED_MLIR="$(pwd)/Torch_MLIR/modified/modified.mlir" 43 | MLIR_OBJ_PY="$(pwd)/make_MLIR_obj.py" 44 | MODIFIED_MLIR="$(pwd)/Torch_MLIR/modified/modified.mlir" 45 | MODIFIED_OBJ_FOLDER="$(pwd)/benchmarks/mlir_obj/" 46 | ORACLE_MLIR_OBJ_FOLDER="$(pwd)/Torch_MLIR/pytorch/" 47 | ML_MODEL_MLIR="${ORACLE_MLIR_OBJ_FOLDER}$(echo ${ML_MODEL}| tr '[:upper:]' '[:lower:]').mlir" 48 | BUILD_DIR="$(pwd)/build-ninja/" 49 | BENCHMARK_DIR="${BUILD_DIR}/benchmarks/" 50 | GOOGLE_BENCHMARK="${BENCHMARK_DIR}/GoogleBenchmarks/" 51 | HC_BENCHMARK="${BENCHMARK_DIR}/Hardware_Counters_or_Time/" 52 | 53 | echo "Compiling your optimized mlir to an object file mlir_oracle : ${ML_MODEL_MLIR} " 54 | echo "=========== Running your pass ${OPT_FLAG} on ${ML_MODEL_MLIR} ==================" 55 | echo " ${PROJECT_OPT} ${OPT_FLAG} ${ML_MODEL_MLIR} -o ${MODIFIED_MLIR} " 56 | $PROJECT_OPT $OPT_FLAG $ML_MODEL_MLIR -o $MODIFIED_MLIR 57 | #make your modified mlir obj 58 | echo "============= Compiling Required MLIR files ${ML_MODEL_MLIR} ===================" 59 | $MLIR_OBJ_PY $MODIFIED_MLIR $MODIFIED_OBJ_FOLDER 60 | cmake --build $BUILD_DIR --target run_bench_Time_Modified 61 | cmake --build $BUILD_DIR --target run_bench_HC_Modified 62 | cmake --build $BUILD_DIR --target run_bench_GB_Modified 63 | cmake --build $BUILD_DIR --target run_bench_GB_$ML_MODEL 64 | cmake --build $BUILD_DIR --target run_bench_Time_$ML_MODEL 65 | cmake --build $BUILD_DIR --target run_bench_HC_$ML_MODEL 66 | 67 | # Implement logic for each benchmark type 68 | case $BENCHMARK_TYPE in 69 | GB) 70 | echo "Running Google Benchmark for $ML_MODEL with flag: $OPT_FLAG" 71 | echo "========================== Orginal with O0 =====================================" 72 | $GOOGLE_BENCHMARK/run_bench_GB_$ML_MODEL --benchmark_time_unit=s 73 | echo "======================= After Transformation ===================================" 74 | $GOOGLE_BENCHMARK/run_bench_GB_Modified --benchmark_time_unit=s 75 | # Add your command to run Google Benchmark with the specified ML model and opt flag 76 | ;; 77 | PAPI) 78 | echo "Running PAPI Benchmark for $ML_MODEL with flag: $OPT_FLAG" 79 | echo "========================== Orginal with O0 =====================================" 80 | $HC_BENCHMARK/run_bench_HC_$ML_MODEL PAPI_EVENT_NAME=$PAPI_EVENT_NAME 81 | echo "======================= After Transformation ===================================" 82 | $HC_BENCHMARK/run_bench_HC_Modified PAPI_EVENT_NAME=$PAPI_EVENT_NAME 83 | # Add your command to run PAPI-based benchmark with the specified ML model and opt flag 84 | # e.g., ./run_bench_PAPI $ML_MODEL $OPT_FLAG 85 | ;; 86 | chrono) 87 | echo "Running Chrono-based Benchmark for $ML_MODEL with flag: $OPT_FLAG" 88 | echo "========================== Orginal with O0 =====================================" 89 | $HC_BENCHMARK/run_bench_Time_$ML_MODEL 90 | echo "======================= After Transformation ===================================" 91 | $HC_BENCHMARK/run_bench_Time_Modified 92 | # Add your command to run chrono-based benchmark with the specified ML model and opt flag 93 | # e.g., ./run_bench_chrono $ML_MODEL $OPT_FLAG 94 | ;; 95 | *) 96 | echo "Error: Invalid benchmark type." 97 | display_help 98 | ;; 99 | esac 100 | -------------------------------------------------------------------------------- /benchmarks/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(GoogleBenchmarks) 2 | add_subdirectory(Hardware_Counters_or_Time) 3 | -------------------------------------------------------------------------------- /benchmarks/GoogleBenchmarks/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(SUPPORTED_MODELS "Alexnet" "Resnet" "VGG" "Modified") 2 | 3 | set(ML_MODEL "" CACHE STRING "Choose the ml model to link with run_bench_GB") 4 | 5 | if(ML_MODEL STREQUAL "") 6 | set(ML_MODEL "Modified" ) 7 | message(STATUS "No model specified. Defaulting to: MODIFIED") 8 | endif() 9 | 10 | if(NOT ML_MODEL IN_LIST SUPPORTED_MODELS) 11 | message(FATAL_ERROR "Invalid model: ${ML_MODEL}. Supported models are: ${SUPPORTED_MODELS}") 12 | endif() 13 | 14 | # add_executable (run_bench_GB "run_bench.cpp") 15 | add_library(libmlir_c_runner_utils SHARED IMPORTED) 16 | set_target_properties(libmlir_c_runner_utils PROPERTIES IMPORTED_LOCATION "${CMAKE_SOURCE_DIR}/External/llvm-project/build/lib/libmlir_c_runner_utils.so") 17 | 18 | 19 | add_executable (run_bench_GB_Alexnet "run_bench.cpp") 20 | target_compile_options(run_bench_GB_Alexnet PRIVATE -O0) 21 | target_sources(run_bench_GB_Alexnet PRIVATE "${CMAKE_SOURCE_DIR}/benchmarks/mlir_obj/alexnet.o") 22 | target_link_libraries(run_bench_GB_Alexnet benchmark::benchmark libmlir_c_runner_utils) 23 | 24 | add_executable (run_bench_GB_Resnet152 "run_bench.cpp") 25 | target_compile_options(run_bench_GB_Resnet152 PRIVATE -O0) 26 | target_sources(run_bench_GB_Resnet152 PRIVATE "${CMAKE_SOURCE_DIR}/benchmarks/mlir_obj/resnet152.o") 27 | target_link_libraries(run_bench_GB_Resnet152 benchmark::benchmark libmlir_c_runner_utils) 28 | 29 | add_executable (run_bench_GB_Vgg11 "run_bench.cpp") 30 | target_compile_options(run_bench_GB_Vgg11 PRIVATE -O0) 31 | target_sources(run_bench_GB_Vgg11 PRIVATE "${CMAKE_SOURCE_DIR}/benchmarks/mlir_obj/vgg11.o") 32 | target_link_libraries(run_bench_GB_Vgg11 benchmark::benchmark libmlir_c_runner_utils) 33 | 34 | add_executable (run_bench_GB_Modified "run_bench.cpp") 35 | target_compile_options(run_bench_GB_Modified PRIVATE -O0) 36 | target_sources(run_bench_GB_Modified PRIVATE "${CMAKE_SOURCE_DIR}/benchmarks/mlir_obj/modified.o") 37 | target_link_libraries(run_bench_GB_Modified benchmark::benchmark libmlir_c_runner_utils) 38 | 39 | # set_target_properties(run_bench_GB PROPERTIES INTERPROCEDURAL_OPTIMIZATION FALSE) 40 | 41 | message(STATUS "Compiling done for Google Benchmarks") -------------------------------------------------------------------------------- /benchmarks/GoogleBenchmarks/run_bench.cpp: -------------------------------------------------------------------------------- 1 | // run_bench.cpp : Defines the entry point for the application. 2 | // 3 | 4 | #include 5 | extern "C" void forward(); 6 | static void BM_forward(benchmark::State &state) { 7 | for (auto _ : state) { 8 | forward(); 9 | } 10 | } 11 | BENCHMARK(BM_forward); 12 | BENCHMARK_MAIN(); -------------------------------------------------------------------------------- /benchmarks/Hardware_Counters_or_Time/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | set(SUPPORTED_MODELS "Alexnet" "Resnet" "VGG" "Modified") 2 | set(SUPPORTED_INST_TYPE "Time" "PAPI") 3 | 4 | set(ML_MODEL "" CACHE STRING "Choose the ml model to link with run_bench_HC") 5 | 6 | if(ML_MODEL STREQUAL "") 7 | set(ML_MODEL "Modified") 8 | message(STATUS "No model specified. Defaulting to: MODIFIED") 9 | endif() 10 | 11 | if(NOT ML_MODEL IN_LIST SUPPORTED_MODELS) 12 | message(FATAL_ERROR "Invalid model: ${ML_MODEL}. Supported models are: ${SUPPORTED_MODELS}") 13 | endif() 14 | 15 | set(INST_TYPE "" CACHE STRING "Choose choose instrumentation type for run_bench_HC") 16 | if(INST_TYPE STREQUAL "") 17 | set(INST_TYPE "Time") 18 | message(STATUS "No model specified. Defaulting to: Time") 19 | endif() 20 | 21 | if(NOT INST_TYPE IN_LIST SUPPORTED_INST_TYPE) 22 | message(FATAL_ERROR "Invalid model: ${INST_TYPE}. Supported models are: ${SUPPORTED_INST_TYPE}") 23 | endif() 24 | 25 | add_library(libmlir_c_runner_utils SHARED IMPORTED) 26 | set_target_properties(libmlir_c_runner_utils PROPERTIES IMPORTED_LOCATION "${CMAKE_SOURCE_DIR}/External/llvm-project/build/lib/libmlir_c_runner_utils.so") 27 | 28 | # PAPI instrumentation 29 | add_executable (run_bench_HC_Alexnet "run_bench.cpp") 30 | target_compile_definitions(run_bench_HC_Alexnet PRIVATE -DPAPI_INST__ -DHUMAN_READABLE) 31 | target_sources(run_bench_HC_Alexnet PRIVATE "${CMAKE_SOURCE_DIR}/benchmarks/mlir_obj/alexnet.o") 32 | target_link_libraries(run_bench_HC_Alexnet libmlir_c_runner_utils papi) 33 | 34 | add_executable (run_bench_HC_Resnet152 "run_bench.cpp") 35 | target_compile_definitions(run_bench_HC_Resnet152 PRIVATE -DPAPI_INST__ -DHUMAN_READABLE) 36 | target_sources(run_bench_HC_Resnet152 PRIVATE "${CMAKE_SOURCE_DIR}/benchmarks/mlir_obj/resnet152.o") 37 | target_link_libraries(run_bench_HC_Resnet152 libmlir_c_runner_utils papi) 38 | 39 | add_executable (run_bench_HC_Vgg11 "run_bench.cpp") 40 | target_compile_definitions(run_bench_HC_Vgg11 PRIVATE -DPAPI_INST__ -DHUMAN_READABLE) 41 | target_sources(run_bench_HC_Vgg11 PRIVATE "${CMAKE_SOURCE_DIR}/benchmarks/mlir_obj/vgg11.o") 42 | target_link_libraries(run_bench_HC_Vgg11 libmlir_c_runner_utils papi) 43 | 44 | add_executable (run_bench_HC_Modified "run_bench.cpp") 45 | target_compile_definitions(run_bench_HC_Modified PRIVATE -DPAPI_INST__ -DHUMAN_READABLE) 46 | target_sources(run_bench_HC_Modified PRIVATE "${CMAKE_SOURCE_DIR}/benchmarks/mlir_obj/modified.o") 47 | target_link_libraries(run_bench_HC_Modified libmlir_c_runner_utils papi) 48 | 49 | # Time instrumentation 50 | add_executable (run_bench_Time_Alexnet "run_bench.cpp") 51 | target_compile_definitions(run_bench_Time_Alexnet PRIVATE -DTIME_INST__ -DHUMAN_READABLE) 52 | target_sources(run_bench_Time_Alexnet PRIVATE "${CMAKE_SOURCE_DIR}/benchmarks/mlir_obj/alexnet.o") 53 | target_link_libraries(run_bench_Time_Alexnet libmlir_c_runner_utils) 54 | 55 | add_executable (run_bench_Time_Resnet152 "run_bench.cpp") 56 | target_compile_definitions(run_bench_Time_Resnet152 PRIVATE -DTIME_INST__ -DHUMAN_READABLE) 57 | target_sources(run_bench_Time_Resnet152 PRIVATE "${CMAKE_SOURCE_DIR}/benchmarks/mlir_obj/resnet152.o") 58 | target_link_libraries(run_bench_Time_Resnet152 libmlir_c_runner_utils) 59 | 60 | add_executable (run_bench_Time_Vgg11 "run_bench.cpp") 61 | target_compile_definitions(run_bench_Time_Vgg11 PRIVATE -DTIME_INST__ -DHUMAN_READABLE) 62 | target_sources(run_bench_Time_Vgg11 PRIVATE "${CMAKE_SOURCE_DIR}/benchmarks/mlir_obj/vgg11.o") 63 | target_link_libraries(run_bench_Time_Vgg11 libmlir_c_runner_utils) 64 | 65 | add_executable (run_bench_Time_Modified "run_bench.cpp") 66 | target_compile_definitions(run_bench_Time_Modified PRIVATE -DTIME_INST__ -DHUMAN_READABLE) 67 | target_sources(run_bench_Time_Modified PRIVATE "${CMAKE_SOURCE_DIR}/benchmarks/mlir_obj/modified.o") 68 | target_link_libraries(run_bench_Time_Modified libmlir_c_runner_utils) 69 | 70 | 71 | message(STATUS "Compiled run_bench_HC_* and run_bench_Tsime_* ") -------------------------------------------------------------------------------- /benchmarks/Hardware_Counters_or_Time/run_bench.cpp: -------------------------------------------------------------------------------- 1 | #include "run_bench.h" 2 | #include 3 | #include 4 | #include 5 | 6 | extern "C" void forward(); 7 | 8 | int main() { 9 | start_instrumentaion; 10 | forward(); 11 | stop_instrumentation; 12 | print_instruments; 13 | return 0; 14 | } -------------------------------------------------------------------------------- /benchmarks/Hardware_Counters_or_Time/run_bench.h: -------------------------------------------------------------------------------- 1 | #ifndef PAPI_TIME_INST 2 | #define PAPI_TIME_INST 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #define start_instrumentaion 12 | #define stop_instrumentation 13 | #define print_instruments 14 | 15 | #ifdef PAPI_INST__ 16 | #include 17 | 18 | long long int papi_event_value = 0; 19 | int eventset = 0; 20 | const char *env_var_name = "PAPI_EVENT_NAME"; 21 | const char *papi_event_name = getenv(env_var_name); 22 | int retval = 0; 23 | 24 | void papi_init() { 25 | retval = PAPI_library_init(PAPI_VER_CURRENT); 26 | if (retval != PAPI_VER_CURRENT) { 27 | std::cerr << "Error initializing PAPI! " << PAPI_strerror(retval) 28 | << std::endl; 29 | exit(1); 30 | } 31 | } 32 | 33 | void create_event_set() { 34 | eventset = PAPI_NULL; 35 | // papi creating event set 36 | retval = PAPI_create_eventset(&eventset); 37 | if (retval != PAPI_OK) { 38 | std::cerr << "Error creating eventset! " << PAPI_strerror(retval) 39 | << std::endl; 40 | } 41 | // papi adding event set 42 | retval = PAPI_add_named_event(eventset, papi_event_name); 43 | if (retval != PAPI_OK) { 44 | std::cerr << "Error adding " << papi_event_name << ": " 45 | << PAPI_strerror(retval) << std::endl; 46 | } 47 | } 48 | 49 | void papi_start() { 50 | PAPI_reset(eventset); 51 | retval = PAPI_start(eventset); 52 | if (retval != PAPI_OK) { 53 | std::cerr << "Error PAPI: " << PAPI_strerror(retval) << std::endl; 54 | } 55 | } 56 | 57 | void papi_stop() { 58 | retval = PAPI_stop(eventset, &papi_event_value); 59 | if (retval != PAPI_OK) { 60 | std::cerr << "Error stopping: " << PAPI_strerror(retval) << std::endl; 61 | } 62 | } 63 | 64 | void print_papi() { 65 | #ifdef HUMAN_READABLE 66 | fprintf(stderr, "Measured %s event %lld times\n", papi_event_name, 67 | papi_event_value); 68 | #endif 69 | fprintf(stdout, "%lld", papi_event_value); 70 | } 71 | 72 | #undef start_instrumentaion 73 | #undef stop_instrumentation 74 | #undef print_instruments 75 | 76 | #define start_instrumentaion \ 77 | papi_init(); \ 78 | create_event_set(); \ 79 | papi_start(); 80 | 81 | #define stop_instrumentation papi_stop(); 82 | 83 | #define print_instruments print_papi(); 84 | 85 | #endif 86 | 87 | #ifdef TIME_INST__ 88 | 89 | double time_reading; 90 | double time_reading_ns; 91 | std::chrono::time_point start_time_counter; 92 | std::chrono::time_point end_time_counter; 93 | 94 | void start_time() { 95 | start_time_counter = std::chrono::high_resolution_clock::now(); 96 | } 97 | 98 | void end_time() { 99 | end_time_counter = std::chrono::high_resolution_clock::now(); 100 | } 101 | 102 | void print_time() { 103 | std::chrono::duration timing = end_time_counter - start_time_counter; 104 | time_reading_ns = 105 | std::chrono::duration_cast(timing).count(); 106 | time_reading = time_reading_ns / 1000000000; 107 | #ifdef HUMAN_READABLE 108 | std::cerr << "Measured Time : " << time_reading << " seconds" << std::endl; 109 | #endif 110 | std::cerr << time_reading << std::endl; 111 | } 112 | 113 | #undef start_instrumentaion 114 | #undef stop_instrumentation 115 | #undef print_instruments 116 | 117 | #define start_instrumentaion start_time(); 118 | 119 | #define stop_instrumentation end_time(); 120 | 121 | #define print_instruments print_time(); 122 | 123 | #endif 124 | 125 | #endif /* PAPI_TIME_INST */ -------------------------------------------------------------------------------- /build_llvm.sh: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env bash 2 | 3 | BUILD_SYSTEM=Ninja 4 | BUILD_TAG=ninja 5 | THIRDPARTY_LLVM_DIR=$PWD/External/llvm-project 6 | BUILD_DIR=$THIRDPARTY_LLVM_DIR/build 7 | INSTALL_DIR=$THIRDPARTY_LLVM_DIR/install 8 | 9 | # rm -rf $BUILD_DIR 10 | mkdir -p $BUILD_DIR 11 | mkdir -p $INSTALL_DIR 12 | 13 | pushd $BUILD_DIR 14 | 15 | cmake ../llvm -G $BUILD_SYSTEM \ 16 | -DCMAKE_CXX_COMPILER=clang++ \ 17 | -DCMAKE_C_COMPILER=clang \ 18 | -DCMAKE_INSTALL_PREFIX=$INSTALL_DIR \ 19 | -DLLVM_LOCAL_RPATH=$INSTALL_DIR/lib \ 20 | -DLLVM_PARALLEL_COMPILE_JOBS=12 \ 21 | -DLLVM_PARALLEL_LINK_JOBS=6 \ 22 | -DLLVM_BUILD_EXAMPLES=OFF \ 23 | -DLLVM_INSTALL_UTILS=ON \ 24 | -DCMAKE_OSX_ARCHITECTURES="$(uname -m)" \ 25 | -DCMAKE_BUILD_TYPE=Release \ 26 | -DLLVM_ENABLE_ASSERTIONS=ON \ 27 | -DLLVM_CCACHE_BUILD=ON \ 28 | -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \ 29 | -DLLVM_ENABLE_PROJECTS='mlir' \ 30 | 31 | 32 | cmake --build . --target check-mlir 33 | cmake --build . --target mlir-libraries 34 | cmake --build . --target llc 35 | 36 | popd -------------------------------------------------------------------------------- /build_mlir.sh: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env bash 2 | 3 | BUILD_SYSTEM="Ninja" 4 | BUILD_DIR=./build-`echo ${BUILD_SYSTEM}| tr '[:upper:]' '[:lower:]'` 5 | echo "build dir ${BUILD_DIR}" 6 | rm -rf $BUILD_DIR 7 | mkdir $BUILD_DIR 8 | pushd $BUILD_DIR 9 | 10 | LLVM_BUILD_DIR=External/llvm-project/build 11 | cmake -G $BUILD_SYSTEM .. \ 12 | -DLLVM_DIR="$LLVM_BUILD_DIR/lib/cmake/llvm" \ 13 | -DMLIR_DIR="$LLVM_BUILD_DIR/lib/cmake/mlir" \ 14 | -DBUILD_DEPS="ON" \ 15 | -DBUILD_SHARED_LIBS="OFF" \ 16 | -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \ 17 | -DCMAKE_BUILD_TYPE=Debug 18 | 19 | popd 20 | 21 | cmake --build $BUILD_DIR --target project-opt 22 | -------------------------------------------------------------------------------- /build_obj.sh: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env bash 2 | if [ $# -ne 1 ]; then 3 | echo "Error: Missing arguments." 4 | echo "Usage: $0 [CMAKE_SOURCE_DIR]" 5 | exit 1 6 | fi 7 | CMAKE_SOURCE_DIR=$1 8 | cd $CMAKE_SOURCE_DIR 9 | python3 ${CMAKE_SOURCE_DIR}/make_MLIR_obj.py ${CMAKE_SOURCE_DIR}/Torch_MLIR/pytorch/ ${CMAKE_SOURCE_DIR}/benchmarks/mlir_obj/ &> /dev/null 10 | echo "Compiled the oracle MLIR files to object files" 11 | -------------------------------------------------------------------------------- /example_output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mvvsmk/OptML/32b88924bfd333a235c718495935883b2a638094/example_output.png -------------------------------------------------------------------------------- /include/Transform/Affine/Affine64Unroll.h: -------------------------------------------------------------------------------- 1 | #ifndef LIB_TRANSFORM_AFFINE_AFFINEFULLUNROLL_H_ 2 | #define LIB_TRANSFORM_AFFINE_AFFINEFULLUNROLL_H_ 3 | 4 | #include "mlir/Dialect/Affine/IR/AffineOps.h" 5 | #include "mlir/Dialect/Func/IR/FuncOps.h" 6 | #include "mlir/include/mlir/Pass/Pass.h" 7 | 8 | namespace mlir { 9 | namespace project { 10 | 11 | class Affine64UnrollPass 12 | : public PassWrapper> { 14 | private: 15 | void runOnOperation() override; 16 | 17 | StringRef getArgument() const final { return "affine-64-unroll"; } 18 | 19 | StringRef getDescription() const final { 20 | return "Unroll loop if it is at a loop depth of 3 or more with a factor of " 21 | "64"; 22 | } 23 | }; 24 | 25 | } // namespace project 26 | } // namespace mlir 27 | 28 | #endif // LIB_TRANSFORM_AFFINE_AFFINEFULLUNROLL_H_ -------------------------------------------------------------------------------- /include/Transform/MakeRunAble/RemoveForwardFuncArgsAndReturn.h: -------------------------------------------------------------------------------- 1 | #ifndef LIB_TRANSFORM_MAKE_RUN_ABLE_RFFAAR_H_ 2 | #define LIB_TRANSFORM_MAKE_RUN_ABLE_RFFAAR_H_ 3 | 4 | #include "include/Transform/MakeRunAble/RemoveGlobalConstants.h" 5 | #include "mlir/Dialect/Affine/IR/AffineOps.h" 6 | #include "mlir/Dialect/Func/IR/FuncOps.h" 7 | #include "mlir/Dialect/MemRef/IR/MemRef.h" 8 | #include "mlir/IR/BuiltinOps.h" 9 | #include "mlir/include/mlir/Pass/Pass.h" 10 | 11 | namespace mlir { 12 | namespace project { 13 | 14 | class RemoveForwardFuncArgsAndReturn 15 | : public PassWrapper> { 17 | private: 18 | void runOnOperation() override; 19 | 20 | StringRef getArgument() const final { 21 | return "rem-forward-func-args-and-return-run-mlir"; 22 | } 23 | 24 | StringRef getDescription() const final { 25 | return "Removes forward function's arguments and return type so that both " 26 | "of them are void"; 27 | } 28 | }; 29 | 30 | } // namespace project 31 | } // namespace mlir 32 | 33 | #endif // LIB_TRANSFORM_MAKE_RUN_ABLE_RFFAAR_H_ -------------------------------------------------------------------------------- /include/Transform/MakeRunAble/RemoveGlobalConstants.h: -------------------------------------------------------------------------------- 1 | #ifndef LIB_TRANSFORM_MAKE_RUN_ABLE_RGC_H_ 2 | #define LIB_TRANSFORM_MAKE_RUN_ABLE_RGC_H_ 3 | 4 | #include "mlir/Dialect/Affine/IR/AffineOps.h" 5 | #include "mlir/Dialect/Func/IR/FuncOps.h" 6 | #include "mlir/Dialect/MemRef/IR/MemRef.h" 7 | #include "mlir/IR/BuiltinOps.h" 8 | #include "mlir/include/mlir/Pass/Pass.h" 9 | 10 | namespace mlir { 11 | namespace project { 12 | 13 | class RemoveGlobalConstants 14 | : public PassWrapper> { 15 | private: 16 | void runOnOperation() override; 17 | 18 | StringRef getArgument() const final { 19 | return "rem-global-constants-run-mlir"; 20 | } 21 | 22 | StringRef getDescription() const final { 23 | return "Removes global constants and moves them to the forward function"; 24 | } 25 | }; 26 | 27 | } // namespace project 28 | } // namespace mlir 29 | 30 | #endif // LIB_TRANSFORM_MAKE_RUN_ABLE_RGC_H_ -------------------------------------------------------------------------------- /include/Transform/MakeRunAble/ZeroInitRemoveForwardFuncArgsAndReturn.h: -------------------------------------------------------------------------------- 1 | #ifndef LIB_TRANSFORM_MAKE_RUN_ABLE_ZIRFFAAR_H_ 2 | #define LIB_TRANSFORM_MAKE_RUN_ABLE_ZIRFFAAR_H_ 3 | 4 | #include "mlir/Dialect/Affine/IR/AffineOps.h" 5 | #include "mlir/Dialect/Func/IR/FuncOps.h" 6 | #include "mlir/Dialect/MemRef/IR/MemRef.h" 7 | #include "mlir/IR/BuiltinOps.h" 8 | #include "mlir/include/mlir/Pass/Pass.h" 9 | 10 | namespace mlir { 11 | namespace project { 12 | 13 | class ZeroInitRemoveForwardFuncArgsAndReturn 14 | : public PassWrapper> { 16 | private: 17 | void runOnOperation() override; 18 | 19 | StringRef getArgument() const final { 20 | return "rem-forward-func-args-and-return-run-mlir-zero-init"; 21 | } 22 | 23 | StringRef getDescription() const final { 24 | return "Removes forward function's arguments and return type so that both " 25 | "of them are void"; 26 | } 27 | }; 28 | 29 | } // namespace project 30 | } // namespace mlir 31 | 32 | #endif // LIB_TRANSFORM_MAKE_RUN_ABLE_ZIRFFAAR_H_ -------------------------------------------------------------------------------- /lib/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # add_subdirectory(Conversion) 2 | # add_subdirectory(Dialect) 3 | # add_subdirectory(Analysis) 4 | add_subdirectory(Transform) 5 | -------------------------------------------------------------------------------- /lib/Transform/Affine/Affine64Unroll.cpp: -------------------------------------------------------------------------------- 1 | #include "include/Transform/Affine/Affine64Unroll.h" 2 | #include "mlir/Dialect/Affine/Analysis/Utils.h" 3 | #include "mlir/Dialect/Affine/IR/AffineOps.h" 4 | #include "mlir/Dialect/Affine/LoopFusionUtils.h" 5 | #include "mlir/Dialect/Affine/LoopUtils.h" 6 | #include "mlir/IR/PatternMatch.h" 7 | #include "mlir/Transforms/GreedyPatternRewriteDriver.h" 8 | #include "llvm/Support/LogicalResult.h" 9 | 10 | namespace mlir { 11 | namespace project { 12 | 13 | void Affine64UnrollPass::runOnOperation() { 14 | getOperation()->walk([&](affine::AffineForOp op) { 15 | affine::LoopNestStats *stats; 16 | if (affine::getNestingDepth(op) > 2) 17 | if (llvm::failed(affine::loopUnrollUpToFactor(op, 64))) { 18 | op->emitError("Unrooling failed"); 19 | signalPassFailure(); 20 | } 21 | }); 22 | } 23 | 24 | } // namespace project 25 | } // namespace mlir -------------------------------------------------------------------------------- /lib/Transform/Affine/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_mlir_library(Affine64Unroll 2 | Affine64Unroll.cpp 3 | # AffineFullUnrollPatternRewrite.cpp 4 | 5 | ${PROJECT_SOURCE_DIR}/include/Transform/Affine/ 6 | ADDITIONAL_HEADER_DIRS 7 | 8 | LINK_LIBS PUBLIC 9 | ) 10 | 11 | # set(LLVM_TARGET_DEFINITIONS Passes.td) 12 | # mlir_tablegen(Passes.h.inc -gen-pass-decls -name Affine) 13 | # add_public_tablegen_target(MLIRAffineFullUnrollPasses) 14 | # add_mlir_doc(Passes AffinePasses ./ -gen-pass-doc) 15 | -------------------------------------------------------------------------------- /lib/Transform/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_subdirectory(Affine) 2 | add_subdirectory(MakeRunAble) 3 | # add_subdirectory(Arith) 4 | # add_subdirectory(Noisy) 5 | -------------------------------------------------------------------------------- /lib/Transform/MakeRunAble/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_mlir_library(MakeRunAble 2 | RemoveGlobalConstants.cpp 3 | ZeroInitRemoveForwardFuncArgsAndReturn.cpp 4 | RemoveForwardFuncArgsAndReturn.cpp 5 | ${PROJECT_SOURCE_DIR}/include/Transform/MakeRunAble/ 6 | ADDITIONAL_HEADER_DIRS 7 | 8 | LINK_LIBS PUBLIC 9 | ) 10 | -------------------------------------------------------------------------------- /lib/Transform/MakeRunAble/RemoveForwardFuncArgsAndReturn.cpp: -------------------------------------------------------------------------------- 1 | #include "include/Transform/MakeRunAble/RemoveForwardFuncArgsAndReturn.h" 2 | #include "mlir/Dialect/Affine/IR/AffineOps.h" 3 | #include "mlir/Dialect/Affine/LoopUtils.h" 4 | #include "mlir/Dialect/Func/IR/FuncOps.h" 5 | #include "mlir/Dialect/MemRef/IR/MemRef.h" 6 | #include "mlir/IR/Attributes.h" 7 | #include "mlir/IR/Builders.h" 8 | #include "mlir/IR/BuiltinAttributes.h" 9 | #include "mlir/IR/BuiltinOps.h" 10 | #include "mlir/IR/BuiltinTypes.h" 11 | #include "mlir/IR/MLIRContext.h" 12 | #include "mlir/IR/PatternMatch.h" 13 | #include "mlir/IR/Value.h" 14 | #include "mlir/Support/LLVM.h" 15 | #include "mlir/Transforms/GreedyPatternRewriteDriver.h" 16 | #include "llvm/ADT/SmallVector.h" 17 | #include "llvm/ADT/StringRef.h" 18 | #include "llvm/Support/Error.h" 19 | #include "llvm/Support/LogicalResult.h" 20 | #include "llvm/Support/raw_ostream.h" 21 | #include "llvm/Transforms/IPO/Attributor.h" 22 | 23 | namespace mlir { 24 | namespace project { 25 | 26 | void RemoveForwardFuncArgsAndReturn::runOnOperation() { 27 | func::FuncOp func = getOperation(); 28 | mlir::MLIRContext *ctx = func->getContext(); 29 | mlir::OpBuilder builder(ctx); 30 | auto fuctionblock = &func.getBody().front(); 31 | builder.setInsertionPointToStart(fuctionblock); 32 | // llvm::errs() << "\n" << func->getName() << "\n"; 33 | mlir::SmallVector argsvector; 34 | auto args = func.getArguments(); 35 | 36 | mlir::DenseMap argMap; 37 | for (auto arg : args) { 38 | auto memrefType = mlir::cast(arg.getType()); 39 | // llvm::errs() << "-> " << arg.getArgNumber() << " moved \n"; 40 | auto allocOp = builder.create(arg.getLoc(), memrefType); 41 | argMap[arg.getArgNumber()] = allocOp; 42 | arg.replaceAllUsesWith(allocOp); 43 | fuctionblock->eraseArgument(arg.getArgNumber()); 44 | } 45 | auto newFuncType = FunctionType::get(ctx, {}, {}); 46 | func.setType(newFuncType); 47 | func.walk([&](func::ReturnOp ret) { 48 | OpBuilder retbuilder(ret); 49 | retbuilder.create(ret->getLoc()); 50 | ret->erase(); 51 | }); 52 | 53 | return; 54 | } 55 | 56 | } // namespace project 57 | } // namespace mlir -------------------------------------------------------------------------------- /lib/Transform/MakeRunAble/RemoveGlobalConstants.cpp: -------------------------------------------------------------------------------- 1 | #include "include/Transform/MakeRunAble/RemoveGlobalConstants.h" 2 | #include "mlir/Dialect/Affine/IR/AffineOps.h" 3 | #include "mlir/Dialect/Affine/LoopUtils.h" 4 | #include "mlir/Dialect/Func/IR/FuncOps.h" 5 | #include "mlir/Dialect/MemRef/IR/MemRef.h" 6 | #include "mlir/IR/Attributes.h" 7 | #include "mlir/IR/Builders.h" 8 | #include "mlir/IR/BuiltinAttributes.h" 9 | #include "mlir/IR/BuiltinOps.h" 10 | #include "mlir/IR/BuiltinTypes.h" 11 | #include "mlir/IR/MLIRContext.h" 12 | #include "mlir/IR/PatternMatch.h" 13 | #include "mlir/IR/Value.h" 14 | #include "mlir/Support/LLVM.h" 15 | #include "mlir/Transforms/GreedyPatternRewriteDriver.h" 16 | #include "llvm/ADT/StringRef.h" 17 | #include "llvm/Support/Error.h" 18 | #include "llvm/Support/LogicalResult.h" 19 | #include "llvm/Support/raw_ostream.h" 20 | 21 | namespace mlir { 22 | namespace project { 23 | 24 | void RemoveGlobalConstants::runOnOperation() { 25 | ModuleOp module = getOperation(); 26 | mlir::SmallVector global_constants; 27 | mlir::MLIRContext *ctx = module->getContext(); 28 | mlir::OpBuilder builder(ctx); 29 | module->walk([&](mlir::memref::GlobalOp globalop) { 30 | global_constants.push_back(globalop); 31 | }); 32 | if (global_constants.empty()) { 33 | return; 34 | } 35 | func::FuncOp forwardFunctionDef = 36 | module.lookupSymbol("forward"); 37 | if (forwardFunctionDef) { 38 | auto entryblock = &forwardFunctionDef.getBody().front(); 39 | builder.setInsertionPointToStart(entryblock); 40 | } 41 | mlir::DenseMap globalMap; 42 | 43 | for (auto global : global_constants) { 44 | auto memrefType = mlir::cast(global.getType()); 45 | // llvm::errs() << "->" << global.getSymName() << "\n"; 46 | auto allocOp = 47 | builder.create(global->getLoc(), memrefType); 48 | globalMap[global.getSymName()] = allocOp; 49 | global->erase(); 50 | } 51 | 52 | module->walk([&](memref::GetGlobalOp getGlobalOp) { 53 | auto globalName = getGlobalOp.getName(); 54 | if (globalMap.count(globalName)) { 55 | getGlobalOp.replaceAllUsesWith(globalMap[globalName]); 56 | getGlobalOp.erase(); 57 | } 58 | }); 59 | // llvm::errs() << " Replacements done " 60 | // << "\n"; 61 | return; 62 | } 63 | 64 | } // namespace project 65 | } // namespace mlir -------------------------------------------------------------------------------- /lib/Transform/MakeRunAble/ZeroInitRemoveForwardFuncArgsAndReturn.cpp: -------------------------------------------------------------------------------- 1 | #include "include/Transform/MakeRunAble/ZeroInitRemoveForwardFuncArgsAndReturn.h" 2 | #include "mlir/Dialect/Affine/IR/AffineOps.h" 3 | #include "mlir/Dialect/Affine/LoopUtils.h" 4 | #include "mlir/Dialect/Arith/IR/Arith.h" 5 | #include "mlir/Dialect/Func/IR/FuncOps.h" 6 | #include "mlir/Dialect/MemRef/IR/MemRef.h" 7 | #include "mlir/IR/Attributes.h" 8 | #include "mlir/IR/Builders.h" 9 | #include "mlir/IR/BuiltinAttributes.h" 10 | #include "mlir/IR/BuiltinOps.h" 11 | #include "mlir/IR/BuiltinTypes.h" 12 | #include "mlir/IR/Location.h" 13 | #include "mlir/IR/MLIRContext.h" 14 | #include "mlir/IR/PatternMatch.h" 15 | #include "mlir/IR/Value.h" 16 | #include "mlir/IR/ValueRange.h" 17 | #include "mlir/Support/LLVM.h" 18 | #include "mlir/Transforms/GreedyPatternRewriteDriver.h" 19 | #include "llvm/ADT/ArrayRef.h" 20 | #include "llvm/ADT/SmallVector.h" 21 | #include "llvm/ADT/StringRef.h" 22 | #include "llvm/Support/Error.h" 23 | #include "llvm/Support/LogicalResult.h" 24 | #include "llvm/Support/raw_ostream.h" 25 | #include "llvm/Transforms/IPO/Attributor.h" 26 | #include 27 | 28 | namespace mlir { 29 | namespace project { 30 | 31 | void ZeroInitRemoveForwardFuncArgsAndReturn::runOnOperation() { 32 | func::FuncOp func = getOperation(); 33 | mlir::MLIRContext *ctx = func->getContext(); 34 | mlir::OpBuilder builder(ctx); 35 | auto functionBlock = &func.getBody().front(); 36 | builder.setInsertionPointToStart(functionBlock); 37 | 38 | auto args = func.getArguments(); 39 | mlir::DenseMap argMap; 40 | 41 | // Helper to create a constant index value 42 | auto createConstantIndex = [&](int64_t val, mlir::Location argloc) -> Value { 43 | return builder.create(argloc, val); 44 | }; 45 | 46 | for (auto arg : args) { 47 | auto memrefType = mlir::cast(arg.getType()); 48 | 49 | // Allocate memory 50 | auto allocOp = builder.create(arg.getLoc(), memrefType); 51 | argMap[arg.getArgNumber()] = allocOp; 52 | 53 | // Get shape of memref to iterate over 54 | auto shape = memrefType.getShape(); 55 | unsigned rank = memrefType.getRank(); 56 | Value zeroVal = builder.create( 57 | arg.getLoc(), builder.getZeroAttr(memrefType.getElementType())); 58 | auto loc = allocOp->getLoc(); 59 | 60 | SmallVector lowerBounds(memrefType.getRank(), /*Value=*/0); 61 | SmallVector steps(memrefType.getRank(), /*Value=*/1); 62 | affine::buildAffineLoopNest( 63 | builder, loc, lowerBounds, memrefType.getShape(), steps, 64 | [&](OpBuilder &nestedBuilder, Location loc, ValueRange ivs) { 65 | nestedBuilder.create(loc, zeroVal, allocOp, 66 | ivs); 67 | }); 68 | 69 | // Replace the original argument with the allocated memory 70 | arg.replaceAllUsesWith(allocOp); 71 | functionBlock->eraseArgument(arg.getArgNumber()); 72 | } 73 | 74 | // Update the function signature to have no arguments and no return type 75 | auto newFuncType = FunctionType::get(ctx, {}, {}); 76 | func.setType(newFuncType); 77 | 78 | // Replace all return operations with empty returns 79 | func.walk([&](func::ReturnOp ret) { 80 | OpBuilder retbuilder(ret); 81 | retbuilder.create(ret->getLoc()); 82 | ret->erase(); 83 | }); 84 | } 85 | 86 | } // namespace project 87 | } // namespace mlir -------------------------------------------------------------------------------- /make_MLIR_obj.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python3 2 | import argparse 3 | import os 4 | import subprocess 5 | import shutil 6 | import re 7 | from concurrent.futures import ThreadPoolExecutor 8 | 9 | def convert_to_ll(source_file, 10 | output_file, 11 | project_opt_flags = '--rem-forward-func-args-and-return-run-mlir --rem-global-constants-run-mlir', 12 | mlir_opt_path = os.path.abspath("./External/llvm-project/build/bin/mlir-opt"), 13 | project_opt_path = os.path.abspath("./build-ninja/tools/project-opt"), 14 | mlir_translate_path = os.path.abspath("./External/llvm-project/build/bin/mlir-translate")): 15 | 16 | # Run the mlir-opt command to convert the input file to LLVM IR 17 | 18 | # project_opt_flags = '--rem-forward-func-args-and-return-run-mlir --rem-global-constants-run-mlir' 19 | """ 20 | ./mlir-opt --lower-affine --expand-strided-metadata --convert-scf-to-cf --convert-cf-to-llvm --llvm-request-c-wrappers --convert-func-to-llvm --normalize-memrefs --memref-expand --finalize-memref-to-llvm --reconcile-unrealized-casts --llvm-legalize-for-export ../../squeezenet1_0.mlir | /home/intern24005/code/compiler_builds/mlir_test_build/bin/mlir-translate --mlir-to-llvmir > squeezenet1_0.ll 21 | """ 22 | mlir_flags = '--lower-affine --expand-strided-metadata --convert-scf-to-cf --convert-math-to-llvm --convert-cf-to-llvm --llvm-request-c-wrappers --convert-func-to-llvm --normalize-memrefs --memref-expand --finalize-memref-to-llvm --reconcile-unrealized-casts --llvm-legalize-for-export' 23 | command = f"{project_opt_path} {project_opt_flags} {source_file} | {mlir_opt_path} {mlir_flags} | {mlir_translate_path} --mlir-to-llvmir > {output_file}" 24 | print(command) 25 | try: 26 | subprocess.run(command, shell=True, check=True) 27 | except subprocess.CalledProcessError as e: 28 | print(f"Erorr: {e} while running command: {command}") 29 | return output_file 30 | 31 | def convert_to_obj(source_file, 32 | output_file, 33 | llc_path = os.path.abspath("./External/llvm-project/build/bin/llc")): 34 | # Run the llc command to convert the input file to an object file 35 | """ 36 | ./llc -filetype=obj -march=x86_64-linux-gnu squeezenet1_0.ll -o squeezenet1_0.o 37 | """ 38 | llc_flags = '-filetype=obj --relocation-model=pic' 39 | command = f"{llc_path} {llc_flags} {source_file} -o {output_file}" 40 | try : 41 | subprocess.run(command, shell=True, check=True) 42 | if os.path.exists(source_file): 43 | os.remove(source_file) 44 | except subprocess.CalledProcessError as e: 45 | print(f"Erorr: {e} while running command: {command}") 46 | return output_file 47 | 48 | def compile_multiple_files(path_to_file, path_to_ll_file, path_to_obj_file, mlir_flags): 49 | #convert the file to llvm ir 50 | ll_file = convert_to_ll(path_to_file, path_to_ll_file ,project_opt_flags=mlir_flags) 51 | convert_to_obj(ll_file, path_to_obj_file) 52 | print(f"Compiled {path_to_file} to {path_to_obj_file}") 53 | 54 | def compile_to_object_from_mlir(path_to_folder_with_mlir_files,path_to_obj_folder,mlir_flags): 55 | if os.path.isdir(path_to_folder_with_mlir_files): 56 | with ThreadPoolExecutor(max_workers=6) as executor: 57 | for file in os.listdir(path_to_folder_with_mlir_files): 58 | if file.endswith(".mlir"): 59 | file_name = os.path.basename(file).split(".")[0] 60 | 61 | output_file = os.path.join(path_to_obj_folder, file_name + ".o") 62 | ll_file = os.path.join(path_to_obj_folder, file_name + ".ll") 63 | executor.submit(compile_multiple_files, os.path.join(path_to_folder_with_mlir_files, file), ll_file, output_file, mlir_flags) 64 | elif os.path.isfile(path_to_folder_with_mlir_files) : 65 | file_name = os.path.basename(path_to_folder_with_mlir_files).split(".")[0] 66 | output_file = os.path.join(path_to_obj_folder, file_name + ".o") 67 | ll_file = os.path.join(path_to_obj_folder,file_name + ".ll") 68 | compile_multiple_files(os.path.join(path_to_folder_with_mlir_files),ll_file,output_file,mlir_flags) 69 | 70 | 71 | if __name__ == "__main__": 72 | parser = argparse.ArgumentParser(description='Compile MLIR files to object files') 73 | parser.add_argument('input_folder', type=str, help='Absolute path to the folder with MLIR files or the only MLIR file') 74 | parser.add_argument('output_folder', type=str, help='Absolute path to the folder where object files will be stored') 75 | parser.add_argument('--mlir-flags', required=False,default="--rem-forward-func-args-and-return-run-mlir-zero-init --rem-global-constants-run-mlir" ,type=str, help='Flags to be passed to project-opt') 76 | args = parser.parse_args() 77 | compile_to_object_from_mlir(args.input_folder, args.output_folder,args.mlir_flags) 78 | -------------------------------------------------------------------------------- /tests/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # someting too add here. -------------------------------------------------------------------------------- /tools/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS) 2 | get_property(conversion_libs GLOBAL PROPERTY MLIR_CONVERSION_LIBS) 3 | 4 | set (LIBS 5 | ${dialect_libs} 6 | ${conversion_libs} 7 | Affine64Unroll 8 | MakeRunAble 9 | MLIROptLib 10 | MLIRPass 11 | ) 12 | 13 | add_llvm_executable(project-opt project-opt.cpp) 14 | 15 | llvm_update_compile_flags(project-opt) 16 | target_link_libraries(project-opt PRIVATE ${LIBS}) 17 | 18 | mlir_check_all_link_libraries(project-opt) 19 | -------------------------------------------------------------------------------- /tools/project-opt.cpp: -------------------------------------------------------------------------------- 1 | #include "include/Transform/Affine/Affine64Unroll.h" 2 | #include "include/Transform/MakeRunAble/RemoveForwardFuncArgsAndReturn.h" 3 | #include "include/Transform/MakeRunAble/RemoveGlobalConstants.h" 4 | #include "include/Transform/MakeRunAble/ZeroInitRemoveForwardFuncArgsAndReturn.h" 5 | #include "mlir/include/mlir/InitAllDialects.h" 6 | #include "mlir/include/mlir/Pass/PassManager.h" 7 | #include "mlir/include/mlir/Pass/PassRegistry.h" 8 | #include "mlir/include/mlir/Tools/mlir-opt/MlirOptMain.h" 9 | 10 | int main(int argc, char **argv) { 11 | mlir::DialectRegistry registry; 12 | mlir::registerAllDialects(registry); 13 | 14 | mlir::PassRegistration(); 15 | mlir::PassRegistration(); 16 | mlir::PassRegistration(); 17 | mlir::PassRegistration< 18 | mlir::project::ZeroInitRemoveForwardFuncArgsAndReturn>(); 19 | return mlir::asMainReturnCode( 20 | mlir::MlirOptMain(argc, argv, "Pass Driver", registry)); 21 | } --------------------------------------------------------------------------------