├── .gitignore
├── .gitmodules
├── README.md
├── iree
    ├── generate.sh
    ├── mlir
    │   ├── linear.py.mlir
    │   └── minilm.py.mlir
    ├── models
    │   ├── driver
    │   │   └── __init__.py
    │   ├── linear.py
    │   ├── minilm.py
    │   └── resnet.py
    ├── prepare.sh
    └── scripts
    │   └── dump_linalg.sh
├── pytorch
    ├── generate.sh
    ├── lib
    │   └── torch_mlir_compile.py
    ├── prepare.sh
    ├── torch-dynamo
    │   ├── mlir
    │   │   ├── bert.mlir
    │   │   ├── conv.mlir
    │   │   ├── linear.mlir
    │   │   ├── mnist.mlir
    │   │   └── resnet18.mlir
    │   └── models
    │   │   ├── bert.py
    │   │   ├── conv.py
    │   │   ├── linear.py
    │   │   ├── mnist.py
    │   │   └── resnet18.py
    └── torch-script
    │   ├── mlir
    │       ├── conv.mlir
    │       ├── linear.mlir
    │       └── resnet18.mlir
    │   └── models
    │       ├── conv.py
    │       ├── linear.py
    │       └── resnet18.py
└── tensorflow
    ├── mlir
        ├── conv.mlir
        └── linear.mlir
    ├── models
        ├── conv.py
        └── linear.py
    └── prepare.sh


/.gitignore:
--------------------------------------------------------------------------------
  1 | # TF Saved Models
  2 | .saved
  3 | 
  4 | # Byte-compiled / optimized / DLL files
  5 | __pycache__/
  6 | *.py[cod]
  7 | *$py.class
  8 | 
  9 | # C extensions
 10 | *.so
 11 | 
 12 | # Distribution / packaging
 13 | .Python
 14 | build/
 15 | develop-eggs/
 16 | dist/
 17 | downloads/
 18 | eggs/
 19 | .eggs/
 20 | lib64/
 21 | parts/
 22 | sdist/
 23 | var/
 24 | wheels/
 25 | pip-wheel-metadata/
 26 | share/python-wheels/
 27 | *.egg-info/
 28 | .installed.cfg
 29 | *.egg
 30 | MANIFEST
 31 | 
 32 | # PyInstaller
 33 | #  Usually these files are written by a python script from a template
 34 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 35 | *.manifest
 36 | *.spec
 37 | 
 38 | # Installer logs
 39 | pip-log.txt
 40 | pip-delete-this-directory.txt
 41 | 
 42 | # Unit test / coverage reports
 43 | htmlcov/
 44 | .tox/
 45 | .nox/
 46 | .coverage
 47 | .coverage.*
 48 | .cache
 49 | nosetests.xml
 50 | coverage.xml
 51 | *.cover
 52 | *.py,cover
 53 | .hypothesis/
 54 | .pytest_cache/
 55 | 
 56 | # Translations
 57 | *.mo
 58 | *.pot
 59 | 
 60 | # Django stuff:
 61 | *.log
 62 | local_settings.py
 63 | db.sqlite3
 64 | db.sqlite3-journal
 65 | 
 66 | # Flask stuff:
 67 | instance/
 68 | .webassets-cache
 69 | 
 70 | # Scrapy stuff:
 71 | .scrapy
 72 | 
 73 | # Sphinx documentation
 74 | docs/_build/
 75 | 
 76 | # PyBuilder
 77 | target/
 78 | 
 79 | # Jupyter Notebook
 80 | .ipynb_checkpoints
 81 | 
 82 | # IPython
 83 | profile_default/
 84 | ipython_config.py
 85 | 
 86 | # pyenv
 87 | .python-version
 88 | 
 89 | # pipenv
 90 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 91 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 92 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 93 | #   install all needed dependencies.
 94 | #Pipfile.lock
 95 | 
 96 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 97 | __pypackages__/
 98 | 
 99 | # Celery stuff
100 | celerybeat-schedule
101 | celerybeat.pid
102 | 
103 | # SageMath parsed files
104 | *.sage.py
105 | 
106 | # Environments
107 | .env
108 | .venv
109 | env/
110 | venv/
111 | ENV/
112 | env.bak/
113 | venv.bak/
114 | mlir_venv
115 | 
116 | # Spyder project settings
117 | .spyderproject
118 | .spyproject
119 | 
120 | # Rope project settings
121 | .ropeproject
122 | 
123 | # mkdocs documentation
124 | /site
125 | 
126 | # mypy
127 | .mypy_cache/
128 | .dmypy.json
129 | dmypy.json
130 | 
131 | # Pyre type checker
132 | .pyre/
133 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/plaidml/mlir-generator/4457cbf8207f595fdc049b09954c5c515acb28bf/.gitmodules


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # MLIR Generator
 2 | 
 3 | ## Rationale
 4 | 
 5 | This repository is meant to help MLIR pass writers to generate MLIR files
 6 | that often come from ML/HPC sources, to allow testing and developing their
 7 | passes (as well as creating unit tests) with the current state of the known
 8 | front-ends.
 9 | 
10 | ## How To Use
11 | 
12 | _Warning: This is very much work-in-progress, don't expect half of it to work._
13 | 
14 | Each front-end has a prepare script, for example:
15 | ```
16 | cd torch-script
17 | ./prepare.sh
18 | ```
19 | 
20 | It also has models ready to use (after prepared):
21 | ```
22 | source venv/bin/activate
23 | python models/linear.py
24 | ```
25 | 
26 | Results are in `<front-end-name>/mlir`.
27 | 
28 | ## Development
29 | 
30 | ### Adding a new front-end
31 | 
32 | If you want to build an existing front-end, you need to:
33 | 1. Add a new directory for the front-end
34 | 2. Add a `prepare.sh` script that installs it and prepares the virtualenv
35 | 3. Add a `generate.sh` script that generates all the MLIR files
36 | 4. Add models as Python files in `models`, run the export, and save them as MLIR files in `mlir`
37 | 


--------------------------------------------------------------------------------
/iree/generate.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Generate MLIR files from IREE
 4 | # Run from inside the repo already, no need to check
 5 | 
 6 | ROOT=$(git rev-parse --show-toplevel)
 7 | if [ ! -d "$ROOT" ]; then
 8 |     echo "Cannot find repository root"
 9 |     exit 1
10 | fi
11 | 
12 | # For each Python script, generate an MLIR file
13 | # with the same name and different extension
14 | pushd "$ROOT/front-ends/iree/models"
15 | for MODEL in $(find . -type f -name \*.py); do
16 |     OUT="${MODEL%.py}.mlir"
17 |     python "$MODEL" > "../mlir/$OUT"
18 | done
19 | popd
20 | 


--------------------------------------------------------------------------------
/iree/mlir/linear.py.mlir:
--------------------------------------------------------------------------------
  1 | func.func @predict(%arg0: !iree_input.list<!iree_input.variant>) -> !iree_input.buffer_view attributes {iree.abi = "{\22a\22:[[\22slist\22,[\22ndarray\22,\22f32\22,2,1,128]]],\22r\22:[[\22ndarray\22,\22f32\22,3,1,1,10]],\22v\22:1}"} {
  2 |   %c0 = arith.constant 0 : index
  3 |   %0 = iree_input.list.get %arg0[%c0] : !iree_input.list<!iree_input.variant> -> !iree_input.buffer_view
  4 |   %1 = iree_input.cast.buffer_view_to_tensor %0 : !iree_input.buffer_view -> tensor<1x128xf32>
  5 |   %2 = call @__inference_predict_670(%1) : (tensor<1x128xf32>) -> tensor<1x1x10xf32>
  6 |   %3 = iree_input.cast.tensor_to_buffer_view %2 : tensor<1x1x10xf32> -> !iree_input.buffer_view
  7 |   return %3 : !iree_input.buffer_view
  8 | }
  9 | 
 10 | func.func private @__inference_predict_670(%arg0: tensor<1x128xf32> {tf._user_specified_name = "x"}) -> tensor<1x1x10xf32> attributes {tf._construction_context = "kEagerRuntime", tf._input_shapes = [#tf_type.shape<1x128>, #tf_type.shape<>, #tf_type.shape<>, #tf_type.shape<>, #tf_type.shape<>, #tf_type.shape<>, #tf_type.shape<>], tf.signature.is_stateful} {
 11 |   %cst = arith.constant dense<-0.000000e+00> : tensor<f32>
 12 |   %cst_0 = arith.constant dense<0xFF800000> : tensor<f32>
 13 |   %cst_1 = arith.constant dense<0.000000e+00> : tensor<f32>
 14 |   %0 = ml_program.global_load @__sm_node14__layers.0.b : tensor<256xf32>
 15 |   %1 = ml_program.global_load @__sm_node10__output_layer.b : tensor<10xf32>
 16 |   %2 = ml_program.global_load @__sm_node7__input_layer.b : tensor<256xf32>
 17 |   %3 = ml_program.global_load @__sm_node13__layers.0.w : tensor<256x256xf32>
 18 |   %4 = ml_program.global_load @__sm_node9__output_layer.w : tensor<256x10xf32>
 19 |   %5 = ml_program.global_load @__sm_node6__input_layer.w : tensor<128x256xf32>
 20 |   %6 = tensor.expand_shape %arg0 [[0], [1, 2]] : tensor<1x128xf32> into tensor<1x1x128xf32>
 21 |   %7 = tensor.expand_shape %5 [[0, 1], [2]] : tensor<128x256xf32> into tensor<1x128x256xf32>
 22 |   %8 = linalg.init_tensor [1, 1, 256] : tensor<1x1x256xf32>
 23 |   %cst_2 = arith.constant 0.000000e+00 : f32
 24 |   %9 = linalg.fill ins(%cst_2 : f32) outs(%8 : tensor<1x1x256xf32>) -> tensor<1x1x256xf32>
 25 |   %10 = linalg.batch_matmul ins(%6, %7 : tensor<1x1x128xf32>, tensor<1x128x256xf32>) outs(%9 : tensor<1x1x256xf32>) -> tensor<1x1x256xf32>
 26 |   %11 = linalg.init_tensor [1, 1, 256] : tensor<1x1x256xf32>
 27 |   %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%2 : tensor<256xf32>) outs(%11 : tensor<1x1x256xf32>) {
 28 |   ^bb0(%arg1: f32):
 29 |     linalg.yield %arg1 : f32
 30 |   } -> tensor<1x1x256xf32>
 31 |   %13 = linalg.init_tensor [1, 1, 256] : tensor<1x1x256xf32>
 32 |   %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%10, %12 : tensor<1x1x256xf32>, tensor<1x1x256xf32>) outs(%13 : tensor<1x1x256xf32>) {
 33 |   ^bb0(%arg1: f32):
 34 |     %61 = arith.addf %arg1, %arg2 : f32
 35 |     linalg.yield %61 : f32
 36 |   } -> tensor<1x1x256xf32>
 37 |   %15 = linalg.init_tensor [1, 1, 256] : tensor<1x1x256xf32>
 38 |   %16 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> ()>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_1 : tensor<f32>) outs(%15 : tensor<1x1x256xf32>) {
 39 |   ^bb0(%arg1: f32):
 40 |     linalg.yield %arg1 : f32
 41 |   } -> tensor<1x1x256xf32>
 42 |   %17 = linalg.init_tensor [1, 1, 256] : tensor<1x1x256xf32>
 43 |   %18 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%14, %16 : tensor<1x1x256xf32>, tensor<1x1x256xf32>) outs(%17 : tensor<1x1x256xf32>) {
 44 |   ^bb0(%arg1: f32):
 45 |     %61 = arith.maxf %arg1, %arg2 : f32
 46 |     linalg.yield %61 : f32
 47 |   } -> tensor<1x1x256xf32>
 48 |   %19 = tensor.expand_shape %3 [[0, 1], [2]] : tensor<256x256xf32> into tensor<1x256x256xf32>
 49 |   %20 = linalg.init_tensor [1, 1, 256] : tensor<1x1x256xf32>
 50 |   %cst_3 = arith.constant 0.000000e+00 : f32
 51 |   %21 = linalg.fill ins(%cst_3 : f32) outs(%20 : tensor<1x1x256xf32>) -> tensor<1x1x256xf32>
 52 |   %22 = linalg.batch_matmul ins(%18, %19 : tensor<1x1x256xf32>, tensor<1x256x256xf32>) outs(%21 : tensor<1x1x256xf32>) -> tensor<1x1x256xf32>
 53 |   %23 = linalg.init_tensor [1, 1, 256] : tensor<1x1x256xf32>
 54 |   %24 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%0 : tensor<256xf32>) outs(%23 : tensor<1x1x256xf32>) {
 55 |   ^bb0(%arg1: f32):
 56 |     linalg.yield %arg1 : f32
 57 |   } -> tensor<1x1x256xf32>
 58 |   %25 = linalg.init_tensor [1, 1, 256] : tensor<1x1x256xf32>
 59 |   %26 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%22, %24 : tensor<1x1x256xf32>, tensor<1x1x256xf32>) outs(%25 : tensor<1x1x256xf32>) {
 60 |   ^bb0(%arg1: f32):
 61 |     %61 = arith.addf %arg1, %arg2 : f32
 62 |     linalg.yield %61 : f32
 63 |   } -> tensor<1x1x256xf32>
 64 |   %27 = linalg.init_tensor [1, 1, 256] : tensor<1x1x256xf32>
 65 |   %28 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> ()>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_1 : tensor<f32>) outs(%27 : tensor<1x1x256xf32>) {
 66 |   ^bb0(%arg1: f32):
 67 |     linalg.yield %arg1 : f32
 68 |   } -> tensor<1x1x256xf32>
 69 |   %29 = linalg.init_tensor [1, 1, 256] : tensor<1x1x256xf32>
 70 |   %30 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%26, %28 : tensor<1x1x256xf32>, tensor<1x1x256xf32>) outs(%29 : tensor<1x1x256xf32>) {
 71 |   ^bb0(%arg1: f32):
 72 |     %61 = arith.maxf %arg1, %arg2 : f32
 73 |     linalg.yield %61 : f32
 74 |   } -> tensor<1x1x256xf32>
 75 |   %31 = tensor.expand_shape %4 [[0, 1], [2]] : tensor<256x10xf32> into tensor<1x256x10xf32>
 76 |   %32 = linalg.init_tensor [1, 1, 10] : tensor<1x1x10xf32>
 77 |   %cst_4 = arith.constant 0.000000e+00 : f32
 78 |   %33 = linalg.fill ins(%cst_4 : f32) outs(%32 : tensor<1x1x10xf32>) -> tensor<1x1x10xf32>
 79 |   %34 = linalg.batch_matmul ins(%30, %31 : tensor<1x1x256xf32>, tensor<1x256x10xf32>) outs(%33 : tensor<1x1x10xf32>) -> tensor<1x1x10xf32>
 80 |   %35 = linalg.init_tensor [1, 1, 10] : tensor<1x1x10xf32>
 81 |   %36 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%1 : tensor<10xf32>) outs(%35 : tensor<1x1x10xf32>) {
 82 |   ^bb0(%arg1: f32):
 83 |     linalg.yield %arg1 : f32
 84 |   } -> tensor<1x1x10xf32>
 85 |   %37 = linalg.init_tensor [1, 1, 10] : tensor<1x1x10xf32>
 86 |   %38 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%34, %36 : tensor<1x1x10xf32>, tensor<1x1x10xf32>) outs(%37 : tensor<1x1x10xf32>) {
 87 |   ^bb0(%arg1: f32):
 88 |     %61 = arith.addf %arg1, %arg2 : f32
 89 |     linalg.yield %61 : f32
 90 |   } -> tensor<1x1x10xf32>
 91 |   %39 = linalg.init_tensor [1, 1, 10] : tensor<1x1x10xf32>
 92 |   %40 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> ()>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%cst_1 : tensor<f32>) outs(%39 : tensor<1x1x10xf32>) {
 93 |   ^bb0(%arg1: f32):
 94 |     linalg.yield %arg1 : f32
 95 |   } -> tensor<1x1x10xf32>
 96 |   %41 = linalg.init_tensor [1, 1, 10] : tensor<1x1x10xf32>
 97 |   %42 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%38, %40 : tensor<1x1x10xf32>, tensor<1x1x10xf32>) outs(%41 : tensor<1x1x10xf32>) {
 98 |   ^bb0(%arg1: f32):
 99 |     %61 = arith.maxf %arg1, %arg2 : f32
100 |     linalg.yield %61 : f32
101 |   } -> tensor<1x1x10xf32>
102 |   %cst_5 = arith.constant 0xFF800000 : f32
103 |   %43 = linalg.init_tensor [1, 1] : tensor<1x1xf32>
104 |   %44 = linalg.fill ins(%cst_5 : f32) outs(%43 : tensor<1x1xf32>) -> tensor<1x1xf32>
105 |   %45 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%42 : tensor<1x1x10xf32>) outs(%44 : tensor<1x1xf32>) {
106 |   ^bb0(%arg1: f32, %arg2: f32 loc(unknown)):
107 |     %61 = tensor.from_elements %arg2 : tensor<f32>
108 |     %62 = tensor.from_elements %arg1 : tensor<f32>
109 |     %63 = tensor.extract %61[] : tensor<f32>
110 |     %64 = tensor.extract %62[] : tensor<f32>
111 |     %65 = arith.maxf %63, %64 : f32
112 |     %66 = tensor.from_elements %65 : tensor<f32>
113 |     %67 = tensor.extract %66[] : tensor<f32>
114 |     linalg.yield %67 : f32
115 |   } -> tensor<1x1xf32>
116 |   %46 = tensor.expand_shape %45 [[0], [1, 2]] : tensor<1x1xf32> into tensor<1x1x1xf32>
117 |   %47 = linalg.init_tensor [1, 1, 10] : tensor<1x1x10xf32>
118 |   %48 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, 0)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%46 : tensor<1x1x1xf32>) outs(%47 : tensor<1x1x10xf32>) {
119 |   ^bb0(%arg1: f32):
120 |     linalg.yield %arg1 : f32
121 |   } -> tensor<1x1x10xf32>
122 |   %49 = linalg.init_tensor [1, 1, 10] : tensor<1x1x10xf32>
123 |   %50 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%42, %48 : tensor<1x1x10xf32>, tensor<1x1x10xf32>) outs(%49 : tensor<1x1x10xf32>) {
124 |   ^bb0(%arg1: f32):
125 |     %61 = arith.subf %arg1, %arg2 : f32
126 |     linalg.yield %61 : f32
127 |   } -> tensor<1x1x10xf32>
128 |   %51 = linalg.init_tensor [1, 1, 10] : tensor<1x1x10xf32>
129 |   %52 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%50 : tensor<1x1x10xf32>) outs(%51 : tensor<1x1x10xf32>) {
130 |   ^bb0(%arg1: f32):
131 |     %61 = math.exp %arg1 : f32
132 |     linalg.yield %61 : f32
133 |   } -> tensor<1x1x10xf32>
134 |   %cst_6 = arith.constant -0.000000e+00 : f32
135 |   %53 = linalg.init_tensor [1, 1] : tensor<1x1xf32>
136 |   %54 = linalg.fill ins(%cst_6 : f32) outs(%53 : tensor<1x1xf32>) -> tensor<1x1xf32>
137 |   %55 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%52 : tensor<1x1x10xf32>) outs(%54 : tensor<1x1xf32>) {
138 |   ^bb0(%arg1: f32, %arg2: f32 loc(unknown)):
139 |     %61 = tensor.from_elements %arg2 : tensor<f32>
140 |     %62 = tensor.from_elements %arg1 : tensor<f32>
141 |     %63 = tensor.extract %61[] : tensor<f32>
142 |     %64 = tensor.extract %62[] : tensor<f32>
143 |     %65 = arith.addf %63, %64 : f32
144 |     %66 = tensor.from_elements %65 : tensor<f32>
145 |     %67 = tensor.extract %66[] : tensor<f32>
146 |     linalg.yield %67 : f32
147 |   } -> tensor<1x1xf32>
148 |   %56 = tensor.expand_shape %55 [[0], [1, 2]] : tensor<1x1xf32> into tensor<1x1x1xf32>
149 |   %57 = linalg.init_tensor [1, 1, 10] : tensor<1x1x10xf32>
150 |   %58 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, 0)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%56 : tensor<1x1x1xf32>) outs(%57 : tensor<1x1x10xf32>) {
151 |   ^bb0(%arg1: f32):
152 |     linalg.yield %arg1 : f32
153 |   } -> tensor<1x1x10xf32>
154 |   %59 = linalg.init_tensor [1, 1, 10] : tensor<1x1x10xf32>
155 |   %60 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%52, %58 : tensor<1x1x10xf32>, tensor<1x1x10xf32>) outs(%59 : tensor<1x1x10xf32>) {
156 |   ^bb0(%arg1: f32):
157 |     %61 = arith.divf %arg1, %arg2 : f32
158 |     linalg.yield %61 : f32
159 |   } -> tensor<1x1x10xf32>
160 |   return %60 : tensor<1x1x10xf32>
161 | }
162 | 
163 | 


--------------------------------------------------------------------------------
/iree/models/driver/__init__.py:
--------------------------------------------------------------------------------
 1 | # Stolen from https://github.com/iree-org/iree-samples/blob/main/ModelCompiler/nlp_models/bert_small_run.py
 2 | from iree.compiler import tf as tfc
 3 | from iree.compiler import compile_str
 4 | 
 5 | def build_module(model, exports):
 6 |     # Compile the model using IREE
 7 |     compiler_module = tfc.compile_module(model, exported_names = exports, import_only=True)
 8 |     backend = "llvm-cpu"
 9 |     args = ["--iree-llvm-target-cpu-features=host",
10 |             "--iree-mhlo-demote-i64-to-i32=false",
11 |             "--iree-flow-demote-i64-to-i32",
12 |             "--mlir-print-ir-after=iree-mhlo-to-linalg-on-tensors"]
13 |     backend_config = "local-task"
14 |     flatbuffer_blob = compile_str(compiler_module, target_backends=[backend], extra_args=args, input_type="mhlo")
15 | 


--------------------------------------------------------------------------------
/iree/models/linear.py:
--------------------------------------------------------------------------------
 1 | # Ref: https://www.tensorflow.org/guide/intro_to_modules
 2 | import driver
 3 | 
 4 | import tensorflow as tf
 5 | 
 6 | # TODO: Make these into command-line arguments
 7 | BATCH_SIZE = 1
 8 | INPUT_LEN = 128
 9 | HIDDEN_LEN = 256
10 | OUTPUT_LEN = 10
11 | NUM_LAYERS = 1
12 | 
13 | class Dense(tf.Module):
14 |   def __init__(self, in_features, out_features, name=None):
15 |     super().__init__(name=name)
16 |     self.w = tf.Variable(
17 |       tf.random.normal([in_features, out_features]), name='w')
18 |     self.b = tf.Variable(tf.zeros([out_features]), name='b')
19 |   def __call__(self, x):
20 |     y = tf.matmul(x, self.w) + self.b
21 |     return tf.nn.relu(y)
22 | 
23 | class SequentialModule(tf.Module):
24 |     def __init__(self, name=None):
25 |         super().__init__(name=name)
26 |         self.input_layer = Dense(in_features=INPUT_LEN, out_features=HIDDEN_LEN)
27 |         self.layers = []
28 |         for layer in range(NUM_LAYERS):
29 |             self.layers.append(Dense(in_features=HIDDEN_LEN, out_features=HIDDEN_LEN))
30 |         self.output_layer = Dense(in_features=HIDDEN_LEN, out_features=OUTPUT_LEN)
31 | 
32 |     @tf.function(input_signature=[[tf.TensorSpec(shape=[BATCH_SIZE,INPUT_LEN],dtype=tf.float32)]])
33 |     def predict(self, x):
34 |         x = self.input_layer(x)
35 |         for layer in range(NUM_LAYERS):
36 |             x = self.layers[layer](x)
37 |         x = self.output_layer(x)
38 |         return tf.nn.softmax(x)
39 | 
40 | if __name__ == "__main__":
41 |     driver.build_module(SequentialModule(), ["predict"])
42 | 


--------------------------------------------------------------------------------
/iree/models/minilm.py:
--------------------------------------------------------------------------------
 1 | # Ref: https://github.com/iree-org/iree-samples/blob/main/ModelCompiler/nlp_models/huggingface_MiniLM_run.py
 2 | import driver
 3 | 
 4 | import tensorflow as tf
 5 | 
 6 | from transformers import TFBertModel
 7 | 
 8 | # TODO: Make these into command-line arguments
 9 | MAX_SEQUENCE_LENGTH = 512
10 | BATCH_SIZE = 1
11 | 
12 | # Create a set of 2-dimensional inputs
13 | bert_input = [tf.TensorSpec(shape=[BATCH_SIZE,MAX_SEQUENCE_LENGTH],dtype=tf.int32),
14 |             tf.TensorSpec(shape=[BATCH_SIZE,MAX_SEQUENCE_LENGTH], dtype=tf.int32),
15 |             tf.TensorSpec(shape=[BATCH_SIZE,MAX_SEQUENCE_LENGTH], dtype=tf.int32)]
16 | 
17 | class BertModule(tf.Module):
18 |     def __init__(self):
19 |         super(BertModule, self).__init__()
20 |         # Create a BERT trainer with the created network.
21 |         self.m = TFBertModel.from_pretrained("microsoft/MiniLM-L12-H384-uncased", from_pt=True)
22 | 
23 |         # Invoke the trainer model on the inputs. This causes the layer to be built.
24 |         self.m.predict = lambda x,y,z: self.m.call(input_ids=x, attention_mask=y, token_type_ids=z, training=False)
25 | 
26 |     @tf.function(input_signature=bert_input)
27 |     def predict(self, input_ids, attention_mask, token_type_ids):
28 |         return self.m.predict(input_ids, attention_mask, token_type_ids)
29 | 
30 | if __name__ == "__main__":
31 |     driver.build_module(BertModule(), ["predict"])
32 | 


--------------------------------------------------------------------------------
/iree/models/resnet.py:
--------------------------------------------------------------------------------
 1 | # Ref: https://github.com/iree-org/iree-samples/blob/main/ModelCompiler/nlp_models/huggingface_MiniLM_run.py
 2 | import driver
 3 | 
 4 | import tensorflow as tf
 5 | from transformers import ResNetForImageClassification
 6 | 
 7 | # TODO: Make these into command-line arguments
 8 | BATCH_SIZE = 1
 9 | H = 224
10 | W = 224
11 | C = 3
12 | 
13 | # Create a set of 2-dimensional inputs
14 | resnet_input = [tf.TensorSpec(shape=[BATCH_SIZE,H,W,C],dtype=tf.int32)]
15 | 
16 | class ResnetModule(tf.Module):
17 |     def __init__(self):
18 |         super(ResnetModule, self).__init__()
19 |         # Create a Resnet trainer with the created network.
20 |         self.m = ResNetForImageClassification.from_pretrained("microsoft/resnet-18")
21 | 
22 |         # Invoke the trainer model on the inputs. This causes the layer to be built.
23 |         self.m.predict = lambda x: self.m(input=x, training=False)
24 | 
25 |     # FIXME: This isn't working
26 |     @tf.function(input_signature=resnet_input)
27 |     def predict(self, input):
28 |         return self.m.predict(*input)
29 | 
30 | if __name__ == "__main__":
31 |     driver.build_module(ResnetModule(), ["predict"])
32 | 


--------------------------------------------------------------------------------
/iree/prepare.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | # Builds IREE following the following documentation:
  4 | # https://iree-org.github.io/iree/building-from-source/getting-started/
  5 | # https://iree-org.github.io/iree/building-from-source/python-bindings-and-importers/
  6 | # TPP build as in https://github.com/iree-org/iree/commit/63e12cafdf0bba3263bf2b5f74d7b2381f43af65
  7 | 
  8 | SYNTAX="build_iree.sh [-tpp]"
  9 | 
 10 | set -eu
 11 | 
 12 | BUILD_WITH_TPP=False
 13 | if [ $# -ge 1 ] && [ "$1" == "-tpp" ]; then
 14 |   BUILD_WITH_TPP=True
 15 |   shift
 16 | fi
 17 | 
 18 | BUILD_TYPE=Release
 19 | if [ $# -ge 1 ] && [ "$1" == "-d" ]; then
 20 |   echo "Building debug version"
 21 |   BUILD_TYPE=Debug
 22 | elif [ $# -ge 1 ] && [ "$1" == "-rd" ]; then
 23 |   echo "Building rel+debug version"
 24 |   BUILD_TYPE=RelWithDebInfo
 25 | fi
 26 | 
 27 | # Run on container/remote directly, need to check
 28 | PROJECT="mlir-generator"
 29 | if [ -d "$PROJECT" ]; then
 30 |   cd "$PROJECT"
 31 | fi
 32 | # Make sure the repo is in a good shape
 33 | echo " + Updating submodules"
 34 | git submodule update --init --recursive
 35 | 
 36 | # Go into iree subrepo
 37 | ROOT="$(git rev-parse --show-toplevel)/external/iree"
 38 | if [ ! -d "$ROOT" ]; then
 39 |     echo "Cannot find repository root"
 40 |     exit 1
 41 | fi
 42 | pushd "$ROOT"
 43 | 
 44 | BLD_DIR="$ROOT/build"
 45 | git fetch
 46 | if [ "$BUILD_WITH_TPP" == "True" ]; then
 47 |   BLD_DIR="$BLD_DIR/tpp"
 48 |   git reset --hard origin/tpp
 49 | else
 50 |   BLD_DIR="$BLD_DIR/main"
 51 |   git reset --hard origin/main
 52 | fi
 53 | mkdir -p "$BLD_DIR"
 54 | VENV_DIR="$BLD_DIR/venv"
 55 | 
 56 | # Always grab a fresh env environment
 57 | echo " + Creating a fresh venv"
 58 | rm -rf $VENV_DIR
 59 | python -m venv $VENV_DIR
 60 | echo "export PATH=\$PATH:$BLD_DIR/tools" >> $VENV_DIR/bin/activate
 61 | echo "export PYTHONPATH=$BLD_DIR/compiler/bindings/python:$BLD_DIR/runtime/bindings/python" >> $VENV_DIR/bin/activate
 62 | source $VENV_DIR/bin/activate
 63 | 
 64 | # Install Python dependencies
 65 | echo " + Install Python dependencies"
 66 | python -m pip install --upgrade pip
 67 | python -m pip install -r $ROOT/runtime/bindings/python/iree/runtime/build_requirements.txt
 68 | python -m pip install tensorflow iree-tools-tf keras transformers torch datasets
 69 | 
 70 | # Checkout iree repos too
 71 | echo " + Updating submodules"
 72 | git submodule update --init --recursive
 73 | 
 74 | # Build iree with LLVM in-tree
 75 | echo " + Build iree in-tree"
 76 | EXTRA_CMAKE_FLAGS=""
 77 | if [ "$BUILD_WITH_TPP" == "True" ]; then
 78 |   echo " + Adding TPP options"
 79 |   EXTRA_CMAKE_FLAGS="-DIREE_USE_TPP=ON \
 80 |                      -DCMAKE_C_FLAGS=-DIREE_HAL_EXECUTABLE_IMPORT_PROVIDER_DEFAULT_FN=iree_samples_tpp_import_provider \
 81 |                      -DIREE_HAL_EXECUTABLE_LOADER_EXTRA_DEPS=iree_samples_tpp_import"
 82 | fi
 83 | cmake -GNinja -B$BLD_DIR -S $ROOT \
 84 |     -DCMAKE_BUILD_TYPE=Release \
 85 |     -DCMAKE_C_COMPILER_LAUNCHER=ccache \
 86 |     -DCMAKE_CXX_COMPILER_LAUNCHER=ccache \
 87 |     -DCMAKE_C_COMPILER=clang \
 88 |     -DCMAKE_CXX_COMPILER=clang++ \
 89 |     -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
 90 |     -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
 91 |     -DLLVM_ENABLE_ASSERTIONS=ON \
 92 |     -DIREE_ENABLE_ASSERTIONS=ON \
 93 |     -DIREE_BUILD_PYTHON_BINDINGS=ON \
 94 |     -DIREE_ENABLE_LLD=ON \
 95 |      $EXTRA_CMAKE_FLAGS \
 96 |     -DPython3_EXECUTABLE=$(which python)
 97 | 
 98 | # Not everything works with TPP
 99 | if [ "$BUILD_WITH_TPP" == "True" ]; then
100 |   ninja -C "$BLD_DIR" iree-opt iree-compile iree-run-module iree-benchmark-module iree-lld
101 |   ninja -C "$BLD_DIR" compiler/bindings/python/iree/compiler/tflite.py
102 |   ninja -C "$BLD_DIR" compiler/bindings/python/iree/compiler/tools/tflite.py
103 |   ninja -C "$BLD_DIR" runtime/package
104 | else
105 |   ninja -C "$BLD_DIR"
106 | fi
107 | 
108 | # Python bindings test
109 | echo " + Checking IREE Python bindings"
110 | python -c "import iree.compiler"
111 | python -c "import iree.runtime"
112 | 
113 | deactivate
114 | popd
115 | 


--------------------------------------------------------------------------------
/iree/scripts/dump_linalg.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Dumps Linalg-on-tensors MLIR from Python module
 4 | SYNTAX="dump_linalg.sh [-tpp] model.py [args]"
 5 | 
 6 | set -eu
 7 | 
 8 | # Use main build or TPP build
 9 | BUILD=main
10 | if [ $# -ge 1 ] && [ "$1" == "-tpp" ]; then
11 |   BUILD=tpp
12 |   shift
13 | fi
14 | 
15 | # Python Module
16 | if [ $# -ge 1 ] && [ -f "$1" ]; then
17 |   SCRIPT="$1"
18 |   shift
19 | else
20 |   echo "Syntax: $SYNTAX"
21 |   exit 1
22 | fi
23 | ARGS="$@"
24 | 
25 | VENV="$(git rev-parse --show-toplevel)/external/iree/build/$BUILD/venv"
26 | if [ ! -d "$VENV" ]; then
27 |     echo "Cannot find repository root"
28 |     exit 1
29 | fi
30 | source "$VENV"/bin/activate
31 | 
32 | # This works on Linux and Darwin
33 | TEMP_DIR=$(mktemp -d 2>/dev/null || mktemp -d -t 'iree')
34 | 
35 | # Run iree-compiler with --mlir-print-ir-after=iree-mhlo-to-linalg-on-tensors
36 | echo "Running [$SCRIPT $ARGS], output on $TEMP_DIR"
37 | python "$SCRIPT" $ARGS > "$TEMP_DIR"/out 2> "$TEMP_DIR"/err || true
38 | 
39 | # Clean-up output to grab the IR
40 | MLIR_FILE="$TEMP_DIR"/"$(basename $SCRIPT)".mlir
41 | echo "Cleaning up output, creating MLIR file $MLIR_FILE"
42 | cat "$TEMP_DIR"/err \
43 |   | grep -v ": \w tensorflow" \
44 |   | grep -v "Invoked with" \
45 |   | grep -v "^//" \
46 |   | grep -v "Traceback" \
47 |   | grep -v "File \"" \
48 |   | grep -v VmModule \
49 |   | grep -v TypeError \
50 |   | grep -v "GPU devices" \
51 |   | grep -v Downloading \
52 |   | grep -v TensorFlow > $MLIR_FILE
53 | 
54 | # Clean-up debug symbols from MLIR file
55 | sed -i 's/ loc(unknown)//' "$MLIR_FILE" # unknown
56 | sed -i '/=/ s/ loc(fused.*)//' "$MLIR_FILE" # assignments
57 | sed -i '/yield/ s/ loc(fused.*)//' "$MLIR_FILE" # yield
58 | sed -i '/\^bb/ s/ loc(fused.*)/)/' "$MLIR_FILE" # basic block
59 | sed -i '/} -> tensor/ s/ loc(fused.*)//' "$MLIR_FILE" # block end
60 | 
61 | # Move the MLIR file into the repository (we know the repo exists)
62 | MLIR_DIR="$(git rev-parse --show-toplevel)/front-ends/iree/mlir"
63 | echo "Moving MLIR file to $MLIR_DIR"
64 | mv "$MLIR_FILE" "$MLIR_DIR"
65 | 
66 | rm -r $TEMP_DIR
67 | deactivate
68 | 


--------------------------------------------------------------------------------
/pytorch/generate.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Generate MLIR files from Torch-MLIR
 4 | # Run from inside the repo already, no need to check
 5 | 
 6 | ROOT=$(git rev-parse --show-toplevel)
 7 | if [ ! -d "$ROOT" ]; then
 8 |     echo "Cannot find repository root"
 9 |     exit 1
10 | fi
11 | 
12 | # For each Python script, generate an MLIR file
13 | # with the same name and different extension
14 | pushd "$ROOT/pytorch/torch-script/models"
15 | for MODEL in $(find . -type f -name \*.py); do
16 |     OUT="${MODEL%.py}.mlir"
17 |     python "$MODEL" > "../mlir/$OUT"
18 | done
19 | popd
20 | 
21 | # For each Python script, generate an MLIR file
22 | # with the same name and different extension
23 | pushd "$ROOT/pytorch/torch-dynamo/models"
24 | for MODEL in $(find . -type f -name \*.py); do
25 |     OUT="${MODEL%.py}.mlir"
26 |     python "$MODEL" > "../mlir/$OUT"
27 | done
28 | popd


--------------------------------------------------------------------------------
/pytorch/lib/torch_mlir_compile.py:
--------------------------------------------------------------------------------
 1 | # Implements the torch_mlir compiler steps into multiple MLIR outputs
 2 | 
 3 | from typing import List
 4 | 
 5 | import torch
 6 | import torch_mlir
 7 | from torch_mlir.dynamo import make_simple_dynamo_backend
 8 | from torch_mlir_e2e_test.linalg_on_tensors_backends import refbackend
 9 | 
10 | def torch_mlir_compile(model, input, output_formats=['torch', 'linalg-on-tensors', 'tosa', 'stablehlo']):
11 |     pass
12 |     module = None
13 |     for format in output_formats:
14 |         module = torch_mlir.compile(model, input, output_type=format)
15 |         print(format, "\n", module.operation.get_asm(large_elements_limit=10))
16 |     # Returning one of MLIR modules so that TorchDynamo can call the object.
17 |     # Otherwise, TorchDynamo does not get invoked.
18 |     # TorchScript models do not care for return from torch_mlir_compile.
19 |     return module
20 | 
21 | @make_simple_dynamo_backend
22 | def refbackend_torchdynamo_backend(fx_graph: torch.fx.GraphModule,
23 |                                    example_inputs: List[torch.Tensor]):
24 |     mlir_module = torch_mlir_compile(
25 |         fx_graph, example_inputs, output_formats=['linalg-on-tensors'])
26 |     backend = refbackend.RefBackendLinalgOnTensorsBackend()
27 |     compiled = backend.compile(mlir_module)
28 |     loaded = backend.load(compiled)
29 | 
30 |     def compiled_callable(*inputs):
31 |         inputs = [x.numpy() for x in inputs]
32 |         result = loaded.forward(*inputs)
33 |         if not isinstance(result, tuple):
34 |             result = torch.from_numpy(result)
35 |         else:
36 |             result = tuple(torch.from_numpy(x) for x in result)
37 |         return result
38 |     return compiled_callable
39 | 


--------------------------------------------------------------------------------
/pytorch/prepare.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Installs torch-mlir following the following documentation:
 4 | # https://github.com/llvm/torch-mlir/tree/main
 5 | 
 6 | # Top of current root
 7 | ROOT="$(git rev-parse --show-toplevel)"
 8 | if [ ! -d "$ROOT" ]; then
 9 |     echo "Cannot find repository root"
10 |     exit 1
11 | fi
12 | cd "$ROOT"
13 | 
14 | # Always grab a fresh env environment
15 | echo " + Creating a fresh conda env "
16 | ENV_PATH="${ROOT}/env"
17 | CONDA_DIR="torch-mlir-conda"
18 | CONDA_DIR_PATH="${ENV_PATH}/${CONDA_DIR}/miniconda3/"
19 | ARCH_NAME=$(uname -m)
20 | 
21 | mkdir -p ${ENV_PATH}
22 | rm -rf ${CONDA_DIR_PATH}
23 | 
24 | wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-${ARCH_NAME}.sh
25 | bash Miniconda3-latest-Linux-${ARCH_NAME}.sh -b -p ${CONDA_DIR_PATH}
26 | eval "$(${CONDA_DIR_PATH}/bin/conda shell.bash hook)"
27 | rm Miniconda3-latest-Linux-${ARCH_NAME}.sh
28 | 
29 | conda activate ${CONDA_DIR_PATH}
30 | conda install -y python=3.11.3
31 | 
32 | # Install Python dependencies
33 | echo " + Install Python dependencies"
34 | python -m pip install --upgrade pip
35 | pip install --pre torch-mlir torchvision \
36 |   -f https://llvm.github.io/torch-mlir/package-index/ \
37 |   --extra-index-url https://download.pytorch.org/whl/nightly/cpu
38 | 
39 | # Install any additional dependencies
40 | pip install transformers
41 | 
42 | # Done
43 | echo " + Done."
44 | conda deactivate
45 | echo " + Run conda activate ${CONDA_DIR_PATH} before using torch-mlir"
46 | 


--------------------------------------------------------------------------------
/pytorch/torch-dynamo/mlir/conv.mlir:
--------------------------------------------------------------------------------
 1 | linalg-on-tensors 
 2 |  #map = affine_map<(d0, d1, d2, d3) -> (d1)>
 3 | #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 4 | #map2 = affine_map<(d0, d1, d2, d3) -> (0, d1, d2, d3)>
 5 | #map3 = affine_map<(d0, d1, d2, d3) -> (d0, 0, d2, d3)>
 6 | #map4 = affine_map<(d0, d1, d2, d3) -> (0, 0, d2, d3)>
 7 | module attributes {torch.debug_module_name = "_lambda"} {
 8 |   ml_program.global private mutable @global_seed(dense<0> : tensor<i64>) : tensor<i64>
 9 |   func.func @forward(%arg0: tensor<32x3x3x3xf32>, %arg1: tensor<32xf32>, %arg2: tensor<2x32x3x3xf32>, %arg3: tensor<2xf32>, %arg4: tensor<1x3x28x28xf32>) -> tensor<1x2x3x3xf32> {
10 |     %cst = arith.constant 0.000000e+00 : f32
11 |     %cst_0 = arith.constant 0xFF800000 : f32
12 |     %c0_i64 = arith.constant 0 : i64
13 |     %0 = tensor.empty() : tensor<1x32x9x9xf32>
14 |     %1 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg1 : tensor<32xf32>) outs(%0 : tensor<1x32x9x9xf32>) {
15 |     ^bb0(%in: f32, %out: f32):
16 |       linalg.yield %in : f32
17 |     } -> tensor<1x32x9x9xf32>
18 |     %2 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<3> : vector<2xi64>} ins(%arg4, %arg0 : tensor<1x3x28x28xf32>, tensor<32x3x3x3xf32>) outs(%1 : tensor<1x32x9x9xf32>) -> tensor<1x32x9x9xf32>
19 |     %3 = linalg.generic {indexing_maps = [#map2, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2 : tensor<1x32x9x9xf32>) outs(%0 : tensor<1x32x9x9xf32>) {
20 |     ^bb0(%in: f32, %out: f32):
21 |       %19 = arith.cmpf ugt, %in, %cst : f32
22 |       %20 = arith.select %19, %in, %cst : f32
23 |       linalg.yield %20 : f32
24 |     } -> tensor<1x32x9x9xf32>
25 |     %4 = tensor.empty() : tensor<1x2x3x3xf32>
26 |     %5 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg3 : tensor<2xf32>) outs(%4 : tensor<1x2x3x3xf32>) {
27 |     ^bb0(%in: f32, %out: f32):
28 |       linalg.yield %in : f32
29 |     } -> tensor<1x2x3x3xf32>
30 |     %6 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<3> : vector<2xi64>} ins(%3, %arg2 : tensor<1x32x9x9xf32>, tensor<2x32x3x3xf32>) outs(%5 : tensor<1x2x3x3xf32>) -> tensor<1x2x3x3xf32>
31 |     %7 = linalg.generic {indexing_maps = [#map2, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%6 : tensor<1x2x3x3xf32>) outs(%4 : tensor<1x2x3x3xf32>) {
32 |     ^bb0(%in: f32, %out: f32):
33 |       %19 = arith.cmpf ugt, %in, %cst : f32
34 |       %20 = arith.select %19, %in, %cst : f32
35 |       linalg.yield %20 : f32
36 |     } -> tensor<1x2x3x3xf32>
37 |     %8 = tensor.empty() : tensor<1x1x3x3xi64>
38 |     %9 = linalg.fill ins(%c0_i64 : i64) outs(%8 : tensor<1x1x3x3xi64>) -> tensor<1x1x3x3xi64>
39 |     %10 = tensor.empty() : tensor<1x1x3x3xf32>
40 |     %11 = linalg.fill ins(%cst_0 : f32) outs(%10 : tensor<1x1x3x3xf32>) -> tensor<1x1x3x3xf32>
41 |     %12:2 = linalg.generic {indexing_maps = [#map1, #map3, #map3], iterator_types = ["parallel", "reduction", "parallel", "parallel"]} ins(%7 : tensor<1x2x3x3xf32>) outs(%11, %9 : tensor<1x1x3x3xf32>, tensor<1x1x3x3xi64>) {
42 |     ^bb0(%in: f32, %out: f32, %out_1: i64):
43 |       %19 = linalg.index 1 : index
44 |       %20 = arith.index_cast %19 : index to i64
45 |       %21 = arith.maximumf %in, %out : f32
46 |       %22 = arith.cmpf ogt, %in, %out : f32
47 |       %23 = arith.select %22, %20, %out_1 : i64
48 |       linalg.yield %21, %23 : f32, i64
49 |     } -> (tensor<1x1x3x3xf32>, tensor<1x1x3x3xi64>)
50 |     %13 = linalg.generic {indexing_maps = [#map2, #map4, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%7, %12#0 : tensor<1x2x3x3xf32>, tensor<1x1x3x3xf32>) outs(%4 : tensor<1x2x3x3xf32>) {
51 |     ^bb0(%in: f32, %in_1: f32, %out: f32):
52 |       %19 = arith.subf %in, %in_1 : f32
53 |       linalg.yield %19 : f32
54 |     } -> tensor<1x2x3x3xf32>
55 |     %14 = linalg.generic {indexing_maps = [#map2, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%13 : tensor<1x2x3x3xf32>) outs(%4 : tensor<1x2x3x3xf32>) {
56 |     ^bb0(%in: f32, %out: f32):
57 |       %19 = math.exp %in : f32
58 |       linalg.yield %19 : f32
59 |     } -> tensor<1x2x3x3xf32>
60 |     %15 = linalg.fill ins(%cst : f32) outs(%10 : tensor<1x1x3x3xf32>) -> tensor<1x1x3x3xf32>
61 |     %16 = linalg.generic {indexing_maps = [#map1, #map3], iterator_types = ["parallel", "reduction", "parallel", "parallel"]} ins(%14 : tensor<1x2x3x3xf32>) outs(%15 : tensor<1x1x3x3xf32>) {
62 |     ^bb0(%in: f32, %out: f32):
63 |       %19 = arith.addf %in, %out : f32
64 |       linalg.yield %19 : f32
65 |     } -> tensor<1x1x3x3xf32>
66 |     %17 = linalg.generic {indexing_maps = [#map4, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%16 : tensor<1x1x3x3xf32>) outs(%10 : tensor<1x1x3x3xf32>) {
67 |     ^bb0(%in: f32, %out: f32):
68 |       %19 = math.log %in : f32
69 |       linalg.yield %19 : f32
70 |     } -> tensor<1x1x3x3xf32>
71 |     %18 = linalg.generic {indexing_maps = [#map2, #map4, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%13, %17 : tensor<1x2x3x3xf32>, tensor<1x1x3x3xf32>) outs(%4 : tensor<1x2x3x3xf32>) {
72 |     ^bb0(%in: f32, %in_1: f32, %out: f32):
73 |       %19 = arith.subf %in, %in_1 : f32
74 |       linalg.yield %19 : f32
75 |     } -> tensor<1x2x3x3xf32>
76 |     return %18 : tensor<1x2x3x3xf32>
77 |   }
78 | }
79 | 


--------------------------------------------------------------------------------
/pytorch/torch-dynamo/mlir/linear.mlir:
--------------------------------------------------------------------------------
 1 | linalg-on-tensors 
 2 |  #map = affine_map<(d0, d1) -> (d0, d1)>
 3 | #map1 = affine_map<(d0, d1) -> (d1, d0)>
 4 | #map2 = affine_map<(d0, d1) -> (d1)>
 5 | #map3 = affine_map<(d0, d1) -> (d0, 0)>
 6 | module attributes {torch.debug_module_name = "_lambda"} {
 7 |   ml_program.global private mutable @global_seed(dense<0> : tensor<i64>) : tensor<i64>
 8 |   func.func @forward(%arg0: tensor<256x128xf32>, %arg1: tensor<256xf32>, %arg2: tensor<10x256xf32>, %arg3: tensor<10xf32>, %arg4: tensor<2x128xf32>) -> tensor<2x10xf32> {
 9 |     %c0_i64 = arith.constant 0 : i64
10 |     %cst = arith.constant 0.000000e+00 : f32
11 |     %cst_0 = arith.constant 0xFF800000 : f32
12 |     %0 = tensor.empty() : tensor<128x256xf32>
13 |     %1 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg0 : tensor<256x128xf32>) outs(%0 : tensor<128x256xf32>) {
14 |     ^bb0(%in: f32, %out: f32):
15 |       linalg.yield %in : f32
16 |     } -> tensor<128x256xf32>
17 |     %2 = tensor.empty() : tensor<2x256xf32>
18 |     %3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<2x256xf32>) -> tensor<2x256xf32>
19 |     %4 = linalg.matmul ins(%arg4, %1 : tensor<2x128xf32>, tensor<128x256xf32>) outs(%3 : tensor<2x256xf32>) -> tensor<2x256xf32>
20 |     %5 = linalg.generic {indexing_maps = [#map2, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%arg1, %4 : tensor<256xf32>, tensor<2x256xf32>) outs(%2 : tensor<2x256xf32>) {
21 |     ^bb0(%in: f32, %in_1: f32, %out: f32):
22 |       %25 = arith.addf %in, %in_1 : f32
23 |       linalg.yield %25 : f32
24 |     } -> tensor<2x256xf32>
25 |     %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%5 : tensor<2x256xf32>) outs(%2 : tensor<2x256xf32>) {
26 |     ^bb0(%in: f32, %out: f32):
27 |       %25 = arith.cmpf ugt, %in, %cst : f32
28 |       %26 = arith.select %25, %in, %cst : f32
29 |       linalg.yield %26 : f32
30 |     } -> tensor<2x256xf32>
31 |     %7 = tensor.empty() : tensor<256x10xf32>
32 |     %8 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg2 : tensor<10x256xf32>) outs(%7 : tensor<256x10xf32>) {
33 |     ^bb0(%in: f32, %out: f32):
34 |       linalg.yield %in : f32
35 |     } -> tensor<256x10xf32>
36 |     %9 = tensor.empty() : tensor<2x10xf32>
37 |     %10 = linalg.fill ins(%cst : f32) outs(%9 : tensor<2x10xf32>) -> tensor<2x10xf32>
38 |     %11 = linalg.matmul ins(%6, %8 : tensor<2x256xf32>, tensor<256x10xf32>) outs(%10 : tensor<2x10xf32>) -> tensor<2x10xf32>
39 |     %12 = linalg.generic {indexing_maps = [#map2, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%arg3, %11 : tensor<10xf32>, tensor<2x10xf32>) outs(%9 : tensor<2x10xf32>) {
40 |     ^bb0(%in: f32, %in_1: f32, %out: f32):
41 |       %25 = arith.addf %in, %in_1 : f32
42 |       linalg.yield %25 : f32
43 |     } -> tensor<2x10xf32>
44 |     %13 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x10xf32>) outs(%9 : tensor<2x10xf32>) {
45 |     ^bb0(%in: f32, %out: f32):
46 |       %25 = arith.cmpf ugt, %in, %cst : f32
47 |       %26 = arith.select %25, %in, %cst : f32
48 |       linalg.yield %26 : f32
49 |     } -> tensor<2x10xf32>
50 |     %14 = tensor.empty() : tensor<2x1xi64>
51 |     %15 = linalg.fill ins(%c0_i64 : i64) outs(%14 : tensor<2x1xi64>) -> tensor<2x1xi64>
52 |     %16 = tensor.empty() : tensor<2x1xf32>
53 |     %17 = linalg.fill ins(%cst_0 : f32) outs(%16 : tensor<2x1xf32>) -> tensor<2x1xf32>
54 |     %18:2 = linalg.generic {indexing_maps = [#map, #map3, #map3], iterator_types = ["parallel", "reduction"]} ins(%13 : tensor<2x10xf32>) outs(%17, %15 : tensor<2x1xf32>, tensor<2x1xi64>) {
55 |     ^bb0(%in: f32, %out: f32, %out_1: i64):
56 |       %25 = linalg.index 1 : index
57 |       %26 = arith.index_cast %25 : index to i64
58 |       %27 = arith.maximumf %in, %out : f32
59 |       %28 = arith.cmpf ogt, %in, %out : f32
60 |       %29 = arith.select %28, %26, %out_1 : i64
61 |       linalg.yield %27, %29 : f32, i64
62 |     } -> (tensor<2x1xf32>, tensor<2x1xi64>)
63 |     %19 = linalg.generic {indexing_maps = [#map, #map3, #map], iterator_types = ["parallel", "parallel"]} ins(%13, %18#0 : tensor<2x10xf32>, tensor<2x1xf32>) outs(%9 : tensor<2x10xf32>) {
64 |     ^bb0(%in: f32, %in_1: f32, %out: f32):
65 |       %25 = arith.subf %in, %in_1 : f32
66 |       linalg.yield %25 : f32
67 |     } -> tensor<2x10xf32>
68 |     %20 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%19 : tensor<2x10xf32>) outs(%9 : tensor<2x10xf32>) {
69 |     ^bb0(%in: f32, %out: f32):
70 |       %25 = math.exp %in : f32
71 |       linalg.yield %25 : f32
72 |     } -> tensor<2x10xf32>
73 |     %21 = linalg.fill ins(%cst : f32) outs(%16 : tensor<2x1xf32>) -> tensor<2x1xf32>
74 |     %22 = linalg.generic {indexing_maps = [#map, #map3], iterator_types = ["parallel", "reduction"]} ins(%20 : tensor<2x10xf32>) outs(%21 : tensor<2x1xf32>) {
75 |     ^bb0(%in: f32, %out: f32):
76 |       %25 = arith.addf %in, %out : f32
77 |       linalg.yield %25 : f32
78 |     } -> tensor<2x1xf32>
79 |     %23 = linalg.generic {indexing_maps = [#map3, #map], iterator_types = ["parallel", "parallel"]} ins(%22 : tensor<2x1xf32>) outs(%16 : tensor<2x1xf32>) {
80 |     ^bb0(%in: f32, %out: f32):
81 |       %25 = math.log %in : f32
82 |       linalg.yield %25 : f32
83 |     } -> tensor<2x1xf32>
84 |     %24 = linalg.generic {indexing_maps = [#map, #map3, #map], iterator_types = ["parallel", "parallel"]} ins(%19, %23 : tensor<2x10xf32>, tensor<2x1xf32>) outs(%9 : tensor<2x10xf32>) {
85 |     ^bb0(%in: f32, %in_1: f32, %out: f32):
86 |       %25 = arith.subf %in, %in_1 : f32
87 |       linalg.yield %25 : f32
88 |     } -> tensor<2x10xf32>
89 |     return %24 : tensor<2x10xf32>
90 |   }
91 | }
92 | 
93 | 


--------------------------------------------------------------------------------
/pytorch/torch-dynamo/mlir/mnist.mlir:
--------------------------------------------------------------------------------
  1 | linalg-on-tensors 
  2 |  #map = affine_map<(d0, d1, d2, d3) -> (d1)>
  3 | #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
  4 | #map2 = affine_map<(d0, d1, d2, d3) -> (0, d1, d2, d3)>
  5 | #map3 = affine_map<(d0, d1, d2, d3) -> ()>
  6 | #map4 = affine_map<(d0, d1) -> (d0, d1)>
  7 | #map5 = affine_map<(d0, d1) -> (d1, d0)>
  8 | #map6 = affine_map<(d0, d1) -> (d1)>
  9 | #map7 = affine_map<(d0, d1) -> (0, d1)>
 10 | #map8 = affine_map<(d0, d1) -> ()>
 11 | #map9 = affine_map<(d0, d1) -> (d0, 0)>
 12 | #map10 = affine_map<(d0, d1) -> (0, 0)>
 13 | module attributes {torch.debug_module_name = "_lambda"} {
 14 |   ml_program.global private mutable @global_seed(dense<0> : tensor<i64>) : tensor<i64>
 15 |   func.func @forward(%arg0: tensor<32x1x3x3xf32>, %arg1: tensor<32xf32>, %arg2: tensor<64x32x3x3xf32>, %arg3: tensor<64xf32>, %arg4: tensor<128x9216xf32>, %arg5: tensor<128xf32>, %arg6: tensor<10x128xf32>, %arg7: tensor<10xf32>, %arg8: tensor<1x1x28x28xf32>) -> tensor<1x10xf32> {
 16 |     %cst = arith.constant 0.000000e+00 : f32
 17 |     %cst_0 = arith.constant 0xFF800000 : f32
 18 |     %c6364136223846793005_i64 = arith.constant 6364136223846793005 : i64
 19 |     %c1442695040888963407_i64 = arith.constant 1442695040888963407 : i64
 20 |     %c32_i64 = arith.constant 32 : i64
 21 |     %cst_1 = arith.constant 5.4210107999999998E-20 : f64
 22 |     %cst_2 = arith.constant 5.000000e-01 : f64
 23 |     %cst_3 = arith.constant 0.000000e+00 : f64
 24 |     %cst_4 = arith.constant 7.500000e-01 : f64
 25 |     %c0_i64 = arith.constant 0 : i64
 26 |     %c64_i64 = arith.constant 64 : i64
 27 |     %c12_i64 = arith.constant 12 : i64
 28 |     %cst_5 = arith.constant 7.500000e-01 : f32
 29 |     %c128_i64 = arith.constant 128 : i64
 30 |     %cst_6 = arith.constant 5.000000e-01 : f32
 31 |     %0 = tensor.empty() : tensor<1x32x26x26xf32>
 32 |     %1 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg1 : tensor<32xf32>) outs(%0 : tensor<1x32x26x26xf32>) {
 33 |     ^bb0(%in: f32, %out: f32):
 34 |       linalg.yield %in : f32
 35 |     } -> tensor<1x32x26x26xf32>
 36 |     %2 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%arg8, %arg0 : tensor<1x1x28x28xf32>, tensor<32x1x3x3xf32>) outs(%1 : tensor<1x32x26x26xf32>) -> tensor<1x32x26x26xf32>
 37 |     %3 = linalg.generic {indexing_maps = [#map2, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2 : tensor<1x32x26x26xf32>) outs(%0 : tensor<1x32x26x26xf32>) {
 38 |     ^bb0(%in: f32, %out: f32):
 39 |       %59 = arith.cmpf ugt, %in, %cst : f32
 40 |       %60 = arith.select %59, %in, %cst : f32
 41 |       linalg.yield %60 : f32
 42 |     } -> tensor<1x32x26x26xf32>
 43 |     %4 = tensor.empty() : tensor<1x64x24x24xf32>
 44 |     %5 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg3 : tensor<64xf32>) outs(%4 : tensor<1x64x24x24xf32>) {
 45 |     ^bb0(%in: f32, %out: f32):
 46 |       linalg.yield %in : f32
 47 |     } -> tensor<1x64x24x24xf32>
 48 |     %6 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%3, %arg2 : tensor<1x32x26x26xf32>, tensor<64x32x3x3xf32>) outs(%5 : tensor<1x64x24x24xf32>) -> tensor<1x64x24x24xf32>
 49 |     %7 = linalg.generic {indexing_maps = [#map2, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%6 : tensor<1x64x24x24xf32>) outs(%4 : tensor<1x64x24x24xf32>) {
 50 |     ^bb0(%in: f32, %out: f32):
 51 |       %59 = arith.cmpf ugt, %in, %cst : f32
 52 |       %60 = arith.select %59, %in, %cst : f32
 53 |       linalg.yield %60 : f32
 54 |     } -> tensor<1x64x24x24xf32>
 55 |     %8 = tensor.empty() : tensor<1x64x12x12xf32>
 56 |     %9 = linalg.fill ins(%cst_0 : f32) outs(%8 : tensor<1x64x12x12xf32>) -> tensor<1x64x12x12xf32>
 57 |     %10 = tensor.empty() : tensor<2x2xf32>
 58 |     %11 = linalg.pooling_nchw_max {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%7, %10 : tensor<1x64x24x24xf32>, tensor<2x2xf32>) outs(%9 : tensor<1x64x12x12xf32>) -> tensor<1x64x12x12xf32>
 59 |     %12 = tensor.empty() : tensor<f64>
 60 |     %13 = linalg.fill ins(%cst_4 : f64) outs(%12 : tensor<f64>) -> tensor<f64>
 61 |     %14 = ml_program.global_load @global_seed : tensor<i64>
 62 |     %extracted = tensor.extract %14[] : tensor<i64>
 63 |     %15 = arith.muli %extracted, %c6364136223846793005_i64 : i64
 64 |     %16 = arith.addi %15, %c1442695040888963407_i64 : i64
 65 |     %inserted = tensor.insert %16 into %14[] : tensor<i64>
 66 |     ml_program.global_store @global_seed = %inserted : tensor<i64>
 67 |     %17 = tensor.empty() : tensor<1x64x12x12xf64>
 68 |     %18 = linalg.generic {indexing_maps = [#map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} outs(%17 : tensor<1x64x12x12xf64>) {
 69 |     ^bb0(%out: f64):
 70 |       %59 = linalg.index 0 : index
 71 |       %60 = arith.index_cast %59 : index to i64
 72 |       %61 = linalg.index 1 : index
 73 |       %62 = arith.index_cast %61 : index to i64
 74 |       %63 = linalg.index 2 : index
 75 |       %64 = arith.index_cast %63 : index to i64
 76 |       %65 = linalg.index 3 : index
 77 |       %66 = arith.index_cast %65 : index to i64
 78 |       %67 = arith.muli %60, %c64_i64 : i64
 79 |       %68 = arith.addi %67, %62 : i64
 80 |       %69 = arith.muli %68, %c12_i64 : i64
 81 |       %70 = arith.addi %69, %64 : i64
 82 |       %71 = arith.muli %70, %c12_i64 : i64
 83 |       %72 = arith.addi %71, %66 : i64
 84 |       %73 = arith.muli %72, %16 : i64
 85 |       %74 = arith.addi %73, %16 : i64
 86 |       %75 = arith.muli %73, %73 : i64
 87 |       %76 = arith.addi %75, %73 : i64
 88 |       %77 = arith.shli %76, %c32_i64 : i64
 89 |       %78 = arith.shrui %76, %c32_i64 : i64
 90 |       %79 = arith.ori %77, %78 : i64
 91 |       %80 = arith.muli %79, %79 : i64
 92 |       %81 = arith.addi %80, %74 : i64
 93 |       %82 = arith.shli %81, %c32_i64 : i64
 94 |       %83 = arith.shrui %81, %c32_i64 : i64
 95 |       %84 = arith.ori %82, %83 : i64
 96 |       %85 = arith.muli %84, %84 : i64
 97 |       %86 = arith.addi %85, %73 : i64
 98 |       %87 = arith.shli %86, %c32_i64 : i64
 99 |       %88 = arith.shrui %86, %c32_i64 : i64
100 |       %89 = arith.ori %87, %88 : i64
101 |       %90 = arith.muli %89, %89 : i64
102 |       %91 = arith.addi %90, %74 : i64
103 |       %92 = arith.shli %91, %c32_i64 : i64
104 |       %93 = arith.shrui %91, %c32_i64 : i64
105 |       %94 = arith.ori %92, %93 : i64
106 |       %95 = arith.muli %94, %94 : i64
107 |       %96 = arith.addi %95, %73 : i64
108 |       %97 = arith.shrui %96, %c32_i64 : i64
109 |       %98 = arith.xori %91, %97 : i64
110 |       %99 = arith.uitofp %98 : i64 to f64
111 |       %100 = arith.mulf %99, %cst_1 : f64
112 |       %101 = arith.addf %100, %cst_3 : f64
113 |       linalg.yield %101 : f64
114 |     } -> tensor<1x64x12x12xf64>
115 |     %19 = tensor.empty() : tensor<1x64x12x12xi1>
116 |     %20 = linalg.generic {indexing_maps = [#map2, #map3, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%18, %13 : tensor<1x64x12x12xf64>, tensor<f64>) outs(%19 : tensor<1x64x12x12xi1>) {
117 |     ^bb0(%in: f64, %in_9: f64, %out: i1):
118 |       %59 = arith.cmpf ult, %in, %in_9 : f64
119 |       linalg.yield %59 : i1
120 |     } -> tensor<1x64x12x12xi1>
121 |     %21 = linalg.generic {indexing_maps = [#map2, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%20 : tensor<1x64x12x12xi1>) outs(%8 : tensor<1x64x12x12xf32>) {
122 |     ^bb0(%in: i1, %out: f32):
123 |       %59 = arith.uitofp %in : i1 to f32
124 |       linalg.yield %59 : f32
125 |     } -> tensor<1x64x12x12xf32>
126 |     %22 = linalg.generic {indexing_maps = [#map2, #map2, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%21, %11 : tensor<1x64x12x12xf32>, tensor<1x64x12x12xf32>) outs(%8 : tensor<1x64x12x12xf32>) {
127 |     ^bb0(%in: f32, %in_9: f32, %out: f32):
128 |       %59 = arith.mulf %in, %in_9 : f32
129 |       linalg.yield %59 : f32
130 |     } -> tensor<1x64x12x12xf32>
131 |     %23 = linalg.generic {indexing_maps = [#map2, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%22 : tensor<1x64x12x12xf32>) outs(%8 : tensor<1x64x12x12xf32>) {
132 |     ^bb0(%in: f32, %out: f32):
133 |       %59 = arith.divf %in, %cst_5 : f32
134 |       linalg.yield %59 : f32
135 |     } -> tensor<1x64x12x12xf32>
136 |     %collapsed = tensor.collapse_shape %23 [[0], [1, 2, 3]] : tensor<1x64x12x12xf32> into tensor<1x9216xf32>
137 |     %24 = tensor.empty() : tensor<9216x128xf32>
138 |     %25 = linalg.generic {indexing_maps = [#map4, #map5], iterator_types = ["parallel", "parallel"]} ins(%arg4 : tensor<128x9216xf32>) outs(%24 : tensor<9216x128xf32>) {
139 |     ^bb0(%in: f32, %out: f32):
140 |       linalg.yield %in : f32
141 |     } -> tensor<9216x128xf32>
142 |     %26 = tensor.empty() : tensor<1x128xf32>
143 |     %27 = linalg.fill ins(%cst : f32) outs(%26 : tensor<1x128xf32>) -> tensor<1x128xf32>
144 |     %28 = linalg.matmul ins(%collapsed, %25 : tensor<1x9216xf32>, tensor<9216x128xf32>) outs(%27 : tensor<1x128xf32>) -> tensor<1x128xf32>
145 |     %29 = linalg.generic {indexing_maps = [#map6, #map7, #map4], iterator_types = ["parallel", "parallel"]} ins(%arg5, %28 : tensor<128xf32>, tensor<1x128xf32>) outs(%26 : tensor<1x128xf32>) {
146 |     ^bb0(%in: f32, %in_9: f32, %out: f32):
147 |       %59 = arith.addf %in, %in_9 : f32
148 |       linalg.yield %59 : f32
149 |     } -> tensor<1x128xf32>
150 |     %30 = linalg.generic {indexing_maps = [#map7, #map4], iterator_types = ["parallel", "parallel"]} ins(%29 : tensor<1x128xf32>) outs(%26 : tensor<1x128xf32>) {
151 |     ^bb0(%in: f32, %out: f32):
152 |       %59 = arith.cmpf ugt, %in, %cst : f32
153 |       %60 = arith.select %59, %in, %cst : f32
154 |       linalg.yield %60 : f32
155 |     } -> tensor<1x128xf32>
156 |     %31 = linalg.fill ins(%cst_2 : f64) outs(%12 : tensor<f64>) -> tensor<f64>
157 |     %32 = ml_program.global_load @global_seed : tensor<i64>
158 |     %extracted_7 = tensor.extract %32[] : tensor<i64>
159 |     %33 = arith.muli %extracted_7, %c6364136223846793005_i64 : i64
160 |     %34 = arith.addi %33, %c1442695040888963407_i64 : i64
161 |     %inserted_8 = tensor.insert %34 into %32[] : tensor<i64>
162 |     ml_program.global_store @global_seed = %inserted_8 : tensor<i64>
163 |     %35 = tensor.empty() : tensor<1x128xf64>
164 |     %36 = linalg.generic {indexing_maps = [#map4], iterator_types = ["parallel", "parallel"]} outs(%35 : tensor<1x128xf64>) {
165 |     ^bb0(%out: f64):
166 |       %59 = linalg.index 0 : index
167 |       %60 = arith.index_cast %59 : index to i64
168 |       %61 = linalg.index 1 : index
169 |       %62 = arith.index_cast %61 : index to i64
170 |       %63 = arith.muli %60, %c128_i64 : i64
171 |       %64 = arith.addi %63, %62 : i64
172 |       %65 = arith.muli %64, %34 : i64
173 |       %66 = arith.addi %65, %34 : i64
174 |       %67 = arith.muli %65, %65 : i64
175 |       %68 = arith.addi %67, %65 : i64
176 |       %69 = arith.shli %68, %c32_i64 : i64
177 |       %70 = arith.shrui %68, %c32_i64 : i64
178 |       %71 = arith.ori %69, %70 : i64
179 |       %72 = arith.muli %71, %71 : i64
180 |       %73 = arith.addi %72, %66 : i64
181 |       %74 = arith.shli %73, %c32_i64 : i64
182 |       %75 = arith.shrui %73, %c32_i64 : i64
183 |       %76 = arith.ori %74, %75 : i64
184 |       %77 = arith.muli %76, %76 : i64
185 |       %78 = arith.addi %77, %65 : i64
186 |       %79 = arith.shli %78, %c32_i64 : i64
187 |       %80 = arith.shrui %78, %c32_i64 : i64
188 |       %81 = arith.ori %79, %80 : i64
189 |       %82 = arith.muli %81, %81 : i64
190 |       %83 = arith.addi %82, %66 : i64
191 |       %84 = arith.shli %83, %c32_i64 : i64
192 |       %85 = arith.shrui %83, %c32_i64 : i64
193 |       %86 = arith.ori %84, %85 : i64
194 |       %87 = arith.muli %86, %86 : i64
195 |       %88 = arith.addi %87, %65 : i64
196 |       %89 = arith.shrui %88, %c32_i64 : i64
197 |       %90 = arith.xori %83, %89 : i64
198 |       %91 = arith.uitofp %90 : i64 to f64
199 |       %92 = arith.mulf %91, %cst_1 : f64
200 |       %93 = arith.addf %92, %cst_3 : f64
201 |       linalg.yield %93 : f64
202 |     } -> tensor<1x128xf64>
203 |     %37 = tensor.empty() : tensor<1x128xi1>
204 |     %38 = linalg.generic {indexing_maps = [#map7, #map8, #map4], iterator_types = ["parallel", "parallel"]} ins(%36, %31 : tensor<1x128xf64>, tensor<f64>) outs(%37 : tensor<1x128xi1>) {
205 |     ^bb0(%in: f64, %in_9: f64, %out: i1):
206 |       %59 = arith.cmpf ult, %in, %in_9 : f64
207 |       linalg.yield %59 : i1
208 |     } -> tensor<1x128xi1>
209 |     %39 = linalg.generic {indexing_maps = [#map7, #map4], iterator_types = ["parallel", "parallel"]} ins(%38 : tensor<1x128xi1>) outs(%26 : tensor<1x128xf32>) {
210 |     ^bb0(%in: i1, %out: f32):
211 |       %59 = arith.uitofp %in : i1 to f32
212 |       linalg.yield %59 : f32
213 |     } -> tensor<1x128xf32>
214 |     %40 = linalg.generic {indexing_maps = [#map7, #map7, #map4], iterator_types = ["parallel", "parallel"]} ins(%39, %30 : tensor<1x128xf32>, tensor<1x128xf32>) outs(%26 : tensor<1x128xf32>) {
215 |     ^bb0(%in: f32, %in_9: f32, %out: f32):
216 |       %59 = arith.mulf %in, %in_9 : f32
217 |       linalg.yield %59 : f32
218 |     } -> tensor<1x128xf32>
219 |     %41 = linalg.generic {indexing_maps = [#map7, #map4], iterator_types = ["parallel", "parallel"]} ins(%40 : tensor<1x128xf32>) outs(%26 : tensor<1x128xf32>) {
220 |     ^bb0(%in: f32, %out: f32):
221 |       %59 = arith.divf %in, %cst_6 : f32
222 |       linalg.yield %59 : f32
223 |     } -> tensor<1x128xf32>
224 |     %42 = tensor.empty() : tensor<128x10xf32>
225 |     %43 = linalg.generic {indexing_maps = [#map4, #map5], iterator_types = ["parallel", "parallel"]} ins(%arg6 : tensor<10x128xf32>) outs(%42 : tensor<128x10xf32>) {
226 |     ^bb0(%in: f32, %out: f32):
227 |       linalg.yield %in : f32
228 |     } -> tensor<128x10xf32>
229 |     %44 = tensor.empty() : tensor<1x10xf32>
230 |     %45 = linalg.fill ins(%cst : f32) outs(%44 : tensor<1x10xf32>) -> tensor<1x10xf32>
231 |     %46 = linalg.matmul ins(%41, %43 : tensor<1x128xf32>, tensor<128x10xf32>) outs(%45 : tensor<1x10xf32>) -> tensor<1x10xf32>
232 |     %47 = linalg.generic {indexing_maps = [#map6, #map7, #map4], iterator_types = ["parallel", "parallel"]} ins(%arg7, %46 : tensor<10xf32>, tensor<1x10xf32>) outs(%44 : tensor<1x10xf32>) {
233 |     ^bb0(%in: f32, %in_9: f32, %out: f32):
234 |       %59 = arith.addf %in, %in_9 : f32
235 |       linalg.yield %59 : f32
236 |     } -> tensor<1x10xf32>
237 |     %48 = tensor.empty() : tensor<1x1xi64>
238 |     %49 = linalg.fill ins(%c0_i64 : i64) outs(%48 : tensor<1x1xi64>) -> tensor<1x1xi64>
239 |     %50 = tensor.empty() : tensor<1x1xf32>
240 |     %51 = linalg.fill ins(%cst_0 : f32) outs(%50 : tensor<1x1xf32>) -> tensor<1x1xf32>
241 |     %52:2 = linalg.generic {indexing_maps = [#map4, #map9, #map9], iterator_types = ["parallel", "reduction"]} ins(%47 : tensor<1x10xf32>) outs(%51, %49 : tensor<1x1xf32>, tensor<1x1xi64>) {
242 |     ^bb0(%in: f32, %out: f32, %out_9: i64):
243 |       %59 = linalg.index 1 : index
244 |       %60 = arith.index_cast %59 : index to i64
245 |       %61 = arith.maximumf %in, %out : f32
246 |       %62 = arith.cmpf ogt, %in, %out : f32
247 |       %63 = arith.select %62, %60, %out_9 : i64
248 |       linalg.yield %61, %63 : f32, i64
249 |     } -> (tensor<1x1xf32>, tensor<1x1xi64>)
250 |     %53 = linalg.generic {indexing_maps = [#map7, #map10, #map4], iterator_types = ["parallel", "parallel"]} ins(%47, %52#0 : tensor<1x10xf32>, tensor<1x1xf32>) outs(%44 : tensor<1x10xf32>) {
251 |     ^bb0(%in: f32, %in_9: f32, %out: f32):
252 |       %59 = arith.subf %in, %in_9 : f32
253 |       linalg.yield %59 : f32
254 |     } -> tensor<1x10xf32>
255 |     %54 = linalg.generic {indexing_maps = [#map7, #map4], iterator_types = ["parallel", "parallel"]} ins(%53 : tensor<1x10xf32>) outs(%44 : tensor<1x10xf32>) {
256 |     ^bb0(%in: f32, %out: f32):
257 |       %59 = math.exp %in : f32
258 |       linalg.yield %59 : f32
259 |     } -> tensor<1x10xf32>
260 |     %55 = linalg.fill ins(%cst : f32) outs(%50 : tensor<1x1xf32>) -> tensor<1x1xf32>
261 |     %56 = linalg.generic {indexing_maps = [#map4, #map9], iterator_types = ["parallel", "reduction"]} ins(%54 : tensor<1x10xf32>) outs(%55 : tensor<1x1xf32>) {
262 |     ^bb0(%in: f32, %out: f32):
263 |       %59 = arith.addf %in, %out : f32
264 |       linalg.yield %59 : f32
265 |     } -> tensor<1x1xf32>
266 |     %57 = linalg.generic {indexing_maps = [#map10, #map4], iterator_types = ["parallel", "parallel"]} ins(%56 : tensor<1x1xf32>) outs(%50 : tensor<1x1xf32>) {
267 |     ^bb0(%in: f32, %out: f32):
268 |       %59 = math.log %in : f32
269 |       linalg.yield %59 : f32
270 |     } -> tensor<1x1xf32>
271 |     %58 = linalg.generic {indexing_maps = [#map7, #map10, #map4], iterator_types = ["parallel", "parallel"]} ins(%53, %57 : tensor<1x10xf32>, tensor<1x1xf32>) outs(%44 : tensor<1x10xf32>) {
272 |     ^bb0(%in: f32, %in_9: f32, %out: f32):
273 |       %59 = arith.subf %in, %in_9 : f32
274 |       linalg.yield %59 : f32
275 |     } -> tensor<1x10xf32>
276 |     return %58 : tensor<1x10xf32>
277 |   }
278 | }
279 | 
280 | 


--------------------------------------------------------------------------------
/pytorch/torch-dynamo/mlir/resnet18.mlir:
--------------------------------------------------------------------------------
   1 | linalg-on-tensors 
   2 |  #map = affine_map<(d0) -> (d0)>
   3 | #map1 = affine_map<(d0, d1, d2, d3) -> (0, d1, d2, d3)>
   4 | #map2 = affine_map<(d0, d1, d2, d3) -> (d1, 0, 0)>
   5 | #map3 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
   6 | #map4 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>
   7 | #map5 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d4, d5)>
   8 | #map6 = affine_map<(d0, d1, d2, d3) -> (d0, d1, 0, 0)>
   9 | #map7 = affine_map<(d0, d1, d2, d3) -> (0, d1, 0, 0)>
  10 | #map8 = affine_map<(d0, d1) -> (d0, d1)>
  11 | #map9 = affine_map<(d0, d1) -> (d1, d0)>
  12 | #map10 = affine_map<(d0, d1) -> (d1)>
  13 | #map11 = affine_map<(d0, d1) -> (0, d1)>
  14 | module attributes {torch.debug_module_name = "_lambda"} {
  15 |   ml_program.global private mutable @global_seed(dense<0> : tensor<i64>) : tensor<i64>
  16 |   func.func @forward(%arg0: tensor<64x3x7x7xf32>, %arg1: tensor<64xf32>, %arg2: tensor<64xf32>, %arg3: tensor<64x64x3x3xf32>, %arg4: tensor<64xf32>, %arg5: tensor<64xf32>, %arg6: tensor<64x64x3x3xf32>, %arg7: tensor<64xf32>, %arg8: tensor<64xf32>, %arg9: tensor<64x64x3x3xf32>, %arg10: tensor<64xf32>, %arg11: tensor<64xf32>, %arg12: tensor<64x64x3x3xf32>, %arg13: tensor<64xf32>, %arg14: tensor<64xf32>, %arg15: tensor<128x64x3x3xf32>, %arg16: tensor<128xf32>, %arg17: tensor<128xf32>, %arg18: tensor<128x128x3x3xf32>, %arg19: tensor<128xf32>, %arg20: tensor<128xf32>, %arg21: tensor<128x64x1x1xf32>, %arg22: tensor<128xf32>, %arg23: tensor<128xf32>, %arg24: tensor<128x128x3x3xf32>, %arg25: tensor<128xf32>, %arg26: tensor<128xf32>, %arg27: tensor<128x128x3x3xf32>, %arg28: tensor<128xf32>, %arg29: tensor<128xf32>, %arg30: tensor<256x128x3x3xf32>, %arg31: tensor<256xf32>, %arg32: tensor<256xf32>, %arg33: tensor<256x256x3x3xf32>, %arg34: tensor<256xf32>, %arg35: tensor<256xf32>, %arg36: tensor<256x128x1x1xf32>, %arg37: tensor<256xf32>, %arg38: tensor<256xf32>, %arg39: tensor<256x256x3x3xf32>, %arg40: tensor<256xf32>, %arg41: tensor<256xf32>, %arg42: tensor<256x256x3x3xf32>, %arg43: tensor<256xf32>, %arg44: tensor<256xf32>, %arg45: tensor<512x256x3x3xf32>, %arg46: tensor<512xf32>, %arg47: tensor<512xf32>, %arg48: tensor<512x512x3x3xf32>, %arg49: tensor<512xf32>, %arg50: tensor<512xf32>, %arg51: tensor<512x256x1x1xf32>, %arg52: tensor<512xf32>, %arg53: tensor<512xf32>, %arg54: tensor<512x512x3x3xf32>, %arg55: tensor<512xf32>, %arg56: tensor<512xf32>, %arg57: tensor<512x512x3x3xf32>, %arg58: tensor<512xf32>, %arg59: tensor<512xf32>, %arg60: tensor<1000x512xf32>, %arg61: tensor<1000xf32>, %arg62: tensor<64xf32>, %arg63: tensor<64xf32>, %arg64: tensor<i64>, %arg65: tensor<64xf32>, %arg66: tensor<64xf32>, %arg67: tensor<i64>, %arg68: tensor<64xf32>, %arg69: tensor<64xf32>, %arg70: tensor<i64>, %arg71: tensor<64xf32>, %arg72: tensor<64xf32>, %arg73: tensor<i64>, %arg74: tensor<64xf32>, %arg75: tensor<64xf32>, %arg76: tensor<i64>, %arg77: tensor<128xf32>, %arg78: tensor<128xf32>, %arg79: tensor<i64>, %arg80: tensor<128xf32>, %arg81: tensor<128xf32>, %arg82: tensor<i64>, %arg83: tensor<128xf32>, %arg84: tensor<128xf32>, %arg85: tensor<i64>, %arg86: tensor<128xf32>, %arg87: tensor<128xf32>, %arg88: tensor<i64>, %arg89: tensor<128xf32>, %arg90: tensor<128xf32>, %arg91: tensor<i64>, %arg92: tensor<256xf32>, %arg93: tensor<256xf32>, %arg94: tensor<i64>, %arg95: tensor<256xf32>, %arg96: tensor<256xf32>, %arg97: tensor<i64>, %arg98: tensor<256xf32>, %arg99: tensor<256xf32>, %arg100: tensor<i64>, %arg101: tensor<256xf32>, %arg102: tensor<256xf32>, %arg103: tensor<i64>, %arg104: tensor<256xf32>, %arg105: tensor<256xf32>, %arg106: tensor<i64>, %arg107: tensor<512xf32>, %arg108: tensor<512xf32>, %arg109: tensor<i64>, %arg110: tensor<512xf32>, %arg111: tensor<512xf32>, %arg112: tensor<i64>, %arg113: tensor<512xf32>, %arg114: tensor<512xf32>, %arg115: tensor<i64>, %arg116: tensor<512xf32>, %arg117: tensor<512xf32>, %arg118: tensor<i64>, %arg119: tensor<512xf32>, %arg120: tensor<512xf32>, %arg121: tensor<i64>, %arg122: tensor<1x3x224x224xf32>) -> (tensor<1x1000xf32>, tensor<64x3x7x7xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<128x64x3x3xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128xf32>, tensor<128x64x1x1xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128xf32>, tensor<256x128x3x3xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256xf32>, tensor<256x128x1x1xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256xf32>, tensor<512x256x3x3xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512xf32>, tensor<512x256x1x1xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<1x3x224x224xf32>, tensor<1x64x112x112xf32>, tensor<1x64x112x112xf32>, tensor<1x64x56x56xf32>, tensor<1x64x56x56xi64>, tensor<1x64x56x56xf32>, tensor<1x64x56x56xf32>, tensor<1x64x56x56xf32>, tensor<1x64x56x56xf32>, tensor<1x64x56x56xf32>, tensor<1x64x56x56xf32>, tensor<1x64x56x56xf32>, tensor<1x64x56x56xf32>, tensor<1x128x28x28xf32>, tensor<1x128x28x28xf32>, tensor<1x128x28x28xf32>, tensor<1x128x28x28xf32>, tensor<1x128x28x28xf32>, tensor<1x128x28x28xf32>, tensor<1x128x28x28xf32>, tensor<1x128x28x28xf32>, tensor<1x128x28x28xf32>, tensor<1x256x14x14xf32>, tensor<1x256x14x14xf32>, tensor<1x256x14x14xf32>, tensor<1x256x14x14xf32>, tensor<1x256x14x14xf32>, tensor<1x256x14x14xf32>, tensor<1x256x14x14xf32>, tensor<1x256x14x14xf32>, tensor<1x256x14x14xf32>, tensor<1x512x7x7xf32>, tensor<1x512x7x7xf32>, tensor<1x512x7x7xf32>, tensor<1x512x7x7xf32>, tensor<1x512x7x7xf32>, tensor<1x512x7x7xf32>, tensor<1x512x7x7xf32>, tensor<1x512x7x7xf32>, tensor<1x512x7x7xf32>, tensor<1x512xf32>, tensor<512x1000xf32>) {
  17 |     %c-1_i64 = arith.constant -1 : i64
  18 |     %c2 = arith.constant 2 : index
  19 |     %cst = arith.constant 0.000000e+00 : f32
  20 |     %cst_0 = arith.constant 1.000000e+00 : f32
  21 |     %cst_1 = arith.constant 0xFF800000 : f32
  22 |     %cst_2 = arith.constant 1.000000e-05 : f64
  23 |     %c112 = arith.constant 112 : index
  24 |     %c3 = arith.constant 3 : index
  25 |     %c0 = arith.constant 0 : index
  26 |     %c1 = arith.constant 1 : index
  27 |     %cst_3 = arith.constant 4.900000e+01 : f32
  28 |     %padded = tensor.pad %arg122 low[0, 0, 3, 3] high[0, 0, 3, 3] {
  29 |     ^bb0(%arg123: index, %arg124: index, %arg125: index, %arg126: index):
  30 |       tensor.yield %cst : f32
  31 |     } : tensor<1x3x224x224xf32> to tensor<1x3x230x230xf32>
  32 |     %0 = tensor.empty() : tensor<1x64x112x112xf32>
  33 |     %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<1x64x112x112xf32>) -> tensor<1x64x112x112xf32>
  34 |     %2 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%padded, %arg0 : tensor<1x3x230x230xf32>, tensor<64x3x7x7xf32>) outs(%1 : tensor<1x64x112x112xf32>) -> tensor<1x64x112x112xf32>
  35 |     %3 = tensor.empty() : tensor<64xf32>
  36 |     %4 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%arg63 : tensor<64xf32>) outs(%3 : tensor<64xf32>) {
  37 |     ^bb0(%in: f32, %out: f32):
  38 |       %216 = arith.truncf %cst_2 : f64 to f32
  39 |       %217 = arith.addf %in, %216 : f32
  40 |       linalg.yield %217 : f32
  41 |     } -> tensor<64xf32>
  42 |     %5 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%4 : tensor<64xf32>) outs(%3 : tensor<64xf32>) {
  43 |     ^bb0(%in: f32, %out: f32):
  44 |       %216 = math.sqrt %in : f32
  45 |       linalg.yield %216 : f32
  46 |     } -> tensor<64xf32>
  47 |     %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%5 : tensor<64xf32>) outs(%3 : tensor<64xf32>) {
  48 |     ^bb0(%in: f32, %out: f32):
  49 |       %216 = arith.cmpf one, %in, %cst : f32
  50 |       cf.assert %216, "unimplemented: tensor with zero element"
  51 |       %217 = arith.divf %cst_0, %in : f32
  52 |       linalg.yield %217 : f32
  53 |     } -> tensor<64xf32>
  54 |     %expanded = tensor.expand_shape %arg62 [[0, 1, 2]] : tensor<64xf32> into tensor<64x1x1xf32>
  55 |     %expanded_4 = tensor.expand_shape %6 [[0, 1, 2]] : tensor<64xf32> into tensor<64x1x1xf32>
  56 |     %7 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2, %expanded : tensor<1x64x112x112xf32>, tensor<64x1x1xf32>) outs(%0 : tensor<1x64x112x112xf32>) {
  57 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
  58 |       %216 = arith.subf %in, %in_100 : f32
  59 |       linalg.yield %216 : f32
  60 |     } -> tensor<1x64x112x112xf32>
  61 |     %8 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%7, %expanded_4 : tensor<1x64x112x112xf32>, tensor<64x1x1xf32>) outs(%0 : tensor<1x64x112x112xf32>) {
  62 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
  63 |       %216 = arith.mulf %in, %in_100 : f32
  64 |       linalg.yield %216 : f32
  65 |     } -> tensor<1x64x112x112xf32>
  66 |     %expanded_5 = tensor.expand_shape %arg1 [[0, 1, 2]] : tensor<64xf32> into tensor<64x1x1xf32>
  67 |     %9 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%8, %expanded_5 : tensor<1x64x112x112xf32>, tensor<64x1x1xf32>) outs(%0 : tensor<1x64x112x112xf32>) {
  68 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
  69 |       %216 = arith.mulf %in, %in_100 : f32
  70 |       linalg.yield %216 : f32
  71 |     } -> tensor<1x64x112x112xf32>
  72 |     %expanded_6 = tensor.expand_shape %arg2 [[0, 1, 2]] : tensor<64xf32> into tensor<64x1x1xf32>
  73 |     %10 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%9, %expanded_6 : tensor<1x64x112x112xf32>, tensor<64x1x1xf32>) outs(%0 : tensor<1x64x112x112xf32>) {
  74 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
  75 |       %216 = arith.addf %in, %in_100 : f32
  76 |       linalg.yield %216 : f32
  77 |     } -> tensor<1x64x112x112xf32>
  78 |     %11 = linalg.generic {indexing_maps = [#map1, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%10 : tensor<1x64x112x112xf32>) outs(%0 : tensor<1x64x112x112xf32>) {
  79 |     ^bb0(%in: f32, %out: f32):
  80 |       %216 = arith.cmpf ugt, %in, %cst : f32
  81 |       %217 = arith.select %216, %in, %cst : f32
  82 |       linalg.yield %217 : f32
  83 |     } -> tensor<1x64x112x112xf32>
  84 |     %padded_7 = tensor.pad %11 low[0, 0, 1, 1] high[0, 0, 1, 1] {
  85 |     ^bb0(%arg123: index, %arg124: index, %arg125: index, %arg126: index):
  86 |       tensor.yield %cst_1 : f32
  87 |     } : tensor<1x64x112x112xf32> to tensor<1x64x114x114xf32>
  88 |     %12 = tensor.empty() : tensor<1x64x56x56xf32>
  89 |     %13 = linalg.fill ins(%cst_1 : f32) outs(%12 : tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf32>
  90 |     %14 = tensor.empty() : tensor<3x3xf32>
  91 |     %15 = linalg.pooling_nchw_max {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%padded_7, %14 : tensor<1x64x114x114xf32>, tensor<3x3xf32>) outs(%13 : tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf32>
  92 |     %16 = tensor.empty() : tensor<1x64x56x56xi64>
  93 |     %17 = linalg.fill ins(%c-1_i64 : i64) outs(%16 : tensor<1x64x56x56xi64>) -> tensor<1x64x56x56xi64>
  94 |     %18 = tensor.empty() : tensor<3x3xi64>
  95 |     %19 = linalg.generic {indexing_maps = [#map4, #map5, #map4], iterator_types = ["parallel", "parallel", "parallel", "parallel", "reduction", "reduction"]} ins(%15, %18 : tensor<1x64x56x56xf32>, tensor<3x3xi64>) outs(%17 : tensor<1x64x56x56xi64>) {
  96 |     ^bb0(%in: f32, %in_100: i64, %out: i64):
  97 |       %216 = linalg.index 0 : index
  98 |       %217 = linalg.index 1 : index
  99 |       %218 = linalg.index 2 : index
 100 |       %219 = linalg.index 3 : index
 101 |       %220 = linalg.index 4 : index
 102 |       %221 = linalg.index 5 : index
 103 |       %222 = arith.muli %218, %c2 : index
 104 |       %223 = arith.addi %222, %220 : index
 105 |       %224 = arith.muli %219, %c2 : index
 106 |       %225 = arith.addi %224, %221 : index
 107 |       %extracted = tensor.extract %padded_7[%216, %217, %223, %225] : tensor<1x64x114x114xf32>
 108 |       %226 = arith.cmpf oeq, %extracted, %in : f32
 109 |       %227 = arith.subi %223, %c1 : index
 110 |       %228 = arith.subi %225, %c1 : index
 111 |       %229 = arith.muli %227, %c112 : index
 112 |       %230 = arith.addi %229, %228 : index
 113 |       %231 = arith.index_cast %230 : index to i64
 114 |       %232 = arith.select %226, %231, %out : i64
 115 |       %233 = arith.cmpi eq, %out, %c-1_i64 : i64
 116 |       %234 = arith.select %233, %232, %out : i64
 117 |       linalg.yield %234 : i64
 118 |     } -> tensor<1x64x56x56xi64>
 119 |     %padded_8 = tensor.pad %15 low[0, 0, 1, 1] high[0, 0, 1, 1] {
 120 |     ^bb0(%arg123: index, %arg124: index, %arg125: index, %arg126: index):
 121 |       tensor.yield %cst : f32
 122 |     } : tensor<1x64x56x56xf32> to tensor<1x64x58x58xf32>
 123 |     %20 = linalg.fill ins(%cst : f32) outs(%12 : tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf32>
 124 |     %21 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%padded_8, %arg3 : tensor<1x64x58x58xf32>, tensor<64x64x3x3xf32>) outs(%20 : tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf32>
 125 |     %22 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%arg66 : tensor<64xf32>) outs(%3 : tensor<64xf32>) {
 126 |     ^bb0(%in: f32, %out: f32):
 127 |       %216 = arith.truncf %cst_2 : f64 to f32
 128 |       %217 = arith.addf %in, %216 : f32
 129 |       linalg.yield %217 : f32
 130 |     } -> tensor<64xf32>
 131 |     %23 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%22 : tensor<64xf32>) outs(%3 : tensor<64xf32>) {
 132 |     ^bb0(%in: f32, %out: f32):
 133 |       %216 = math.sqrt %in : f32
 134 |       linalg.yield %216 : f32
 135 |     } -> tensor<64xf32>
 136 |     %24 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%23 : tensor<64xf32>) outs(%3 : tensor<64xf32>) {
 137 |     ^bb0(%in: f32, %out: f32):
 138 |       %216 = arith.cmpf one, %in, %cst : f32
 139 |       cf.assert %216, "unimplemented: tensor with zero element"
 140 |       %217 = arith.divf %cst_0, %in : f32
 141 |       linalg.yield %217 : f32
 142 |     } -> tensor<64xf32>
 143 |     %expanded_9 = tensor.expand_shape %arg65 [[0, 1, 2]] : tensor<64xf32> into tensor<64x1x1xf32>
 144 |     %expanded_10 = tensor.expand_shape %24 [[0, 1, 2]] : tensor<64xf32> into tensor<64x1x1xf32>
 145 |     %25 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%21, %expanded_9 : tensor<1x64x56x56xf32>, tensor<64x1x1xf32>) outs(%12 : tensor<1x64x56x56xf32>) {
 146 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
 147 |       %216 = arith.subf %in, %in_100 : f32
 148 |       linalg.yield %216 : f32
 149 |     } -> tensor<1x64x56x56xf32>
 150 |     %26 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%25, %expanded_10 : tensor<1x64x56x56xf32>, tensor<64x1x1xf32>) outs(%12 : tensor<1x64x56x56xf32>) {
 151 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
 152 |       %216 = arith.mulf %in, %in_100 : f32
 153 |       linalg.yield %216 : f32
 154 |     } -> tensor<1x64x56x56xf32>
 155 |     %expanded_11 = tensor.expand_shape %arg4 [[0, 1, 2]] : tensor<64xf32> into tensor<64x1x1xf32>
 156 |     %27 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%26, %expanded_11 : tensor<1x64x56x56xf32>, tensor<64x1x1xf32>) outs(%12 : tensor<1x64x56x56xf32>) {
 157 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
 158 |       %216 = arith.mulf %in, %in_100 : f32
 159 |       linalg.yield %216 : f32
 160 |     } -> tensor<1x64x56x56xf32>
 161 |     %expanded_12 = tensor.expand_shape %arg5 [[0, 1, 2]] : tensor<64xf32> into tensor<64x1x1xf32>
 162 |     %28 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%27, %expanded_12 : tensor<1x64x56x56xf32>, tensor<64x1x1xf32>) outs(%12 : tensor<1x64x56x56xf32>) {
 163 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
 164 |       %216 = arith.addf %in, %in_100 : f32
 165 |       linalg.yield %216 : f32
 166 |     } -> tensor<1x64x56x56xf32>
 167 |     %29 = linalg.generic {indexing_maps = [#map1, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%28 : tensor<1x64x56x56xf32>) outs(%12 : tensor<1x64x56x56xf32>) {
 168 |     ^bb0(%in: f32, %out: f32):
 169 |       %216 = arith.cmpf ugt, %in, %cst : f32
 170 |       %217 = arith.select %216, %in, %cst : f32
 171 |       linalg.yield %217 : f32
 172 |     } -> tensor<1x64x56x56xf32>
 173 |     %padded_13 = tensor.pad %29 low[0, 0, 1, 1] high[0, 0, 1, 1] {
 174 |     ^bb0(%arg123: index, %arg124: index, %arg125: index, %arg126: index):
 175 |       tensor.yield %cst : f32
 176 |     } : tensor<1x64x56x56xf32> to tensor<1x64x58x58xf32>
 177 |     %30 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%padded_13, %arg6 : tensor<1x64x58x58xf32>, tensor<64x64x3x3xf32>) outs(%20 : tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf32>
 178 |     %31 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%arg69 : tensor<64xf32>) outs(%3 : tensor<64xf32>) {
 179 |     ^bb0(%in: f32, %out: f32):
 180 |       %216 = arith.truncf %cst_2 : f64 to f32
 181 |       %217 = arith.addf %in, %216 : f32
 182 |       linalg.yield %217 : f32
 183 |     } -> tensor<64xf32>
 184 |     %32 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%31 : tensor<64xf32>) outs(%3 : tensor<64xf32>) {
 185 |     ^bb0(%in: f32, %out: f32):
 186 |       %216 = math.sqrt %in : f32
 187 |       linalg.yield %216 : f32
 188 |     } -> tensor<64xf32>
 189 |     %33 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%32 : tensor<64xf32>) outs(%3 : tensor<64xf32>) {
 190 |     ^bb0(%in: f32, %out: f32):
 191 |       %216 = arith.cmpf one, %in, %cst : f32
 192 |       cf.assert %216, "unimplemented: tensor with zero element"
 193 |       %217 = arith.divf %cst_0, %in : f32
 194 |       linalg.yield %217 : f32
 195 |     } -> tensor<64xf32>
 196 |     %expanded_14 = tensor.expand_shape %arg68 [[0, 1, 2]] : tensor<64xf32> into tensor<64x1x1xf32>
 197 |     %expanded_15 = tensor.expand_shape %33 [[0, 1, 2]] : tensor<64xf32> into tensor<64x1x1xf32>
 198 |     %34 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%30, %expanded_14 : tensor<1x64x56x56xf32>, tensor<64x1x1xf32>) outs(%12 : tensor<1x64x56x56xf32>) {
 199 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
 200 |       %216 = arith.subf %in, %in_100 : f32
 201 |       linalg.yield %216 : f32
 202 |     } -> tensor<1x64x56x56xf32>
 203 |     %35 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%34, %expanded_15 : tensor<1x64x56x56xf32>, tensor<64x1x1xf32>) outs(%12 : tensor<1x64x56x56xf32>) {
 204 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
 205 |       %216 = arith.mulf %in, %in_100 : f32
 206 |       linalg.yield %216 : f32
 207 |     } -> tensor<1x64x56x56xf32>
 208 |     %expanded_16 = tensor.expand_shape %arg7 [[0, 1, 2]] : tensor<64xf32> into tensor<64x1x1xf32>
 209 |     %36 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%35, %expanded_16 : tensor<1x64x56x56xf32>, tensor<64x1x1xf32>) outs(%12 : tensor<1x64x56x56xf32>) {
 210 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
 211 |       %216 = arith.mulf %in, %in_100 : f32
 212 |       linalg.yield %216 : f32
 213 |     } -> tensor<1x64x56x56xf32>
 214 |     %expanded_17 = tensor.expand_shape %arg8 [[0, 1, 2]] : tensor<64xf32> into tensor<64x1x1xf32>
 215 |     %37 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%36, %expanded_17 : tensor<1x64x56x56xf32>, tensor<64x1x1xf32>) outs(%12 : tensor<1x64x56x56xf32>) {
 216 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
 217 |       %216 = arith.addf %in, %in_100 : f32
 218 |       linalg.yield %216 : f32
 219 |     } -> tensor<1x64x56x56xf32>
 220 |     %38 = linalg.generic {indexing_maps = [#map1, #map1, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%37, %15 : tensor<1x64x56x56xf32>, tensor<1x64x56x56xf32>) outs(%12 : tensor<1x64x56x56xf32>) {
 221 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
 222 |       %216 = arith.addf %in, %in_100 : f32
 223 |       linalg.yield %216 : f32
 224 |     } -> tensor<1x64x56x56xf32>
 225 |     %39 = linalg.generic {indexing_maps = [#map1, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%38 : tensor<1x64x56x56xf32>) outs(%12 : tensor<1x64x56x56xf32>) {
 226 |     ^bb0(%in: f32, %out: f32):
 227 |       %216 = arith.cmpf ugt, %in, %cst : f32
 228 |       %217 = arith.select %216, %in, %cst : f32
 229 |       linalg.yield %217 : f32
 230 |     } -> tensor<1x64x56x56xf32>
 231 |     %padded_18 = tensor.pad %39 low[0, 0, 1, 1] high[0, 0, 1, 1] {
 232 |     ^bb0(%arg123: index, %arg124: index, %arg125: index, %arg126: index):
 233 |       tensor.yield %cst : f32
 234 |     } : tensor<1x64x56x56xf32> to tensor<1x64x58x58xf32>
 235 |     %40 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%padded_18, %arg9 : tensor<1x64x58x58xf32>, tensor<64x64x3x3xf32>) outs(%20 : tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf32>
 236 |     %41 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%arg72 : tensor<64xf32>) outs(%3 : tensor<64xf32>) {
 237 |     ^bb0(%in: f32, %out: f32):
 238 |       %216 = arith.truncf %cst_2 : f64 to f32
 239 |       %217 = arith.addf %in, %216 : f32
 240 |       linalg.yield %217 : f32
 241 |     } -> tensor<64xf32>
 242 |     %42 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%41 : tensor<64xf32>) outs(%3 : tensor<64xf32>) {
 243 |     ^bb0(%in: f32, %out: f32):
 244 |       %216 = math.sqrt %in : f32
 245 |       linalg.yield %216 : f32
 246 |     } -> tensor<64xf32>
 247 |     %43 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%42 : tensor<64xf32>) outs(%3 : tensor<64xf32>) {
 248 |     ^bb0(%in: f32, %out: f32):
 249 |       %216 = arith.cmpf one, %in, %cst : f32
 250 |       cf.assert %216, "unimplemented: tensor with zero element"
 251 |       %217 = arith.divf %cst_0, %in : f32
 252 |       linalg.yield %217 : f32
 253 |     } -> tensor<64xf32>
 254 |     %expanded_19 = tensor.expand_shape %arg71 [[0, 1, 2]] : tensor<64xf32> into tensor<64x1x1xf32>
 255 |     %expanded_20 = tensor.expand_shape %43 [[0, 1, 2]] : tensor<64xf32> into tensor<64x1x1xf32>
 256 |     %44 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%40, %expanded_19 : tensor<1x64x56x56xf32>, tensor<64x1x1xf32>) outs(%12 : tensor<1x64x56x56xf32>) {
 257 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
 258 |       %216 = arith.subf %in, %in_100 : f32
 259 |       linalg.yield %216 : f32
 260 |     } -> tensor<1x64x56x56xf32>
 261 |     %45 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%44, %expanded_20 : tensor<1x64x56x56xf32>, tensor<64x1x1xf32>) outs(%12 : tensor<1x64x56x56xf32>) {
 262 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
 263 |       %216 = arith.mulf %in, %in_100 : f32
 264 |       linalg.yield %216 : f32
 265 |     } -> tensor<1x64x56x56xf32>
 266 |     %expanded_21 = tensor.expand_shape %arg10 [[0, 1, 2]] : tensor<64xf32> into tensor<64x1x1xf32>
 267 |     %46 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%45, %expanded_21 : tensor<1x64x56x56xf32>, tensor<64x1x1xf32>) outs(%12 : tensor<1x64x56x56xf32>) {
 268 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
 269 |       %216 = arith.mulf %in, %in_100 : f32
 270 |       linalg.yield %216 : f32
 271 |     } -> tensor<1x64x56x56xf32>
 272 |     %expanded_22 = tensor.expand_shape %arg11 [[0, 1, 2]] : tensor<64xf32> into tensor<64x1x1xf32>
 273 |     %47 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%46, %expanded_22 : tensor<1x64x56x56xf32>, tensor<64x1x1xf32>) outs(%12 : tensor<1x64x56x56xf32>) {
 274 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
 275 |       %216 = arith.addf %in, %in_100 : f32
 276 |       linalg.yield %216 : f32
 277 |     } -> tensor<1x64x56x56xf32>
 278 |     %48 = linalg.generic {indexing_maps = [#map1, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%47 : tensor<1x64x56x56xf32>) outs(%12 : tensor<1x64x56x56xf32>) {
 279 |     ^bb0(%in: f32, %out: f32):
 280 |       %216 = arith.cmpf ugt, %in, %cst : f32
 281 |       %217 = arith.select %216, %in, %cst : f32
 282 |       linalg.yield %217 : f32
 283 |     } -> tensor<1x64x56x56xf32>
 284 |     %padded_23 = tensor.pad %48 low[0, 0, 1, 1] high[0, 0, 1, 1] {
 285 |     ^bb0(%arg123: index, %arg124: index, %arg125: index, %arg126: index):
 286 |       tensor.yield %cst : f32
 287 |     } : tensor<1x64x56x56xf32> to tensor<1x64x58x58xf32>
 288 |     %49 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%padded_23, %arg12 : tensor<1x64x58x58xf32>, tensor<64x64x3x3xf32>) outs(%20 : tensor<1x64x56x56xf32>) -> tensor<1x64x56x56xf32>
 289 |     %50 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%arg75 : tensor<64xf32>) outs(%3 : tensor<64xf32>) {
 290 |     ^bb0(%in: f32, %out: f32):
 291 |       %216 = arith.truncf %cst_2 : f64 to f32
 292 |       %217 = arith.addf %in, %216 : f32
 293 |       linalg.yield %217 : f32
 294 |     } -> tensor<64xf32>
 295 |     %51 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%50 : tensor<64xf32>) outs(%3 : tensor<64xf32>) {
 296 |     ^bb0(%in: f32, %out: f32):
 297 |       %216 = math.sqrt %in : f32
 298 |       linalg.yield %216 : f32
 299 |     } -> tensor<64xf32>
 300 |     %52 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%51 : tensor<64xf32>) outs(%3 : tensor<64xf32>) {
 301 |     ^bb0(%in: f32, %out: f32):
 302 |       %216 = arith.cmpf one, %in, %cst : f32
 303 |       cf.assert %216, "unimplemented: tensor with zero element"
 304 |       %217 = arith.divf %cst_0, %in : f32
 305 |       linalg.yield %217 : f32
 306 |     } -> tensor<64xf32>
 307 |     %expanded_24 = tensor.expand_shape %arg74 [[0, 1, 2]] : tensor<64xf32> into tensor<64x1x1xf32>
 308 |     %expanded_25 = tensor.expand_shape %52 [[0, 1, 2]] : tensor<64xf32> into tensor<64x1x1xf32>
 309 |     %53 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%49, %expanded_24 : tensor<1x64x56x56xf32>, tensor<64x1x1xf32>) outs(%12 : tensor<1x64x56x56xf32>) {
 310 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
 311 |       %216 = arith.subf %in, %in_100 : f32
 312 |       linalg.yield %216 : f32
 313 |     } -> tensor<1x64x56x56xf32>
 314 |     %54 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%53, %expanded_25 : tensor<1x64x56x56xf32>, tensor<64x1x1xf32>) outs(%12 : tensor<1x64x56x56xf32>) {
 315 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
 316 |       %216 = arith.mulf %in, %in_100 : f32
 317 |       linalg.yield %216 : f32
 318 |     } -> tensor<1x64x56x56xf32>
 319 |     %expanded_26 = tensor.expand_shape %arg13 [[0, 1, 2]] : tensor<64xf32> into tensor<64x1x1xf32>
 320 |     %55 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%54, %expanded_26 : tensor<1x64x56x56xf32>, tensor<64x1x1xf32>) outs(%12 : tensor<1x64x56x56xf32>) {
 321 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
 322 |       %216 = arith.mulf %in, %in_100 : f32
 323 |       linalg.yield %216 : f32
 324 |     } -> tensor<1x64x56x56xf32>
 325 |     %expanded_27 = tensor.expand_shape %arg14 [[0, 1, 2]] : tensor<64xf32> into tensor<64x1x1xf32>
 326 |     %56 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%55, %expanded_27 : tensor<1x64x56x56xf32>, tensor<64x1x1xf32>) outs(%12 : tensor<1x64x56x56xf32>) {
 327 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
 328 |       %216 = arith.addf %in, %in_100 : f32
 329 |       linalg.yield %216 : f32
 330 |     } -> tensor<1x64x56x56xf32>
 331 |     %57 = linalg.generic {indexing_maps = [#map1, #map1, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%56, %39 : tensor<1x64x56x56xf32>, tensor<1x64x56x56xf32>) outs(%12 : tensor<1x64x56x56xf32>) {
 332 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
 333 |       %216 = arith.addf %in, %in_100 : f32
 334 |       linalg.yield %216 : f32
 335 |     } -> tensor<1x64x56x56xf32>
 336 |     %58 = linalg.generic {indexing_maps = [#map1, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%57 : tensor<1x64x56x56xf32>) outs(%12 : tensor<1x64x56x56xf32>) {
 337 |     ^bb0(%in: f32, %out: f32):
 338 |       %216 = arith.cmpf ugt, %in, %cst : f32
 339 |       %217 = arith.select %216, %in, %cst : f32
 340 |       linalg.yield %217 : f32
 341 |     } -> tensor<1x64x56x56xf32>
 342 |     %padded_28 = tensor.pad %58 low[0, 0, 1, 1] high[0, 0, 1, 1] {
 343 |     ^bb0(%arg123: index, %arg124: index, %arg125: index, %arg126: index):
 344 |       tensor.yield %cst : f32
 345 |     } : tensor<1x64x56x56xf32> to tensor<1x64x58x58xf32>
 346 |     %59 = tensor.empty() : tensor<1x128x28x28xf32>
 347 |     %60 = linalg.fill ins(%cst : f32) outs(%59 : tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
 348 |     %61 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%padded_28, %arg15 : tensor<1x64x58x58xf32>, tensor<128x64x3x3xf32>) outs(%60 : tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
 349 |     %62 = tensor.empty() : tensor<128xf32>
 350 |     %63 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%arg78 : tensor<128xf32>) outs(%62 : tensor<128xf32>) {
 351 |     ^bb0(%in: f32, %out: f32):
 352 |       %216 = arith.truncf %cst_2 : f64 to f32
 353 |       %217 = arith.addf %in, %216 : f32
 354 |       linalg.yield %217 : f32
 355 |     } -> tensor<128xf32>
 356 |     %64 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%63 : tensor<128xf32>) outs(%62 : tensor<128xf32>) {
 357 |     ^bb0(%in: f32, %out: f32):
 358 |       %216 = math.sqrt %in : f32
 359 |       linalg.yield %216 : f32
 360 |     } -> tensor<128xf32>
 361 |     %65 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%64 : tensor<128xf32>) outs(%62 : tensor<128xf32>) {
 362 |     ^bb0(%in: f32, %out: f32):
 363 |       %216 = arith.cmpf one, %in, %cst : f32
 364 |       cf.assert %216, "unimplemented: tensor with zero element"
 365 |       %217 = arith.divf %cst_0, %in : f32
 366 |       linalg.yield %217 : f32
 367 |     } -> tensor<128xf32>
 368 |     %expanded_29 = tensor.expand_shape %arg77 [[0, 1, 2]] : tensor<128xf32> into tensor<128x1x1xf32>
 369 |     %expanded_30 = tensor.expand_shape %65 [[0, 1, 2]] : tensor<128xf32> into tensor<128x1x1xf32>
 370 |     %66 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%61, %expanded_29 : tensor<1x128x28x28xf32>, tensor<128x1x1xf32>) outs(%59 : tensor<1x128x28x28xf32>) {
 371 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
 372 |       %216 = arith.subf %in, %in_100 : f32
 373 |       linalg.yield %216 : f32
 374 |     } -> tensor<1x128x28x28xf32>
 375 |     %67 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%66, %expanded_30 : tensor<1x128x28x28xf32>, tensor<128x1x1xf32>) outs(%59 : tensor<1x128x28x28xf32>) {
 376 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
 377 |       %216 = arith.mulf %in, %in_100 : f32
 378 |       linalg.yield %216 : f32
 379 |     } -> tensor<1x128x28x28xf32>
 380 |     %expanded_31 = tensor.expand_shape %arg16 [[0, 1, 2]] : tensor<128xf32> into tensor<128x1x1xf32>
 381 |     %68 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%67, %expanded_31 : tensor<1x128x28x28xf32>, tensor<128x1x1xf32>) outs(%59 : tensor<1x128x28x28xf32>) {
 382 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
 383 |       %216 = arith.mulf %in, %in_100 : f32
 384 |       linalg.yield %216 : f32
 385 |     } -> tensor<1x128x28x28xf32>
 386 |     %expanded_32 = tensor.expand_shape %arg17 [[0, 1, 2]] : tensor<128xf32> into tensor<128x1x1xf32>
 387 |     %69 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%68, %expanded_32 : tensor<1x128x28x28xf32>, tensor<128x1x1xf32>) outs(%59 : tensor<1x128x28x28xf32>) {
 388 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
 389 |       %216 = arith.addf %in, %in_100 : f32
 390 |       linalg.yield %216 : f32
 391 |     } -> tensor<1x128x28x28xf32>
 392 |     %70 = linalg.generic {indexing_maps = [#map1, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%69 : tensor<1x128x28x28xf32>) outs(%59 : tensor<1x128x28x28xf32>) {
 393 |     ^bb0(%in: f32, %out: f32):
 394 |       %216 = arith.cmpf ugt, %in, %cst : f32
 395 |       %217 = arith.select %216, %in, %cst : f32
 396 |       linalg.yield %217 : f32
 397 |     } -> tensor<1x128x28x28xf32>
 398 |     %padded_33 = tensor.pad %70 low[0, 0, 1, 1] high[0, 0, 1, 1] {
 399 |     ^bb0(%arg123: index, %arg124: index, %arg125: index, %arg126: index):
 400 |       tensor.yield %cst : f32
 401 |     } : tensor<1x128x28x28xf32> to tensor<1x128x30x30xf32>
 402 |     %71 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%padded_33, %arg18 : tensor<1x128x30x30xf32>, tensor<128x128x3x3xf32>) outs(%60 : tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
 403 |     %72 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%arg81 : tensor<128xf32>) outs(%62 : tensor<128xf32>) {
 404 |     ^bb0(%in: f32, %out: f32):
 405 |       %216 = arith.truncf %cst_2 : f64 to f32
 406 |       %217 = arith.addf %in, %216 : f32
 407 |       linalg.yield %217 : f32
 408 |     } -> tensor<128xf32>
 409 |     %73 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%72 : tensor<128xf32>) outs(%62 : tensor<128xf32>) {
 410 |     ^bb0(%in: f32, %out: f32):
 411 |       %216 = math.sqrt %in : f32
 412 |       linalg.yield %216 : f32
 413 |     } -> tensor<128xf32>
 414 |     %74 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%73 : tensor<128xf32>) outs(%62 : tensor<128xf32>) {
 415 |     ^bb0(%in: f32, %out: f32):
 416 |       %216 = arith.cmpf one, %in, %cst : f32
 417 |       cf.assert %216, "unimplemented: tensor with zero element"
 418 |       %217 = arith.divf %cst_0, %in : f32
 419 |       linalg.yield %217 : f32
 420 |     } -> tensor<128xf32>
 421 |     %expanded_34 = tensor.expand_shape %arg80 [[0, 1, 2]] : tensor<128xf32> into tensor<128x1x1xf32>
 422 |     %expanded_35 = tensor.expand_shape %74 [[0, 1, 2]] : tensor<128xf32> into tensor<128x1x1xf32>
 423 |     %75 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%71, %expanded_34 : tensor<1x128x28x28xf32>, tensor<128x1x1xf32>) outs(%59 : tensor<1x128x28x28xf32>) {
 424 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
 425 |       %216 = arith.subf %in, %in_100 : f32
 426 |       linalg.yield %216 : f32
 427 |     } -> tensor<1x128x28x28xf32>
 428 |     %76 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%75, %expanded_35 : tensor<1x128x28x28xf32>, tensor<128x1x1xf32>) outs(%59 : tensor<1x128x28x28xf32>) {
 429 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
 430 |       %216 = arith.mulf %in, %in_100 : f32
 431 |       linalg.yield %216 : f32
 432 |     } -> tensor<1x128x28x28xf32>
 433 |     %expanded_36 = tensor.expand_shape %arg19 [[0, 1, 2]] : tensor<128xf32> into tensor<128x1x1xf32>
 434 |     %77 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%76, %expanded_36 : tensor<1x128x28x28xf32>, tensor<128x1x1xf32>) outs(%59 : tensor<1x128x28x28xf32>) {
 435 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
 436 |       %216 = arith.mulf %in, %in_100 : f32
 437 |       linalg.yield %216 : f32
 438 |     } -> tensor<1x128x28x28xf32>
 439 |     %expanded_37 = tensor.expand_shape %arg20 [[0, 1, 2]] : tensor<128xf32> into tensor<128x1x1xf32>
 440 |     %78 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%77, %expanded_37 : tensor<1x128x28x28xf32>, tensor<128x1x1xf32>) outs(%59 : tensor<1x128x28x28xf32>) {
 441 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
 442 |       %216 = arith.addf %in, %in_100 : f32
 443 |       linalg.yield %216 : f32
 444 |     } -> tensor<1x128x28x28xf32>
 445 |     %79 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%58, %arg21 : tensor<1x64x56x56xf32>, tensor<128x64x1x1xf32>) outs(%60 : tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
 446 |     %80 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%arg84 : tensor<128xf32>) outs(%62 : tensor<128xf32>) {
 447 |     ^bb0(%in: f32, %out: f32):
 448 |       %216 = arith.truncf %cst_2 : f64 to f32
 449 |       %217 = arith.addf %in, %216 : f32
 450 |       linalg.yield %217 : f32
 451 |     } -> tensor<128xf32>
 452 |     %81 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%80 : tensor<128xf32>) outs(%62 : tensor<128xf32>) {
 453 |     ^bb0(%in: f32, %out: f32):
 454 |       %216 = math.sqrt %in : f32
 455 |       linalg.yield %216 : f32
 456 |     } -> tensor<128xf32>
 457 |     %82 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%81 : tensor<128xf32>) outs(%62 : tensor<128xf32>) {
 458 |     ^bb0(%in: f32, %out: f32):
 459 |       %216 = arith.cmpf one, %in, %cst : f32
 460 |       cf.assert %216, "unimplemented: tensor with zero element"
 461 |       %217 = arith.divf %cst_0, %in : f32
 462 |       linalg.yield %217 : f32
 463 |     } -> tensor<128xf32>
 464 |     %expanded_38 = tensor.expand_shape %arg83 [[0, 1, 2]] : tensor<128xf32> into tensor<128x1x1xf32>
 465 |     %expanded_39 = tensor.expand_shape %82 [[0, 1, 2]] : tensor<128xf32> into tensor<128x1x1xf32>
 466 |     %83 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%79, %expanded_38 : tensor<1x128x28x28xf32>, tensor<128x1x1xf32>) outs(%59 : tensor<1x128x28x28xf32>) {
 467 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
 468 |       %216 = arith.subf %in, %in_100 : f32
 469 |       linalg.yield %216 : f32
 470 |     } -> tensor<1x128x28x28xf32>
 471 |     %84 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%83, %expanded_39 : tensor<1x128x28x28xf32>, tensor<128x1x1xf32>) outs(%59 : tensor<1x128x28x28xf32>) {
 472 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
 473 |       %216 = arith.mulf %in, %in_100 : f32
 474 |       linalg.yield %216 : f32
 475 |     } -> tensor<1x128x28x28xf32>
 476 |     %expanded_40 = tensor.expand_shape %arg22 [[0, 1, 2]] : tensor<128xf32> into tensor<128x1x1xf32>
 477 |     %85 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%84, %expanded_40 : tensor<1x128x28x28xf32>, tensor<128x1x1xf32>) outs(%59 : tensor<1x128x28x28xf32>) {
 478 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
 479 |       %216 = arith.mulf %in, %in_100 : f32
 480 |       linalg.yield %216 : f32
 481 |     } -> tensor<1x128x28x28xf32>
 482 |     %expanded_41 = tensor.expand_shape %arg23 [[0, 1, 2]] : tensor<128xf32> into tensor<128x1x1xf32>
 483 |     %86 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%85, %expanded_41 : tensor<1x128x28x28xf32>, tensor<128x1x1xf32>) outs(%59 : tensor<1x128x28x28xf32>) {
 484 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
 485 |       %216 = arith.addf %in, %in_100 : f32
 486 |       linalg.yield %216 : f32
 487 |     } -> tensor<1x128x28x28xf32>
 488 |     %87 = linalg.generic {indexing_maps = [#map1, #map1, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%78, %86 : tensor<1x128x28x28xf32>, tensor<1x128x28x28xf32>) outs(%59 : tensor<1x128x28x28xf32>) {
 489 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
 490 |       %216 = arith.addf %in, %in_100 : f32
 491 |       linalg.yield %216 : f32
 492 |     } -> tensor<1x128x28x28xf32>
 493 |     %88 = linalg.generic {indexing_maps = [#map1, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%87 : tensor<1x128x28x28xf32>) outs(%59 : tensor<1x128x28x28xf32>) {
 494 |     ^bb0(%in: f32, %out: f32):
 495 |       %216 = arith.cmpf ugt, %in, %cst : f32
 496 |       %217 = arith.select %216, %in, %cst : f32
 497 |       linalg.yield %217 : f32
 498 |     } -> tensor<1x128x28x28xf32>
 499 |     %padded_42 = tensor.pad %88 low[0, 0, 1, 1] high[0, 0, 1, 1] {
 500 |     ^bb0(%arg123: index, %arg124: index, %arg125: index, %arg126: index):
 501 |       tensor.yield %cst : f32
 502 |     } : tensor<1x128x28x28xf32> to tensor<1x128x30x30xf32>
 503 |     %89 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%padded_42, %arg24 : tensor<1x128x30x30xf32>, tensor<128x128x3x3xf32>) outs(%60 : tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
 504 |     %90 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%arg87 : tensor<128xf32>) outs(%62 : tensor<128xf32>) {
 505 |     ^bb0(%in: f32, %out: f32):
 506 |       %216 = arith.truncf %cst_2 : f64 to f32
 507 |       %217 = arith.addf %in, %216 : f32
 508 |       linalg.yield %217 : f32
 509 |     } -> tensor<128xf32>
 510 |     %91 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%90 : tensor<128xf32>) outs(%62 : tensor<128xf32>) {
 511 |     ^bb0(%in: f32, %out: f32):
 512 |       %216 = math.sqrt %in : f32
 513 |       linalg.yield %216 : f32
 514 |     } -> tensor<128xf32>
 515 |     %92 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%91 : tensor<128xf32>) outs(%62 : tensor<128xf32>) {
 516 |     ^bb0(%in: f32, %out: f32):
 517 |       %216 = arith.cmpf one, %in, %cst : f32
 518 |       cf.assert %216, "unimplemented: tensor with zero element"
 519 |       %217 = arith.divf %cst_0, %in : f32
 520 |       linalg.yield %217 : f32
 521 |     } -> tensor<128xf32>
 522 |     %expanded_43 = tensor.expand_shape %arg86 [[0, 1, 2]] : tensor<128xf32> into tensor<128x1x1xf32>
 523 |     %expanded_44 = tensor.expand_shape %92 [[0, 1, 2]] : tensor<128xf32> into tensor<128x1x1xf32>
 524 |     %93 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%89, %expanded_43 : tensor<1x128x28x28xf32>, tensor<128x1x1xf32>) outs(%59 : tensor<1x128x28x28xf32>) {
 525 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
 526 |       %216 = arith.subf %in, %in_100 : f32
 527 |       linalg.yield %216 : f32
 528 |     } -> tensor<1x128x28x28xf32>
 529 |     %94 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%93, %expanded_44 : tensor<1x128x28x28xf32>, tensor<128x1x1xf32>) outs(%59 : tensor<1x128x28x28xf32>) {
 530 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
 531 |       %216 = arith.mulf %in, %in_100 : f32
 532 |       linalg.yield %216 : f32
 533 |     } -> tensor<1x128x28x28xf32>
 534 |     %expanded_45 = tensor.expand_shape %arg25 [[0, 1, 2]] : tensor<128xf32> into tensor<128x1x1xf32>
 535 |     %95 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%94, %expanded_45 : tensor<1x128x28x28xf32>, tensor<128x1x1xf32>) outs(%59 : tensor<1x128x28x28xf32>) {
 536 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
 537 |       %216 = arith.mulf %in, %in_100 : f32
 538 |       linalg.yield %216 : f32
 539 |     } -> tensor<1x128x28x28xf32>
 540 |     %expanded_46 = tensor.expand_shape %arg26 [[0, 1, 2]] : tensor<128xf32> into tensor<128x1x1xf32>
 541 |     %96 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%95, %expanded_46 : tensor<1x128x28x28xf32>, tensor<128x1x1xf32>) outs(%59 : tensor<1x128x28x28xf32>) {
 542 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
 543 |       %216 = arith.addf %in, %in_100 : f32
 544 |       linalg.yield %216 : f32
 545 |     } -> tensor<1x128x28x28xf32>
 546 |     %97 = linalg.generic {indexing_maps = [#map1, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%96 : tensor<1x128x28x28xf32>) outs(%59 : tensor<1x128x28x28xf32>) {
 547 |     ^bb0(%in: f32, %out: f32):
 548 |       %216 = arith.cmpf ugt, %in, %cst : f32
 549 |       %217 = arith.select %216, %in, %cst : f32
 550 |       linalg.yield %217 : f32
 551 |     } -> tensor<1x128x28x28xf32>
 552 |     %padded_47 = tensor.pad %97 low[0, 0, 1, 1] high[0, 0, 1, 1] {
 553 |     ^bb0(%arg123: index, %arg124: index, %arg125: index, %arg126: index):
 554 |       tensor.yield %cst : f32
 555 |     } : tensor<1x128x28x28xf32> to tensor<1x128x30x30xf32>
 556 |     %98 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%padded_47, %arg27 : tensor<1x128x30x30xf32>, tensor<128x128x3x3xf32>) outs(%60 : tensor<1x128x28x28xf32>) -> tensor<1x128x28x28xf32>
 557 |     %99 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%arg90 : tensor<128xf32>) outs(%62 : tensor<128xf32>) {
 558 |     ^bb0(%in: f32, %out: f32):
 559 |       %216 = arith.truncf %cst_2 : f64 to f32
 560 |       %217 = arith.addf %in, %216 : f32
 561 |       linalg.yield %217 : f32
 562 |     } -> tensor<128xf32>
 563 |     %100 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%99 : tensor<128xf32>) outs(%62 : tensor<128xf32>) {
 564 |     ^bb0(%in: f32, %out: f32):
 565 |       %216 = math.sqrt %in : f32
 566 |       linalg.yield %216 : f32
 567 |     } -> tensor<128xf32>
 568 |     %101 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%100 : tensor<128xf32>) outs(%62 : tensor<128xf32>) {
 569 |     ^bb0(%in: f32, %out: f32):
 570 |       %216 = arith.cmpf one, %in, %cst : f32
 571 |       cf.assert %216, "unimplemented: tensor with zero element"
 572 |       %217 = arith.divf %cst_0, %in : f32
 573 |       linalg.yield %217 : f32
 574 |     } -> tensor<128xf32>
 575 |     %expanded_48 = tensor.expand_shape %arg89 [[0, 1, 2]] : tensor<128xf32> into tensor<128x1x1xf32>
 576 |     %expanded_49 = tensor.expand_shape %101 [[0, 1, 2]] : tensor<128xf32> into tensor<128x1x1xf32>
 577 |     %102 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%98, %expanded_48 : tensor<1x128x28x28xf32>, tensor<128x1x1xf32>) outs(%59 : tensor<1x128x28x28xf32>) {
 578 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
 579 |       %216 = arith.subf %in, %in_100 : f32
 580 |       linalg.yield %216 : f32
 581 |     } -> tensor<1x128x28x28xf32>
 582 |     %103 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%102, %expanded_49 : tensor<1x128x28x28xf32>, tensor<128x1x1xf32>) outs(%59 : tensor<1x128x28x28xf32>) {
 583 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
 584 |       %216 = arith.mulf %in, %in_100 : f32
 585 |       linalg.yield %216 : f32
 586 |     } -> tensor<1x128x28x28xf32>
 587 |     %expanded_50 = tensor.expand_shape %arg28 [[0, 1, 2]] : tensor<128xf32> into tensor<128x1x1xf32>
 588 |     %104 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%103, %expanded_50 : tensor<1x128x28x28xf32>, tensor<128x1x1xf32>) outs(%59 : tensor<1x128x28x28xf32>) {
 589 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
 590 |       %216 = arith.mulf %in, %in_100 : f32
 591 |       linalg.yield %216 : f32
 592 |     } -> tensor<1x128x28x28xf32>
 593 |     %expanded_51 = tensor.expand_shape %arg29 [[0, 1, 2]] : tensor<128xf32> into tensor<128x1x1xf32>
 594 |     %105 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%104, %expanded_51 : tensor<1x128x28x28xf32>, tensor<128x1x1xf32>) outs(%59 : tensor<1x128x28x28xf32>) {
 595 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
 596 |       %216 = arith.addf %in, %in_100 : f32
 597 |       linalg.yield %216 : f32
 598 |     } -> tensor<1x128x28x28xf32>
 599 |     %106 = linalg.generic {indexing_maps = [#map1, #map1, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%105, %88 : tensor<1x128x28x28xf32>, tensor<1x128x28x28xf32>) outs(%59 : tensor<1x128x28x28xf32>) {
 600 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
 601 |       %216 = arith.addf %in, %in_100 : f32
 602 |       linalg.yield %216 : f32
 603 |     } -> tensor<1x128x28x28xf32>
 604 |     %107 = linalg.generic {indexing_maps = [#map1, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%106 : tensor<1x128x28x28xf32>) outs(%59 : tensor<1x128x28x28xf32>) {
 605 |     ^bb0(%in: f32, %out: f32):
 606 |       %216 = arith.cmpf ugt, %in, %cst : f32
 607 |       %217 = arith.select %216, %in, %cst : f32
 608 |       linalg.yield %217 : f32
 609 |     } -> tensor<1x128x28x28xf32>
 610 |     %padded_52 = tensor.pad %107 low[0, 0, 1, 1] high[0, 0, 1, 1] {
 611 |     ^bb0(%arg123: index, %arg124: index, %arg125: index, %arg126: index):
 612 |       tensor.yield %cst : f32
 613 |     } : tensor<1x128x28x28xf32> to tensor<1x128x30x30xf32>
 614 |     %108 = tensor.empty() : tensor<1x256x14x14xf32>
 615 |     %109 = linalg.fill ins(%cst : f32) outs(%108 : tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf32>
 616 |     %110 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%padded_52, %arg30 : tensor<1x128x30x30xf32>, tensor<256x128x3x3xf32>) outs(%109 : tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf32>
 617 |     %111 = tensor.empty() : tensor<256xf32>
 618 |     %112 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%arg93 : tensor<256xf32>) outs(%111 : tensor<256xf32>) {
 619 |     ^bb0(%in: f32, %out: f32):
 620 |       %216 = arith.truncf %cst_2 : f64 to f32
 621 |       %217 = arith.addf %in, %216 : f32
 622 |       linalg.yield %217 : f32
 623 |     } -> tensor<256xf32>
 624 |     %113 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%112 : tensor<256xf32>) outs(%111 : tensor<256xf32>) {
 625 |     ^bb0(%in: f32, %out: f32):
 626 |       %216 = math.sqrt %in : f32
 627 |       linalg.yield %216 : f32
 628 |     } -> tensor<256xf32>
 629 |     %114 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%113 : tensor<256xf32>) outs(%111 : tensor<256xf32>) {
 630 |     ^bb0(%in: f32, %out: f32):
 631 |       %216 = arith.cmpf one, %in, %cst : f32
 632 |       cf.assert %216, "unimplemented: tensor with zero element"
 633 |       %217 = arith.divf %cst_0, %in : f32
 634 |       linalg.yield %217 : f32
 635 |     } -> tensor<256xf32>
 636 |     %expanded_53 = tensor.expand_shape %arg92 [[0, 1, 2]] : tensor<256xf32> into tensor<256x1x1xf32>
 637 |     %expanded_54 = tensor.expand_shape %114 [[0, 1, 2]] : tensor<256xf32> into tensor<256x1x1xf32>
 638 |     %115 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%110, %expanded_53 : tensor<1x256x14x14xf32>, tensor<256x1x1xf32>) outs(%108 : tensor<1x256x14x14xf32>) {
 639 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
 640 |       %216 = arith.subf %in, %in_100 : f32
 641 |       linalg.yield %216 : f32
 642 |     } -> tensor<1x256x14x14xf32>
 643 |     %116 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%115, %expanded_54 : tensor<1x256x14x14xf32>, tensor<256x1x1xf32>) outs(%108 : tensor<1x256x14x14xf32>) {
 644 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
 645 |       %216 = arith.mulf %in, %in_100 : f32
 646 |       linalg.yield %216 : f32
 647 |     } -> tensor<1x256x14x14xf32>
 648 |     %expanded_55 = tensor.expand_shape %arg31 [[0, 1, 2]] : tensor<256xf32> into tensor<256x1x1xf32>
 649 |     %117 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%116, %expanded_55 : tensor<1x256x14x14xf32>, tensor<256x1x1xf32>) outs(%108 : tensor<1x256x14x14xf32>) {
 650 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
 651 |       %216 = arith.mulf %in, %in_100 : f32
 652 |       linalg.yield %216 : f32
 653 |     } -> tensor<1x256x14x14xf32>
 654 |     %expanded_56 = tensor.expand_shape %arg32 [[0, 1, 2]] : tensor<256xf32> into tensor<256x1x1xf32>
 655 |     %118 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%117, %expanded_56 : tensor<1x256x14x14xf32>, tensor<256x1x1xf32>) outs(%108 : tensor<1x256x14x14xf32>) {
 656 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
 657 |       %216 = arith.addf %in, %in_100 : f32
 658 |       linalg.yield %216 : f32
 659 |     } -> tensor<1x256x14x14xf32>
 660 |     %119 = linalg.generic {indexing_maps = [#map1, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%118 : tensor<1x256x14x14xf32>) outs(%108 : tensor<1x256x14x14xf32>) {
 661 |     ^bb0(%in: f32, %out: f32):
 662 |       %216 = arith.cmpf ugt, %in, %cst : f32
 663 |       %217 = arith.select %216, %in, %cst : f32
 664 |       linalg.yield %217 : f32
 665 |     } -> tensor<1x256x14x14xf32>
 666 |     %padded_57 = tensor.pad %119 low[0, 0, 1, 1] high[0, 0, 1, 1] {
 667 |     ^bb0(%arg123: index, %arg124: index, %arg125: index, %arg126: index):
 668 |       tensor.yield %cst : f32
 669 |     } : tensor<1x256x14x14xf32> to tensor<1x256x16x16xf32>
 670 |     %120 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%padded_57, %arg33 : tensor<1x256x16x16xf32>, tensor<256x256x3x3xf32>) outs(%109 : tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf32>
 671 |     %121 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%arg96 : tensor<256xf32>) outs(%111 : tensor<256xf32>) {
 672 |     ^bb0(%in: f32, %out: f32):
 673 |       %216 = arith.truncf %cst_2 : f64 to f32
 674 |       %217 = arith.addf %in, %216 : f32
 675 |       linalg.yield %217 : f32
 676 |     } -> tensor<256xf32>
 677 |     %122 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%121 : tensor<256xf32>) outs(%111 : tensor<256xf32>) {
 678 |     ^bb0(%in: f32, %out: f32):
 679 |       %216 = math.sqrt %in : f32
 680 |       linalg.yield %216 : f32
 681 |     } -> tensor<256xf32>
 682 |     %123 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%122 : tensor<256xf32>) outs(%111 : tensor<256xf32>) {
 683 |     ^bb0(%in: f32, %out: f32):
 684 |       %216 = arith.cmpf one, %in, %cst : f32
 685 |       cf.assert %216, "unimplemented: tensor with zero element"
 686 |       %217 = arith.divf %cst_0, %in : f32
 687 |       linalg.yield %217 : f32
 688 |     } -> tensor<256xf32>
 689 |     %expanded_58 = tensor.expand_shape %arg95 [[0, 1, 2]] : tensor<256xf32> into tensor<256x1x1xf32>
 690 |     %expanded_59 = tensor.expand_shape %123 [[0, 1, 2]] : tensor<256xf32> into tensor<256x1x1xf32>
 691 |     %124 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%120, %expanded_58 : tensor<1x256x14x14xf32>, tensor<256x1x1xf32>) outs(%108 : tensor<1x256x14x14xf32>) {
 692 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
 693 |       %216 = arith.subf %in, %in_100 : f32
 694 |       linalg.yield %216 : f32
 695 |     } -> tensor<1x256x14x14xf32>
 696 |     %125 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%124, %expanded_59 : tensor<1x256x14x14xf32>, tensor<256x1x1xf32>) outs(%108 : tensor<1x256x14x14xf32>) {
 697 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
 698 |       %216 = arith.mulf %in, %in_100 : f32
 699 |       linalg.yield %216 : f32
 700 |     } -> tensor<1x256x14x14xf32>
 701 |     %expanded_60 = tensor.expand_shape %arg34 [[0, 1, 2]] : tensor<256xf32> into tensor<256x1x1xf32>
 702 |     %126 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%125, %expanded_60 : tensor<1x256x14x14xf32>, tensor<256x1x1xf32>) outs(%108 : tensor<1x256x14x14xf32>) {
 703 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
 704 |       %216 = arith.mulf %in, %in_100 : f32
 705 |       linalg.yield %216 : f32
 706 |     } -> tensor<1x256x14x14xf32>
 707 |     %expanded_61 = tensor.expand_shape %arg35 [[0, 1, 2]] : tensor<256xf32> into tensor<256x1x1xf32>
 708 |     %127 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%126, %expanded_61 : tensor<1x256x14x14xf32>, tensor<256x1x1xf32>) outs(%108 : tensor<1x256x14x14xf32>) {
 709 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
 710 |       %216 = arith.addf %in, %in_100 : f32
 711 |       linalg.yield %216 : f32
 712 |     } -> tensor<1x256x14x14xf32>
 713 |     %128 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%107, %arg36 : tensor<1x128x28x28xf32>, tensor<256x128x1x1xf32>) outs(%109 : tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf32>
 714 |     %129 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%arg99 : tensor<256xf32>) outs(%111 : tensor<256xf32>) {
 715 |     ^bb0(%in: f32, %out: f32):
 716 |       %216 = arith.truncf %cst_2 : f64 to f32
 717 |       %217 = arith.addf %in, %216 : f32
 718 |       linalg.yield %217 : f32
 719 |     } -> tensor<256xf32>
 720 |     %130 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%129 : tensor<256xf32>) outs(%111 : tensor<256xf32>) {
 721 |     ^bb0(%in: f32, %out: f32):
 722 |       %216 = math.sqrt %in : f32
 723 |       linalg.yield %216 : f32
 724 |     } -> tensor<256xf32>
 725 |     %131 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%130 : tensor<256xf32>) outs(%111 : tensor<256xf32>) {
 726 |     ^bb0(%in: f32, %out: f32):
 727 |       %216 = arith.cmpf one, %in, %cst : f32
 728 |       cf.assert %216, "unimplemented: tensor with zero element"
 729 |       %217 = arith.divf %cst_0, %in : f32
 730 |       linalg.yield %217 : f32
 731 |     } -> tensor<256xf32>
 732 |     %expanded_62 = tensor.expand_shape %arg98 [[0, 1, 2]] : tensor<256xf32> into tensor<256x1x1xf32>
 733 |     %expanded_63 = tensor.expand_shape %131 [[0, 1, 2]] : tensor<256xf32> into tensor<256x1x1xf32>
 734 |     %132 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%128, %expanded_62 : tensor<1x256x14x14xf32>, tensor<256x1x1xf32>) outs(%108 : tensor<1x256x14x14xf32>) {
 735 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
 736 |       %216 = arith.subf %in, %in_100 : f32
 737 |       linalg.yield %216 : f32
 738 |     } -> tensor<1x256x14x14xf32>
 739 |     %133 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%132, %expanded_63 : tensor<1x256x14x14xf32>, tensor<256x1x1xf32>) outs(%108 : tensor<1x256x14x14xf32>) {
 740 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
 741 |       %216 = arith.mulf %in, %in_100 : f32
 742 |       linalg.yield %216 : f32
 743 |     } -> tensor<1x256x14x14xf32>
 744 |     %expanded_64 = tensor.expand_shape %arg37 [[0, 1, 2]] : tensor<256xf32> into tensor<256x1x1xf32>
 745 |     %134 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%133, %expanded_64 : tensor<1x256x14x14xf32>, tensor<256x1x1xf32>) outs(%108 : tensor<1x256x14x14xf32>) {
 746 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
 747 |       %216 = arith.mulf %in, %in_100 : f32
 748 |       linalg.yield %216 : f32
 749 |     } -> tensor<1x256x14x14xf32>
 750 |     %expanded_65 = tensor.expand_shape %arg38 [[0, 1, 2]] : tensor<256xf32> into tensor<256x1x1xf32>
 751 |     %135 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%134, %expanded_65 : tensor<1x256x14x14xf32>, tensor<256x1x1xf32>) outs(%108 : tensor<1x256x14x14xf32>) {
 752 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
 753 |       %216 = arith.addf %in, %in_100 : f32
 754 |       linalg.yield %216 : f32
 755 |     } -> tensor<1x256x14x14xf32>
 756 |     %136 = linalg.generic {indexing_maps = [#map1, #map1, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%127, %135 : tensor<1x256x14x14xf32>, tensor<1x256x14x14xf32>) outs(%108 : tensor<1x256x14x14xf32>) {
 757 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
 758 |       %216 = arith.addf %in, %in_100 : f32
 759 |       linalg.yield %216 : f32
 760 |     } -> tensor<1x256x14x14xf32>
 761 |     %137 = linalg.generic {indexing_maps = [#map1, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%136 : tensor<1x256x14x14xf32>) outs(%108 : tensor<1x256x14x14xf32>) {
 762 |     ^bb0(%in: f32, %out: f32):
 763 |       %216 = arith.cmpf ugt, %in, %cst : f32
 764 |       %217 = arith.select %216, %in, %cst : f32
 765 |       linalg.yield %217 : f32
 766 |     } -> tensor<1x256x14x14xf32>
 767 |     %padded_66 = tensor.pad %137 low[0, 0, 1, 1] high[0, 0, 1, 1] {
 768 |     ^bb0(%arg123: index, %arg124: index, %arg125: index, %arg126: index):
 769 |       tensor.yield %cst : f32
 770 |     } : tensor<1x256x14x14xf32> to tensor<1x256x16x16xf32>
 771 |     %138 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%padded_66, %arg39 : tensor<1x256x16x16xf32>, tensor<256x256x3x3xf32>) outs(%109 : tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf32>
 772 |     %139 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%arg102 : tensor<256xf32>) outs(%111 : tensor<256xf32>) {
 773 |     ^bb0(%in: f32, %out: f32):
 774 |       %216 = arith.truncf %cst_2 : f64 to f32
 775 |       %217 = arith.addf %in, %216 : f32
 776 |       linalg.yield %217 : f32
 777 |     } -> tensor<256xf32>
 778 |     %140 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%139 : tensor<256xf32>) outs(%111 : tensor<256xf32>) {
 779 |     ^bb0(%in: f32, %out: f32):
 780 |       %216 = math.sqrt %in : f32
 781 |       linalg.yield %216 : f32
 782 |     } -> tensor<256xf32>
 783 |     %141 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%140 : tensor<256xf32>) outs(%111 : tensor<256xf32>) {
 784 |     ^bb0(%in: f32, %out: f32):
 785 |       %216 = arith.cmpf one, %in, %cst : f32
 786 |       cf.assert %216, "unimplemented: tensor with zero element"
 787 |       %217 = arith.divf %cst_0, %in : f32
 788 |       linalg.yield %217 : f32
 789 |     } -> tensor<256xf32>
 790 |     %expanded_67 = tensor.expand_shape %arg101 [[0, 1, 2]] : tensor<256xf32> into tensor<256x1x1xf32>
 791 |     %expanded_68 = tensor.expand_shape %141 [[0, 1, 2]] : tensor<256xf32> into tensor<256x1x1xf32>
 792 |     %142 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%138, %expanded_67 : tensor<1x256x14x14xf32>, tensor<256x1x1xf32>) outs(%108 : tensor<1x256x14x14xf32>) {
 793 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
 794 |       %216 = arith.subf %in, %in_100 : f32
 795 |       linalg.yield %216 : f32
 796 |     } -> tensor<1x256x14x14xf32>
 797 |     %143 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%142, %expanded_68 : tensor<1x256x14x14xf32>, tensor<256x1x1xf32>) outs(%108 : tensor<1x256x14x14xf32>) {
 798 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
 799 |       %216 = arith.mulf %in, %in_100 : f32
 800 |       linalg.yield %216 : f32
 801 |     } -> tensor<1x256x14x14xf32>
 802 |     %expanded_69 = tensor.expand_shape %arg40 [[0, 1, 2]] : tensor<256xf32> into tensor<256x1x1xf32>
 803 |     %144 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%143, %expanded_69 : tensor<1x256x14x14xf32>, tensor<256x1x1xf32>) outs(%108 : tensor<1x256x14x14xf32>) {
 804 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
 805 |       %216 = arith.mulf %in, %in_100 : f32
 806 |       linalg.yield %216 : f32
 807 |     } -> tensor<1x256x14x14xf32>
 808 |     %expanded_70 = tensor.expand_shape %arg41 [[0, 1, 2]] : tensor<256xf32> into tensor<256x1x1xf32>
 809 |     %145 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%144, %expanded_70 : tensor<1x256x14x14xf32>, tensor<256x1x1xf32>) outs(%108 : tensor<1x256x14x14xf32>) {
 810 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
 811 |       %216 = arith.addf %in, %in_100 : f32
 812 |       linalg.yield %216 : f32
 813 |     } -> tensor<1x256x14x14xf32>
 814 |     %146 = linalg.generic {indexing_maps = [#map1, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%145 : tensor<1x256x14x14xf32>) outs(%108 : tensor<1x256x14x14xf32>) {
 815 |     ^bb0(%in: f32, %out: f32):
 816 |       %216 = arith.cmpf ugt, %in, %cst : f32
 817 |       %217 = arith.select %216, %in, %cst : f32
 818 |       linalg.yield %217 : f32
 819 |     } -> tensor<1x256x14x14xf32>
 820 |     %padded_71 = tensor.pad %146 low[0, 0, 1, 1] high[0, 0, 1, 1] {
 821 |     ^bb0(%arg123: index, %arg124: index, %arg125: index, %arg126: index):
 822 |       tensor.yield %cst : f32
 823 |     } : tensor<1x256x14x14xf32> to tensor<1x256x16x16xf32>
 824 |     %147 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%padded_71, %arg42 : tensor<1x256x16x16xf32>, tensor<256x256x3x3xf32>) outs(%109 : tensor<1x256x14x14xf32>) -> tensor<1x256x14x14xf32>
 825 |     %148 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%arg105 : tensor<256xf32>) outs(%111 : tensor<256xf32>) {
 826 |     ^bb0(%in: f32, %out: f32):
 827 |       %216 = arith.truncf %cst_2 : f64 to f32
 828 |       %217 = arith.addf %in, %216 : f32
 829 |       linalg.yield %217 : f32
 830 |     } -> tensor<256xf32>
 831 |     %149 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%148 : tensor<256xf32>) outs(%111 : tensor<256xf32>) {
 832 |     ^bb0(%in: f32, %out: f32):
 833 |       %216 = math.sqrt %in : f32
 834 |       linalg.yield %216 : f32
 835 |     } -> tensor<256xf32>
 836 |     %150 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%149 : tensor<256xf32>) outs(%111 : tensor<256xf32>) {
 837 |     ^bb0(%in: f32, %out: f32):
 838 |       %216 = arith.cmpf one, %in, %cst : f32
 839 |       cf.assert %216, "unimplemented: tensor with zero element"
 840 |       %217 = arith.divf %cst_0, %in : f32
 841 |       linalg.yield %217 : f32
 842 |     } -> tensor<256xf32>
 843 |     %expanded_72 = tensor.expand_shape %arg104 [[0, 1, 2]] : tensor<256xf32> into tensor<256x1x1xf32>
 844 |     %expanded_73 = tensor.expand_shape %150 [[0, 1, 2]] : tensor<256xf32> into tensor<256x1x1xf32>
 845 |     %151 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%147, %expanded_72 : tensor<1x256x14x14xf32>, tensor<256x1x1xf32>) outs(%108 : tensor<1x256x14x14xf32>) {
 846 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
 847 |       %216 = arith.subf %in, %in_100 : f32
 848 |       linalg.yield %216 : f32
 849 |     } -> tensor<1x256x14x14xf32>
 850 |     %152 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%151, %expanded_73 : tensor<1x256x14x14xf32>, tensor<256x1x1xf32>) outs(%108 : tensor<1x256x14x14xf32>) {
 851 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
 852 |       %216 = arith.mulf %in, %in_100 : f32
 853 |       linalg.yield %216 : f32
 854 |     } -> tensor<1x256x14x14xf32>
 855 |     %expanded_74 = tensor.expand_shape %arg43 [[0, 1, 2]] : tensor<256xf32> into tensor<256x1x1xf32>
 856 |     %153 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%152, %expanded_74 : tensor<1x256x14x14xf32>, tensor<256x1x1xf32>) outs(%108 : tensor<1x256x14x14xf32>) {
 857 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
 858 |       %216 = arith.mulf %in, %in_100 : f32
 859 |       linalg.yield %216 : f32
 860 |     } -> tensor<1x256x14x14xf32>
 861 |     %expanded_75 = tensor.expand_shape %arg44 [[0, 1, 2]] : tensor<256xf32> into tensor<256x1x1xf32>
 862 |     %154 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%153, %expanded_75 : tensor<1x256x14x14xf32>, tensor<256x1x1xf32>) outs(%108 : tensor<1x256x14x14xf32>) {
 863 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
 864 |       %216 = arith.addf %in, %in_100 : f32
 865 |       linalg.yield %216 : f32
 866 |     } -> tensor<1x256x14x14xf32>
 867 |     %155 = linalg.generic {indexing_maps = [#map1, #map1, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%154, %137 : tensor<1x256x14x14xf32>, tensor<1x256x14x14xf32>) outs(%108 : tensor<1x256x14x14xf32>) {
 868 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
 869 |       %216 = arith.addf %in, %in_100 : f32
 870 |       linalg.yield %216 : f32
 871 |     } -> tensor<1x256x14x14xf32>
 872 |     %156 = linalg.generic {indexing_maps = [#map1, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%155 : tensor<1x256x14x14xf32>) outs(%108 : tensor<1x256x14x14xf32>) {
 873 |     ^bb0(%in: f32, %out: f32):
 874 |       %216 = arith.cmpf ugt, %in, %cst : f32
 875 |       %217 = arith.select %216, %in, %cst : f32
 876 |       linalg.yield %217 : f32
 877 |     } -> tensor<1x256x14x14xf32>
 878 |     %padded_76 = tensor.pad %156 low[0, 0, 1, 1] high[0, 0, 1, 1] {
 879 |     ^bb0(%arg123: index, %arg124: index, %arg125: index, %arg126: index):
 880 |       tensor.yield %cst : f32
 881 |     } : tensor<1x256x14x14xf32> to tensor<1x256x16x16xf32>
 882 |     %157 = tensor.empty() : tensor<1x512x7x7xf32>
 883 |     %158 = linalg.fill ins(%cst : f32) outs(%157 : tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf32>
 884 |     %159 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%padded_76, %arg45 : tensor<1x256x16x16xf32>, tensor<512x256x3x3xf32>) outs(%158 : tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf32>
 885 |     %160 = tensor.empty() : tensor<512xf32>
 886 |     %161 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%arg108 : tensor<512xf32>) outs(%160 : tensor<512xf32>) {
 887 |     ^bb0(%in: f32, %out: f32):
 888 |       %216 = arith.truncf %cst_2 : f64 to f32
 889 |       %217 = arith.addf %in, %216 : f32
 890 |       linalg.yield %217 : f32
 891 |     } -> tensor<512xf32>
 892 |     %162 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%161 : tensor<512xf32>) outs(%160 : tensor<512xf32>) {
 893 |     ^bb0(%in: f32, %out: f32):
 894 |       %216 = math.sqrt %in : f32
 895 |       linalg.yield %216 : f32
 896 |     } -> tensor<512xf32>
 897 |     %163 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%162 : tensor<512xf32>) outs(%160 : tensor<512xf32>) {
 898 |     ^bb0(%in: f32, %out: f32):
 899 |       %216 = arith.cmpf one, %in, %cst : f32
 900 |       cf.assert %216, "unimplemented: tensor with zero element"
 901 |       %217 = arith.divf %cst_0, %in : f32
 902 |       linalg.yield %217 : f32
 903 |     } -> tensor<512xf32>
 904 |     %expanded_77 = tensor.expand_shape %arg107 [[0, 1, 2]] : tensor<512xf32> into tensor<512x1x1xf32>
 905 |     %expanded_78 = tensor.expand_shape %163 [[0, 1, 2]] : tensor<512xf32> into tensor<512x1x1xf32>
 906 |     %164 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%159, %expanded_77 : tensor<1x512x7x7xf32>, tensor<512x1x1xf32>) outs(%157 : tensor<1x512x7x7xf32>) {
 907 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
 908 |       %216 = arith.subf %in, %in_100 : f32
 909 |       linalg.yield %216 : f32
 910 |     } -> tensor<1x512x7x7xf32>
 911 |     %165 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%164, %expanded_78 : tensor<1x512x7x7xf32>, tensor<512x1x1xf32>) outs(%157 : tensor<1x512x7x7xf32>) {
 912 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
 913 |       %216 = arith.mulf %in, %in_100 : f32
 914 |       linalg.yield %216 : f32
 915 |     } -> tensor<1x512x7x7xf32>
 916 |     %expanded_79 = tensor.expand_shape %arg46 [[0, 1, 2]] : tensor<512xf32> into tensor<512x1x1xf32>
 917 |     %166 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%165, %expanded_79 : tensor<1x512x7x7xf32>, tensor<512x1x1xf32>) outs(%157 : tensor<1x512x7x7xf32>) {
 918 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
 919 |       %216 = arith.mulf %in, %in_100 : f32
 920 |       linalg.yield %216 : f32
 921 |     } -> tensor<1x512x7x7xf32>
 922 |     %expanded_80 = tensor.expand_shape %arg47 [[0, 1, 2]] : tensor<512xf32> into tensor<512x1x1xf32>
 923 |     %167 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%166, %expanded_80 : tensor<1x512x7x7xf32>, tensor<512x1x1xf32>) outs(%157 : tensor<1x512x7x7xf32>) {
 924 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
 925 |       %216 = arith.addf %in, %in_100 : f32
 926 |       linalg.yield %216 : f32
 927 |     } -> tensor<1x512x7x7xf32>
 928 |     %168 = linalg.generic {indexing_maps = [#map1, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%167 : tensor<1x512x7x7xf32>) outs(%157 : tensor<1x512x7x7xf32>) {
 929 |     ^bb0(%in: f32, %out: f32):
 930 |       %216 = arith.cmpf ugt, %in, %cst : f32
 931 |       %217 = arith.select %216, %in, %cst : f32
 932 |       linalg.yield %217 : f32
 933 |     } -> tensor<1x512x7x7xf32>
 934 |     %padded_81 = tensor.pad %168 low[0, 0, 1, 1] high[0, 0, 1, 1] {
 935 |     ^bb0(%arg123: index, %arg124: index, %arg125: index, %arg126: index):
 936 |       tensor.yield %cst : f32
 937 |     } : tensor<1x512x7x7xf32> to tensor<1x512x9x9xf32>
 938 |     %169 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%padded_81, %arg48 : tensor<1x512x9x9xf32>, tensor<512x512x3x3xf32>) outs(%158 : tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf32>
 939 |     %170 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%arg111 : tensor<512xf32>) outs(%160 : tensor<512xf32>) {
 940 |     ^bb0(%in: f32, %out: f32):
 941 |       %216 = arith.truncf %cst_2 : f64 to f32
 942 |       %217 = arith.addf %in, %216 : f32
 943 |       linalg.yield %217 : f32
 944 |     } -> tensor<512xf32>
 945 |     %171 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%170 : tensor<512xf32>) outs(%160 : tensor<512xf32>) {
 946 |     ^bb0(%in: f32, %out: f32):
 947 |       %216 = math.sqrt %in : f32
 948 |       linalg.yield %216 : f32
 949 |     } -> tensor<512xf32>
 950 |     %172 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%171 : tensor<512xf32>) outs(%160 : tensor<512xf32>) {
 951 |     ^bb0(%in: f32, %out: f32):
 952 |       %216 = arith.cmpf one, %in, %cst : f32
 953 |       cf.assert %216, "unimplemented: tensor with zero element"
 954 |       %217 = arith.divf %cst_0, %in : f32
 955 |       linalg.yield %217 : f32
 956 |     } -> tensor<512xf32>
 957 |     %expanded_82 = tensor.expand_shape %arg110 [[0, 1, 2]] : tensor<512xf32> into tensor<512x1x1xf32>
 958 |     %expanded_83 = tensor.expand_shape %172 [[0, 1, 2]] : tensor<512xf32> into tensor<512x1x1xf32>
 959 |     %173 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%169, %expanded_82 : tensor<1x512x7x7xf32>, tensor<512x1x1xf32>) outs(%157 : tensor<1x512x7x7xf32>) {
 960 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
 961 |       %216 = arith.subf %in, %in_100 : f32
 962 |       linalg.yield %216 : f32
 963 |     } -> tensor<1x512x7x7xf32>
 964 |     %174 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%173, %expanded_83 : tensor<1x512x7x7xf32>, tensor<512x1x1xf32>) outs(%157 : tensor<1x512x7x7xf32>) {
 965 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
 966 |       %216 = arith.mulf %in, %in_100 : f32
 967 |       linalg.yield %216 : f32
 968 |     } -> tensor<1x512x7x7xf32>
 969 |     %expanded_84 = tensor.expand_shape %arg49 [[0, 1, 2]] : tensor<512xf32> into tensor<512x1x1xf32>
 970 |     %175 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%174, %expanded_84 : tensor<1x512x7x7xf32>, tensor<512x1x1xf32>) outs(%157 : tensor<1x512x7x7xf32>) {
 971 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
 972 |       %216 = arith.mulf %in, %in_100 : f32
 973 |       linalg.yield %216 : f32
 974 |     } -> tensor<1x512x7x7xf32>
 975 |     %expanded_85 = tensor.expand_shape %arg50 [[0, 1, 2]] : tensor<512xf32> into tensor<512x1x1xf32>
 976 |     %176 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%175, %expanded_85 : tensor<1x512x7x7xf32>, tensor<512x1x1xf32>) outs(%157 : tensor<1x512x7x7xf32>) {
 977 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
 978 |       %216 = arith.addf %in, %in_100 : f32
 979 |       linalg.yield %216 : f32
 980 |     } -> tensor<1x512x7x7xf32>
 981 |     %177 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} ins(%156, %arg51 : tensor<1x256x14x14xf32>, tensor<512x256x1x1xf32>) outs(%158 : tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf32>
 982 |     %178 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%arg114 : tensor<512xf32>) outs(%160 : tensor<512xf32>) {
 983 |     ^bb0(%in: f32, %out: f32):
 984 |       %216 = arith.truncf %cst_2 : f64 to f32
 985 |       %217 = arith.addf %in, %216 : f32
 986 |       linalg.yield %217 : f32
 987 |     } -> tensor<512xf32>
 988 |     %179 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%178 : tensor<512xf32>) outs(%160 : tensor<512xf32>) {
 989 |     ^bb0(%in: f32, %out: f32):
 990 |       %216 = math.sqrt %in : f32
 991 |       linalg.yield %216 : f32
 992 |     } -> tensor<512xf32>
 993 |     %180 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%179 : tensor<512xf32>) outs(%160 : tensor<512xf32>) {
 994 |     ^bb0(%in: f32, %out: f32):
 995 |       %216 = arith.cmpf one, %in, %cst : f32
 996 |       cf.assert %216, "unimplemented: tensor with zero element"
 997 |       %217 = arith.divf %cst_0, %in : f32
 998 |       linalg.yield %217 : f32
 999 |     } -> tensor<512xf32>
1000 |     %expanded_86 = tensor.expand_shape %arg113 [[0, 1, 2]] : tensor<512xf32> into tensor<512x1x1xf32>
1001 |     %expanded_87 = tensor.expand_shape %180 [[0, 1, 2]] : tensor<512xf32> into tensor<512x1x1xf32>
1002 |     %181 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%177, %expanded_86 : tensor<1x512x7x7xf32>, tensor<512x1x1xf32>) outs(%157 : tensor<1x512x7x7xf32>) {
1003 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
1004 |       %216 = arith.subf %in, %in_100 : f32
1005 |       linalg.yield %216 : f32
1006 |     } -> tensor<1x512x7x7xf32>
1007 |     %182 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%181, %expanded_87 : tensor<1x512x7x7xf32>, tensor<512x1x1xf32>) outs(%157 : tensor<1x512x7x7xf32>) {
1008 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
1009 |       %216 = arith.mulf %in, %in_100 : f32
1010 |       linalg.yield %216 : f32
1011 |     } -> tensor<1x512x7x7xf32>
1012 |     %expanded_88 = tensor.expand_shape %arg52 [[0, 1, 2]] : tensor<512xf32> into tensor<512x1x1xf32>
1013 |     %183 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%182, %expanded_88 : tensor<1x512x7x7xf32>, tensor<512x1x1xf32>) outs(%157 : tensor<1x512x7x7xf32>) {
1014 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
1015 |       %216 = arith.mulf %in, %in_100 : f32
1016 |       linalg.yield %216 : f32
1017 |     } -> tensor<1x512x7x7xf32>
1018 |     %expanded_89 = tensor.expand_shape %arg53 [[0, 1, 2]] : tensor<512xf32> into tensor<512x1x1xf32>
1019 |     %184 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%183, %expanded_89 : tensor<1x512x7x7xf32>, tensor<512x1x1xf32>) outs(%157 : tensor<1x512x7x7xf32>) {
1020 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
1021 |       %216 = arith.addf %in, %in_100 : f32
1022 |       linalg.yield %216 : f32
1023 |     } -> tensor<1x512x7x7xf32>
1024 |     %185 = linalg.generic {indexing_maps = [#map1, #map1, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%176, %184 : tensor<1x512x7x7xf32>, tensor<1x512x7x7xf32>) outs(%157 : tensor<1x512x7x7xf32>) {
1025 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
1026 |       %216 = arith.addf %in, %in_100 : f32
1027 |       linalg.yield %216 : f32
1028 |     } -> tensor<1x512x7x7xf32>
1029 |     %186 = linalg.generic {indexing_maps = [#map1, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%185 : tensor<1x512x7x7xf32>) outs(%157 : tensor<1x512x7x7xf32>) {
1030 |     ^bb0(%in: f32, %out: f32):
1031 |       %216 = arith.cmpf ugt, %in, %cst : f32
1032 |       %217 = arith.select %216, %in, %cst : f32
1033 |       linalg.yield %217 : f32
1034 |     } -> tensor<1x512x7x7xf32>
1035 |     %padded_90 = tensor.pad %186 low[0, 0, 1, 1] high[0, 0, 1, 1] {
1036 |     ^bb0(%arg123: index, %arg124: index, %arg125: index, %arg126: index):
1037 |       tensor.yield %cst : f32
1038 |     } : tensor<1x512x7x7xf32> to tensor<1x512x9x9xf32>
1039 |     %187 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%padded_90, %arg54 : tensor<1x512x9x9xf32>, tensor<512x512x3x3xf32>) outs(%158 : tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf32>
1040 |     %188 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%arg117 : tensor<512xf32>) outs(%160 : tensor<512xf32>) {
1041 |     ^bb0(%in: f32, %out: f32):
1042 |       %216 = arith.truncf %cst_2 : f64 to f32
1043 |       %217 = arith.addf %in, %216 : f32
1044 |       linalg.yield %217 : f32
1045 |     } -> tensor<512xf32>
1046 |     %189 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%188 : tensor<512xf32>) outs(%160 : tensor<512xf32>) {
1047 |     ^bb0(%in: f32, %out: f32):
1048 |       %216 = math.sqrt %in : f32
1049 |       linalg.yield %216 : f32
1050 |     } -> tensor<512xf32>
1051 |     %190 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%189 : tensor<512xf32>) outs(%160 : tensor<512xf32>) {
1052 |     ^bb0(%in: f32, %out: f32):
1053 |       %216 = arith.cmpf one, %in, %cst : f32
1054 |       cf.assert %216, "unimplemented: tensor with zero element"
1055 |       %217 = arith.divf %cst_0, %in : f32
1056 |       linalg.yield %217 : f32
1057 |     } -> tensor<512xf32>
1058 |     %expanded_91 = tensor.expand_shape %arg116 [[0, 1, 2]] : tensor<512xf32> into tensor<512x1x1xf32>
1059 |     %expanded_92 = tensor.expand_shape %190 [[0, 1, 2]] : tensor<512xf32> into tensor<512x1x1xf32>
1060 |     %191 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%187, %expanded_91 : tensor<1x512x7x7xf32>, tensor<512x1x1xf32>) outs(%157 : tensor<1x512x7x7xf32>) {
1061 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
1062 |       %216 = arith.subf %in, %in_100 : f32
1063 |       linalg.yield %216 : f32
1064 |     } -> tensor<1x512x7x7xf32>
1065 |     %192 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%191, %expanded_92 : tensor<1x512x7x7xf32>, tensor<512x1x1xf32>) outs(%157 : tensor<1x512x7x7xf32>) {
1066 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
1067 |       %216 = arith.mulf %in, %in_100 : f32
1068 |       linalg.yield %216 : f32
1069 |     } -> tensor<1x512x7x7xf32>
1070 |     %expanded_93 = tensor.expand_shape %arg55 [[0, 1, 2]] : tensor<512xf32> into tensor<512x1x1xf32>
1071 |     %193 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%192, %expanded_93 : tensor<1x512x7x7xf32>, tensor<512x1x1xf32>) outs(%157 : tensor<1x512x7x7xf32>) {
1072 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
1073 |       %216 = arith.mulf %in, %in_100 : f32
1074 |       linalg.yield %216 : f32
1075 |     } -> tensor<1x512x7x7xf32>
1076 |     %expanded_94 = tensor.expand_shape %arg56 [[0, 1, 2]] : tensor<512xf32> into tensor<512x1x1xf32>
1077 |     %194 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%193, %expanded_94 : tensor<1x512x7x7xf32>, tensor<512x1x1xf32>) outs(%157 : tensor<1x512x7x7xf32>) {
1078 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
1079 |       %216 = arith.addf %in, %in_100 : f32
1080 |       linalg.yield %216 : f32
1081 |     } -> tensor<1x512x7x7xf32>
1082 |     %195 = linalg.generic {indexing_maps = [#map1, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%194 : tensor<1x512x7x7xf32>) outs(%157 : tensor<1x512x7x7xf32>) {
1083 |     ^bb0(%in: f32, %out: f32):
1084 |       %216 = arith.cmpf ugt, %in, %cst : f32
1085 |       %217 = arith.select %216, %in, %cst : f32
1086 |       linalg.yield %217 : f32
1087 |     } -> tensor<1x512x7x7xf32>
1088 |     %padded_95 = tensor.pad %195 low[0, 0, 1, 1] high[0, 0, 1, 1] {
1089 |     ^bb0(%arg123: index, %arg124: index, %arg125: index, %arg126: index):
1090 |       tensor.yield %cst : f32
1091 |     } : tensor<1x512x7x7xf32> to tensor<1x512x9x9xf32>
1092 |     %196 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<1> : vector<2xi64>} ins(%padded_95, %arg57 : tensor<1x512x9x9xf32>, tensor<512x512x3x3xf32>) outs(%158 : tensor<1x512x7x7xf32>) -> tensor<1x512x7x7xf32>
1093 |     %197 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%arg120 : tensor<512xf32>) outs(%160 : tensor<512xf32>) {
1094 |     ^bb0(%in: f32, %out: f32):
1095 |       %216 = arith.truncf %cst_2 : f64 to f32
1096 |       %217 = arith.addf %in, %216 : f32
1097 |       linalg.yield %217 : f32
1098 |     } -> tensor<512xf32>
1099 |     %198 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%197 : tensor<512xf32>) outs(%160 : tensor<512xf32>) {
1100 |     ^bb0(%in: f32, %out: f32):
1101 |       %216 = math.sqrt %in : f32
1102 |       linalg.yield %216 : f32
1103 |     } -> tensor<512xf32>
1104 |     %199 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins(%198 : tensor<512xf32>) outs(%160 : tensor<512xf32>) {
1105 |     ^bb0(%in: f32, %out: f32):
1106 |       %216 = arith.cmpf one, %in, %cst : f32
1107 |       cf.assert %216, "unimplemented: tensor with zero element"
1108 |       %217 = arith.divf %cst_0, %in : f32
1109 |       linalg.yield %217 : f32
1110 |     } -> tensor<512xf32>
1111 |     %expanded_96 = tensor.expand_shape %arg119 [[0, 1, 2]] : tensor<512xf32> into tensor<512x1x1xf32>
1112 |     %expanded_97 = tensor.expand_shape %199 [[0, 1, 2]] : tensor<512xf32> into tensor<512x1x1xf32>
1113 |     %200 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%196, %expanded_96 : tensor<1x512x7x7xf32>, tensor<512x1x1xf32>) outs(%157 : tensor<1x512x7x7xf32>) {
1114 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
1115 |       %216 = arith.subf %in, %in_100 : f32
1116 |       linalg.yield %216 : f32
1117 |     } -> tensor<1x512x7x7xf32>
1118 |     %201 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%200, %expanded_97 : tensor<1x512x7x7xf32>, tensor<512x1x1xf32>) outs(%157 : tensor<1x512x7x7xf32>) {
1119 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
1120 |       %216 = arith.mulf %in, %in_100 : f32
1121 |       linalg.yield %216 : f32
1122 |     } -> tensor<1x512x7x7xf32>
1123 |     %expanded_98 = tensor.expand_shape %arg58 [[0, 1, 2]] : tensor<512xf32> into tensor<512x1x1xf32>
1124 |     %202 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%201, %expanded_98 : tensor<1x512x7x7xf32>, tensor<512x1x1xf32>) outs(%157 : tensor<1x512x7x7xf32>) {
1125 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
1126 |       %216 = arith.mulf %in, %in_100 : f32
1127 |       linalg.yield %216 : f32
1128 |     } -> tensor<1x512x7x7xf32>
1129 |     %expanded_99 = tensor.expand_shape %arg59 [[0, 1, 2]] : tensor<512xf32> into tensor<512x1x1xf32>
1130 |     %203 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%202, %expanded_99 : tensor<1x512x7x7xf32>, tensor<512x1x1xf32>) outs(%157 : tensor<1x512x7x7xf32>) {
1131 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
1132 |       %216 = arith.addf %in, %in_100 : f32
1133 |       linalg.yield %216 : f32
1134 |     } -> tensor<1x512x7x7xf32>
1135 |     %204 = linalg.generic {indexing_maps = [#map1, #map1, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%203, %186 : tensor<1x512x7x7xf32>, tensor<1x512x7x7xf32>) outs(%157 : tensor<1x512x7x7xf32>) {
1136 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
1137 |       %216 = arith.addf %in, %in_100 : f32
1138 |       linalg.yield %216 : f32
1139 |     } -> tensor<1x512x7x7xf32>
1140 |     %205 = linalg.generic {indexing_maps = [#map1, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%204 : tensor<1x512x7x7xf32>) outs(%157 : tensor<1x512x7x7xf32>) {
1141 |     ^bb0(%in: f32, %out: f32):
1142 |       %216 = arith.cmpf ugt, %in, %cst : f32
1143 |       %217 = arith.select %216, %in, %cst : f32
1144 |       linalg.yield %217 : f32
1145 |     } -> tensor<1x512x7x7xf32>
1146 |     %206 = tensor.empty() : tensor<1x512x1x1xf32>
1147 |     %207 = linalg.fill ins(%cst : f32) outs(%206 : tensor<1x512x1x1xf32>) -> tensor<1x512x1x1xf32>
1148 |     %208 = linalg.generic {indexing_maps = [#map3, #map6], iterator_types = ["parallel", "parallel", "reduction", "reduction"]} ins(%205 : tensor<1x512x7x7xf32>) outs(%207 : tensor<1x512x1x1xf32>) {
1149 |     ^bb0(%in: f32, %out: f32):
1150 |       %216 = arith.addf %in, %out : f32
1151 |       linalg.yield %216 : f32
1152 |     } -> tensor<1x512x1x1xf32>
1153 |     %209 = linalg.generic {indexing_maps = [#map7, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%208 : tensor<1x512x1x1xf32>) outs(%206 : tensor<1x512x1x1xf32>) {
1154 |     ^bb0(%in: f32, %out: f32):
1155 |       %216 = arith.divf %in, %cst_3 : f32
1156 |       linalg.yield %216 : f32
1157 |     } -> tensor<1x512x1x1xf32>
1158 |     %collapsed = tensor.collapse_shape %209 [[0], [1, 2, 3]] : tensor<1x512x1x1xf32> into tensor<1x512xf32>
1159 |     %210 = tensor.empty() : tensor<512x1000xf32>
1160 |     %211 = linalg.generic {indexing_maps = [#map8, #map9], iterator_types = ["parallel", "parallel"]} ins(%arg60 : tensor<1000x512xf32>) outs(%210 : tensor<512x1000xf32>) {
1161 |     ^bb0(%in: f32, %out: f32):
1162 |       linalg.yield %in : f32
1163 |     } -> tensor<512x1000xf32>
1164 |     %212 = tensor.empty() : tensor<1x1000xf32>
1165 |     %213 = linalg.fill ins(%cst : f32) outs(%212 : tensor<1x1000xf32>) -> tensor<1x1000xf32>
1166 |     %214 = linalg.matmul ins(%collapsed, %211 : tensor<1x512xf32>, tensor<512x1000xf32>) outs(%213 : tensor<1x1000xf32>) -> tensor<1x1000xf32>
1167 |     %215 = linalg.generic {indexing_maps = [#map10, #map11, #map8], iterator_types = ["parallel", "parallel"]} ins(%arg61, %214 : tensor<1000xf32>, tensor<1x1000xf32>) outs(%212 : tensor<1x1000xf32>) {
1168 |     ^bb0(%in: f32, %in_100: f32, %out: f32):
1169 |       %216 = arith.addf %in, %in_100 : f32
1170 |       linalg.yield %216 : f32
1171 |     } -> tensor<1x1000xf32>
1172 |     return %215, %arg0, %arg1, %arg3, %arg4, %arg6, %arg7, %arg9, %arg10, %arg12, %arg13, %arg15, %arg16, %arg18, %arg19, %arg21, %arg22, %arg24, %arg25, %arg27, %arg28, %arg30, %arg31, %arg33, %arg34, %arg36, %arg37, %arg39, %arg40, %arg42, %arg43, %arg45, %arg46, %arg48, %arg49, %arg51, %arg52, %arg54, %arg55, %arg57, %arg58, %arg62, %arg63, %arg65, %arg66, %arg68, %arg69, %arg71, %arg72, %arg74, %arg75, %arg77, %arg78, %arg80, %arg81, %arg83, %arg84, %arg86, %arg87, %arg89, %arg90, %arg92, %arg93, %arg95, %arg96, %arg98, %arg99, %arg101, %arg102, %arg104, %arg105, %arg107, %arg108, %arg110, %arg111, %arg113, %arg114, %arg116, %arg117, %arg119, %arg120, %arg122, %2, %11, %15, %19, %21, %29, %30, %39, %40, %48, %49, %58, %61, %70, %71, %79, %88, %89, %97, %98, %107, %110, %119, %120, %128, %137, %138, %146, %147, %156, %159, %168, %169, %177, %186, %187, %195, %196, %205, %collapsed, %211 : tensor<1x1000xf32>, tensor<64x3x7x7xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<64x64x3x3xf32>, tensor<64xf32>, tensor<128x64x3x3xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128xf32>, tensor<128x64x1x1xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128xf32>, tensor<128x128x3x3xf32>, tensor<128xf32>, tensor<256x128x3x3xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256xf32>, tensor<256x128x1x1xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256xf32>, tensor<256x256x3x3xf32>, tensor<256xf32>, tensor<512x256x3x3xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512xf32>, tensor<512x256x1x1xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512xf32>, tensor<512x512x3x3xf32>, tensor<512xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<64xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<128xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<512xf32>, tensor<1x3x224x224xf32>, tensor<1x64x112x112xf32>, tensor<1x64x112x112xf32>, tensor<1x64x56x56xf32>, tensor<1x64x56x56xi64>, tensor<1x64x56x56xf32>, tensor<1x64x56x56xf32>, tensor<1x64x56x56xf32>, tensor<1x64x56x56xf32>, tensor<1x64x56x56xf32>, tensor<1x64x56x56xf32>, tensor<1x64x56x56xf32>, tensor<1x64x56x56xf32>, tensor<1x128x28x28xf32>, tensor<1x128x28x28xf32>, tensor<1x128x28x28xf32>, tensor<1x128x28x28xf32>, tensor<1x128x28x28xf32>, tensor<1x128x28x28xf32>, tensor<1x128x28x28xf32>, tensor<1x128x28x28xf32>, tensor<1x128x28x28xf32>, tensor<1x256x14x14xf32>, tensor<1x256x14x14xf32>, tensor<1x256x14x14xf32>, tensor<1x256x14x14xf32>, tensor<1x256x14x14xf32>, tensor<1x256x14x14xf32>, tensor<1x256x14x14xf32>, tensor<1x256x14x14xf32>, tensor<1x256x14x14xf32>, tensor<1x512x7x7xf32>, tensor<1x512x7x7xf32>, tensor<1x512x7x7xf32>, tensor<1x512x7x7xf32>, tensor<1x512x7x7xf32>, tensor<1x512x7x7xf32>, tensor<1x512x7x7xf32>, tensor<1x512x7x7xf32>, tensor<1x512x7x7xf32>, tensor<1x512xf32>, tensor<512x1000xf32>
1173 |   }
1174 | }
1175 | 
1176 | 


--------------------------------------------------------------------------------
/pytorch/torch-dynamo/models/bert.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import torch
 4 | import torch.nn as nn
 5 | import torch.nn.functional as F
 6 | import torch._dynamo as dynamo
 7 | 
 8 | from transformers import AutoTokenizer, BertModel
 9 | 
10 | import sys
11 | sys.path.append('../../lib')
12 | from torch_mlir_compile import refbackend_torchdynamo_backend
13 | 
14 | 
15 | def main():
16 |     device = torch.device("cpu")
17 |     # The bare Bert Model transformer outputting raw hidden-states
18 |     # without any specific head on top.
19 |     # https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertModel
20 |     bert_model_name = "bert-base-uncased"
21 |     model = BertModel.from_pretrained(bert_model_name).to(device)
22 |     dynamo_callable = dynamo.optimize(refbackend_torchdynamo_backend)(model)
23 | 
24 |     tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
25 |     inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
26 |     dynamo_callable(**inputs)
27 | 
28 | if __name__ == '__main__':
29 |     with torch.no_grad():
30 |         main()
31 | 


--------------------------------------------------------------------------------
/pytorch/torch-dynamo/models/conv.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | from typing import List
 4 | 
 5 | import torch
 6 | import torch.nn as nn
 7 | import torch.nn.functional as F
 8 | import torch._dynamo as dynamo
 9 | 
10 | import sys
11 | sys.path.append('../../lib')
12 | from torch_mlir_compile import refbackend_torchdynamo_backend
13 | 
14 | class Net(nn.Module):
15 |     def __init__(self):
16 |         super(Net, self).__init__()
17 |         self.conv1 = nn.Conv2d(3, 32, 3, 3)
18 |         self.conv2 = nn.Conv2d(32, 2, 3, 3)
19 | 
20 |     def forward(self, x):
21 |             x = self.conv1(x)
22 |             x = F.relu(x)
23 |             x = self.conv2(x)
24 |             x = F.relu(x)
25 |             output = F.log_softmax(x, dim=1)
26 |             return output
27 | 
28 | 
29 | def main():
30 |     device = torch.device("cpu")
31 |     simple = Net().to(device)
32 |     dynamo_callable = dynamo.optimize(refbackend_torchdynamo_backend)(simple)
33 |     dynamo_callable(torch.ones(1, 3, 28, 28))
34 | 
35 | if __name__ == '__main__':
36 |     with torch.no_grad():
37 |         main()
38 | 


--------------------------------------------------------------------------------
/pytorch/torch-dynamo/models/linear.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import torch
 4 | import torch.nn as nn
 5 | import torch.nn.functional as F
 6 | import torch._dynamo as dynamo
 7 | 
 8 | import sys
 9 | sys.path.append('../../lib')
10 | from torch_mlir_compile import refbackend_torchdynamo_backend
11 | 
12 | 
13 | class Net(nn.Module):
14 |     def __init__(self):
15 |         super(Net, self).__init__()
16 |         self.fc1 = nn.Linear(128, 256)
17 |         self.fc2 = nn.Linear(256, 10)
18 | 
19 |     def forward(self, x):
20 |             x = self.fc1(x)
21 |             x = F.relu(x)
22 |             x = self.fc2(x)
23 |             x = F.relu(x)
24 |             output = F.log_softmax(x, dim=1)
25 |             return output
26 | 
27 | 
28 | def main():
29 |     device = torch.device("cpu")
30 |     simple = Net().to(device)
31 |     dynamo_callable = dynamo.optimize(refbackend_torchdynamo_backend)(simple)
32 |     dynamo_callable(torch.ones(2, 128))
33 | 
34 | if __name__ == '__main__':
35 |     with torch.no_grad():
36 |         main()
37 | 


--------------------------------------------------------------------------------
/pytorch/torch-dynamo/models/mnist.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import torch
 4 | import torch.nn as nn
 5 | import torch.nn.functional as F
 6 | import torch._dynamo as dynamo
 7 | 
 8 | import sys
 9 | sys.path.append('../../lib')
10 | from torch_mlir_compile import refbackend_torchdynamo_backend
11 | 
12 | 
13 | # Model taken from PyTorch examples - https://github.com/pytorch/examples/blob/main/mnist/main.py
14 | class MNIST(nn.Module):
15 |     def __init__(self):
16 |         super(MNIST, self).__init__()
17 |         self.conv1 = nn.Conv2d(1, 32, 3, 1)
18 |         self.conv2 = nn.Conv2d(32, 64, 3, 1)
19 |         self.dropout1 = nn.Dropout(0.25)
20 |         self.dropout2 = nn.Dropout(0.5)
21 |         self.fc1 = nn.Linear(9216, 128)
22 |         self.fc2 = nn.Linear(128, 10)
23 | 
24 |     def forward(self, x):
25 |         x = self.conv1(x)
26 |         x = F.relu(x)
27 |         x = self.conv2(x)
28 |         x = F.relu(x)
29 |         x = F.max_pool2d(x, 2)
30 |         x = self.dropout1(x)
31 |         x = torch.flatten(x, 1)
32 |         x = self.fc1(x)
33 |         x = F.relu(x)
34 |         x = self.dropout2(x)
35 |         x = self.fc2(x)
36 |         output = F.log_softmax(x, dim=1)
37 |         return output
38 | 
39 | def main():
40 |     device = torch.device("cpu")
41 |     mnist = MNIST().to(device)
42 |     dynamo_callable = dynamo.optimize(refbackend_torchdynamo_backend)(mnist)
43 |     dynamo_callable(torch.ones(1, 1, 28, 28))
44 | 
45 | if __name__ == '__main__':
46 |     with torch.no_grad():
47 |         main()


--------------------------------------------------------------------------------
/pytorch/torch-dynamo/models/resnet18.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torchvision
 3 | import torch._dynamo as dynamo
 4 | 
 5 | import sys
 6 | sys.path.append('../../lib')
 7 | from torch_mlir_compile import refbackend_torchdynamo_backend
 8 | 
 9 | resnet18 = torchvision.models.resnet18(weights=torchvision.models.resnet.ResNet18_Weights.IMAGENET1K_V1)
10 | resnet18.eval()
11 | 
12 | dynamo_callable = dynamo.optimize(refbackend_torchdynamo_backend)(resnet18)
13 | dynamo_callable(torch.ones(1, 3, 224, 224))
14 | 


--------------------------------------------------------------------------------
/pytorch/torch-script/mlir/conv.mlir:
--------------------------------------------------------------------------------
  1 | torch 
  2 |  module attributes {torch.debug_module_name = "Net"} {
  3 |   func.func @forward(%arg0: !torch.vtensor<[1,3,28,28],f32>) -> !torch.vtensor<[1,2,3,3],f32> {
  4 |     %true = torch.constant.bool true
  5 |     %float1.000000e00 = torch.constant.float 1.000000e+00
  6 |     %none = torch.constant.none
  7 |     %false = torch.constant.bool false
  8 |     %0 = torch.vtensor.literal(dense<[-0.0228016917, 0.0496884249]> : tensor<2xf32>) : !torch.vtensor<[2],f32>
  9 |     %1 = torch.vtensor.literal(dense_resource<__elided__> : tensor<2x32x3x3xf32>) : !torch.vtensor<[2,32,3,3],f32>
 10 |     %2 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32xf32>) : !torch.vtensor<[32],f32>
 11 |     %3 = torch.vtensor.literal(dense_resource<__elided__> : tensor<32x3x3x3xf32>) : !torch.vtensor<[32,3,3,3],f32>
 12 |     %int0 = torch.constant.int 0
 13 |     %int1 = torch.constant.int 1
 14 |     %int3 = torch.constant.int 3
 15 |     %4 = torch.prim.ListConstruct %int3, %int3 : (!torch.int, !torch.int) -> !torch.list<int>
 16 |     %5 = torch.prim.ListConstruct %int0, %int0 : (!torch.int, !torch.int) -> !torch.list<int>
 17 |     %6 = torch.prim.ListConstruct %int1, %int1 : (!torch.int, !torch.int) -> !torch.list<int>
 18 |     %7 = torch.prim.ListConstruct  : () -> !torch.list<int>
 19 |     %8 = torch.aten.convolution %arg0, %3, %2, %4, %5, %6, %false, %7, %int1 : !torch.vtensor<[1,3,28,28],f32>, !torch.vtensor<[32,3,3,3],f32>, !torch.vtensor<[32],f32>, !torch.list<int>, !torch.list<int>, !torch.list<int>, !torch.bool, !torch.list<int>, !torch.int -> !torch.vtensor<[1,32,9,9],f32>
 20 |     %9 = torch.aten.relu %8 : !torch.vtensor<[1,32,9,9],f32> -> !torch.vtensor<[1,32,9,9],f32>
 21 |     %10 = torch.prim.ListConstruct  : () -> !torch.list<int>
 22 |     %11 = torch.aten.convolution %9, %1, %0, %4, %5, %6, %false, %10, %int1 : !torch.vtensor<[1,32,9,9],f32>, !torch.vtensor<[2,32,3,3],f32>, !torch.vtensor<[2],f32>, !torch.list<int>, !torch.list<int>, !torch.list<int>, !torch.bool, !torch.list<int>, !torch.int -> !torch.vtensor<[1,2,3,3],f32>
 23 |     %12 = torch.aten.relu %11 : !torch.vtensor<[1,2,3,3],f32> -> !torch.vtensor<[1,2,3,3],f32>
 24 |     %values, %indices = torch.aten.max.dim %12, %int1, %true : !torch.vtensor<[1,2,3,3],f32>, !torch.int, !torch.bool -> !torch.vtensor<[1,1,3,3],f32>, !torch.vtensor<[1,1,3,3],si64>
 25 |     %13 = torch.aten.sub.Tensor %12, %values, %float1.000000e00 : !torch.vtensor<[1,2,3,3],f32>, !torch.vtensor<[1,1,3,3],f32>, !torch.float -> !torch.vtensor<[1,2,3,3],f32>
 26 |     %14 = torch.aten.exp %13 : !torch.vtensor<[1,2,3,3],f32> -> !torch.vtensor<[1,2,3,3],f32>
 27 |     %15 = torch.prim.ListConstruct %int1 : (!torch.int) -> !torch.list<int>
 28 |     %16 = torch.aten.sum.dim_IntList %14, %15, %true, %none : !torch.vtensor<[1,2,3,3],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[1,1,3,3],f32>
 29 |     %17 = torch.aten.log %16 : !torch.vtensor<[1,1,3,3],f32> -> !torch.vtensor<[1,1,3,3],f32>
 30 |     %18 = torch.aten.sub.Tensor %13, %17, %float1.000000e00 : !torch.vtensor<[1,2,3,3],f32>, !torch.vtensor<[1,1,3,3],f32>, !torch.float -> !torch.vtensor<[1,2,3,3],f32>
 31 |     return %18 : !torch.vtensor<[1,2,3,3],f32>
 32 |   }
 33 | }
 34 | 
 35 | linalg-on-tensors 
 36 |  #map = affine_map<(d0, d1, d2, d3) -> (d1)>
 37 | #map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 38 | #map2 = affine_map<(d0, d1, d2, d3) -> (0, d1, d2, d3)>
 39 | #map3 = affine_map<(d0, d1, d2, d3) -> (d0, 0, d2, d3)>
 40 | #map4 = affine_map<(d0, d1, d2, d3) -> (0, 0, d2, d3)>
 41 | module attributes {torch.debug_module_name = "Net"} {
 42 |   ml_program.global private mutable @global_seed(dense<0> : tensor<i64>) : tensor<i64>
 43 |   func.func @forward(%arg0: tensor<1x3x28x28xf32>) -> tensor<1x2x3x3xf32> {
 44 |     %cst = arith.constant dense<[-0.0228016917, 0.0496884249]> : tensor<2xf32>
 45 |     %cst_0 = arith.constant dense_resource<__elided__> : tensor<2x32x3x3xf32>
 46 |     %cst_1 = arith.constant dense_resource<__elided__> : tensor<32xf32>
 47 |     %cst_2 = arith.constant dense_resource<__elided__> : tensor<32x3x3x3xf32>
 48 |     %cst_3 = arith.constant 0.000000e+00 : f32
 49 |     %cst_4 = arith.constant 0xFF800000 : f32
 50 |     %c0_i64 = arith.constant 0 : i64
 51 |     %0 = tensor.empty() : tensor<1x32x9x9xf32>
 52 |     %1 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst_1 : tensor<32xf32>) outs(%0 : tensor<1x32x9x9xf32>) {
 53 |     ^bb0(%in: f32, %out: f32):
 54 |       linalg.yield %in : f32
 55 |     } -> tensor<1x32x9x9xf32>
 56 |     %2 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<3> : vector<2xi64>} ins(%arg0, %cst_2 : tensor<1x3x28x28xf32>, tensor<32x3x3x3xf32>) outs(%1 : tensor<1x32x9x9xf32>) -> tensor<1x32x9x9xf32>
 57 |     %3 = linalg.generic {indexing_maps = [#map2, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%2 : tensor<1x32x9x9xf32>) outs(%0 : tensor<1x32x9x9xf32>) {
 58 |     ^bb0(%in: f32, %out: f32):
 59 |       %19 = arith.cmpf ugt, %in, %cst_3 : f32
 60 |       %20 = arith.select %19, %in, %cst_3 : f32
 61 |       linalg.yield %20 : f32
 62 |     } -> tensor<1x32x9x9xf32>
 63 |     %4 = tensor.empty() : tensor<1x2x3x3xf32>
 64 |     %5 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%cst : tensor<2xf32>) outs(%4 : tensor<1x2x3x3xf32>) {
 65 |     ^bb0(%in: f32, %out: f32):
 66 |       linalg.yield %in : f32
 67 |     } -> tensor<1x2x3x3xf32>
 68 |     %6 = linalg.conv_2d_nchw_fchw {dilations = dense<1> : vector<2xi64>, strides = dense<3> : vector<2xi64>} ins(%3, %cst_0 : tensor<1x32x9x9xf32>, tensor<2x32x3x3xf32>) outs(%5 : tensor<1x2x3x3xf32>) -> tensor<1x2x3x3xf32>
 69 |     %7 = linalg.generic {indexing_maps = [#map2, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%6 : tensor<1x2x3x3xf32>) outs(%4 : tensor<1x2x3x3xf32>) {
 70 |     ^bb0(%in: f32, %out: f32):
 71 |       %19 = arith.cmpf ugt, %in, %cst_3 : f32
 72 |       %20 = arith.select %19, %in, %cst_3 : f32
 73 |       linalg.yield %20 : f32
 74 |     } -> tensor<1x2x3x3xf32>
 75 |     %8 = tensor.empty() : tensor<1x1x3x3xi64>
 76 |     %9 = linalg.fill ins(%c0_i64 : i64) outs(%8 : tensor<1x1x3x3xi64>) -> tensor<1x1x3x3xi64>
 77 |     %10 = tensor.empty() : tensor<1x1x3x3xf32>
 78 |     %11 = linalg.fill ins(%cst_4 : f32) outs(%10 : tensor<1x1x3x3xf32>) -> tensor<1x1x3x3xf32>
 79 |     %12:2 = linalg.generic {indexing_maps = [#map1, #map3, #map3], iterator_types = ["parallel", "reduction", "parallel", "parallel"]} ins(%7 : tensor<1x2x3x3xf32>) outs(%11, %9 : tensor<1x1x3x3xf32>, tensor<1x1x3x3xi64>) {
 80 |     ^bb0(%in: f32, %out: f32, %out_5: i64):
 81 |       %19 = linalg.index 1 : index
 82 |       %20 = arith.index_cast %19 : index to i64
 83 |       %21 = arith.maximumf %in, %out : f32
 84 |       %22 = arith.cmpf ogt, %in, %out : f32
 85 |       %23 = arith.select %22, %20, %out_5 : i64
 86 |       linalg.yield %21, %23 : f32, i64
 87 |     } -> (tensor<1x1x3x3xf32>, tensor<1x1x3x3xi64>)
 88 |     %13 = linalg.generic {indexing_maps = [#map2, #map4, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%7, %12#0 : tensor<1x2x3x3xf32>, tensor<1x1x3x3xf32>) outs(%4 : tensor<1x2x3x3xf32>) {
 89 |     ^bb0(%in: f32, %in_5: f32, %out: f32):
 90 |       %19 = arith.subf %in, %in_5 : f32
 91 |       linalg.yield %19 : f32
 92 |     } -> tensor<1x2x3x3xf32>
 93 |     %14 = linalg.generic {indexing_maps = [#map2, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%13 : tensor<1x2x3x3xf32>) outs(%4 : tensor<1x2x3x3xf32>) {
 94 |     ^bb0(%in: f32, %out: f32):
 95 |       %19 = math.exp %in : f32
 96 |       linalg.yield %19 : f32
 97 |     } -> tensor<1x2x3x3xf32>
 98 |     %15 = linalg.fill ins(%cst_3 : f32) outs(%10 : tensor<1x1x3x3xf32>) -> tensor<1x1x3x3xf32>
 99 |     %16 = linalg.generic {indexing_maps = [#map1, #map3], iterator_types = ["parallel", "reduction", "parallel", "parallel"]} ins(%14 : tensor<1x2x3x3xf32>) outs(%15 : tensor<1x1x3x3xf32>) {
100 |     ^bb0(%in: f32, %out: f32):
101 |       %19 = arith.addf %in, %out : f32
102 |       linalg.yield %19 : f32
103 |     } -> tensor<1x1x3x3xf32>
104 |     %17 = linalg.generic {indexing_maps = [#map4, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%16 : tensor<1x1x3x3xf32>) outs(%10 : tensor<1x1x3x3xf32>) {
105 |     ^bb0(%in: f32, %out: f32):
106 |       %19 = math.log %in : f32
107 |       linalg.yield %19 : f32
108 |     } -> tensor<1x1x3x3xf32>
109 |     %18 = linalg.generic {indexing_maps = [#map2, #map4, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%13, %17 : tensor<1x2x3x3xf32>, tensor<1x1x3x3xf32>) outs(%4 : tensor<1x2x3x3xf32>) {
110 |     ^bb0(%in: f32, %in_5: f32, %out: f32):
111 |       %19 = arith.subf %in, %in_5 : f32
112 |       linalg.yield %19 : f32
113 |     } -> tensor<1x2x3x3xf32>
114 |     return %18 : tensor<1x2x3x3xf32>
115 |   }
116 | }
117 | 
118 | tosa 
119 |  module attributes {torch.debug_module_name = "Net"} {
120 |   func.func @forward(%arg0: tensor<1x3x28x28xf32>) -> tensor<1x2x3x3xf32> {
121 |     %0 = "tosa.const"() <{value = dense<[-0.0228016917, 0.0496884249]> : tensor<2xf32>}> : () -> tensor<2xf32>
122 |     %1 = "tosa.const"() <{value = dense_resource<__elided__> : tensor<2x32x3x3xf32>}> : () -> tensor<2x32x3x3xf32>
123 |     %2 = "tosa.const"() <{value = dense_resource<__elided__> : tensor<32xf32>}> : () -> tensor<32xf32>
124 |     %3 = "tosa.const"() <{value = dense_resource<__elided__> : tensor<32x3x3x3xf32>}> : () -> tensor<32x3x3x3xf32>
125 |     %4 = "tosa.const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi32>}> : () -> tensor<4xi32>
126 |     %5 = "tosa.const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi32>}> : () -> tensor<4xi32>
127 |     %6 = tosa.transpose %arg0, %4 : (tensor<1x3x28x28xf32>, tensor<4xi32>) -> tensor<1x28x28x3xf32>
128 |     %7 = tosa.transpose %3, %4 : (tensor<32x3x3x3xf32>, tensor<4xi32>) -> tensor<32x3x3x3xf32>
129 |     %8 = tosa.conv2d %6, %7, %2 {dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 3, 3>} : (tensor<1x28x28x3xf32>, tensor<32x3x3x3xf32>, tensor<32xf32>) -> tensor<1x9x9x32xf32>
130 |     %9 = tosa.transpose %8, %5 : (tensor<1x9x9x32xf32>, tensor<4xi32>) -> tensor<1x32x9x9xf32>
131 |     %10 = tosa.clamp %9 {max_fp = 3.40282347E+38 : f32, max_int = 2147483647 : i64, min_fp = 0.000000e+00 : f32, min_int = 0 : i64} : (tensor<1x32x9x9xf32>) -> tensor<1x32x9x9xf32>
132 |     %11 = tosa.transpose %10, %4 : (tensor<1x32x9x9xf32>, tensor<4xi32>) -> tensor<1x9x9x32xf32>
133 |     %12 = tosa.transpose %1, %4 : (tensor<2x32x3x3xf32>, tensor<4xi32>) -> tensor<2x3x3x32xf32>
134 |     %13 = tosa.conv2d %11, %12, %0 {dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 3, 3>} : (tensor<1x9x9x32xf32>, tensor<2x3x3x32xf32>, tensor<2xf32>) -> tensor<1x3x3x2xf32>
135 |     %14 = tosa.transpose %13, %5 : (tensor<1x3x3x2xf32>, tensor<4xi32>) -> tensor<1x2x3x3xf32>
136 |     %15 = tosa.clamp %14 {max_fp = 3.40282347E+38 : f32, max_int = 2147483647 : i64, min_fp = 0.000000e+00 : f32, min_int = 0 : i64} : (tensor<1x2x3x3xf32>) -> tensor<1x2x3x3xf32>
137 |     %16 = tosa.reduce_max %15 {axis = 1 : i32} : (tensor<1x2x3x3xf32>) -> tensor<1x1x3x3xf32>
138 |     %17 = tosa.sub %15, %16 : (tensor<1x2x3x3xf32>, tensor<1x1x3x3xf32>) -> tensor<1x2x3x3xf32>
139 |     %18 = tosa.exp %17 : (tensor<1x2x3x3xf32>) -> tensor<1x2x3x3xf32>
140 |     %19 = tosa.reduce_sum %18 {axis = 1 : i32} : (tensor<1x2x3x3xf32>) -> tensor<1x1x3x3xf32>
141 |     %20 = tosa.log %19 : (tensor<1x1x3x3xf32>) -> tensor<1x1x3x3xf32>
142 |     %21 = tosa.sub %17, %20 : (tensor<1x2x3x3xf32>, tensor<1x1x3x3xf32>) -> tensor<1x2x3x3xf32>
143 |     return %21 : tensor<1x2x3x3xf32>
144 |   }
145 | }
146 | 
147 | stablehlo 
148 |  module attributes {torch.debug_module_name = "Net"} {
149 |   func.func @forward(%arg0: tensor<1x3x28x28xf32>) -> tensor<1x2x3x3xf32> {
150 |     %0 = stablehlo.constant dense<[-0.0228016917, 0.0496884249]> : tensor<2xf32>
151 |     %1 = stablehlo.constant dense_resource<__elided__> : tensor<2x32x3x3xf32>
152 |     %2 = stablehlo.constant dense_resource<__elided__> : tensor<32xf32>
153 |     %3 = stablehlo.constant dense_resource<__elided__> : tensor<32x3x3x3xf32>
154 |     %4 = stablehlo.constant dense<0xFF800000> : tensor<f32>
155 |     %5 = stablehlo.constant dense<0> : tensor<i64>
156 |     %6 = stablehlo.constant dense<0.000000e+00> : tensor<f32>
157 |     %cst = arith.constant dense<[32, 1, 1]> : tensor<3xi64>
158 |     %7 = chlo.constant dense<0.000000e+00> : tensor<1x32x9x9xf32>
159 |     %cst_0 = arith.constant dense<[2, 1, 1]> : tensor<3xi64>
160 |     %8 = chlo.constant dense<0.000000e+00> : tensor<1x2x3x3xf32>
161 |     %cst_1 = arith.constant dense<[1, 2, 3, 3]> : tensor<4xi64>
162 |     %cst_2 = arith.constant dense<[1, 1, 3, 3]> : tensor<4xi64>
163 |     %9 = stablehlo.convolution(%arg0, %3) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [3, 3], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x28x28xf32>, tensor<32x3x3x3xf32>) -> tensor<1x32x9x9xf32>
164 |     %10 = stablehlo.dynamic_reshape %2, %cst : (tensor<32xf32>, tensor<3xi64>) -> tensor<32x1x1xf32>
165 |     %11 = chlo.broadcast_add %9, %10 : (tensor<1x32x9x9xf32>, tensor<32x1x1xf32>) -> tensor<1x32x9x9xf32>
166 |     %12 = stablehlo.maximum %11, %7 : tensor<1x32x9x9xf32>
167 |     %13 = stablehlo.convolution(%12, %1) dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1], window = {stride = [3, 3], pad = [[0, 0], [0, 0]], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x32x9x9xf32>, tensor<2x32x3x3xf32>) -> tensor<1x2x3x3xf32>
168 |     %14 = stablehlo.dynamic_reshape %0, %cst_0 : (tensor<2xf32>, tensor<3xi64>) -> tensor<2x1x1xf32>
169 |     %15 = chlo.broadcast_add %13, %14 : (tensor<1x2x3x3xf32>, tensor<2x1x1xf32>) -> tensor<1x2x3x3xf32>
170 |     %16 = stablehlo.maximum %15, %8 : tensor<1x2x3x3xf32>
171 |     %17 = stablehlo.dynamic_iota %cst_1, dim = 1 : (tensor<4xi64>) -> tensor<1x2x3x3xi64>
172 |     %18:2 = stablehlo.reduce(%16 init: %4), (%17 init: %5) across dimensions = [1] : (tensor<1x2x3x3xf32>, tensor<1x2x3x3xi64>, tensor<f32>, tensor<i64>) -> (tensor<1x3x3xf32>, tensor<1x3x3xi64>)
173 |      reducer(%arg1: tensor<f32>, %arg3: tensor<f32>) (%arg2: tensor<i64>, %arg4: tensor<i64>)  {
174 |       %26 = stablehlo.compare  GE, %arg1, %arg3,  FLOAT : (tensor<f32>, tensor<f32>) -> tensor<i1>
175 |       %27 = stablehlo.select %26, %arg1, %arg3 : tensor<i1>, tensor<f32>
176 |       %28 = stablehlo.compare  EQ, %arg1, %arg3,  FLOAT : (tensor<f32>, tensor<f32>) -> tensor<i1>
177 |       %29 = stablehlo.minimum %arg2, %arg4 : tensor<i64>
178 |       %30 = stablehlo.select %26, %arg2, %arg4 : tensor<i1>, tensor<i64>
179 |       %31 = stablehlo.select %28, %29, %30 : tensor<i1>, tensor<i64>
180 |       stablehlo.return %27, %31 : tensor<f32>, tensor<i64>
181 |     }
182 |     %19 = stablehlo.dynamic_reshape %18#0, %cst_2 : (tensor<1x3x3xf32>, tensor<4xi64>) -> tensor<1x1x3x3xf32>
183 |     %20 = chlo.broadcast_subtract %16, %19 : (tensor<1x2x3x3xf32>, tensor<1x1x3x3xf32>) -> tensor<1x2x3x3xf32>
184 |     %21 = stablehlo.exponential %20 : tensor<1x2x3x3xf32>
185 |     %22 = stablehlo.reduce(%21 init: %6) applies stablehlo.add across dimensions = [1] : (tensor<1x2x3x3xf32>, tensor<f32>) -> tensor<1x3x3xf32>
186 |     %23 = stablehlo.dynamic_reshape %22, %cst_2 : (tensor<1x3x3xf32>, tensor<4xi64>) -> tensor<1x1x3x3xf32>
187 |     %24 = stablehlo.log %23 : tensor<1x1x3x3xf32>
188 |     %25 = chlo.broadcast_subtract %20, %24 : (tensor<1x2x3x3xf32>, tensor<1x1x3x3xf32>) -> tensor<1x2x3x3xf32>
189 |     return %25 : tensor<1x2x3x3xf32>
190 |   }
191 | }
192 | 
193 | 


--------------------------------------------------------------------------------
/pytorch/torch-script/mlir/linear.mlir:
--------------------------------------------------------------------------------
  1 | torch 
  2 |  module attributes {torch.debug_module_name = "Net"} {
  3 |   func.func @forward(%arg0: !torch.vtensor<[2,128],f32>) -> !torch.vtensor<[2,10],f32> {
  4 |     %true = torch.constant.bool true
  5 |     %float1.000000e00 = torch.constant.float 1.000000e+00
  6 |     %none = torch.constant.none
  7 |     %int0 = torch.constant.int 0
  8 |     %int1 = torch.constant.int 1
  9 |     %0 = torch.vtensor.literal(dense<[-0.00630987436, -0.0443928167, 0.0618280694, -0.0368138924, -0.0515485033, 0.00771782547, -0.0303224251, -0.0296016484, 0.0289968103, 0.0607223138]> : tensor<10xf32>) : !torch.vtensor<[10],f32>
 10 |     %1 = torch.vtensor.literal(dense_resource<__elided__> : tensor<10x256xf32>) : !torch.vtensor<[10,256],f32>
 11 |     %2 = torch.vtensor.literal(dense_resource<__elided__> : tensor<256xf32>) : !torch.vtensor<[256],f32>
 12 |     %3 = torch.vtensor.literal(dense_resource<__elided__> : tensor<256x128xf32>) : !torch.vtensor<[256,128],f32>
 13 |     %4 = torch.aten.transpose.int %3, %int0, %int1 : !torch.vtensor<[256,128],f32>, !torch.int, !torch.int -> !torch.vtensor<[128,256],f32>
 14 |     %5 = torch.aten.mm %arg0, %4 : !torch.vtensor<[2,128],f32>, !torch.vtensor<[128,256],f32> -> !torch.vtensor<[2,256],f32>
 15 |     %6 = torch.aten.add.Tensor %5, %2, %float1.000000e00 : !torch.vtensor<[2,256],f32>, !torch.vtensor<[256],f32>, !torch.float -> !torch.vtensor<[2,256],f32>
 16 |     %7 = torch.aten.relu %6 : !torch.vtensor<[2,256],f32> -> !torch.vtensor<[2,256],f32>
 17 |     %8 = torch.aten.transpose.int %1, %int0, %int1 : !torch.vtensor<[10,256],f32>, !torch.int, !torch.int -> !torch.vtensor<[256,10],f32>
 18 |     %9 = torch.aten.mm %7, %8 : !torch.vtensor<[2,256],f32>, !torch.vtensor<[256,10],f32> -> !torch.vtensor<[2,10],f32>
 19 |     %10 = torch.aten.add.Tensor %9, %0, %float1.000000e00 : !torch.vtensor<[2,10],f32>, !torch.vtensor<[10],f32>, !torch.float -> !torch.vtensor<[2,10],f32>
 20 |     %11 = torch.aten.relu %10 : !torch.vtensor<[2,10],f32> -> !torch.vtensor<[2,10],f32>
 21 |     %values, %indices = torch.aten.max.dim %11, %int1, %true : !torch.vtensor<[2,10],f32>, !torch.int, !torch.bool -> !torch.vtensor<[2,1],f32>, !torch.vtensor<[2,1],si64>
 22 |     %12 = torch.aten.sub.Tensor %11, %values, %float1.000000e00 : !torch.vtensor<[2,10],f32>, !torch.vtensor<[2,1],f32>, !torch.float -> !torch.vtensor<[2,10],f32>
 23 |     %13 = torch.aten.exp %12 : !torch.vtensor<[2,10],f32> -> !torch.vtensor<[2,10],f32>
 24 |     %14 = torch.prim.ListConstruct %int1 : (!torch.int) -> !torch.list<int>
 25 |     %15 = torch.aten.sum.dim_IntList %13, %14, %true, %none : !torch.vtensor<[2,10],f32>, !torch.list<int>, !torch.bool, !torch.none -> !torch.vtensor<[2,1],f32>
 26 |     %16 = torch.aten.log %15 : !torch.vtensor<[2,1],f32> -> !torch.vtensor<[2,1],f32>
 27 |     %17 = torch.aten.sub.Tensor %12, %16, %float1.000000e00 : !torch.vtensor<[2,10],f32>, !torch.vtensor<[2,1],f32>, !torch.float -> !torch.vtensor<[2,10],f32>
 28 |     return %17 : !torch.vtensor<[2,10],f32>
 29 |   }
 30 | }
 31 | 
 32 | linalg-on-tensors 
 33 |  #map = affine_map<(d0, d1) -> (d0, d1)>
 34 | #map1 = affine_map<(d0, d1) -> (d1, d0)>
 35 | #map2 = affine_map<(d0, d1) -> (d1)>
 36 | #map3 = affine_map<(d0, d1) -> (d0, 0)>
 37 | module attributes {torch.debug_module_name = "Net"} {
 38 |   ml_program.global private mutable @global_seed(dense<0> : tensor<i64>) : tensor<i64>
 39 |   func.func @forward(%arg0: tensor<2x128xf32>) -> tensor<2x10xf32> {
 40 |     %c0_i64 = arith.constant 0 : i64
 41 |     %cst = arith.constant dense<[-0.00630987436, -0.0443928167, 0.0618280694, -0.0368138924, -0.0515485033, 0.00771782547, -0.0303224251, -0.0296016484, 0.0289968103, 0.0607223138]> : tensor<10xf32>
 42 |     %cst_0 = arith.constant dense_resource<__elided__> : tensor<10x256xf32>
 43 |     %cst_1 = arith.constant dense_resource<__elided__> : tensor<256xf32>
 44 |     %cst_2 = arith.constant dense_resource<__elided__> : tensor<256x128xf32>
 45 |     %cst_3 = arith.constant 0.000000e+00 : f32
 46 |     %cst_4 = arith.constant 0xFF800000 : f32
 47 |     %0 = tensor.empty() : tensor<128x256xf32>
 48 |     %1 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel"]} ins(%cst_2 : tensor<256x128xf32>) outs(%0 : tensor<128x256xf32>) {
 49 |     ^bb0(%in: f32, %out: f32):
 50 |       linalg.yield %in : f32
 51 |     } -> tensor<128x256xf32>
 52 |     %2 = tensor.empty() : tensor<2x256xf32>
 53 |     %3 = linalg.fill ins(%cst_3 : f32) outs(%2 : tensor<2x256xf32>) -> tensor<2x256xf32>
 54 |     %4 = linalg.matmul ins(%arg0, %1 : tensor<2x128xf32>, tensor<128x256xf32>) outs(%3 : tensor<2x256xf32>) -> tensor<2x256xf32>
 55 |     %5 = linalg.generic {indexing_maps = [#map, #map2, #map], iterator_types = ["parallel", "parallel"]} ins(%4, %cst_1 : tensor<2x256xf32>, tensor<256xf32>) outs(%2 : tensor<2x256xf32>) {
 56 |     ^bb0(%in: f32, %in_5: f32, %out: f32):
 57 |       %25 = arith.addf %in, %in_5 : f32
 58 |       linalg.yield %25 : f32
 59 |     } -> tensor<2x256xf32>
 60 |     %6 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%5 : tensor<2x256xf32>) outs(%2 : tensor<2x256xf32>) {
 61 |     ^bb0(%in: f32, %out: f32):
 62 |       %25 = arith.cmpf ugt, %in, %cst_3 : f32
 63 |       %26 = arith.select %25, %in, %cst_3 : f32
 64 |       linalg.yield %26 : f32
 65 |     } -> tensor<2x256xf32>
 66 |     %7 = tensor.empty() : tensor<256x10xf32>
 67 |     %8 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel"]} ins(%cst_0 : tensor<10x256xf32>) outs(%7 : tensor<256x10xf32>) {
 68 |     ^bb0(%in: f32, %out: f32):
 69 |       linalg.yield %in : f32
 70 |     } -> tensor<256x10xf32>
 71 |     %9 = tensor.empty() : tensor<2x10xf32>
 72 |     %10 = linalg.fill ins(%cst_3 : f32) outs(%9 : tensor<2x10xf32>) -> tensor<2x10xf32>
 73 |     %11 = linalg.matmul ins(%6, %8 : tensor<2x256xf32>, tensor<256x10xf32>) outs(%10 : tensor<2x10xf32>) -> tensor<2x10xf32>
 74 |     %12 = linalg.generic {indexing_maps = [#map, #map2, #map], iterator_types = ["parallel", "parallel"]} ins(%11, %cst : tensor<2x10xf32>, tensor<10xf32>) outs(%9 : tensor<2x10xf32>) {
 75 |     ^bb0(%in: f32, %in_5: f32, %out: f32):
 76 |       %25 = arith.addf %in, %in_5 : f32
 77 |       linalg.yield %25 : f32
 78 |     } -> tensor<2x10xf32>
 79 |     %13 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<2x10xf32>) outs(%9 : tensor<2x10xf32>) {
 80 |     ^bb0(%in: f32, %out: f32):
 81 |       %25 = arith.cmpf ugt, %in, %cst_3 : f32
 82 |       %26 = arith.select %25, %in, %cst_3 : f32
 83 |       linalg.yield %26 : f32
 84 |     } -> tensor<2x10xf32>
 85 |     %14 = tensor.empty() : tensor<2x1xi64>
 86 |     %15 = linalg.fill ins(%c0_i64 : i64) outs(%14 : tensor<2x1xi64>) -> tensor<2x1xi64>
 87 |     %16 = tensor.empty() : tensor<2x1xf32>
 88 |     %17 = linalg.fill ins(%cst_4 : f32) outs(%16 : tensor<2x1xf32>) -> tensor<2x1xf32>
 89 |     %18:2 = linalg.generic {indexing_maps = [#map, #map3, #map3], iterator_types = ["parallel", "reduction"]} ins(%13 : tensor<2x10xf32>) outs(%17, %15 : tensor<2x1xf32>, tensor<2x1xi64>) {
 90 |     ^bb0(%in: f32, %out: f32, %out_5: i64):
 91 |       %25 = linalg.index 1 : index
 92 |       %26 = arith.index_cast %25 : index to i64
 93 |       %27 = arith.maximumf %in, %out : f32
 94 |       %28 = arith.cmpf ogt, %in, %out : f32
 95 |       %29 = arith.select %28, %26, %out_5 : i64
 96 |       linalg.yield %27, %29 : f32, i64
 97 |     } -> (tensor<2x1xf32>, tensor<2x1xi64>)
 98 |     %19 = linalg.generic {indexing_maps = [#map, #map3, #map], iterator_types = ["parallel", "parallel"]} ins(%13, %18#0 : tensor<2x10xf32>, tensor<2x1xf32>) outs(%9 : tensor<2x10xf32>) {
 99 |     ^bb0(%in: f32, %in_5: f32, %out: f32):
100 |       %25 = arith.subf %in, %in_5 : f32
101 |       linalg.yield %25 : f32
102 |     } -> tensor<2x10xf32>
103 |     %20 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%19 : tensor<2x10xf32>) outs(%9 : tensor<2x10xf32>) {
104 |     ^bb0(%in: f32, %out: f32):
105 |       %25 = math.exp %in : f32
106 |       linalg.yield %25 : f32
107 |     } -> tensor<2x10xf32>
108 |     %21 = linalg.fill ins(%cst_3 : f32) outs(%16 : tensor<2x1xf32>) -> tensor<2x1xf32>
109 |     %22 = linalg.generic {indexing_maps = [#map, #map3], iterator_types = ["parallel", "reduction"]} ins(%20 : tensor<2x10xf32>) outs(%21 : tensor<2x1xf32>) {
110 |     ^bb0(%in: f32, %out: f32):
111 |       %25 = arith.addf %in, %out : f32
112 |       linalg.yield %25 : f32
113 |     } -> tensor<2x1xf32>
114 |     %23 = linalg.generic {indexing_maps = [#map3, #map], iterator_types = ["parallel", "parallel"]} ins(%22 : tensor<2x1xf32>) outs(%16 : tensor<2x1xf32>) {
115 |     ^bb0(%in: f32, %out: f32):
116 |       %25 = math.log %in : f32
117 |       linalg.yield %25 : f32
118 |     } -> tensor<2x1xf32>
119 |     %24 = linalg.generic {indexing_maps = [#map, #map3, #map], iterator_types = ["parallel", "parallel"]} ins(%19, %23 : tensor<2x10xf32>, tensor<2x1xf32>) outs(%9 : tensor<2x10xf32>) {
120 |     ^bb0(%in: f32, %in_5: f32, %out: f32):
121 |       %25 = arith.subf %in, %in_5 : f32
122 |       linalg.yield %25 : f32
123 |     } -> tensor<2x10xf32>
124 |     return %24 : tensor<2x10xf32>
125 |   }
126 | }
127 | 
128 | tosa 
129 |  module attributes {torch.debug_module_name = "Net"} {
130 |   func.func @forward(%arg0: tensor<2x128xf32>) -> tensor<2x10xf32> {
131 |     %0 = "tosa.const"() <{value = dense_resource<__elided__> : tensor<10x256xf32>}> : () -> tensor<10x256xf32>
132 |     %1 = "tosa.const"() <{value = dense_resource<__elided__> : tensor<256x128xf32>}> : () -> tensor<256x128xf32>
133 |     %2 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32>
134 |     %3 = "tosa.const"() <{value = dense<[[-0.00630987436, -0.0443928167, 0.0618280694, -0.0368138924, -0.0515485033, 0.00771782547, -0.0303224251, -0.0296016484, 0.0289968103, 0.0607223138]]> : tensor<1x10xf32>}> : () -> tensor<1x10xf32>
135 |     %4 = "tosa.const"() <{value = dense_resource<__elided__> : tensor<1x256xf32>}> : () -> tensor<1x256xf32>
136 |     %5 = tosa.transpose %1, %2 : (tensor<256x128xf32>, tensor<2xi32>) -> tensor<128x256xf32>
137 |     %6 = tosa.reshape %arg0 {new_shape = array<i64: 1, 2, 128>} : (tensor<2x128xf32>) -> tensor<1x2x128xf32>
138 |     %7 = tosa.reshape %5 {new_shape = array<i64: 1, 128, 256>} : (tensor<128x256xf32>) -> tensor<1x128x256xf32>
139 |     %8 = tosa.matmul %6, %7 : (tensor<1x2x128xf32>, tensor<1x128x256xf32>) -> tensor<1x2x256xf32>
140 |     %9 = tosa.reshape %8 {new_shape = array<i64: 2, 256>} : (tensor<1x2x256xf32>) -> tensor<2x256xf32>
141 |     %10 = tosa.add %9, %4 : (tensor<2x256xf32>, tensor<1x256xf32>) -> tensor<2x256xf32>
142 |     %11 = tosa.clamp %10 {max_fp = 3.40282347E+38 : f32, max_int = 2147483647 : i64, min_fp = 0.000000e+00 : f32, min_int = 0 : i64} : (tensor<2x256xf32>) -> tensor<2x256xf32>
143 |     %12 = tosa.transpose %0, %2 : (tensor<10x256xf32>, tensor<2xi32>) -> tensor<256x10xf32>
144 |     %13 = tosa.reshape %11 {new_shape = array<i64: 1, 2, 256>} : (tensor<2x256xf32>) -> tensor<1x2x256xf32>
145 |     %14 = tosa.reshape %12 {new_shape = array<i64: 1, 256, 10>} : (tensor<256x10xf32>) -> tensor<1x256x10xf32>
146 |     %15 = tosa.matmul %13, %14 : (tensor<1x2x256xf32>, tensor<1x256x10xf32>) -> tensor<1x2x10xf32>
147 |     %16 = tosa.reshape %15 {new_shape = array<i64: 2, 10>} : (tensor<1x2x10xf32>) -> tensor<2x10xf32>
148 |     %17 = tosa.add %16, %3 : (tensor<2x10xf32>, tensor<1x10xf32>) -> tensor<2x10xf32>
149 |     %18 = tosa.clamp %17 {max_fp = 3.40282347E+38 : f32, max_int = 2147483647 : i64, min_fp = 0.000000e+00 : f32, min_int = 0 : i64} : (tensor<2x10xf32>) -> tensor<2x10xf32>
150 |     %19 = tosa.reduce_max %18 {axis = 1 : i32} : (tensor<2x10xf32>) -> tensor<2x1xf32>
151 |     %20 = tosa.sub %18, %19 : (tensor<2x10xf32>, tensor<2x1xf32>) -> tensor<2x10xf32>
152 |     %21 = tosa.exp %20 : (tensor<2x10xf32>) -> tensor<2x10xf32>
153 |     %22 = tosa.reduce_sum %21 {axis = 1 : i32} : (tensor<2x10xf32>) -> tensor<2x1xf32>
154 |     %23 = tosa.log %22 : (tensor<2x1xf32>) -> tensor<2x1xf32>
155 |     %24 = tosa.sub %20, %23 : (tensor<2x10xf32>, tensor<2x1xf32>) -> tensor<2x10xf32>
156 |     return %24 : tensor<2x10xf32>
157 |   }
158 | }
159 | 
160 | stablehlo 
161 |  module attributes {torch.debug_module_name = "Net"} {
162 |   func.func @forward(%arg0: tensor<2x128xf32>) -> tensor<2x10xf32> {
163 |     %0 = stablehlo.constant dense<[-0.00630987436, -0.0443928167, 0.0618280694, -0.0368138924, -0.0515485033, 0.00771782547, -0.0303224251, -0.0296016484, 0.0289968103, 0.0607223138]> : tensor<10xf32>
164 |     %1 = stablehlo.constant dense_resource<__elided__> : tensor<10x256xf32>
165 |     %2 = stablehlo.constant dense_resource<__elided__> : tensor<256xf32>
166 |     %3 = stablehlo.constant dense_resource<__elided__> : tensor<256x128xf32>
167 |     %4 = stablehlo.constant dense<0xFF800000> : tensor<f32>
168 |     %5 = stablehlo.constant dense<0> : tensor<i64>
169 |     %6 = stablehlo.constant dense<0.000000e+00> : tensor<f32>
170 |     %7 = chlo.constant dense<0.000000e+00> : tensor<2x256xf32>
171 |     %8 = chlo.constant dense<0.000000e+00> : tensor<2x10xf32>
172 |     %cst = arith.constant dense<[2, 10]> : tensor<2xi64>
173 |     %cst_0 = arith.constant dense<[2, 1]> : tensor<2xi64>
174 |     %9 = stablehlo.transpose %3, dims = [1, 0] : (tensor<256x128xf32>) -> tensor<128x256xf32>
175 |     %10 = stablehlo.dot %arg0, %9 : (tensor<2x128xf32>, tensor<128x256xf32>) -> tensor<2x256xf32>
176 |     %11 = chlo.broadcast_add %10, %2 : (tensor<2x256xf32>, tensor<256xf32>) -> tensor<2x256xf32>
177 |     %12 = stablehlo.maximum %11, %7 : tensor<2x256xf32>
178 |     %13 = stablehlo.transpose %1, dims = [1, 0] : (tensor<10x256xf32>) -> tensor<256x10xf32>
179 |     %14 = stablehlo.dot %12, %13 : (tensor<2x256xf32>, tensor<256x10xf32>) -> tensor<2x10xf32>
180 |     %15 = chlo.broadcast_add %14, %0 : (tensor<2x10xf32>, tensor<10xf32>) -> tensor<2x10xf32>
181 |     %16 = stablehlo.maximum %15, %8 : tensor<2x10xf32>
182 |     %17 = stablehlo.dynamic_iota %cst, dim = 1 : (tensor<2xi64>) -> tensor<2x10xi64>
183 |     %18:2 = stablehlo.reduce(%16 init: %4), (%17 init: %5) across dimensions = [1] : (tensor<2x10xf32>, tensor<2x10xi64>, tensor<f32>, tensor<i64>) -> (tensor<2xf32>, tensor<2xi64>)
184 |      reducer(%arg1: tensor<f32>, %arg3: tensor<f32>) (%arg2: tensor<i64>, %arg4: tensor<i64>)  {
185 |       %26 = stablehlo.compare  GE, %arg1, %arg3,  FLOAT : (tensor<f32>, tensor<f32>) -> tensor<i1>
186 |       %27 = stablehlo.select %26, %arg1, %arg3 : tensor<i1>, tensor<f32>
187 |       %28 = stablehlo.compare  EQ, %arg1, %arg3,  FLOAT : (tensor<f32>, tensor<f32>) -> tensor<i1>
188 |       %29 = stablehlo.minimum %arg2, %arg4 : tensor<i64>
189 |       %30 = stablehlo.select %26, %arg2, %arg4 : tensor<i1>, tensor<i64>
190 |       %31 = stablehlo.select %28, %29, %30 : tensor<i1>, tensor<i64>
191 |       stablehlo.return %27, %31 : tensor<f32>, tensor<i64>
192 |     }
193 |     %19 = stablehlo.dynamic_reshape %18#0, %cst_0 : (tensor<2xf32>, tensor<2xi64>) -> tensor<2x1xf32>
194 |     %20 = chlo.broadcast_subtract %16, %19 : (tensor<2x10xf32>, tensor<2x1xf32>) -> tensor<2x10xf32>
195 |     %21 = stablehlo.exponential %20 : tensor<2x10xf32>
196 |     %22 = stablehlo.reduce(%21 init: %6) applies stablehlo.add across dimensions = [1] : (tensor<2x10xf32>, tensor<f32>) -> tensor<2xf32>
197 |     %23 = stablehlo.dynamic_reshape %22, %cst_0 : (tensor<2xf32>, tensor<2xi64>) -> tensor<2x1xf32>
198 |     %24 = stablehlo.log %23 : tensor<2x1xf32>
199 |     %25 = chlo.broadcast_subtract %20, %24 : (tensor<2x10xf32>, tensor<2x1xf32>) -> tensor<2x10xf32>
200 |     return %25 : tensor<2x10xf32>
201 |   }
202 | }
203 | 
204 | 


--------------------------------------------------------------------------------
/pytorch/torch-script/models/conv.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import torch
 4 | import torch.nn as nn
 5 | import torch.nn.functional as F
 6 | 
 7 | import sys
 8 | sys.path.append('../../lib')
 9 | from torch_mlir_compile import torch_mlir_compile
10 | 
11 | class Net(nn.Module):
12 |     def __init__(self):
13 |         super(Net, self).__init__()
14 |         self.conv1 = nn.Conv2d(3, 32, 3, 3)
15 |         self.conv2 = nn.Conv2d(32, 2, 3, 3)
16 | 
17 |     def forward(self, x):
18 |             x = self.conv1(x)
19 |             x = F.relu(x)
20 |             x = self.conv2(x)
21 |             x = F.relu(x)
22 |             output = F.log_softmax(x, dim=1)
23 |             return output
24 | 
25 | def main():
26 |     device = torch.device("cpu")
27 |     simple = Net().to(device)
28 |     # Any other dialect segfaults
29 |     torch_mlir_compile(simple, torch.ones(1, 3, 28, 28))
30 | 
31 | if __name__ == '__main__':
32 |     main()
33 | 


--------------------------------------------------------------------------------
/pytorch/torch-script/models/linear.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import torch
 4 | import torch.nn as nn
 5 | import torch.nn.functional as F
 6 | 
 7 | import sys
 8 | sys.path.append('../../lib')
 9 | from torch_mlir_compile import torch_mlir_compile
10 | 
11 | class Net(nn.Module):
12 |     def __init__(self):
13 |         super(Net, self).__init__()
14 |         self.fc1 = nn.Linear(128, 256)
15 |         self.fc2 = nn.Linear(256, 10)
16 | 
17 |     def forward(self, x):
18 |             x = self.fc1(x)
19 |             x = F.relu(x)
20 |             x = self.fc2(x)
21 |             x = F.relu(x)
22 |             output = F.log_softmax(x, dim=1)
23 |             return output
24 | 
25 | def main():
26 |     device = torch.device("cpu")
27 |     simple = Net().to(device)
28 |     torch_mlir_compile(simple, torch.ones(2, 128))
29 | 
30 | if __name__ == '__main__':
31 |     main()
32 | 


--------------------------------------------------------------------------------
/pytorch/torch-script/models/resnet18.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torchvision
 3 | 
 4 | import sys
 5 | sys.path.append('../../lib')
 6 | from torch_mlir_compile import torch_mlir_compile
 7 | 
 8 | resnet18 = torchvision.models.resnet18(weights=torchvision.models.resnet.ResNet18_Weights.IMAGENET1K_V1)
 9 | resnet18.eval()
10 | 
11 | torch_mlir_compile(resnet18, torch.ones(1, 3, 224, 224),['torch', 'linalg-on-tensors', 'tosa'])


--------------------------------------------------------------------------------
/tensorflow/mlir/conv.mlir:
--------------------------------------------------------------------------------
 1 | module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 1087 : i32}} {
 2 |   func @main(%arg0: tensor<2x28x28x3xf32>) -> tensor<2x?x?x?xf32> attributes {tf.entry_function = {control_outputs = "", inputs = "args_0:0", outputs = "Identity:0"}} {
 3 |     %0 = "tf.Placeholder"() {device = "", shape = #tf_type.shape<>} : () -> tensor<!tf_type.resource>
 4 |     %1 = "tf.ReadVariableOp"(%0) {device = ""} : (tensor<!tf_type.resource>) -> tensor<*xf32>
 5 |     %2 = "tf.ReadVariableOp"(%0) {device = ""} : (tensor<!tf_type.resource>) -> tensor<*xf32>
 6 |     %3 = "tf.Conv2D"(%arg0, %2) {data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "VALID", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true} : (tensor<2x28x28x3xf32>, tensor<*xf32>) -> tensor<2x?x?x?xf32>
 7 |     %4 = "tf.BiasAdd"(%3, %1) {data_format = "NHWC", device = ""} : (tensor<2x?x?x?xf32>, tensor<*xf32>) -> tensor<2x?x?x?xf32>
 8 |     %5 = "tf.Relu"(%4) {device = ""} : (tensor<2x?x?x?xf32>) -> tensor<2x?x?x?xf32>
 9 |     %6 = "tf.ReadVariableOp"(%0) {device = ""} : (tensor<!tf_type.resource>) -> tensor<*xf32>
10 |     %7 = "tf.ReadVariableOp"(%0) {device = ""} : (tensor<!tf_type.resource>) -> tensor<*xf32>
11 |     %8 = "tf.Conv2D"(%5, %7) {data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "VALID", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true} : (tensor<2x?x?x?xf32>, tensor<*xf32>) -> tensor<2x?x?x?xf32>
12 |     %9 = "tf.BiasAdd"(%8, %6) {data_format = "NHWC", device = ""} : (tensor<2x?x?x?xf32>, tensor<*xf32>) -> tensor<2x?x?x?xf32>
13 |     %10 = "tf.Relu"(%9) {device = ""} : (tensor<2x?x?x?xf32>) -> tensor<2x?x?x?xf32>
14 |     %11 = "tf.MaxPool"(%10) {data_format = "NHWC", device = "", explicit_paddings = [], ksize = [1, 23, 23, 1], padding = "VALID", strides = [1, 1, 1, 1]} : (tensor<2x?x?x?xf32>) -> tensor<2x?x?x?xf32>
15 |     %12 = "tf.Identity"(%11) {device = ""} : (tensor<2x?x?x?xf32>) -> tensor<2x?x?x?xf32>
16 |     return %12 : tensor<2x?x?x?xf32>
17 |   }
18 | }
19 | 
20 | 


--------------------------------------------------------------------------------
/tensorflow/mlir/linear.mlir:
--------------------------------------------------------------------------------
 1 | module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 1087 : i32}} {
 2 |   func @main(%arg0: tensor<2x16xf32>) -> tensor<2x?xf32> attributes {tf.entry_function = {control_outputs = "", inputs = "args_0:0", outputs = "Identity:0"}} {
 3 |     %0 = "tf.Placeholder"() {device = "", shape = #tf_type.shape<>} : () -> tensor<!tf_type.resource>
 4 |     %1 = "tf.ReadVariableOp"(%0) {device = ""} : (tensor<!tf_type.resource>) -> tensor<*xf32>
 5 |     %2 = "tf.MatMul"(%arg0, %1) {device = "", transpose_a = false, transpose_b = false} : (tensor<2x16xf32>, tensor<*xf32>) -> tensor<2x?xf32>
 6 |     %3 = "tf.Relu"(%2) {device = ""} : (tensor<2x?xf32>) -> tensor<2x?xf32>
 7 |     %4 = "tf.ReadVariableOp"(%0) {device = ""} : (tensor<!tf_type.resource>) -> tensor<*xf32>
 8 |     %5 = "tf.ReadVariableOp"(%0) {device = ""} : (tensor<!tf_type.resource>) -> tensor<*xf32>
 9 |     %6 = "tf.MatMul"(%3, %5) {device = "", transpose_a = false, transpose_b = false} : (tensor<2x?xf32>, tensor<*xf32>) -> tensor<2x?xf32>
10 |     %7 = "tf.BiasAdd"(%6, %4) {data_format = "NHWC", device = ""} : (tensor<2x?xf32>, tensor<*xf32>) -> tensor<2x?xf32>
11 |     %8 = "tf.Relu"(%7) {device = ""} : (tensor<2x?xf32>) -> tensor<2x?xf32>
12 |     %9 = "tf.ReadVariableOp"(%0) {device = ""} : (tensor<!tf_type.resource>) -> tensor<*xf32>
13 |     %10 = "tf.MatMul"(%8, %9) {device = "", transpose_a = false, transpose_b = false} : (tensor<2x?xf32>, tensor<*xf32>) -> tensor<2x?xf32>
14 |     %11 = "tf.Identity"(%10) {device = ""} : (tensor<2x?xf32>) -> tensor<2x?xf32>
15 |     return %11 : tensor<2x?xf32>
16 |   }
17 | }
18 | 
19 | 


--------------------------------------------------------------------------------
/tensorflow/models/conv.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
 3 | import tensorflow as tf
 4 | from tensorflow.python.pywrap_mlir import import_graphdef
 5 | 
 6 | class MyModel(tf.keras.Model):
 7 |     def build(self, input_shape):
 8 |         # https://www.tensorflow.org/api_docs/python/tf/keras/layers/Conv2D
 9 |         self.x = tf.keras.layers.InputLayer(input_shape=input_shape, batch_size=2)
10 |         self.c1 = tf.keras.layers.Conv2D(2, 3, activation='relu')
11 |         self.c2 = tf.keras.layers.Conv2D(2, 3, activation='relu', use_bias=True)
12 |         # https://www.tensorflow.org/api_docs/python/tf/keras/layers/MaxPool2D
13 |         self.out = tf.keras.layers.MaxPool2D(pool_size=(23, 23), strides=(1, 1))
14 | 
15 |     def call(self, x):
16 |         x = self.x(x)
17 |         x = self.c1(x)
18 |         x = self.c2(x)
19 |         x = self.out(x)
20 |         return x
21 | 
22 | 
23 | if __name__ == "__main__":
24 |     # NHWC shape
25 |     input_shape=(2, 28, 28, 3)
26 | 
27 |     model = MyModel(tf.TensorSpec(shape=input_shape, dtype=tf.float32))
28 | 
29 |     # model.compile(optimizer='sgd', loss='mse')
30 |     # model.fit(
31 |     #     tf.constant(tf.ones(shape=input_shape), dtype=tf.float32),
32 |     #     tf.constant(tf.ones(shape=(2, 2), dtype=tf.float32)))
33 | 
34 |     func = tf.function(model, input_signature=[tf.TensorSpec(shape=input_shape, dtype=tf.float32)])
35 |     concrete_func = func.get_concrete_function(
36 |         tf.constant(tf.ones(shape=input_shape, dtype=tf.float32))
37 |     )
38 | 
39 |     graph = concrete_func.graph.as_graph_def()
40 |     mlir_tf = import_graphdef(
41 |         graph,
42 |         "tf-standard-pipeline",
43 |         False,
44 |         input_names=["args_0:0"],
45 |         input_data_types=["DT_FLOAT"],
46 |         input_data_shapes=["2,28,28,3"],
47 |         output_names=["Identity:0"],
48 |     )
49 |     print(mlir_tf)


--------------------------------------------------------------------------------
/tensorflow/models/linear.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
 3 | import tensorflow as tf
 4 | from tensorflow.python.pywrap_mlir import import_graphdef
 5 | 
 6 | class MyModel(tf.keras.Model):
 7 |     def build(self, input_shape):
 8 |         self.x = tf.keras.layers.InputLayer(input_shape=input_shape, batch_size=2)
 9 |         self.d1 = tf.keras.layers.Dense(32, activation='relu', use_bias=False)
10 |         self.d2 = tf.keras.layers.Dense(32, activation='relu', use_bias=True)
11 |         self.out = tf.keras.layers.Dense(10, use_bias=False)
12 | 
13 |     def call(self, x):
14 |         x = self.x(x)
15 |         x = self.d1(x)
16 |         x = self.d2(x)
17 |         x = self.out(x)
18 |         return x
19 | 
20 | if __name__ == "__main__":
21 |     # Input shape
22 |     input_shape=(2, 16)
23 |     output_shape=(2, 10)
24 | 
25 |     # See: https://github.com/tensorflow/tensorflow/issues/50521
26 |     model = MyModel(tf.TensorSpec(shape=input_shape, dtype=tf.float32))
27 | 
28 |     # print("First make sure that the model has the right shape, and input flows through output")
29 |     # # https://www.tensorflow.org/api_docs/python/tf/keras/Sequential
30 |     # model.compile(optimizer='sgd', loss='mse')
31 |     # model.fit(
32 |     #     tf.constant(tf.ones(shape=input_shape), dtype=tf.float32),
33 |     #     tf.constant(tf.ones(shape=output_shape, dtype=tf.float32)))
34 | 
35 |     # https://www.tensorflow.org/api_docs/python/tf/function#input_signatures_2
36 |     func = tf.function(model, input_signature=[tf.TensorSpec(shape=input_shape, dtype=tf.float32)])
37 |     concrete_func = func.get_concrete_function(
38 |         tf.constant(tf.ones(shape=input_shape, dtype=tf.float32))
39 |     )
40 | 
41 |     # Basically what convert_graph_def should do
42 |     graph = concrete_func.graph.as_graph_def()
43 |     mlir_tf = import_graphdef(
44 |         graph,
45 |         "tf-standard-pipeline",
46 |         False,
47 |         input_names=["args_0:0"],
48 |         input_data_types=["DT_FLOAT"],
49 |         input_data_shapes=["2,16"],
50 |         output_names=["Identity:0"],
51 |     )
52 |     print(mlir_tf)
53 | 


--------------------------------------------------------------------------------
/tensorflow/prepare.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Installs TensorFlow on a virtual environment
 4 | 
 5 | # Run on container/remote directly, need to check
 6 | PROJECT="mlir-generator"
 7 | if [ -d "$PROJECT" ]; then
 8 |   cd "$PROJECT"
 9 | fi
10 | 
11 | # Go into tensorflow subrepo
12 | # This is created on the fly, not a submodule
13 | ROOT="$(git rev-parse --show-toplevel)/external/tensorflow"
14 | rm -rf "$ROOT"
15 | mkdir -p "$ROOT"
16 | pushd "$ROOT"
17 | 
18 | # Always grab a fresh env environment
19 | echo " + Creating a fresh venv"
20 | rm -rf mlir_venv
21 | python -m venv mlir_venv
22 | source mlir_venv/bin/activate
23 | 
24 | # Install Python dependencies
25 | echo " + Install Python dependencies"
26 | python -m pip install --upgrade pip
27 | python -m pip install tensorflow keras
28 | 
29 | popd
30 | 


--------------------------------------------------------------------------------