├── tests
    ├── __init__.py
    ├── ops
    │   ├── __init__.py
    │   ├── test_identity.py
    │   ├── test_softmax.py
    │   ├── test_elementwise.py
    │   ├── test_concat.py
    │   ├── test_transpose.py
    │   ├── test_conv.py
    │   ├── test_pooling.py
    │   ├── test_broadcastable.py
    │   └── test_gemm.py
    ├── util.py
    ├── conftest.py
    ├── zoo.py
    └── test_zoo.py
├── evaluation
    ├── __init__.py
    ├── .gitignore
    ├── results
    │   ├── 10th-ice-lake.png
    │   ├── 6th-skylake.png
    │   ├── 10th-comet-lake.png
    │   ├── 11th-tiger-lake.jpg
    │   └── analysis.Rmd
    ├── random_sample.py
    ├── setup.py
    ├── evaluate.py
    ├── results_conv
    │   ├── analysis.Rmd
    │   ├── 6th.csv
    │   └── 10th.csv
    ├── results_gemm
    │   ├── analysis.Rmd
    │   ├── 10th.csv
    │   └── 6th.csv
    ├── measure_gemm.py
    ├── measure_conv.py
    ├── measure_models.py
    ├── eval_tilings.py
    ├── find_best_tiling_params.py
    ├── measure.py
    └── results_gemm.csv
├── onnx2code
    ├── __init__.py
    ├── ops
    │   ├── __init__.py
    │   ├── gemm_tiling
    │   │   ├── microkernel_ref.cpp
    │   │   ├── GEMM.py
    │   │   ├── microkernel_test.cpp
    │   │   ├── gpackB.cpp
    │   │   ├── gpackA.cpp
    │   │   └── gemm.cpp
    │   ├── identity.py
    │   ├── transpose.py
    │   ├── concat.py
    │   ├── softmax.py
    │   ├── elementwise.py
    │   ├── broadcastable.py
    │   ├── pooling.py
    │   ├── operation.py
    │   ├── gemm.py
    │   └── conv.py
    ├── result.py
    ├── debugger.c
    ├── service.c
    ├── checker.py
    ├── __main__.py
    ├── tensor.py
    ├── memory.py
    ├── util.py
    ├── service.py
    └── generator.py
├── pyproject.toml
├── pytest.ini
├── preliminar
    ├── models-df.pkl
    └── build_dataset.ipynb
├── docs
    └── TP Final onnx2code.pdf
├── .flake8
├── .gitattributes
├── .gitignore
├── Pipfile
├── Dockerfile
├── Makefile
├── .vscode
    └── settings.json
├── .github
    └── workflows
    │   └── ci.yml
└── README.md


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/evaluation/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/onnx2code/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/ops/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.isort]
2 | profile = "black"
3 | 
4 | 


--------------------------------------------------------------------------------
/evaluation/.gitignore:
--------------------------------------------------------------------------------
1 | *.pdf
2 | *.html
3 | results_*.csv
4 | 


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | addopts = --no-header -v -W ignore::DeprecationWarning --ignore=data
3 | 


--------------------------------------------------------------------------------
/preliminar/models-df.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlomb/onnx2code/HEAD/preliminar/models-df.pkl


--------------------------------------------------------------------------------
/docs/TP Final onnx2code.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlomb/onnx2code/HEAD/docs/TP Final onnx2code.pdf


--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 88
3 | ignore = E203, E266, E402, E501, W503, F841
4 | exclude =
5 |     data
6 | 


--------------------------------------------------------------------------------
/evaluation/results/10th-ice-lake.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlomb/onnx2code/HEAD/evaluation/results/10th-ice-lake.png


--------------------------------------------------------------------------------
/evaluation/results/6th-skylake.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlomb/onnx2code/HEAD/evaluation/results/6th-skylake.png


--------------------------------------------------------------------------------
/evaluation/results/10th-comet-lake.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlomb/onnx2code/HEAD/evaluation/results/10th-comet-lake.png


--------------------------------------------------------------------------------
/evaluation/results/11th-tiger-lake.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mlomb/onnx2code/HEAD/evaluation/results/11th-tiger-lake.jpg


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 
4 | # ignore notebooks
5 | *.ipynb linguist-vendored
6 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Cache files
 2 | .mypy_cache/
 3 | .ipynb_checkpoints/
 4 | .pytest_cache/
 5 | __pycache__/
 6 | 
 7 | # Data files
 8 | **/tmp
 9 | data
10 | output/
11 | results.csv
12 | 


--------------------------------------------------------------------------------
/onnx2code/ops/__init__.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa
 2 | 
 3 | from . import (
 4 |     broadcastable,
 5 |     concat,
 6 |     conv,
 7 |     elementwise,
 8 |     gemm,
 9 |     identity,
10 |     operation,
11 |     pooling,
12 |     softmax,
13 |     transpose,
14 | )
15 | 


--------------------------------------------------------------------------------
/tests/util.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import tf2onnx
 3 | 
 4 | from onnx2code.checker import check_model
 5 | 
 6 | 
 7 | def check_keras(model: tf.keras.Model, variations: list[str] = []) -> None:
 8 |     model_proto, _ = tf2onnx.convert.from_keras(model)
 9 |     check_model(model_proto, variations)
10 | 


--------------------------------------------------------------------------------
/onnx2code/result.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | 
 3 | from .tensor import TensorData
 4 | from .util import ShapesMap
 5 | 
 6 | 
 7 | @dataclass
 8 | class ModelResult:
 9 |     input_shapes: ShapesMap
10 |     output_shapes: ShapesMap
11 |     source_c: str
12 |     source_h: str
13 |     source_asm: str
14 |     weights: TensorData
15 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | from typing import Any
 2 | 
 3 | 
 4 | def shapes_id(shape: list[int]) -> str:
 5 |     return f"""({",".join(map(str, shape))})"""
 6 | 
 7 | 
 8 | def pytest_make_parametrize_id(config: Any, val: Any, argname: str) -> str | None:
 9 |     if argname.startswith("shape"):
10 |         return shapes_id(val)
11 |     return None
12 | 


--------------------------------------------------------------------------------
/evaluation/random_sample.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | import numpy as np
 4 | 
 5 | # Read a single int from args
 6 | parser = argparse.ArgumentParser()
 7 | parser.add_argument("N", type=int)
 8 | 
 9 | args = parser.parse_args()
10 | 
11 | # Generate a random float array of size N
12 | arr = np.random.uniform(size=args.N).astype(np.float32)
13 | 
14 | # Write the array to a file
15 | with open("random.bin", "wb") as f:
16 |     f.write(arr.tobytes())
17 | 


--------------------------------------------------------------------------------
/tests/ops/test_identity.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import tensorflow as tf
 3 | 
 4 | from ..util import check_keras
 5 | 
 6 | 
 7 | @pytest.mark.parametrize("shape", [[1], [2, 3], [4, 5, 6]])
 8 | @pytest.mark.parametrize("variation", ["c"])  # , "asm"
 9 | def test_identity(variation: str, shape: list[int]) -> None:
10 |     input = tf.keras.Input(shape)
11 |     out = tf.keras.layers.Lambda(lambda x: x)(input)
12 |     model = tf.keras.Model(inputs=[input], outputs=[out])
13 |     check_keras(model, [variation])
14 | 


--------------------------------------------------------------------------------
/Pipfile:
--------------------------------------------------------------------------------
 1 | [[source]]
 2 | url = "https://pypi.org/simple"
 3 | verify_ssl = true
 4 | name = "pypi"
 5 | 
 6 | [packages]
 7 | numpy = "*"
 8 | onnx = "*"
 9 | onnx-simplifier = "*"
10 | onnxruntime = "*"
11 | pytest = "*"
12 | tensorflow = "*"
13 | tf2onnx = ">=1.12.1"
14 | matplotlib = "*"
15 | pandas = "*"
16 | black = "==22.10.0"
17 | tqdm = "*"
18 | 
19 | [dev-packages]
20 | mypy = "*"
21 | flake8 = "*"
22 | black = "*"
23 | ipykernel = "*"
24 | isort = "*"
25 | 
26 | [requires]
27 | python_version = "3.10"
28 | 
29 | [pipenv]
30 | allow_prereleases = true
31 | 


--------------------------------------------------------------------------------
/tests/ops/test_softmax.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import tensorflow as tf
 3 | 
 4 | from ..util import check_keras
 5 | 
 6 | 
 7 | @pytest.mark.parametrize(
 8 |     "shape",
 9 |     [[1], [2, 3], [4, 5, 6]],
10 | )
11 | @pytest.mark.parametrize("axis", [-1, 1, 2])
12 | def test_softmax(shape: list[int], axis: int) -> None:
13 |     input = tf.keras.Input(shape)
14 |     try:
15 |         output = tf.keras.layers.Softmax(axis=axis)(input)
16 |         model = tf.keras.Model(inputs=[input], outputs=[output])
17 |     except Exception:
18 |         pytest.skip("incompatible configuration")
19 | 
20 |     check_keras(model)
21 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.10
 2 | 
 3 | # install deps
 4 | RUN apt-get update && apt-get install -y --no-install-recommends gcc nasm
 5 | RUN pip install pipenv
 6 | 
 7 | # install libxsmm @ 4e1aa533
 8 | RUN git clone https://github.com/libxsmm/libxsmm
 9 | WORKDIR /libxsmm
10 | RUN git checkout 4e1aa5332123088916989651ae9b187ecba377dc
11 | RUN make generator
12 | ENV PATH="/libxsmm/bin:${PATH}"
13 | 
14 | # install onnx2code
15 | WORKDIR /app
16 | COPY Pipfile .
17 | COPY Pipfile.lock .
18 | RUN pipenv install --deploy
19 | 
20 | COPY onnx2code onnx2code
21 | 
22 | ENTRYPOINT ["pipenv", "run", "python", "-m", "onnx2code", "input.onnx", "output"]
23 | 


--------------------------------------------------------------------------------
/tests/ops/test_elementwise.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import tensorflow as tf
 3 | 
 4 | from ..util import check_keras
 5 | 
 6 | 
 7 | @pytest.mark.parametrize(
 8 |     "activation",
 9 |     [
10 |         tf.keras.layers.Activation("relu"),
11 |         tf.keras.layers.Activation("tanh"),
12 |         tf.keras.layers.Activation("sigmoid"),
13 |     ],
14 |     ids=lambda x: str(x.activation.__name__),
15 | )
16 | def test_activations(activation: tf.keras.layers.Activation) -> None:
17 |     input = tf.keras.Input(shape=(4, 5, 6))
18 |     output = activation(input)
19 |     model = tf.keras.Model(inputs=[input], outputs=[output])
20 |     check_keras(model)
21 | 


--------------------------------------------------------------------------------
/tests/ops/test_concat.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import tensorflow as tf
 3 | 
 4 | from ..util import check_keras
 5 | 
 6 | 
 7 | @pytest.mark.parametrize(
 8 |     "shapes",
 9 |     [
10 |         [[1, 2, 3, 4], [1, 2, 3, 4]],
11 |         [[2, 2, 5, 1], [2, 1, 5, 1]],
12 |         [[2, 2, 5, 1], [2, 1, 5, 1], [2, 3, 5, 1]],
13 |     ],
14 | )
15 | @pytest.mark.parametrize("axis", [0, 1, 2, 3])
16 | @pytest.mark.parametrize("variation", ["c"])
17 | def test_concat(shapes: list[list[int]], axis: int, variation: str) -> None:
18 |     inputs = [tf.keras.Input(shape) for shape in shapes]
19 | 
20 |     try:
21 |         out = tf.keras.layers.Concatenate(axis=1 + axis)(inputs)  # +1 for batch dim
22 |         model = tf.keras.Model(inputs=inputs, outputs=[out])
23 |     except Exception:
24 |         pytest.skip("incompatible configuration")
25 | 
26 |     check_keras(model, [variation])
27 | 


--------------------------------------------------------------------------------
/onnx2code/ops/gemm_tiling/microkernel_ref.cpp:
--------------------------------------------------------------------------------
 1 | template<
 2 |     int mr,
 3 |     int nr,
 4 |     int kc,
 5 | 
 6 |     int CStrideRow
 7 | >
 8 | inline void ref_microkernel(
 9 |     const float* __restrict__ A_kernel,  // (mr x kc) column major
10 |     const float* __restrict__ B_kernel,  // (kc x nr) row major
11 |     float* __restrict__ C
12 | ) {
13 |     float AB[mr * nr];
14 |     memset(AB, 0, mr * nr * sizeof(float));
15 | 
16 |     for (int k = 0; k < kc; k++) {
17 |         for (int n = 0; n < nr; n++) {
18 |             for (int m = 0; m < mr; m++) {
19 |                 AB[n * mr + m] +=
20 |                     A_kernel[k * mr + m] *
21 |                     B_kernel[k * nr + n];
22 |             }
23 |         }
24 |     }
25 | 
26 |     for (int j = 0; j < nr; j++) {
27 |         for (int i = 0; i < mr; i++) {
28 |             C[i * CStrideRow + j] += AB[mr * j + i];
29 |         }
30 |     }
31 | }
32 | 


--------------------------------------------------------------------------------
/evaluation/setup.py:
--------------------------------------------------------------------------------
 1 | # Makes sure the libraries are using only 1 CPU thread
 2 | # and are optimized for inference.
 3 | 
 4 | import os
 5 | import sys
 6 | 
 7 | # Silence TF
 8 | os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
 9 | # Do not use GPU
10 | os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
11 | 
12 | # Make onnxruntime only use 1 CPU thread
13 | os.environ["OMP_NUM_THREADS"] = "1"
14 | os.environ["MKL_NUM_THREADS"] = "1"
15 | os.environ["OPENBLAS_NUM_THREADS"] = "1"
16 | 
17 | import tensorflow as tf
18 | 
19 | # Make tensorflow only use 1 CPU thread
20 | tf.config.threading.set_inter_op_parallelism_threads(1)
21 | tf.config.threading.set_intra_op_parallelism_threads(1)
22 | 
23 | # We don't need to disable eager execution, because we are using tf.function (I hope)
24 | # tf.compat.v1.disable_v2_behavior()
25 | # tf.compat.v1.disable_eager_execution()
26 | # tf.config.run_functions_eagerly(False)  # this line does not work 🤡
27 | 
28 | sys.path.append("../")
29 | 


--------------------------------------------------------------------------------
/tests/zoo.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import urllib.request
 4 | from pathlib import Path
 5 | from typing import Any
 6 | 
 7 | 
 8 | def download_from_zoo(path: str, expected_size: int | None = None) -> Path:
 9 |     """
10 |     Download files from the repo https://github.com/onnx/models
11 |     """
12 |     target = Path(os.path.dirname(__file__)) / "../data" / path
13 | 
14 |     if target.is_file():
15 |         # file already downloaded
16 | 
17 |         # check if the size is the expected one
18 |         if expected_size is None or target.stat().st_size == expected_size:
19 |             return target
20 | 
21 |     target.parent.mkdir(parents=True, exist_ok=True)
22 | 
23 |     urllib.request.urlretrieve(
24 |         f"https://github.com/onnx/models/raw/main/{path}", target
25 |     )
26 | 
27 |     return target
28 | 
29 | 
30 | def zoo_manifest() -> Any:
31 |     return json.loads(download_from_zoo("ONNX_HUB_MANIFEST.json").read_text())
32 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | test:
 2 | 	env TF_CPP_MIN_LOG_LEVEL=3 CUDA_VISIBLE_DEVICES=-1 pytest --durations=10
 3 | 
 4 | mnist:
 5 | 	env TF_CPP_MIN_LOG_LEVEL=3 CUDA_VISIBLE_DEVICES=-1 pytest -k mnist
 6 | 
 7 | lint:
 8 | 	flake8 . --count --statistics
 9 | 
10 | format:
11 | 	isort . --skip=data && \
12 | 	black --verbose . --exclude=data
13 | 
14 | precommit: lint format test
15 | 
16 | debug:
17 | 	python -m onnx2code model.onnx output --variations=loop-tiling --checks=1 ; \
18 | 	nasm -f elf64 output/model.asm -o output/model-asm.o -g && \
19 | 	g++ output/model.cpp output/debugger.cpp output/model-asm.o -o output/main -g && \
20 | 	gdb output/main output/model-asm.o -ex "b unit_update" -ex "r"
21 | 
22 | profile:
23 | 	python -m onnx2code data/model.onnx output --variations=loop-tiling --checks=1; \
24 | 	nasm -f elf64 output/model.asm -o output/model-asm.o -g && \
25 | 	g++ -Ioutput/ output/model.cpp onnx2code/debugger.c output/model-asm.o -o output/main \
26 | 		-g -O3 -march=native -mtune=native
27 | 


--------------------------------------------------------------------------------
/tests/ops/test_transpose.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import tensorflow as tf
 3 | 
 4 | from ..util import check_keras
 5 | 
 6 | 
 7 | @pytest.mark.parametrize("shape", [[2, 3], [3, 4, 5]])
 8 | def test_transpose_default(shape: list[int]) -> None:
 9 |     input = tf.keras.Input(shape=shape)
10 |     out = tf.keras.backend.transpose(input)
11 |     model = tf.keras.Model(inputs=[input], outputs=[out])
12 |     check_keras(model)
13 | 
14 | 
15 | @pytest.mark.parametrize(
16 |     "perm",
17 |     [
18 |         # ([1,2,3]), # this gets optimized with Identity
19 |         ([2, 3, 1]),
20 |         ([3, 2, 1]),
21 |         ([3, 1, 2]),
22 |         ([1, 3, 2]),
23 |         ([2, 1, 3]),
24 |     ],
25 |     ids=lambda x: ",".join(map(str, x)),
26 | )
27 | def test_transpose_perm(perm: list[int]) -> None:
28 |     input = tf.keras.Input(shape=[3, 4, 5])
29 |     out = tf.keras.layers.Permute(perm)(input)
30 |     model = tf.keras.Model(inputs=[input], outputs=[out])
31 |     check_keras(model)
32 | 


--------------------------------------------------------------------------------
/evaluation/evaluate.py:
--------------------------------------------------------------------------------
 1 | import setup  # noqa # isort:skip
 2 | 
 3 | import keras
 4 | import matplotlib.pyplot as plt
 5 | import tensorflow as tf
 6 | from keras import layers
 7 | from measure import measure_all
 8 | 
 9 | from onnx2code.ops.gemm_tiling.GEMM import LoopTilingParams, set_tiling_params
10 | 
11 | # Custom MNIST-like model
12 | input = tf.keras.Input([4096 * 64])
13 | out = tf.keras.layers.Lambda(lambda x: x)(input)
14 | 
15 | input_shape = (512, 512)
16 | 
17 | M = 4
18 | K = 16
19 | N = 64009
20 | 
21 | 
22 | model = keras.Sequential(
23 |     [
24 |         keras.Input(shape=(M, K)),
25 |         layers.Dense(N, activation="relu"),
26 |     ]
27 | )
28 | 
29 | set_tiling_params(LoopTilingParams(nc=4096, kc=256, mc=128, mr=4, nr=8, mv=4, nu=4))
30 | 
31 | # Measure models
32 | data = measure_all(model, variations=["loop-tiling", "gemm-naive"])
33 | 
34 | # Plot results
35 | plt.boxplot(data.values(), labels=data.keys())
36 | plt.ylabel("Time (ms)")
37 | plt.title("Inference time of Identity model")
38 | 
39 | plt.show()
40 | 


--------------------------------------------------------------------------------
/evaluation/results_conv/analysis.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "R Notebook"
 3 | output: html_notebook
 4 | ---
 5 | 
 6 | ```{r}
 7 | library(ggplot2)
 8 | library(extrafont)
 9 | 
10 | theme_set(theme(text=element_text(family="LM Roman 10")))
11 | ```
12 | 
13 | ```{r}
14 | datos <- read.csv("C:/Users/mlomb/Desktop/onnx2code/evaluation/results_conv/6th.csv")
15 | #datos$runtime <- factor(datos$runtime, labels=c("naïve", "libxsmm", "onnx2code", "onnxruntime", "tensorflow"))
16 | 
17 | datosCON <- datos[datos$runtime == "onnx2code",]
18 | datosSIN <- datos[datos$runtime != "onnx2code",]
19 | ```
20 | 
21 | 
22 | ```{r}
23 | ggplot(NULL, aes(x=MNK,y=time_mean, colour=runtime)) +
24 |   geom_line(data=datosSIN, size=0.4) +
25 |   geom_line(data=datosCON, size=1) +
26 |   geom_point(data=datosSIN, size=1) +
27 |   geom_point(data=datosCON, size=1.3) +
28 |   #geom_errorbar(data=datosSIN, aes(ymin=time_mean-time_std, ymax=time_mean+time_std)) +
29 |   xlab("M=K=N") +
30 |   ylab("Tiempo (ms, log scale)") +
31 |   scale_y_log10() + labs(color='Runtime') +
32 |   scale_x_continuous(breaks = pretty(datos$MNK, n = 15)) 
33 | ggsave("conv.pdf", width = 8, height = 4, device=cairo_pdf)
34 | ```
35 | 
36 | 
37 | 


--------------------------------------------------------------------------------
/evaluation/results_gemm/analysis.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "R Notebook"
 3 | output: html_notebook
 4 | ---
 5 | 
 6 | ```{r}
 7 | library(ggplot2)
 8 | library(extrafont)
 9 | 
10 | theme_set(theme(text=element_text(family="LM Roman 10")))
11 | ```
12 | 
13 | ```{r}
14 | datos <- read.csv("C:/Users/mlomb/Desktop/onnx2code/evaluation/results_gemm/10th.csv")
15 | datos$runtime <- factor(datos$runtime, labels=c("naïve", "libxsmm", "onnx2code", "onnxruntime", "tensorflow"))
16 | 
17 | datosCON <- datos[datos$runtime == "onnx2code",]
18 | datosSIN <- datos[datos$runtime != "onnx2code",]
19 | ```
20 | 
21 | 
22 | ```{r}
23 | ggplot(NULL, aes(x=MNK,y=time_mean, colour=runtime)) +
24 |   geom_line(data=datosSIN, size=0.4) +
25 |   geom_line(data=datosCON, size=1) +
26 |   geom_point(data=datosSIN, size=1) +
27 |   geom_point(data=datosCON, size=1.3) +
28 |   #geom_errorbar(data=datosSIN, aes(ymin=time_mean-time_std, ymax=time_mean+time_std)) +
29 |   xlab("M=K=N") +
30 |   ylab("Tiempo (ms, log scale)") +
31 |   scale_y_log10() + labs(color='Runtime') +
32 |   scale_x_continuous(breaks = pretty(datos$MNK, n = 15)) 
33 | ggsave("gemm.pdf", width = 8, height = 4, device=cairo_pdf)
34 | ```
35 | 
36 | 
37 | 


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "files.exclude": {
 3 |     ".git/": true,
 4 |     "**/__pycache__/": true,
 5 |     "**/.mypy_cache/": true,
 6 |     "**/.pytest_cache/": true,
 7 |     "**/.ipynb_checkpoints/": true
 8 |   },
 9 |   "editor.tabSize": 4,
10 |   "editor.rulers": [{ "column": 88, "color": "#575a57" }],
11 |   "python.formatting.provider": "black",
12 |   "python.linting.flake8Enabled": true,
13 |   "python.linting.mypyEnabled": true,
14 |   "python.linting.mypyArgs": [
15 |     "--follow-imports=silent",
16 |     "--ignore-missing-imports",
17 |     "--show-column-numbers",
18 |     "--no-pretty",
19 |     "--strict"
20 |   ],
21 |   "python.linting.enabled": true,
22 |   "python.testing.pytestArgs": ["tests"],
23 |   "python.testing.unittestEnabled": false,
24 |   "python.testing.pytestEnabled": true,
25 |   "python.linting.flake8Args": [
26 |     "--config=.flake8"
27 |   ],
28 |   "files.associations": {
29 |     "*.desktop": "ini",
30 |     "*.dbus": "ini",
31 |     "*.systemd": "ini",
32 |     ".env": "properties",
33 |     "*.tcc": "c",
34 |     "chrono": "cpp",
35 |     "random": "cpp",
36 |     "limits": "cpp",
37 |     "valarray": "cpp",
38 |     "algorithm": "cpp"
39 |   }
40 | }
41 | 


--------------------------------------------------------------------------------
/onnx2code/debugger.c:
--------------------------------------------------------------------------------
 1 | #include <assert.h>
 2 | #include <fcntl.h>
 3 | #include <math.h>
 4 | #include <stdio.h>
 5 | #include <stdlib.h>
 6 | #include <sys/stat.h>
 7 | 
 8 | #include "model.h"
 9 | 
10 | void* read_file(const char* filename, long* _size) {
11 |     FILE* fp = fopen(filename, "rb");
12 |     assert(fp != NULL);
13 | 
14 |     fseek(fp, 0, SEEK_END);
15 |     long size = ftell(fp);
16 |     fseek(fp, 0, SEEK_SET);
17 | 
18 |     void* buffer = malloc(size);
19 | 
20 |     fread(buffer, sizeof(char), size, fp);
21 |     fclose(fp);
22 | 
23 |     if (_size)
24 |         *_size = size;
25 | 
26 |     return buffer;
27 | }
28 | 
29 | int main(int argc, char** argv) {
30 |     long outputs_size;
31 | 
32 |     const float* inputs = (const float*)read_file("./sample_inputs.bin", NULL);
33 |     const float* weights = (const float*)read_file("./weights.bin", NULL);
34 |     const float* truth_outputs = (const float*)read_file("./sample_outputs.bin", &outputs_size);
35 |     float* outputs = (float*)malloc(outputs_size);
36 | 
37 |     float total = 0;
38 |     for (int i = 0; i < 10000; i++) {
39 |         inference(weights, inputs, outputs);
40 | 
41 |         total += outputs[0];
42 |     }
43 |     
44 |     printf("total: %f\n", total);
45 | 
46 |     return 0;
47 | }
48 | 


--------------------------------------------------------------------------------
/evaluation/measure_gemm.py:
--------------------------------------------------------------------------------
 1 | import setup  # noqa # isort:skip
 2 | 
 3 | import keras
 4 | import numpy as np
 5 | import pandas as pd
 6 | from keras import layers
 7 | from measure import measure_all
 8 | from tqdm import tqdm
 9 | 
10 | from onnx2code.ops.gemm_tiling.GEMM import LoopTilingParams, set_tiling_params
11 | 
12 | # should be set to the best
13 | set_tiling_params(LoopTilingParams(nc=4096, kc=256, mc=64, mr=4, nr=32, mv=2, nu=4))
14 | 
15 | SIZES = 2 ** np.arange(8, 10)
16 | VARIATIONS = ["gemm-naive", "loop-tiling", "libxsmm"]
17 | 
18 | 
19 | results = pd.DataFrame(columns=["MNK", "runtime", "time_mean", "time_std"])
20 | 
21 | for x in tqdm(range(256, 1280, 32)):
22 |     model = keras.Sequential(
23 |         [
24 |             keras.Input(shape=(x, x)),
25 |             layers.Dense(x, activation=None, use_bias=False),
26 |         ]
27 |     )
28 | 
29 |     result = measure_all(model, variations=VARIATIONS, runs=300, tqdm_leave=False)
30 | 
31 |     for var, times in result.items():
32 |         entry = {
33 |             "MNK": x,
34 |             "runtime": var,
35 |             "time_mean": np.mean(times),
36 |             "time_std": np.std(times),
37 |         }
38 |         results = pd.concat(
39 |             [
40 |                 results,
41 |                 pd.DataFrame.from_records([entry]),
42 |             ]
43 |         )
44 |     results.to_csv("results_gemm.csv", index=False)
45 | 


--------------------------------------------------------------------------------
/onnx2code/ops/gemm_tiling/GEMM.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | import math
 3 | from pathlib import Path
 4 | 
 5 | 
 6 | @dataclass
 7 | class LoopTilingParams:
 8 |     nc: int  # Columnas de panel de B
 9 |     kc: int  # Filas de panel de B
10 |     mc: int  # Filas de bloque de A
11 |     mr: int  # Filas de microkernel
12 |     nr: int  # Columnas de microkernel
13 |     mv: int  # Filas de unit-update
14 |     nu: int  # Columnas de unit-update
15 | 
16 | 
17 | tiling_params = LoopTilingParams(
18 |     nc=4096,
19 |     kc=256,
20 |     mc=256,
21 |     mr=4,
22 |     nr=8,
23 |     mv=4,
24 |     nu=4,
25 | )
26 | 
27 | 
28 | def set_tiling_params(params: LoopTilingParams) -> None:
29 |     global tiling_params
30 |     tiling_params = params
31 | 
32 | 
33 | external_paths_GEMM = (
34 |     Path(__file__).parent / "gpackA.cpp",
35 |     Path(__file__).parent / "gpackB.cpp",
36 |     Path(__file__).parent / "microkernel_ref.cpp",
37 |     Path(__file__).parent / "microkernel_test.cpp",
38 |     Path(__file__).parent / "gemm.cpp",
39 | )
40 | 
41 | 
42 | def call_GEMM(M: int, K: int, N: int, params: str) -> str:
43 |     nc = min(2 ** math.ceil(math.log2(N)), tiling_params.nc)
44 |     kc = tiling_params.kc
45 |     mc = tiling_params.mc
46 |     mr = tiling_params.mr
47 |     nr = tiling_params.nr
48 | 
49 |     mv = tiling_params.mv
50 |     nu = tiling_params.nu
51 | 
52 |     return f"gemm<{M},{K},{N},{nc},{kc},{mc},{mr},{nr},{mv},{nu}>({params});"
53 | 


--------------------------------------------------------------------------------
/tests/ops/test_conv.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import tensorflow as tf
 3 | 
 4 | from ..util import check_keras
 5 | 
 6 | 
 7 | @pytest.mark.parametrize("shape", [(4, 3, 1), (3, 4, 3), (10, 10, 5), (32, 32, 3)])
 8 | @pytest.mark.parametrize("kernel_size", [1, 2, 3])
 9 | @pytest.mark.parametrize("filters", [1, 2, 3, 10])
10 | @pytest.mark.parametrize("padding", ["valid", "same"])
11 | @pytest.mark.parametrize(
12 |     "stride_and_dilation",
13 |     [
14 |         (1, 1),
15 |         (2, 1),
16 |         # TODO: dilation?
17 |     ],
18 |     ids=lambda x: f"s{x[0]}d{x[1]}",
19 | )
20 | @pytest.mark.parametrize("use_bias", [False, True], ids=["no_bias", "bias"])
21 | def test_conv(
22 |     shape: list[int],
23 |     kernel_size: int,
24 |     filters: int,
25 |     padding: str,
26 |     stride_and_dilation: tuple[int, int],
27 |     use_bias: bool,
28 | ) -> None:
29 |     try:
30 |         input = tf.keras.Input(shape=shape)
31 |         output = tf.keras.layers.Conv2D(
32 |             filters=filters,
33 |             padding=padding,
34 |             kernel_size=kernel_size,
35 |             strides=stride_and_dilation[0],
36 |             dilation_rate=stride_and_dilation[1],
37 |             use_bias=use_bias,
38 |             bias_initializer="random_normal",
39 |         )(input)
40 |         model = tf.keras.Model(inputs=[input], outputs=[output])
41 |     except Exception:
42 |         pytest.skip("incompatible configuration")
43 | 
44 |     check_keras(model)
45 | 


--------------------------------------------------------------------------------
/tests/ops/test_pooling.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import tensorflow as tf
 3 | 
 4 | from ..util import check_keras
 5 | 
 6 | 
 7 | @pytest.mark.parametrize(
 8 |     "shape",
 9 |     [
10 |         # 1D
11 |         *[shape for shape in [[1, 1], [5, 1], [10, 2], [16, 3]]],
12 |         # 2D
13 |         *[shape for shape in [[1, 1, 1], [5, 5, 1], [10, 8, 3], [16, 8, 8]]],
14 |     ],
15 | )
16 | @pytest.mark.parametrize("pool_size", [1, 2, 3])
17 | @pytest.mark.parametrize("strides", [1, 2, 3])
18 | @pytest.mark.parametrize("padding", ["valid", "same"])
19 | @pytest.mark.parametrize("op", ["max", "average"])
20 | def test_maxpool(
21 |     shape: list[int],
22 |     pool_size: int,
23 |     strides: int,
24 |     padding: str,
25 |     op: str,
26 | ) -> None:
27 |     impl = {
28 |         "max": {
29 |             2: tf.keras.layers.MaxPooling1D,
30 |             3: tf.keras.layers.MaxPooling2D,
31 |         },
32 |         "average": {
33 |             2: tf.keras.layers.AveragePooling1D,
34 |             3: tf.keras.layers.AveragePooling2D,
35 |         },
36 |     }[op][len(shape)]
37 |     input = tf.keras.Input(shape)
38 |     try:
39 |         pool = impl(
40 |             pool_size=pool_size,
41 |             strides=strides,
42 |             padding=padding,
43 |         )(input)
44 |     except ValueError as e:
45 |         pytest.skip("incompatible configuration: " + str(e))
46 | 
47 |     model = tf.keras.Model(inputs=[input], outputs=[pool])
48 |     check_keras(model)
49 | 


--------------------------------------------------------------------------------
/evaluation/measure_conv.py:
--------------------------------------------------------------------------------
 1 | import setup  # noqa # isort:skip
 2 | 
 3 | import keras
 4 | import numpy as np
 5 | import pandas as pd
 6 | from keras import layers
 7 | from measure import measure_all
 8 | from tqdm import tqdm
 9 | 
10 | from onnx2code.ops.gemm_tiling.GEMM import LoopTilingParams, set_tiling_params
11 | 
12 | # should be set to the best
13 | set_tiling_params(LoopTilingParams(nc=4096, kc=256, mc=64, mr=4, nr=32, mv=2, nu=4))
14 | 
15 | SIZES = 2 ** np.arange(8, 10)
16 | VARIATIONS = ["conv-naive", "im2col"]
17 | 
18 | 
19 | results = pd.DataFrame(columns=["MNK", "runtime", "time_mean", "time_std"])
20 | 
21 | for x in tqdm(range(256, 1280, 32)):
22 |     model = keras.Sequential(
23 |         [
24 |             keras.Input(shape=(x, x, 1)),
25 |             layers.Conv2D(
26 |                 filters=4,
27 |                 padding="valid",
28 |                 kernel_size=4,
29 |                 use_bias=False,
30 |             ),
31 |         ]
32 |     )
33 | 
34 |     result = measure_all(model, variations=VARIATIONS, runs=100, tqdm_leave=False)
35 | 
36 |     for var, times in result.items():
37 |         entry = {
38 |             "MNK": x,
39 |             "runtime": var,
40 |             "time_mean": np.mean(times),
41 |             "time_std": np.std(times),
42 |         }
43 |         results = pd.concat(
44 |             [
45 |                 results,
46 |                 pd.DataFrame.from_records([entry]),
47 |             ]
48 |         )
49 |     results.to_csv("results_conv.csv", index=False)
50 | 


--------------------------------------------------------------------------------
/onnx2code/ops/gemm_tiling/microkernel_test.cpp:
--------------------------------------------------------------------------------
 1 | template <
 2 |     int mv,
 3 |     int nu,
 4 | 
 5 |     int CStrideRow,
 6 |     int CStrideCol>
 7 | inline void unit_update(
 8 |     const float* __restrict__ a,  // mv
 9 |     const float* __restrict__ b,  // nu
10 |     float* __restrict__ C         // mv x nu
11 | ) {
12 |     for (int i = 0; i < mv; i++) {
13 |         for (int j = 0; j < nu; j++) {
14 |             C[i * CStrideRow + j * CStrideCol] += a[i] * b[j];
15 |         }
16 |     }
17 | }
18 | 
19 | template <
20 |     int mr,
21 |     int nr,
22 |     int mv,
23 |     int nu>
24 | inline void test_microkernel(
25 |     int kc,
26 |     const float* __restrict__ A_kernel,  // (mr x kc) column major
27 |     const float* __restrict__ B_kernel,  // (kc x nr) row major
28 |     float* __restrict__ AB               // (mr x nr)
29 | ) {
30 |     static_assert(mr % mv == 0, "must be conforming");
31 |     static_assert(nr % nu == 0, "must be conforming");
32 | 
33 |     for (int k = 0; k < kc; k++) {
34 |         // single outer product
35 |         // en una columna de A y una fila de B (del zigzag)
36 | 
37 |         // loop tiling
38 |         for (int j = 0; j < nr; j += nu) {
39 |             for (int i = 0; i < mr; i += mv) {
40 |                 // unit update (small outer product)
41 |                 unit_update<mv, nu, nr, 1>(
42 |                     A_kernel + i,
43 |                     B_kernel + j,
44 |                     AB + i * nr + j);
45 |             }
46 |         }
47 | 
48 |         // advance one column of A and one row of B
49 |         A_kernel += mr;
50 |         B_kernel += nr;
51 |     }
52 | }
53 | 


--------------------------------------------------------------------------------
/tests/ops/test_broadcastable.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import tensorflow as tf
 3 | 
 4 | from ..util import check_keras
 5 | 
 6 | shapes = [
 7 |     # same shape
 8 |     *[(s, s) for s in [[1], [2, 3], [4, 5, 6]]],  # scalar, 2d, 3d
 9 |     # broadcasting
10 |     # https://github.com/onnx/onnx/blob/master/docs/Broadcasting.md
11 |     ([2], [1]),
12 |     ([2, 3, 4, 5], [1]),
13 |     ([2, 3, 4, 5], [5]),
14 |     ([4, 5], [2, 3, 4, 5]),
15 |     ([1, 4, 5], [2, 3, 1, 1]),
16 |     ([3, 4, 5], [2, 1, 1, 1]),
17 |     ([3, 4, 5], [5]),
18 |     ([3, 4, 5], [4, 5]),
19 |     ([3, 4, 5, 6], [5, 6]),
20 |     ([3, 4, 5, 6], [4, 5, 6]),
21 |     ([1, 4, 1, 6], [3, 1, 5, 6]),
22 |     ([3, 1, 1], [1, 3, 416, 416]),
23 | ]
24 | 
25 | 
26 | @pytest.mark.parametrize("shapeA,shapeB", shapes)
27 | @pytest.mark.parametrize(
28 |     "operation",
29 |     [
30 |         tf.keras.layers.Add,
31 |         tf.keras.layers.Subtract,
32 |         tf.keras.layers.Multiply,
33 |     ],
34 |     ids=lambda x: str(x.__name__),
35 | )
36 | def test_basic_ops(
37 |     operation: tf.keras.layers.Layer, shapeA: list[int], shapeB: list[int]
38 | ) -> None:
39 |     inputA = tf.keras.Input(shapeA)
40 |     inputB = tf.keras.Input(shapeB)
41 |     result = operation()([inputA, inputB])
42 |     model = tf.keras.Model(inputs=[inputA, inputB], outputs=[result])
43 |     check_keras(model)
44 | 
45 | 
46 | @pytest.mark.parametrize("shapeA,shapeB", shapes)
47 | def test_div(shapeA: list[int], shapeB: list[int]) -> None:
48 |     inputA = tf.keras.Input(shapeA)
49 |     inputB = tf.keras.Input(shapeB)
50 |     model = tf.keras.Model(inputs=[inputA, inputB], outputs=[inputA / inputB])
51 |     check_keras(model)
52 | 


--------------------------------------------------------------------------------
/onnx2code/service.c:
--------------------------------------------------------------------------------
 1 | #include <assert.h>
 2 | #include <fcntl.h>
 3 | #include <stdio.h>
 4 | #include <stdlib.h>
 5 | #include <sys/mman.h>
 6 | #include <sys/stat.h>
 7 | #include <unistd.h>
 8 | 
 9 | #include "model.h"
10 | 
11 | void* map_shared_memory(const char* name) {
12 |     int fd = shm_open(name, O_RDWR | O_CREAT, 0666);
13 |     assert(fd != -1);
14 | 
15 |     // we query the size so we don't have to know it beforehand
16 |     struct stat finfo;
17 |     fstat(fd, &finfo);
18 |     size_t size = finfo.st_size;
19 |     assert(size > 0);
20 | 
21 |     void* shared = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
22 |     assert(shared != MAP_FAILED);
23 | 
24 |     return shared;
25 | }
26 | 
27 | void* read_file(const char* filename) {
28 |     FILE* fp = fopen(filename, "rb");
29 |     assert(fp != NULL);
30 | 
31 |     fseek(fp, 0, SEEK_END);
32 |     long size = ftell(fp);
33 |     fseek(fp, 0, SEEK_SET);
34 | 
35 |     void* buffer = malloc(size);
36 | 
37 |     fread(buffer, sizeof(char), size, fp);
38 |     fclose(fp);
39 | 
40 |     return buffer;
41 | }
42 | 
43 | int main(int argc, char** argv) {
44 |     const float* weights = (const float*)read_file(argv[1]);
45 |     float* inputs = (float*)map_shared_memory("/o2c-inputs");
46 |     float* outputs = (float*)map_shared_memory("/o2c-outputs");
47 | 
48 |     while (1) {
49 |         // wait for data
50 |         char signal;
51 |         read(STDIN_FILENO, &signal, 1);
52 | 
53 |         // run inference
54 |         inference(weights, inputs, outputs);
55 | 
56 |         // mark as ready
57 |         write(STDOUT_FILENO, &signal, 1);
58 |         fsync(STDOUT_FILENO);
59 |     }
60 | 
61 |     return 0;
62 | }
63 | 


--------------------------------------------------------------------------------
/onnx2code/ops/identity.py:
--------------------------------------------------------------------------------
 1 | from .operation import OpCall, Operation, OpImpl
 2 | 
 3 | 
 4 | class Identity(Operation):
 5 |     """
 6 |     Identity operator
 7 | 
 8 |     https://github.com/onnx/onnx/blob/main/docs/Operators.md#identity
 9 |     """
10 | 
11 |     node_types = {"Identity"}
12 | 
13 |     def parse(self) -> None:
14 |         assert len(self.inputs) == 1, "expected one input"
15 |         assert len(self.outputs) == 1, "expected one output"
16 |         assert (
17 |             self.inputs[0].size == self.outputs[0].size
18 |         ), "input and output tensors should have the same size"
19 | 
20 |         self.size = self.inputs[0].size
21 | 
22 |     def call(self) -> OpCall:
23 |         return OpCall(
24 |             sig_name="Identity",
25 |             sig_params=[self.size],
26 |             inputs=self.inputs,
27 |             outputs=self.outputs,
28 |         )
29 | 
30 | 
31 | @Identity.variant("c", priority=1)
32 | class IdentityC(Identity):
33 |     def impl(self) -> OpImpl:
34 |         source = f"""
35 |         for (int i = 0; i < {self.size}; i++) {{
36 |             OUT[i] = A[i];
37 |         }}
38 |         """
39 | 
40 |         return OpImpl(lang="c", source=source)
41 | 
42 | 
43 | @Identity.variant("asm", priority=0)
44 | class IdentityASM(Identity):
45 |     def impl(self) -> OpImpl:
46 |         source = (
47 |             f"mov rax, {self.size}",
48 |             ".loop:",
49 |             "movss xmm0, [rdi]",
50 |             "add rdi, 4",
51 |             "movss [rsi], xmm0",
52 |             "add rsi, 4",
53 |             "dec rax",
54 |             "jnz .loop",
55 |             "ret",
56 |         )
57 | 
58 |         return OpImpl(lang="asm", source=source)
59 | 


--------------------------------------------------------------------------------
/evaluation/measure_models.py:
--------------------------------------------------------------------------------
 1 | import setup  # noqa # isort:skip
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | import onnx
 6 | from measure import measure_all
 7 | from tqdm import tqdm
 8 | 
 9 | from onnx2code.ops.gemm_tiling.GEMM import LoopTilingParams, set_tiling_params
10 | 
11 | # should be set to the best
12 | set_tiling_params(LoopTilingParams(nc=4096, kc=256, mc=64, mr=4, nr=32, mv=2, nu=4))
13 | 
14 | VARIATIONS = ["conv-naive", "im2col"]
15 | 
16 | results = pd.DataFrame(columns=["model", "runtime", "time_mean", "time_std"])
17 | 
18 | models = [
19 |     "../data/vision/classification/mnist/model/mnist-12.onnx",
20 |     "../data/vision/super_resolution/sub_pixel_cnn_2016/model/super-resolution-10.onnx",
21 |     "../data/vision/classification/squeezenet/model/squeezenet1.1-7.onnx",
22 |     "../data/vision/body_analysis/emotion_ferplus/model/emotion-ferplus-8.onnx",
23 |     "../data/vision/classification/inception_and_googlenet/inception_v2/model/inception-v2-7.onnx",
24 |     "../data/vision/classification/resnet/model/resnet50-caffe2-v1-9.onnx",
25 | ]
26 | 
27 | for model in tqdm(models):
28 |     result = measure_all(
29 |         None,
30 |         variations=VARIATIONS,
31 |         runs=15,
32 |         tqdm_leave=False,
33 |         onnx_model=onnx.load(model),
34 |     )
35 | 
36 |     for var, times in result.items():
37 |         entry = {
38 |             "model": model,
39 |             "runtime": var,
40 |             "time_mean": np.mean(times),
41 |             "time_std": np.std(times),
42 |         }
43 |         results = pd.concat(
44 |             [
45 |                 results,
46 |                 pd.DataFrame.from_records([entry]),
47 |             ]
48 |         )
49 |     results.to_csv("results_models.csv", index=False)
50 | 


--------------------------------------------------------------------------------
/tests/ops/test_gemm.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import tensorflow as tf
 3 | 
 4 | from ..util import check_keras
 5 | 
 6 | 
 7 | @pytest.mark.parametrize("shape", [[1], [2, 2], [2, 3], [4, 5, 6]])
 8 | @pytest.mark.parametrize("units", [1, 2, 10, 100], ids=lambda x: f"{x}_units")
 9 | @pytest.mark.parametrize("use_bias", [False, True], ids=["no_bias", "bias"])
10 | def test_naive(shape: list[int], units: int, use_bias: bool) -> None:
11 |     input = tf.keras.Input(shape)
12 |     dense = tf.keras.layers.Dense(units, use_bias=use_bias, bias_initializer="uniform")(
13 |         input
14 |     )
15 |     model = tf.keras.Model(inputs=[input], outputs=[dense])
16 |     check_keras(model, variations=["gemm-naive"])
17 | 
18 | 
19 | # @pytest.mark.parametrize("shape", [[1], [2, 2], [2, 3], [4, 5, 6]])
20 | # @pytest.mark.parametrize("units", [1, 2, 10, 100], ids=lambda x: f"{x}_units")
21 | # @pytest.mark.parametrize("use_bias", [False, True], ids=["no_bias", "bias"])
22 | # def test_libxsmm(shape: list[int], units: int, use_bias: bool) -> None:
23 | #     input = tf.keras.Input(shape)
24 | #     dense = tf.keras.layers.Dense(units, use_bias=use_bias, bias_initializer="uniform")(
25 | #         input
26 | #     )
27 | #     model = tf.keras.Model(inputs=[input], outputs=[dense])
28 | #     check_keras(model, variations=["libxsmm"])
29 | 
30 | 
31 | @pytest.mark.parametrize("shape", [[64, 64]])  # , [19, 37]
32 | @pytest.mark.parametrize("units", [64], ids=lambda x: f"{x}_units")
33 | def test_tiling(shape: list[int], units: int) -> None:
34 |     input = tf.keras.Input(shape)
35 |     dense = tf.keras.layers.Dense(units, use_bias=False, bias_initializer="uniform")(
36 |         input
37 |     )
38 |     model = tf.keras.Model(inputs=[input], outputs=[dense])
39 |     check_keras(model, variations=["loop-tiling"])
40 | 


--------------------------------------------------------------------------------
/onnx2code/ops/transpose.py:
--------------------------------------------------------------------------------
 1 | from ..util import compute_strides, get_attribute
 2 | from .operation import OpCall, Operation, OpImpl
 3 | 
 4 | 
 5 | class Transpose(Operation):
 6 |     """
 7 |     Transpose operator
 8 | 
 9 |     https://github.com/onnx/onnx/blob/main/docs/Operators.md#transpose
10 |     """
11 | 
12 |     node_types = {"Transpose"}
13 | 
14 |     def parse(self) -> None:
15 |         assert len(self.inputs) == 1, "expected one input"
16 |         assert len(self.outputs) == 1, "expected one output"
17 |         assert (
18 |             self.inputs[0].size == self.outputs[0].size
19 |         ), "input and output tensors should have the same size"
20 | 
21 |         self.input_strides = compute_strides(self.inputs[0].shape)
22 |         self.output_strides = compute_strides(self.outputs[0].shape)
23 |         self.perm = get_attribute(self.node, "perm", [])
24 | 
25 |     def call(self) -> OpCall:
26 |         return OpCall(
27 |             sig_name="Transpose",
28 |             sig_params=[self.inputs[0].shape, self.outputs[0].shape, self.perm],
29 |             inputs=self.inputs,
30 |             outputs=self.outputs,
31 |         )
32 | 
33 | 
34 | @Transpose.variant("c")
35 | class TransposeC(Transpose):
36 |     def impl(self) -> OpImpl:
37 |         output_shape = self.outputs[0].shape
38 | 
39 |         for_loops = []
40 |         out_index = []
41 |         in_index = []
42 | 
43 |         for i in range(len(output_shape)):
44 |             for_loops.append(
45 |                 f"""for (int d{i} = 0; d{i} < {output_shape[i]}; ++d{i})"""
46 |             )
47 |             out_index.append(f"d{i}*{self.output_strides[i]}")
48 |             in_index.append(f"d{i}*{self.input_strides[self.perm[i]]}")
49 | 
50 |         source = "\n".join([loop + "{" for loop in for_loops])
51 |         source += (
52 |             "\n\tOUT[" + "+".join(out_index) + "] = A[" + "+".join(in_index) + "];\n"
53 |         )
54 |         source += "}" * len(for_loops)
55 |         source += "\n"
56 | 
57 |         return OpImpl(lang="c", source=source)
58 | 


--------------------------------------------------------------------------------
/onnx2code/ops/concat.py:
--------------------------------------------------------------------------------
 1 | from ..util import compute_strides, get_attribute
 2 | from .operation import LETTERS, OpCall, Operation, OpImpl
 3 | 
 4 | 
 5 | class Concat(Operation):
 6 |     """
 7 |     Concat operator
 8 | 
 9 |     https://github.com/onnx/onnx/blob/main/docs/Operators.md#concat
10 |     """
11 | 
12 |     node_types = {"Concat"}
13 | 
14 |     def parse(self) -> None:
15 |         assert len(self.outputs) == 1, "expected one output"
16 | 
17 |         self.axis = get_attribute(self.node, "axis", None)
18 | 
19 |         assert self.axis is not None, "axis is not set"
20 | 
21 |     def call(self) -> OpCall:
22 |         return OpCall(
23 |             sig_name="Concat",
24 |             sig_params=[inp.shape for inp in self.inputs],
25 |             inputs=self.inputs,
26 |             outputs=self.outputs,
27 |         )
28 | 
29 | 
30 | @Concat.variant("c")
31 | class ConcatC(Concat):
32 |     def impl(self) -> OpImpl:
33 |         source = ""
34 | 
35 |         output_strides = compute_strides(self.outputs[0].shape)
36 | 
37 |         def output_index(axis_offset: int) -> str:
38 |             output_index = ""
39 |             for i, stride in enumerate(output_strides):
40 |                 output_index += "+"
41 |                 if i == self.axis:
42 |                     output_index += f"({axis_offset}+d{i})"
43 |                 else:
44 |                     output_index += f"d{i}"
45 |                 output_index += f"*{stride}"
46 |             return output_index
47 | 
48 |         axis_offset = 0
49 |         for k, input in enumerate(self.inputs):
50 |             index = ""
51 |             input_strides = compute_strides(input.shape)
52 | 
53 |             for i, elems in enumerate(input.shape):
54 |                 source += f"for (int d{i} = 0; d{i} < {elems}; d{i}++) {{\n"
55 |                 index += f"+ d{i} * {input_strides[i]}"
56 | 
57 |             source += f"OUT[{output_index(axis_offset)}] = {LETTERS[k]}[{index}];\n"
58 |             source += "}\n" * len(input.shape)
59 | 
60 |             axis_offset += input.shape[self.axis]
61 | 
62 |         return OpImpl(lang="c", source=source)
63 | 


--------------------------------------------------------------------------------
/evaluation/eval_tilings.py:
--------------------------------------------------------------------------------
 1 | import setup  # noqa # isort:skip
 2 | 
 3 | from itertools import product
 4 | 
 5 | import numpy as np
 6 | import tensorflow as tf
 7 | from measure import measure_all
 8 | 
 9 | from onnx2code.ops.gemm import LoopTilingParams, set_tiling_params
10 | 
11 | FLOAT_SIZE = 4
12 | KB = 1024
13 | L1_SIZE = 32 * KB
14 | L2_SIZE = 256 * KB
15 | 
16 | # Custom MNIST-like model
17 | input = tf.keras.Input([4096 * 64])
18 | out = tf.keras.layers.Lambda(lambda x: x)(input)
19 | 
20 | input_shape = (512, 512)
21 | 
22 | model = tf.keras.Sequential(
23 |     [
24 |         tf.keras.Input(shape=input_shape),
25 |         tf.keras.layers.Dense(512, activation="relu"),
26 |     ]
27 | )
28 | 
29 | # nc, kc, mc, mr, nr
30 | nc_options = [4096]
31 | kc_options = [256]
32 | mc_options = [256]
33 | mr_options = [4]
34 | nr_options = [8]
35 | mv_options = [4]
36 | nu_options = [4]
37 | 
38 | for nc, kc, mc, mr, nr, mv, nu in product(
39 |     nc_options, kc_options, mc_options, mr_options, nr_options, mv_options, nu_options
40 | ):
41 |     set_tiling_params(LoopTilingParams(nc=nc, kc=kc, mc=mc, mr=mr, nr=nr, mv=mv, nu=nu))
42 |     print(f"\n## nc={nc}, kc={kc}, mc={mc}, mr={mr}, nr={nr}\n")
43 | 
44 |     B_sliver = nr * kc * FLOAT_SIZE
45 |     A_sliver = mr * kc * FLOAT_SIZE
46 |     AB = mr * nr * FLOAT_SIZE
47 |     L1_total = A_sliver + B_sliver + AB
48 |     L1_remaining = L1_SIZE - L1_total
49 |     print("L1:")
50 |     print(f"\t{A_sliver=}")
51 |     print(f"\t{B_sliver=}")
52 |     print(f"\t{AB=}")
53 |     print(f"\t{L1_total=}")
54 |     print(f"\t{L1_remaining=}")
55 | 
56 |     A_panel = mc * kc * FLOAT_SIZE
57 |     C_writeback = AB
58 |     L2_total = A_panel + B_sliver + C_writeback
59 |     L2_remaining = L2_SIZE - L2_total
60 |     print("\nL2:")
61 |     print(f"\t{A_panel=}")
62 |     print(f"\t{B_sliver=}")
63 |     print(f"\t{C_writeback=}")
64 |     print(f"\t{L2_total=}")
65 |     print(f"\t{L2_remaining=}")
66 | 
67 |     data = measure_all(model, variations=["loop-tiling"], measure_base=False)
68 | 
69 |     assert len(data) == 1
70 |     result = data[next(iter(data.keys()))]
71 | 
72 |     print(f"result: {np.mean(result):.2f}ms")
73 | 


--------------------------------------------------------------------------------
/onnx2code/ops/gemm_tiling/gpackB.cpp:
--------------------------------------------------------------------------------
 1 | template <int kc, int nr, int StrideCol, int StrideRow>
 2 | inline void gpackB_panel(
 3 |     const float* __restrict__ B,
 4 |     float* __restrict__ B_panel  // kc x nr
 5 | ) {
 6 |     for (int r = 0; r < kc; r++) {
 7 |         // copy row of nr
 8 |         for (int c = 0; c < nr; c++) {
 9 |             B_panel[c] = B[c * StrideCol];
10 |         }
11 | 
12 |         // advance row
13 |         B_panel += nr;
14 |         B += StrideRow;
15 |     }
16 | }
17 | 
18 | template <int kc, int nc, int nr, int StrideCol, int StrideRow>
19 | inline void gpackB(
20 |     const float* __restrict__ B,
21 |     float* __restrict__ B_panel  // kc x nc
22 | ) {
23 |     for (int p = 0; p < nc; p += nr) {
24 |         gpackB_panel<kc, nr, StrideCol, StrideRow>(B, B_panel);
25 | 
26 |         // advance panel
27 |         B_panel += kc * nr;
28 |         B += nr * StrideCol;
29 |     }
30 | }
31 | 
32 | // Edge case
33 | 
34 | template <int nr, int StrideCol, int StrideRow>
35 | inline void gpackB_panel_edge(
36 |     int _kc,
37 |     int _nr,
38 |     const float* __restrict__ B,
39 |     float* __restrict__ B_panel  // kc x nr
40 | ) {
41 |     for (int r = 0; r < _kc; r++) {
42 |         // copy row of _nr
43 |         for (int c = 0; c < _nr; c++) {
44 |             B_panel[c] = B[c * StrideCol];
45 |         }
46 | 
47 |         // advance row
48 |         B_panel += nr;
49 |         B += StrideRow;
50 |     }
51 | }
52 | 
53 | template <int kc, int nc, int nr, int StrideCol, int StrideRow>
54 | inline void gpackB_edge(
55 |     int _kc,
56 |     int _nc,
57 |     const float* __restrict__ B,
58 |     float* __restrict__ B_panel  // kc x nc
59 | ) {
60 |     const int NP = _nc / nr;
61 |     const int NPl = _nc % nr;
62 | 
63 |     memset(B_panel, 0, kc * nc * sizeof(float));
64 | 
65 |     for (int p = 0; p < NP; p++) {
66 |         gpackB_panel_edge<nr, StrideCol, StrideRow>(_kc, nr, B, B_panel);
67 | 
68 |         // advance panel
69 |         B_panel += kc * nr;
70 |         B += nr * StrideCol;
71 |     }
72 | 
73 |     if (NPl != 0) {
74 |         gpackB_panel_edge<nr, StrideCol, StrideRow>(_kc, NPl, B, B_panel);
75 |     }
76 | }
77 | 


--------------------------------------------------------------------------------
/onnx2code/checker.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | from pathlib import Path
 4 | 
 5 | import numpy as np
 6 | import onnx
 7 | import onnxruntime
 8 | 
 9 | from .generator import Generator
10 | from .result import ModelResult
11 | from .service import ModelService
12 | 
13 | 
14 | def check_model_result(
15 |     model_proto: onnx.ModelProto, result: ModelResult, n_inputs: int = 1
16 | ) -> None:
17 |     """
18 |     Checks if the generated output matches the reference runtime (ONNX Runtime)
19 | 
20 |     :param n_inputs: random inputs will be generated
21 |     """
22 |     ort_sess = onnxruntime.InferenceSession(model_proto.SerializeToString())
23 | 
24 |     with ModelService(result) as service:
25 |         for _ in range(n_inputs):
26 | 
27 |             inputs = {
28 |                 name: np.random.uniform(-1.0, 1.0, shape).astype(np.float32)
29 |                 for name, shape in result.input_shapes.items()
30 |             }
31 | 
32 |             out1 = service.inference(inputs)
33 |             out2 = ort_sess.run(None, inputs)
34 | 
35 |             assert len(out1) == len(out2)
36 | 
37 |             output_matches = True
38 | 
39 |             for o1, o2 in zip(out1, out2):
40 |                 output_matches = output_matches and np.allclose(o1, o2, atol=1e-5)
41 | 
42 |             if not output_matches and os.getenv("ONNX2CODE_DEBUG", "0") == "1":
43 |                 temp_dir = Path(__file__).parent.parent / "tmp/"
44 |                 inputs_np = np.concatenate([inp.reshape(-1) for inp in inputs.values()])
45 |                 outputs_np = np.concatenate([o.reshape(-1) for o in out2])
46 |                 inputs_np.tofile(temp_dir / "sample_inputs.bin")
47 |                 outputs_np.tofile(temp_dir / "sample_outputs.bin")
48 |                 shutil.copyfile(
49 |                     Path(__file__).parent / "debugger.c",
50 |                     temp_dir / "debugger.c",
51 |                 )
52 | 
53 |             if not output_matches:
54 |                 raise RuntimeError("output mismatch")
55 | 
56 | 
57 | def check_model(
58 |     model_proto: onnx.ModelProto, variations: list[str] = [], n_inputs: int = 1
59 | ) -> None:
60 |     """
61 |     Generates code for the given model and checks if the generated output matches the reference runtime (ONNX Runtime)
62 | 
63 |     :param n_inputs: random inputs will be generated
64 |     """
65 |     result = Generator(model_proto, variations).generate()
66 | 
67 |     check_model_result(model_proto, result, n_inputs)
68 | 


--------------------------------------------------------------------------------
/onnx2code/__main__.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import sys
 3 | from pathlib import Path
 4 | 
 5 | import onnx
 6 | from rich import print
 7 | 
 8 | from .checker import check_model_result
 9 | from .generator import Generator
10 | 
11 | 
12 | def main() -> None:
13 |     parser = argparse.ArgumentParser(prog="onnx2code")
14 |     parser.add_argument("input_model", help="input .onnx file")
15 |     parser.add_argument("output_folder", help="output folder to write files")
16 |     parser.add_argument(
17 |         "--variations",
18 |         "--vars",
19 |         type=str,
20 |         help="variation priority",
21 |         default="asm, c",
22 |         action="store",
23 |     )
24 |     parser.add_argument(
25 |         "--checks",
26 |         type=int,
27 |         help="compile and test the model with the provided amount of inputs",
28 |         default=0,
29 |         action="store",
30 |     )
31 | 
32 |     args = parser.parse_args()
33 | 
34 |     try:
35 |         model_proto = onnx.load(args.input_model)
36 |     except Exception as e:
37 |         print("Error loading ONNX model: ", e)
38 |         sys.exit(1)
39 | 
40 |     variations = [v.strip() for v in args.variations.split(",")]
41 | 
42 |     try:
43 |         result = Generator(model_proto, variations).generate()
44 |     except Exception as e:
45 |         print("Error generating code: ", e)
46 |         sys.exit(2)
47 | 
48 |     print("Input shapes:", result.input_shapes)
49 |     print("Output shapes:", result.output_shapes)
50 |     print("Weights size (floats):", result.weights.size)
51 | 
52 |     path = Path(args.output_folder)
53 |     print("Writing files to", path.resolve())
54 | 
55 |     path.mkdir(parents=True, exist_ok=True)
56 |     c_file = path / "model.cpp"
57 |     h_file = path / "model.h"
58 |     asm_file = path / "model.asm"
59 |     weights_file = path / "weights.bin"
60 |     result.weights.tofile(weights_file)
61 | 
62 |     for file, content in [
63 |         (c_file, result.source_c),
64 |         (h_file, result.source_h),
65 |         (asm_file, result.source_asm),
66 |     ]:
67 |         with open(file, "w") as f:
68 |             f.write(content)
69 | 
70 |     if args.checks > 0:
71 |         print("Checking model with", args.checks, "random inputs")
72 | 
73 |         try:
74 |             check_model_result(model_proto, result, args.checks)
75 |         except Exception as e:
76 |             print("Error checking model: ", e)
77 |             sys.exit(3)
78 | 
79 |     print("Done")
80 | 
81 | 
82 | if __name__ == "__main__":
83 |     main()
84 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | 
 3 | on: [push, pull_request]
 4 | 
 5 | concurrency:
 6 |   group: ${{ github.workflow }}-${{ github.ref }}
 7 |   cancel-in-progress: true
 8 | 
 9 | jobs:
10 |   style:
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |       - uses: actions/checkout@v3
14 | 
15 |       - name: Set up Python
16 |         uses: actions/setup-python@v4
17 |         with:
18 |           python-version: "3.10"
19 |           cache: "pipenv"
20 | 
21 |       - name: Install pipenv
22 |         run: curl https://raw.githubusercontent.com/pypa/pipenv/master/get-pipenv.py | python
23 | 
24 |       - name: Install dependencies
25 |         run: pipenv install --dev
26 | 
27 |       - name: Lint with flake8
28 |         run: pipenv run flake8 . --count --statistics
29 | 
30 |       - name: Check formatting with black
31 |         run: pipenv run black --check --verbose .
32 | 
33 |   test:
34 |     runs-on: ubuntu-latest
35 |     steps:
36 |       - name: Install libxsmm
37 |         run: |
38 |           git clone https://github.com/libxsmm/libxsmm
39 |           cd libxsmm
40 |           git checkout 4e1aa5332123088916989651ae9b187ecba377dc
41 |           make generator
42 |           echo "$(pwd)/bin/libxsmm_gemm_generator"
43 |           cd ..
44 | 
45 |       - uses: actions/checkout@v3
46 | 
47 |       - name: Install gcc and nasm
48 |         run: sudo apt-get install -y gcc nasm
49 | 
50 |       - name: Set up Python
51 |         uses: actions/setup-python@v4
52 |         with:
53 |           python-version: "3.10"
54 |           cache: "pipenv"
55 | 
56 |       - name: Install pipenv
57 |         run: curl https://raw.githubusercontent.com/pypa/pipenv/master/get-pipenv.py | python
58 | 
59 |       - name: Install dependencies
60 |         run: pipenv install --dev
61 | 
62 |       - name: Run tests
63 |         run: |
64 |           export PATH=$PATH:$(pwd)/libxsmm/bin
65 |           pipenv run make test
66 | 
67 |   deploy_docker_image:
68 |     needs: [style, test]
69 |     if: github.ref == 'refs/heads/main'
70 |     name: Push Docker image to Docker Hub
71 |     runs-on: ubuntu-latest
72 |     steps:
73 |       - uses: actions/checkout@v3
74 |       
75 |       - name: Log in to Docker Hub
76 |         uses: docker/login-action@v2
77 |         with:
78 |           username: ${{ secrets.DOCKERHUB_USERNAME }}
79 |           password: ${{ secrets.DOCKERHUB_TOKEN }}
80 |       
81 |       - name: Build and push Docker image
82 |         uses: docker/build-push-action@v3
83 |         with:
84 |           push: true
85 |           tags: mlomb/onnx2code:latest
86 | 


--------------------------------------------------------------------------------
/evaluation/find_best_tiling_params.py:
--------------------------------------------------------------------------------
 1 | import setup  # noqa # isort:skip
 2 | 
 3 | import numpy as np
 4 | import pandas as pd
 5 | from tqdm import tqdm
 6 | from itertools import product
 7 | 
 8 | import tensorflow as tf
 9 | from measure import measure_all
10 | 
11 | from onnx2code.ops.gemm_tiling.GEMM import LoopTilingParams, set_tiling_params
12 | 
13 | N = 512
14 | input_shape = (N, N)
15 | 
16 | model = tf.keras.Sequential(
17 |     [
18 |         tf.keras.Input(shape=input_shape),
19 |         tf.keras.layers.Dense(N, activation=None, use_bias=False),
20 |     ]
21 | )
22 | 
23 | # nc, kc, mc, mr, nr
24 | nc_options = [N]
25 | kc_options = [64, 128, 256, 512]
26 | mc_options = [64, 128, 256, 512]
27 | mr_options = [2, 4, 8, 16, 32]
28 | nr_options = [2, 4, 8, 16, 32]
29 | mv_options = [2, 4, 8, 16]
30 | nu_options = [2, 4, 8, 16]
31 | 
32 | 
33 | params = [
34 |     LoopTilingParams(nc=nc, kc=kc, mc=mc, mr=mr, nr=nr, mv=mv, nu=nu)
35 |     for nc, kc, mc, mr, nr, mv, nu in product(
36 |         nc_options,
37 |         kc_options,
38 |         mc_options,
39 |         mr_options,
40 |         nr_options,
41 |         mv_options,
42 |         nu_options,
43 |     )
44 | ]
45 | 
46 | 
47 | def is_valid_configuration(params: LoopTilingParams) -> bool:
48 |     # dont blame me
49 |     try:
50 |         assert params.nr % params.nu == 0
51 |         assert params.mr % params.mv == 0
52 | 
53 |         assert params.nc % params.nr == 0
54 |         assert params.mc % params.mr == 0
55 | 
56 |         assert params.kc <= N
57 | 
58 |         return True
59 |     except AssertionError:
60 |         return False
61 | 
62 | 
63 | params = [p for p in params if is_valid_configuration(p)]
64 | results = pd.DataFrame(columns=["nc", "kc", "mc", "mr", "nr", "mv", "nu", "time"])
65 | 
66 | for p in tqdm(params, desc="Tiling params"):
67 |     set_tiling_params(p)
68 | 
69 |     data = measure_all(
70 |         model,
71 |         variations=["loop-tiling"],
72 |         measure_base=False,
73 |         runs=300,
74 |         tqdm_leave=False,
75 |     )
76 | 
77 |     assert len(data) == 1
78 |     result = data[next(iter(data.keys()))]
79 |     # print(f"result: {np.mean(result):.2f}ms")
80 | 
81 |     entry = {
82 |         "nc": int(p.nc),
83 |         "kc": int(p.kc),
84 |         "mc": int(p.mc),
85 |         "mr": int(p.mr),
86 |         "nr": int(p.nr),
87 |         "mv": int(p.mv),
88 |         "nu": int(p.nu),
89 |         "time": np.mean(result),
90 |     }
91 |     results = pd.concat(
92 |         [
93 |             results,
94 |             pd.DataFrame.from_records([entry]),
95 |         ]
96 |     )
97 |     results.to_csv("results.csv", index=False)
98 | 


--------------------------------------------------------------------------------
/onnx2code/ops/softmax.py:
--------------------------------------------------------------------------------
 1 | from typing import Callable
 2 | 
 3 | from onnx2code.util import compute_strides, get_attribute
 4 | 
 5 | from .operation import OpCall, Operation, OpImpl
 6 | 
 7 | 
 8 | class Softmax(Operation):
 9 |     """
10 |     Softmax operator
11 | 
12 |     https://github.com/onnx/onnx/blob/main/docs/Operators.md#softmax
13 |     """
14 | 
15 |     node_types = {"Softmax"}
16 | 
17 |     def parse(self) -> None:
18 |         assert len(self.inputs) == 1, "expected one input"
19 |         assert len(self.outputs) == 1, "expected one output"
20 | 
21 |         self.X = self.inputs[0]
22 |         self.Y = self.outputs[0]
23 | 
24 |         self.strides = compute_strides(self.X.shape)
25 |         self.sizes = self.X.shape.copy()
26 |         self.axis = get_attribute(self.node, "axis", -1)
27 |         if self.axis < 0:
28 |             self.axis += len(self.X.shape)
29 | 
30 |     def call(self) -> OpCall:
31 |         return OpCall(
32 |             sig_name="Softmax",
33 |             sig_params=[],
34 |             inputs=self.inputs,
35 |             outputs=self.outputs,
36 |         )
37 | 
38 | 
39 | @Softmax.variant("c")
40 | class SoftmaxC(Softmax):
41 |     def impl(self) -> OpImpl:
42 |         strides, sizes, axis = self.strides, self.sizes, self.axis
43 | 
44 |         labels_size = sizes[axis]
45 |         labels_stride = strides[axis]
46 | 
47 |         del sizes[axis]
48 |         del strides[axis]
49 | 
50 |         NL = "\n"
51 | 
52 |         def iterate(predicate: Callable[[str], str]) -> str:
53 |             iterators = []
54 |             offset = f"i * {labels_stride}"
55 | 
56 |             for i, size in enumerate(sizes):
57 |                 iterators.append(f"for (int d{i} = 0; d{i} < {size}; ++d{i}) {{")
58 |                 offset += f" + d{i} * {strides[i]}"
59 | 
60 |             return f"""
61 |                 {NL.join(iterators)}
62 |                 {predicate(offset)}
63 |                 {NL.join("}" for _ in iterators)}
64 |             """
65 | 
66 |         source = iterate(
67 |             lambda offset: f"""
68 |             float max = -INFINITY;
69 |             float sum = 0.0f;
70 | 
71 |             for (int i = 0; i < {labels_size}; ++i) {{
72 |                 max = fmax(max, A[{offset}]);
73 |             }}
74 |             for (int i = 0; i < {labels_size}; ++i) {{
75 |                 OUT[{offset}] = exp(A[{offset}] - max);
76 |                 sum += OUT[{offset}];
77 |             }}
78 |             for (int i = 0; i < {labels_size}; ++i) {{
79 |                 OUT[{offset}] /= sum;
80 |             }}
81 |         """
82 |         )
83 | 
84 |         return OpImpl(lang="c", source=source)
85 | 


--------------------------------------------------------------------------------
/onnx2code/ops/elementwise.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from ..util import get_attribute
 4 | from .operation import LETTERS, OpCall, Operation, OpImpl
 5 | 
 6 | 
 7 | class Elementwise(Operation):
 8 |     """
 9 |     Elementwise operators
10 | 
11 |     For example: ReLU, Tanh, Sigmoid, etc.
12 |     """
13 | 
14 |     node_types = {"Relu", "Tanh", "Sigmoid", "Clip", "Sum"}
15 | 
16 |     def parse(self) -> None:
17 |         assert len(self.outputs) == 1, "expected one output"
18 | 
19 |         for input in self.inputs:
20 |             assert (
21 |                 input.size == self.outputs[0].size
22 |             ), "input and output tensors should have the same size"
23 | 
24 |             if self.node.op_type == "Clip":
25 |                 # Clip may have min and max as inputs
26 |                 # or as attributes (depending on ONNX opset)
27 |                 break
28 | 
29 |         self.op: str = self.node.op_type
30 |         self.size = self.inputs[0].size
31 | 
32 |     def call(self) -> OpCall:
33 |         return OpCall(
34 |             sig_name=self.op,
35 |             sig_params=[self.size],
36 |             inputs=self.inputs,
37 |             outputs=self.outputs,
38 |         )
39 | 
40 | 
41 | @Elementwise.variant("c")
42 | class ElementwiseC(Elementwise):
43 |     def impl(self) -> OpImpl:
44 |         impl: str
45 |         match self.op:
46 |             case "Sum":
47 |                 impl = "+".join([f"{LETTERS[i]}[i]" for i in range(len(self.inputs))])
48 |             case "Relu":
49 |                 impl = "A[i] > 0 ? A[i] : 0"
50 |             case "Tanh":
51 |                 impl = "tanh(A[i])"
52 |             case "Sigmoid":
53 |                 impl = "1.0f / (1.0f + exp(-A[i]))"
54 |             case "Clip":
55 |                 if len(self.inputs) == 3:
56 |                     min_data = self.inputs[1].data
57 |                     max_data = self.inputs[2].data
58 | 
59 |                     if min_data is None or max_data is None:
60 |                         raise ValueError("Clip: min and max should be constants")
61 | 
62 |                     # "cast" the 0-dimensional arrays to numbers
63 |                     min = min_data + 0
64 |                     max = max_data + 0
65 |                 else:
66 |                     finfo = np.finfo(dtype=np.float32)
67 |                     min = get_attribute(self.node, "min", finfo.min)
68 |                     max = get_attribute(self.node, "max", finfo.max)
69 | 
70 |                 impl = "A[i] < {} ? {} : (A[i] > {} ? {} : A[i])".format(
71 |                     min, min, max, max
72 |                 )
73 |             case _:
74 |                 raise NotImplementedError(f"ElementwiseC: {self.op}")
75 | 
76 |         source = f"""
77 |         for(int i = 0; i < {self.size}; i++) {{
78 |             OUT[i] = {impl};
79 |         }}
80 |         """
81 | 
82 |         return OpImpl(lang="c", source=source)
83 | 


--------------------------------------------------------------------------------
/evaluation/results/analysis.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "R Notebook"
 3 | output: html_notebook
 4 | ---
 5 | 
 6 | ```{r}
 7 | options(warn=-1)
 8 | library(ggplot2)
 9 | library(extrafont)
10 | 
11 | theme_set(theme(text=element_text(family="LM Roman 10")))
12 | ```
13 | 
14 | ```{r}
15 | datos_6th_skylake <- read.csv("C:/Users/mlomb/Desktop/onnx2code/evaluation/results/6th-skylake.csv")
16 | datos_6th_skylake$gen <- rep("Skylake (6th)", nrow(datos_6th_skylake))
17 | 
18 | datos_10th_comet_lake <- read.csv("C:/Users/mlomb/Desktop/onnx2code/evaluation/results/10th-comet-lake.csv")
19 | datos_10th_comet_lake$gen <- rep("Comet Lake (10th)", nrow(datos_10th_comet_lake))
20 | 
21 | datos_10th_ice_lake <- read.csv("C:/Users/mlomb/Desktop/onnx2code/evaluation/results/10th-ice-lake.csv")
22 | datos_10th_ice_lake$gen <- rep("Ice Lake (10th)", nrow(datos_10th_ice_lake))
23 | 
24 | datos_11th_tiger_lake <- read.csv("C:/Users/mlomb/Desktop/onnx2code/evaluation/results/11th-tiger-lake.csv")
25 | datos_11th_tiger_lake$gen <- rep("Tiger Lake (11th)", nrow(datos_11th_tiger_lake))
26 | 
27 | topN = 500 # nrow(datos_6th_skylake)
28 | 
29 | datos <- rbind(
30 |   datos_6th_skylake[order(datos_6th_skylake$time, decreasing=F),][1:topN,],
31 |   datos_10th_comet_lake[order(datos_10th_comet_lake$time, decreasing=F),][1:topN,],
32 |   datos_10th_ice_lake[order(datos_10th_ice_lake$time, decreasing=F),][1:topN,],
33 |   datos_11th_tiger_lake[order(datos_11th_tiger_lake$time, decreasing=F),][1:topN,]
34 | )
35 | datos$gen <- factor(datos$gen, levels=c("Skylake (6th)", "Comet Lake (10th)", "Ice Lake (10th)", "Tiger Lake (11th)"))
36 | datos$mrnr <- datos$mr * datos$nr
37 | datos$l1 <- datos$kc * datos$nr * 4
38 | datos$l2 <- datos$kc * datos$mc * 4
39 | datos$l3 <- datos$nc * datos$kc * 4
40 | 
41 | filtrados_mrnr <- datos[
42 |   ((datos$gen == "Skylake (6th)" | datos$gen == "Comet Lake (10th)") & datos$mrnr == 64) |
43 |   ((datos$gen == "Ice Lake (10th)" | datos$gen == "Tiger Lake (11th)") & datos$mrnr == 128)
44 | ,]
45 | ```
46 | 
47 | 
48 | ```{r}
49 | ggplot(datos, aes(x=as.factor((mr*nr)),y=time))+
50 |   geom_boxplot() +
51 |   ylab("Tiempo (ms)") +
52 |   xlab(expression("m"[r] * " × n"[r])) +
53 |   facet_grid(~gen)
54 | ggsave("mr_x_nr.pdf", width = 8, height = 4, device=cairo_pdf)
55 | ```
56 | 
57 | 
58 | ```{r}
59 | ggplot(filtrados_mrnr, aes(x=factor(l1, labels=c("2KB","4KB","8KB","16KB","32KB","64KB")),y=time))+
60 |   geom_boxplot() +
61 |   ylab("Tiempo (ms)") +
62 |   xlab(expression("k"[c] * " × n"[r] * " × 4")) +
63 |   facet_grid(~gen)
64 | ggsave("l1.pdf", width = 9, height = 4, device=cairo_pdf)
65 | ```
66 | 
67 | ```{r}
68 | ggplot(filtrados_mrnr, aes(x=factor(l2, labels=c("16KB","32KB","64KB","128KB","256KB","512KB","1MB")),y=time))+
69 |   geom_boxplot() +
70 |   ylab("Tiempo (ms)") +
71 |   xlab(expression("k"[c] * " × m"[c] * " × 4")) +
72 |   facet_grid(~gen)
73 | ggsave("l2.pdf", width = 12, height = 4, device=cairo_pdf)
74 | ```
75 | 
76 | 


--------------------------------------------------------------------------------
/onnx2code/ops/gemm_tiling/gpackA.cpp:
--------------------------------------------------------------------------------
 1 | // Referencias:
 2 | //  - [Automating the Last-Mile for High Performance Dense Linear Algebra] Figure 1 ~Ai (verde)
 3 | //  - https://github.com/michael-lehn/ulmBLAS/blob/191efa54ddb595760353a1ca557a886fa74a864a/ulmblas/level3/pack/gepack.tcc
 4 | 
 5 | // for (1..mp) bloques de mr filas (dentro de las mc filas del panel)
 6 | //   for (1..kc) columnas del bloque de A (kc)
 7 | //       for (1..mr) filas de mr (dentro de las mc filas)
 8 | 
 9 | // La matriz A se lee en forma ROW MAJOR (onnx)
10 | // El panel de A se puede pensar como un tensor de 3 dimensiones: (mp, kc, mr)
11 | 
12 | // Target (los numeros son los indices originales de A que queremos en el packeado)
13 | // -----------------  -|      -|
14 | // | 0 | 2 | 4 | 6 |   |       |
15 | // | 1 | 3 | 5 | 7 |   |  mr   |
16 | // -----------------  -|       |  mc
17 | // | 8 | 10| 12| 14|   |  mr   |
18 | // | 9 | 11| 13| 15|   |       |
19 | // -----------------  -|      -|
20 | // |---------------|
21 | //         kc
22 | // mp = 2
23 | 
24 | template <int kc, int mr, int StrideCol, int StrideRow>
25 | inline void gpackA_panel(
26 |     const float* __restrict__ A,
27 |     float* __restrict__ A_panel  // mr x kc
28 | ) {
29 |     for (int c = 0; c < kc; c++) {
30 |         // copy column of mr
31 |         for (int r = 0; r < mr; r++) {
32 |             A_panel[r] = A[r * StrideRow];
33 |         }
34 | 
35 |         // advance column
36 |         A_panel += mr;
37 |         A += StrideCol;
38 |     }
39 | }
40 | 
41 | template <int kc, int mc, int mr, int StrideCol, int StrideRow>
42 | inline void gpackA(
43 |     const float* __restrict__ A,
44 |     float* __restrict__ A_panel  // mc x kc
45 | ) {
46 |     for (int p = 0; p < mc; p += mr) {
47 |         gpackA_panel<kc, mr, StrideCol, StrideRow>(A, A_panel);
48 | 
49 |         // advance panel
50 |         A_panel += mr * kc;
51 |         A += mr * StrideRow;
52 |     }
53 | }
54 | 
55 | // Edge case
56 | 
57 | template <int mr, int StrideCol, int StrideRow>
58 | inline void gpackA_panel_edge(
59 |     int _kc,
60 |     int _mr,
61 |     const float* __restrict__ A,
62 |     float* __restrict__ A_panel  // mr x kc
63 | ) {
64 |     for (int c = 0; c < _kc; c++) {
65 |         // copy column of mr
66 |         for (int r = 0; r < _mr; r++) {
67 |             A_panel[r] = A[r * StrideRow];
68 |         }
69 | 
70 |         // advance column
71 |         A_panel += mr;
72 |         A += StrideCol;
73 |     }
74 | }
75 | template <int kc, int mc, int mr, int StrideCol, int StrideRow>
76 | inline void gpackA_edge(
77 |     int _kc,
78 |     int _mc,
79 |     const float* __restrict__ A,
80 |     float* __restrict__ A_panel  // mc x kc
81 | ) {
82 |     const int MP = _mc / mr;
83 |     const int MPl = _mc % mr;
84 | 
85 |     memset(A_panel, 0, mc * kc * sizeof(float));
86 | 
87 |     for (int p = 0; p < MP; p++) {
88 |         gpackA_panel_edge<mr, StrideCol, StrideRow>(_kc, mr, A, A_panel);
89 | 
90 |         // advance panel
91 |         A_panel += mr * kc;
92 |         A += mr * StrideRow;
93 |     }
94 | 
95 |     if (MPl != 0) {
96 |         gpackA_panel_edge<mr, StrideCol, StrideRow>(_kc, MPl, A, A_panel);
97 |     }
98 | }
99 | 


--------------------------------------------------------------------------------
/onnx2code/ops/broadcastable.py:
--------------------------------------------------------------------------------
 1 | from typing import Any
 2 | 
 3 | import numpy as np
 4 | 
 5 | from .operation import OpCall, Operation, OpImpl
 6 | 
 7 | 
 8 | class Broadcastable(Operation):
 9 |     """
10 |     Broadcastable operators like Add, Sub, etc.
11 | 
12 |     https://github.com/onnx/onnx/blob/master/docs/Broadcasting.md
13 |     """
14 | 
15 |     node_types = {"Add", "Div", "Mul", "Sub"}
16 | 
17 |     def parse(self) -> None:
18 |         assert len(self.inputs) == 2, "expected two inputs"
19 |         assert len(self.outputs) == 1, "expected one output"
20 | 
21 |         self.op: str = self.node.op_type
22 |         self.b_is_scalar = self.inputs[1].size == 1
23 |         self.input_A = self.inputs[0]
24 |         self.input_B = self.inputs[1]
25 | 
26 |     def call(self) -> OpCall:
27 |         return OpCall(
28 |             sig_name=self.op,
29 |             sig_params=[self.input_A.shape, self.input_B.shape],
30 |             inputs=self.inputs,
31 |             outputs=self.outputs,
32 |         )
33 | 
34 | 
35 | @Broadcastable.variant("c")
36 | class BroadcastableC(Broadcastable):
37 |     def impl(self) -> OpImpl:
38 |         source = ""
39 | 
40 |         symbol = {
41 |             "Add": "+",
42 |             "Div": "/",
43 |             "Mul": "*",
44 |             "Sub": "-",
45 |         }[self.op]
46 | 
47 |         if self.b_is_scalar:
48 |             source += f"""
49 |             const float D = B[0];
50 |             for (int i = 0; i < {self.inputs[0].size}; i++) {{
51 |                 OUT[i] = A[i] {symbol} D;
52 |             }}
53 |             """
54 |         else:
55 |             # since we are using the trick below, we can't tell beforehand if
56 |             # implementations will differ for every pair of input shapes
57 |             # so we add salt so implementations dont collide
58 |             source += f"// broadcasting {self.input_A.shape_str()} with {self.input_B.shape_str()}\n"
59 | 
60 |             # we use nditer to generate the for loops for the broadcastable ops
61 |             # it is a bit of a hack, but it works and it hides the complexity of
62 |             # broadcasting :)
63 | 
64 |             a = np.arange(start=0, stop=self.input_A.size).reshape(self.input_A.shape)
65 |             b = np.arange(start=0, stop=self.input_B.size).reshape(self.input_B.shape)
66 |             offset = 0
67 | 
68 |             for x, y in np.nditer([a, b], flags=["external_loop"], order="C"):
69 |                 assert x.size == y.size, "nditer size expected to match"
70 |                 size = x.size
71 | 
72 |                 # ARBITRARY ASSUMPTIONS I AM MAKING:
73 |                 def is_consecutive(z: Any) -> Any:
74 |                     return z[z.size - 1] - z[0] == z.size - 1
75 | 
76 |                 x_is_consecutive = is_consecutive(x)
77 |                 x_is_all_equal = x[0] == x[x.size - 1]
78 |                 y_is_consecutive = is_consecutive(y)
79 |                 y_is_all_equal = y[0] == y[y.size - 1]
80 |                 assert (
81 |                     x_is_all_equal or x_is_consecutive
82 |                 ), "nditer x expected to be all equal or consecutive"
83 |                 assert (
84 |                     y_is_all_equal or y_is_consecutive
85 |                 ), "nditer y expected to be all equal or consecutive"
86 | 
87 |                 A_index = f"{x[0]}" + (" + i" if x_is_consecutive else "")
88 |                 B_index = f"{y[0]}" + (" + i" if y_is_consecutive else "")
89 | 
90 |                 source += f"for(int i = 0; i < {size}; i++) OUT[{offset} + i] = A[{A_index}] {symbol} B[{B_index}];\n"
91 |                 offset += size
92 | 
93 |         return OpImpl(lang="c", source=source)
94 | 


--------------------------------------------------------------------------------
/onnx2code/tensor.py:
--------------------------------------------------------------------------------
  1 | import operator
  2 | from dataclasses import dataclass
  3 | from functools import reduce
  4 | from typing import Literal
  5 | 
  6 | import numpy as np
  7 | import onnx
  8 | from numpy.typing import NDArray
  9 | 
 10 | from .util import get_model_inputs, get_shape_from_value_info_proto
 11 | 
 12 | TensorData = NDArray[np.float32]
 13 | TensorTag = Literal["input", "output", "weight", "intermediate", "welded"]
 14 | 
 15 | 
 16 | @dataclass
 17 | class TensorInfo:
 18 |     name: str
 19 |     tag: TensorTag
 20 |     shape: list[int]
 21 |     size: int
 22 |     data: TensorData | None
 23 |     variable: str
 24 | 
 25 |     def shape_str(self, sep: str = "x") -> str:
 26 |         return sep.join(map(str, self.shape))
 27 | 
 28 |     @staticmethod
 29 |     def from_value(
 30 |         value_info: onnx.ValueInfoProto,
 31 |         tag: TensorTag,
 32 |         var_index: int,
 33 |         model_proto: onnx.ModelProto,
 34 |     ) -> "TensorInfo":
 35 |         """
 36 |         Parses a ValueInfo and returns the tensor
 37 |         """
 38 |         name = value_info.name
 39 |         shape = get_shape_from_value_info_proto(value_info)
 40 |         data: TensorData | None = None
 41 | 
 42 |         for node in model_proto.graph.node:
 43 |             if node.op_type == "Constant" and node.output[0] == name:
 44 |                 data = np.array(node.attribute[0].t.float_data)
 45 |                 tag = "weight"
 46 | 
 47 |         return TensorInfo(
 48 |             name=name,
 49 |             tag=tag,
 50 |             shape=shape,
 51 |             size=reduce(operator.mul, shape, 1),
 52 |             data=data,
 53 |             variable=f"T{var_index}",
 54 |         )
 55 | 
 56 |     @staticmethod
 57 |     def from_initializer(initializer: onnx.TensorProto, var_index: int) -> "TensorInfo":
 58 |         """
 59 |         Parses a TensorProto and returns the tensor
 60 |         """
 61 |         shape = [dim for dim in initializer.dims]
 62 |         data = onnx.numpy_helper.to_array(initializer)  # type: ignore
 63 |         assert data is not None, "data should not be None"
 64 |         assert list(data.shape) == shape, "Tensor shape and data shape should match"
 65 |         return TensorInfo(
 66 |             name=initializer.name,
 67 |             tag="weight",
 68 |             shape=shape,
 69 |             size=reduce(operator.mul, shape, 1),
 70 |             data=data,
 71 |             variable=f"T{var_index}",
 72 |         )
 73 | 
 74 | 
 75 | def parse_tensors(model_proto: onnx.ModelProto) -> list[TensorInfo]:
 76 |     """
 77 |     Reads ALL tensors and store them in a manegeable format
 78 |     input, output, intermediate and constant tensors
 79 |     """
 80 |     tensors: list[TensorInfo] = []
 81 | 
 82 |     # input
 83 |     tensors.extend(
 84 |         TensorInfo.from_value(vi, "input", i, model_proto)
 85 |         for i, vi in enumerate(get_model_inputs(model_proto), start=0)
 86 |     )
 87 | 
 88 |     # output
 89 |     tensors.extend(
 90 |         TensorInfo.from_value(vi, "output", i, model_proto)
 91 |         for i, vi in enumerate(model_proto.graph.output, start=len(tensors))
 92 |     )
 93 | 
 94 |     # intermediate
 95 |     tensors.extend(
 96 |         TensorInfo.from_value(vi, "intermediate", i, model_proto)
 97 |         for i, vi in enumerate(model_proto.graph.value_info, start=len(tensors))
 98 |     )
 99 | 
100 |     # constant
101 |     tensors.extend(
102 |         TensorInfo.from_initializer(initializer, i)
103 |         for i, initializer in enumerate(
104 |             model_proto.graph.initializer, start=len(tensors)
105 |         )
106 |     )
107 | 
108 |     return tensors
109 | 


--------------------------------------------------------------------------------
/onnx2code/ops/pooling.py:
--------------------------------------------------------------------------------
 1 | from onnx2code.util import compute_strides, get_attribute
 2 | 
 3 | from .operation import OpCall, Operation, OpImpl
 4 | 
 5 | 
 6 | class Pooling(Operation):
 7 |     """
 8 |     MaxPool, AveragePool operators
 9 | 
10 |     https://github.com/onnx/onnx/blob/main/docs/Operators.md#maxpool
11 |     https://github.com/onnx/onnx/blob/main/docs/Operators.md#averagepool
12 |     """
13 | 
14 |     node_types = {"MaxPool", "AveragePool"}
15 | 
16 |     def parse(self) -> None:
17 |         assert len(self.inputs) == 1, "expected one input"
18 |         assert len(self.outputs) == 1, "expected one output"
19 | 
20 |         count_include_pad = get_attribute(self.node, "count_include_pad", 0)
21 |         if count_include_pad != 0:
22 |             raise NotImplementedError("only support count_include_pad=0")
23 | 
24 |         self.op: str = self.node.op_type
25 |         self.X = self.inputs[0]
26 |         self.Y = self.outputs[0]
27 | 
28 |         self.pads = get_attribute(self.node, "pads", [0] * len(self.X.shape) * 2)
29 |         self.strides = get_attribute(self.node, "strides", [1] * len(self.X.shape))
30 | 
31 |         kernel_shape = get_attribute(self.node, "kernel_shape", [1] * len(self.X.shape))
32 | 
33 |         self.KH = kernel_shape[0]
34 |         self.KW = kernel_shape[1]
35 | 
36 |     def call(self) -> OpCall:
37 |         return OpCall(
38 |             sig_name=self.op,
39 |             sig_params=[self.X.shape, [self.KW, self.KH], self.strides, self.pads],
40 |             inputs=self.inputs,
41 |             outputs=self.outputs,
42 |         )
43 | 
44 | 
45 | @Pooling.variant("c")
46 | class PoolingC(Pooling):
47 |     def impl(self) -> OpImpl:
48 |         KH, KW = self.KH, self.KW
49 | 
50 |         H = self.X.shape[2]
51 |         W = self.X.shape[3]
52 | 
53 |         pads_start = [self.pads[0], self.pads[1]]
54 |         # pads_end = [self.pads[2], self.pads[3]]
55 | 
56 |         input_strides = compute_strides(self.X.shape)
57 |         output_strides = compute_strides(self.Y.shape)
58 | 
59 |         source = f"""
60 |         // start position of kernel
61 |         for(int c = 0; c < {self.Y.shape[1]}; c++) {{
62 |             for(int h = 0; h < {self.Y.shape[2]}; h++) {{
63 |                 for(int w = 0; w < {self.Y.shape[3]}; w++) {{
64 |                     float acc = {'-INFINITY' if self.op == "MaxPool" else "0.0f"};
65 |                     int count = 0;
66 | 
67 |                     // position in kernel
68 |                     for(int hh = 0; hh < {KH}; hh++) {{
69 |                         for(int ww = 0; ww < {KW}; ww++) {{
70 |                             const int ih = {-pads_start[0]} + (h * {self.strides[0]}) + hh;
71 |                             const int iw = {-pads_start[1]} + (w * {self.strides[1]}) + ww;
72 |                             if(ih >= 0 && ih < {H} && iw >= 0 && iw < {W}) {{
73 |                                 const float val = A[
74 |                                     c * {input_strides[1]} +
75 |                                     ih * {input_strides[2]} +
76 |                                     iw * {input_strides[3]}
77 |                                 ];
78 |                                 acc = {'acc > val ? acc : val' if self.op == "MaxPool" else 'acc + val'};
79 |                                 count++;
80 |                             }}
81 |                         }}
82 |                     }}
83 |                     OUT[
84 |                         c * {output_strides[1]} +
85 |                         h * {output_strides[2]} +
86 |                         w * {output_strides[3]}
87 |                     ] = acc{"" if self.op == "MaxPool" else "/(float)count"};
88 |                 }}
89 |             }}
90 |         }}
91 |         """
92 | 
93 |         return OpImpl(lang="c", source=source)
94 | 


--------------------------------------------------------------------------------
/evaluation/results_conv/6th.csv:
--------------------------------------------------------------------------------
 1 | MNK,runtime,time_mean,time_std
 2 | 256,onnx2code-conv-naive,0.5335040000000001,0.13929143255778512
 3 | 256,onnx2code-im2col,0.9039860100000002,0.3674661125795002
 4 | 256,tensorflow,4.80589202,0.683442539188438
 5 | 256,onnxruntime,0.507659,0.11556475508994946
 6 | 288,onnx2code-conv-naive,0.5146580000000001,0.07770419059484503
 7 | 288,onnx2code-im2col,1.7917450100000003,1.8639875966218524
 8 | 288,tensorflow,6.35994402,2.949189984354053
 9 | 288,onnxruntime,0.61280601,0.1191525637822783
10 | 320,onnx2code-conv-naive,0.634851,0.09292494659132175
11 | 320,onnx2code-im2col,1.9863670100000002,0.44807282247829977
12 | 320,tensorflow,7.29074003,1.0714042269534076
13 | 320,onnxruntime,0.7684680000000002,0.13833235693791962
14 | 352,onnx2code-conv-naive,0.807433,0.13366325452793673
15 | 352,onnx2code-im2col,2.51028901,0.38476206666098195
16 | 352,tensorflow,8.64401896,1.1966611925806563
17 | 352,onnxruntime,0.9582860000000001,0.18548641676413935
18 | 384,onnx2code-conv-naive,0.9626979899999999,0.17574935067296807
19 | 384,onnx2code-im2col,3.4803209899999996,0.4532894176918648
20 | 384,tensorflow,11.220825960000004,1.3980038878057952
21 | 384,onnxruntime,1.4973109899999997,0.4829659531221946
22 | 416,onnx2code-conv-naive,1.38213199,0.21410341375291028
23 | 416,onnx2code-im2col,3.6922599899999993,0.4935615735054846
24 | 416,tensorflow,11.50879395,1.3834223381706137
25 | 416,onnxruntime,1.41198999,0.26554929743422384
26 | 448,onnx2code-conv-naive,1.4111139900000003,0.2204240278968922
27 | 448,onnx2code-im2col,4.43189997,0.6188949241091165
28 | 448,tensorflow,13.2603729,1.8157711212087528
29 | 448,onnxruntime,1.73833399,0.3230000907905289
30 | 480,onnx2code-conv-naive,1.5975139900000002,0.24468945613223694
31 | 480,onnx2code-im2col,5.15567996,0.6334665079184837
32 | 480,tensorflow,15.11004689,1.854034459133766
33 | 480,onnxruntime,1.9667179799999996,0.3832841641820591
34 | 512,onnx2code-conv-naive,1.8798239899999998,0.23651916821367755
35 | 512,onnx2code-im2col,6.38169198,0.6658296137854636
36 | 512,tensorflow,17.81744493,4.1299519575648045
37 | 512,onnxruntime,2.5316899900000003,1.1681892520836725
38 | 544,onnx2code-conv-naive,2.0533539899999997,0.2816379399709668
39 | 544,onnx2code-im2col,6.51396297,0.5585122655670234
40 | 544,tensorflow,19.55571191,2.396831020647522
41 | 544,onnxruntime,2.6647249700000004,0.5108897333422635
42 | 576,onnx2code-conv-naive,2.3157639800000003,0.2005958408356953
43 | 576,onnx2code-im2col,7.40451691,0.6117355805461555
44 | 576,tensorflow,23.915504730000002,4.28320132070323
45 | 576,onnxruntime,5.273924940000001,2.711525277159713
46 | 608,onnx2code-conv-naive,2.9795849700000003,0.44997838777004523
47 | 608,onnx2code-im2col,9.95935502,1.315584263856489
48 | 608,tensorflow,26.85310105,3.405717858857798
49 | 608,onnxruntime,3.9157660100000005,0.8325832152179685
50 | 640,onnx2code-conv-naive,3.27329201,0.6007385518494464
51 | 640,onnx2code-im2col,10.205583019999999,0.776620449798806
52 | 640,tensorflow,28.46991395,2.150791806086179
53 | 640,onnxruntime,3.9682809899999993,0.7603661038780134
54 | 672,onnx2code-conv-naive,3.2548249899999995,0.3024018106088486
55 | 672,onnx2code-im2col,10.301164980000001,0.667454295158567
56 | 672,tensorflow,29.82736495,2.841980785533137
57 | 672,onnxruntime,4.14581799,0.5737991852890434
58 | 704,onnx2code-conv-naive,3.8885129900000006,0.7703979854719701
59 | 704,onnx2code-im2col,12.37283298,0.7178972353382757
60 | 704,tensorflow,37.103216950000004,9.318760829180064
61 | 704,onnxruntime,5.105472,1.2238957019354224
62 | 736,onnx2code-conv-naive,3.9390469900000005,0.5133510699725383
63 | 736,onnx2code-im2col,12.53659398,1.2395910685729303
64 | 736,tensorflow,34.75010769000001,3.339327976403812
65 | 736,onnxruntime,4.972059880000001,0.629258674413397
66 | 768,onnx2code-conv-naive,4.302181900000001,0.46374701565885035
67 | 768,onnx2code-im2col,13.454682659999996,0.8720942521698353
68 | 768,tensorflow,38.50100595,4.069915104293922
69 | 768,onnxruntime,5.67756882,0.8204209666372427
70 | 


--------------------------------------------------------------------------------
/onnx2code/ops/gemm_tiling/gemm.cpp:
--------------------------------------------------------------------------------
 1 | template <
 2 |     // matrix sizes
 3 |     int M,
 4 |     int K,
 5 |     int N,
 6 | 
 7 |     int nc,  // Columnas de panel de B
 8 |     int kc,  // Filas de panel de B
 9 |     int mc,  // Filas de bloque de A
10 | 
11 |     int mr,  // Filas de microkernel
12 |     int nr,  // Columnas de microkernel
13 | 
14 |     int mv,  // Filas de unit update
15 |     int nu   // Columnas de unit update
16 |     >
17 | void gemm(
18 |     const float* __restrict__ A,  // MxK
19 |     const float* __restrict__ B,  // KxN
20 |     float* __restrict__ OUT       // MxN
21 | ) {
22 |     memset(OUT, 0, M * N * sizeof(float));
23 | 
24 |     float A_block[(mc + mr) * kc];
25 |     float B_panel[(nc + nr) * kc];
26 | 
27 |     float AB_microkernel[mr * nr];
28 | 
29 |     for (int jc = 0; jc < N; jc += nc) {
30 |         int _nc = min(N - jc, nc);  // evitar que se pase "matrices grandes?"
31 | 
32 |         for (int pc = 0; pc < K; pc += kc) {
33 |             int _kc = min(K - pc, kc);  // evitar que se pase el panel
34 | 
35 |             if (_kc < kc || _nc < nc || true) {
36 |                 gpackB_edge<kc, nc, nr, 1, N>(_kc, _nc, (float*)B + pc * N + jc, B_panel);
37 |             } else {
38 |                 gpackB<kc, nc, nr, 1, N>((float*)B + pc * N + jc, B_panel);
39 |             }
40 | 
41 |             for (int ic = 0; ic < M; ic += mc) {
42 |                 int _mc = min(M - ic, mc);  // evitar que se pase el panel
43 | 
44 |                 if (_kc < kc || _mc < mc) {
45 |                     gpackA_edge<kc, mc, mr, 1, K>(_kc, _mc, (float*)A + ic * K + pc, A_block);
46 |                 } else {
47 |                     gpackA<kc, mc, mr, 1, K>((float*)A + ic * K + pc, A_block);
48 |                 }
49 | 
50 |                 // fprintf(stderr, "jc=%d pc=%d ic=%d _kc=%d _nc=%d, _mc=%d\n", jc, pc, ic, _kc, _nc, _mc);
51 | 
52 |                 for (int jr = 0; jr < _nc; jr += nr) {      // jr es el offset del sliver de ancho nr (violeta)
53 |                     for (int ir = 0; ir < _mc; ir += mr) {  // ir es el offset del sliver de ancho mr (verde)
54 |                         // (_mr x kc) * (kc x _nr)
55 | 
56 |                         const float* A_kernel = A_block + ir * kc;  // (mr x kc) column major
57 |                         const float* B_kernel = B_panel + jr * kc;  // (kc x nr) row major
58 | 
59 |                         // ref_microkernel<mr, nr, kc, N>(A_kernel, B_kernel, AB_microkernel);
60 | 
61 |                         memset(AB_microkernel, 0, mr * nr * sizeof(float));
62 |                         test_microkernel<mr, nr, mv, nu>(_kc, A_kernel, B_kernel, AB_microkernel);
63 | 
64 |                         // TODO: pasar _mr y _nr para evitar escribir fuera en C
65 |                         //       quizas un branch entre optimized y ref?
66 |                         int _nr = min(_nc - jr, nr);  // evitar que se pase el bloque
67 |                         int _mr = min(_mc - ir, mr);  // evitar que se pase el bloque
68 | 
69 |                         // assert(_mr == mr);
70 |                         // assert(_nr == nr);
71 | 
72 |                         float* C_writeback = (float*)OUT + (ic + ir) * N + (jc + jr);
73 | 
74 |                         if (_mr == mr && _nr == nr) {
75 |                             // Versión optimizada
76 |                             for (int i = 0; i < mr; i++) {
77 |                                 for (int j = 0; j < nr; j++) {
78 |                                     C_writeback[i * N + j] += AB_microkernel[i * nr + j];
79 |                                 }
80 |                             }
81 |                         } else {
82 |                             // Edge case
83 |                             for (int i = 0; i < _mr; i++) {
84 |                                 for (int j = 0; j < _nr; j++) {
85 |                                     C_writeback[i * N + j] += AB_microkernel[i * nr + j];
86 |                                 }
87 |                             }
88 |                         }
89 |                     }
90 |                 }
91 |             }
92 |         }
93 |     }
94 | }
95 | 


--------------------------------------------------------------------------------
/tests/test_zoo.py:
--------------------------------------------------------------------------------
  1 | from pathlib import Path
  2 | from typing import Any
  3 | 
  4 | import onnx
  5 | import pytest
  6 | 
  7 | from onnx2code.checker import check_model
  8 | from tests.zoo import download_from_zoo, zoo_manifest
  9 | 
 10 | # To avoid downloading big models we know are going to fail
 11 | EXCLUDED_MODELS = {
 12 |     "Works but is too slow": ["ResNet101-DUC-7.onnx", "ResNet101-DUC-12.onnx"],
 13 |     "Operation GlobalAveragePool not implemented": [
 14 |         "resnet101-v1-7.onnx",
 15 |         "resnet101-v2-7.onnx",
 16 |         "resnet152-v1-7.onnx",
 17 |         "resnet152-v2-7.onnx",
 18 |         "resnet50-v1-12.onnx",
 19 |         "resnet18-v1-7.onnx",
 20 |         "resnet18-v2-7.onnx",
 21 |         "resnet34-v1-7.onnx",
 22 |         "resnet34-v2-7.onnx",
 23 |         "resnet50-v1-7.onnx",
 24 |         "resnet50-v2-7.onnx",
 25 |         "densenet-12.onnx",
 26 |         "densenet-7.onnx",
 27 |         "densenet-8.onnx",
 28 |         "densenet-9.onnx",
 29 |     ],
 30 |     "Operation LRN not implemented": [
 31 |         "rcnn-ilsvrc13-7.onnx",
 32 |         "rcnn-ilsvrc13-8.onnx",
 33 |         "rcnn-ilsvrc13-9.onnx",
 34 |         "bvlcalexnet-12.onnx",
 35 |         "bvlcalexnet-7.onnx",
 36 |         "bvlcalexnet-8.onnx",
 37 |         "bvlcalexnet-9.onnx",
 38 |         "caffenet-12.onnx",
 39 |         "caffenet-7.onnx",
 40 |         "caffenet-8.onnx",
 41 |         "caffenet-9.onnx",
 42 |         "googlenet-12.onnx",
 43 |         "googlenet-7.onnx",
 44 |         "googlenet-8.onnx",
 45 |         "googlenet-9.onnx",
 46 |         "inception-v1-12.onnx",
 47 |         "inception-v1-7.onnx",
 48 |         "inception-v1-8.onnx",
 49 |         "inception-v1-9.onnx",
 50 |         "zfnet512-12.onnx",
 51 |         "zfnet512-7.onnx",
 52 |         "zfnet512-8.onnx",
 53 |         "zfnet512-9.onnx",
 54 |     ],
 55 |     "Operation Pad not implemented": [
 56 |         "candy-8.onnx",
 57 |         "candy-9.onnx",
 58 |         "mosaic-8.onnx",
 59 |         "mosaic-9.onnx",
 60 |         "pointilism-8.onnx",
 61 |         "pointilism-9.onnx",
 62 |         "rain-princess-8.onnx",
 63 |         "rain-princess-9.onnx",
 64 |         "udnie-8.onnx",
 65 |         "udnie-9.onnx",
 66 |     ],
 67 |     "Operation Resize not implemented": [
 68 |         "FasterRCNN-10.onnx",
 69 |         "fcn-resnet101-11.onnx",
 70 |         "fcn-resnet50-11.onnx",
 71 |         "fcn-resnet50-12.onnx",
 72 |         "MaskRCNN-10.onnx",
 73 |     ],
 74 |     "Broken": ["FasterRCNN-12.onnx", "MaskRCNN-12.onnx"],
 75 | }
 76 | 
 77 | 
 78 | def idfn(model: Any) -> str:
 79 |     return Path(model["model_path"]).name
 80 | 
 81 | 
 82 | def check_io_is_float(model: Any) -> None:
 83 |     io = model["metadata"]["io_ports"]
 84 | 
 85 |     for input in io["inputs"]:
 86 |         if input["type"] != "tensor(float)":
 87 |             raise NotImplementedError(f"No support for IO port type {input['type']}")
 88 | 
 89 | 
 90 | @pytest.mark.parametrize("variation", ["c"])
 91 | @pytest.mark.parametrize("model", zoo_manifest(), ids=idfn)
 92 | def test_zoo(model: Any, variation: str) -> None:
 93 |     # avoid downloading big models!
 94 | 
 95 |     # manual exclusion
 96 |     for reason, models in EXCLUDED_MODELS.items():
 97 |         if idfn(model) in models:
 98 |             pytest.skip(reason)
 99 | 
100 |     # opset unsupported
101 |     if model["opset_version"] < 7:
102 |         pytest.skip("Opset version < 7")
103 | 
104 |     # incompatible I/O
105 |     try:
106 |         check_io_is_float(model)
107 |     except NotImplementedError as e:
108 |         pytest.skip(e.__str__())
109 | 
110 |     # model is quantized
111 |     if "int8" in model["model"] or "qdq" in model["model"]:
112 |         pytest.skip("Quantized models are not supported")
113 | 
114 |     model_path = download_from_zoo(
115 |         model["model_path"], model["metadata"]["model_bytes"]
116 |     )
117 |     model_proto = onnx.load(model_path.__str__())
118 | 
119 |     try:
120 |         check_model(model_proto, [variation])
121 |     except NotImplementedError as e:
122 |         pytest.skip(e.__str__())
123 | 


--------------------------------------------------------------------------------
/evaluation/measure.py:
--------------------------------------------------------------------------------
  1 | import setup  # noqa # isort:skip
  2 | 
  3 | from time import perf_counter_ns
  4 | 
  5 | import numpy as np
  6 | import onnx
  7 | import onnxruntime
  8 | import tensorflow as tf
  9 | import tf2onnx
 10 | from tqdm import tqdm
 11 | 
 12 | from onnx2code.generator import Generator
 13 | from onnx2code.result import ModelResult
 14 | from onnx2code.service import ModelService, TensorsMap
 15 | 
 16 | 
 17 | def measure_tf(
 18 |     tf_model: tf.keras.Model,
 19 |     inputs: TensorsMap,
 20 |     runs: int,
 21 |     tqdm_leave: bool = True,
 22 | ) -> list[int]:
 23 |     times = []
 24 | 
 25 |     # ⚠️ Make sure to use graph execution and NOT eager execution
 26 |     graph_model = tf.function(tf_model)
 27 | 
 28 |     for _ in tqdm(range(runs), desc="tensorflow", leave=tqdm_leave):
 29 |         start = perf_counter_ns()
 30 |         graph_model(inputs)
 31 |         end = perf_counter_ns()
 32 |         times.append(end - start)
 33 | 
 34 |     return times
 35 | 
 36 | 
 37 | def measure_onnxruntime(
 38 |     model_proto: onnx.ModelProto,
 39 |     inputs: TensorsMap,
 40 |     runs: int,
 41 |     tqdm_leave: bool = True,
 42 | ) -> list[int]:
 43 |     times = []
 44 |     ort_sess = onnxruntime.InferenceSession(model_proto.SerializeToString())
 45 | 
 46 |     for _ in tqdm(range(runs), desc="onnxruntime", leave=tqdm_leave):
 47 |         start = perf_counter_ns()
 48 |         ort_sess.run(None, inputs)
 49 |         end = perf_counter_ns()
 50 |         times.append(end - start)
 51 | 
 52 |     return times
 53 | 
 54 | 
 55 | def measure_onnx2code(
 56 |     model_result: ModelResult,
 57 |     inputs: TensorsMap,
 58 |     runs: int,
 59 |     variation_name: str = "",
 60 |     tqdm_leave: bool = True,
 61 | ) -> list[int]:
 62 |     times = []
 63 | 
 64 |     with ModelService(model_result) as service:
 65 |         for _ in tqdm(
 66 |             range(runs),
 67 |             desc="onnx2code" if not variation_name else f"onnx2code-{variation_name}",
 68 |             leave=tqdm_leave,
 69 |         ):
 70 |             start = perf_counter_ns()
 71 |             service.inference(inputs)
 72 |             end = perf_counter_ns()
 73 |             times.append(end - start)
 74 | 
 75 |     return times
 76 | 
 77 | 
 78 | def measure_all(
 79 |     tf_model: tf.keras.Model,
 80 |     runs: int = 300,
 81 |     variations: list[str] = [],
 82 |     *,
 83 |     measure_base: bool = True,
 84 |     tqdm_leave: bool = True,
 85 |     onnx_model: onnx.ModelProto | None = None,
 86 | ) -> dict[str, list[float]]:
 87 |     """
 88 |     Measure the inference time of the given model in tf, onnxruntime and onnx2code.
 89 | 
 90 |     Time in milliseconds.
 91 |     """
 92 |     if tf_model is not None:
 93 |         model_proto, _ = tf2onnx.convert.from_keras(tf_model)
 94 |         # onnx.save(model_proto, "debug.onnx")
 95 |     else:
 96 |         model_proto = onnx_model
 97 | 
 98 |     warmup_runs = int(min(100, max(5, runs * 0.1)))
 99 |     total = runs + warmup_runs
100 | 
101 |     def postprocess(times_in_ns: list[int]) -> list[float]:
102 |         return [t / 1_000_000 for t in times_in_ns[warmup_runs:]]
103 | 
104 |     results: dict[str, list[float]] = {}
105 | 
106 |     for variation in variations:
107 |         model_variation = Generator(model_proto, variations=[variation]).generate()
108 |         # print(model_variation.source_c)
109 | 
110 |         inputs = {
111 |             name: np.random.random_sample(shape).astype(np.float32) * 2 - 1
112 |             for name, shape in model_variation.input_shapes.items()
113 |         }
114 | 
115 |         results[f"onnx2code-{variation}"] = postprocess(
116 |             measure_onnx2code(
117 |                 model_variation, inputs, total, variation, tqdm_leave=tqdm_leave
118 |             )
119 |         )
120 | 
121 |     return results | (
122 |         {
123 |             "tensorflow": postprocess(
124 |                 measure_tf(tf_model, inputs, total, tqdm_leave=tqdm_leave)
125 |             ),
126 |             "onnxruntime": postprocess(
127 |                 measure_onnxruntime(model_proto, inputs, total, tqdm_leave=tqdm_leave)
128 |             ),
129 |         }
130 |         if measure_base
131 |         else {}
132 |     )
133 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # onnx2code
 2 | 
 3 | Generate plain C++ code for inference of ONNX models without dependencies
 4 | 
 5 | This project was made as an alternative to a final exam for the assignment "Computer Organization II". You can read the writeup in [docs/TP Final onnx2code.pdf](docs/TP%20Final%20onnx2code.pdf) (in Spanish).
 6 | 
 7 | ## Model support
 8 | 
 9 | The following models have been tested and work as expected.
10 | 
11 | | Model | Size |
12 | |---|---|
13 | | [mnist](https://github.com/onnx/models/tree/main/vision/classification/mnist) | 26 KB |
14 | | [Super_Resolution](https://github.com/onnx/models/tree/main/vision/super_resolution/sub_pixel_cnn_2016) | 240 KB |
15 | | [squeezenet1.1](https://github.com/onnx/models/tree/main/vision/classification/squeezenet) | 9 MB |
16 | | [emotion_ferplus](https://github.com/onnx/models/tree/main/vision/body_analysis/emotion_ferplus) | 34 MB |
17 | | [inception-v2](https://github.com/onnx/models/tree/main/vision/classification/inception_and_googlenet/inception_v2) | 44 MB |
18 | | [resnet50-caffe2-v1](https://github.com/onnx/models/tree/main/vision/classification/resnet) | 98 MB |
19 | | [VGG 16 and VGG 16-bn](https://github.com/onnx/models/tree/main/vision/classification/vgg) | 527 MB |
20 | | [VGG 19 and VGG 19-bn](https://github.com/onnx/models/tree/main/vision/classification/vgg) | 548 MB |
21 | | [VGG 19-caffe2](https://github.com/onnx/models/tree/main/vision/classification/vgg) | 561 MB |
22 | 
23 | * Minimum ONNX opset version: **7**
24 | * Quantized models are not supported
25 | 
26 | ## Operator support
27 | 
28 | Only `float` data type is supported.
29 | 
30 | | Operator | Attribute support |
31 | |---|---|
32 | | [Add](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Add), [Div](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Div), [Mul](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Mul), [Sub](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Sub) | ✅ with broadcasting |
33 | | [Concat](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Concat) | ✅ with multiple inputs<br/>✅ axis |
34 | | [Conv](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Conv) | ✅ bias<br/>✅ stride<br/>✅ padding (and `auto_pad`)<br/>❌ dilations<br/>❌ depthwise (group != 1) |
35 | | [Sum](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Sum) | ✅ with multiple inputs<br/>❌ with broadcasting |
36 | | [Relu](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Relu), [Tanh](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Tanh), [Sigmoid](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Sigmoid),  [Clip](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Clip) | ✅ |
37 | | [Gemm](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Gemm) | ✅ with bias<br/>❌ transpose A<br/>✅ tranpose B<br/>❌ alpha != 1<br/>❌ beta != 1 |
38 | | [Identity](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Identity) | ✅ |
39 | | [MaxPool](https://github.com/onnx/onnx/blob/main/docs/Operators.md#MaxPool), [AveragePool](https://github.com/onnx/onnx/blob/main/docs/Operators.md#AveragePool) | ✅ stride<br/>✅  padding (and `auto_pad`)<br/>❌ dilations<br/>❌ storage_order != 0<br/>❌ count_include_pad != 0 |
40 | | [Softmax](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Softmax) | ✅ stride<br/>✅ axis |
41 | | [Transpose](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Transpose) | ✅ perm |
42 | 
43 | 
44 | ## Setting up with Docker
45 | 
46 | We provide a ready to use [Docker image](https://hub.docker.com/r/mlomb/onnx2code):
47 | 
48 | ```sh
49 | docker run --rm -it -v $pwd/mnist.onnx:/app/input.onnx:ro -v $pwd/output:/app/output:rw mlomb/onnx2code:latest --variations=im2col,loop-tiling --checks=3
50 | ```
51 | 
52 | The command above will generate C++ code for the `mnist.onnx` model in the `output` folder.
53 | 
54 | ## Setting up locally
55 | 
56 | ### Prerequisites
57 | 
58 | * gcc (required if checking models)
59 | * Python 3.10
60 | * [pipenv](https://pypi.org/project/pipenv/)
61 | 
62 | Clone and install dependencies with `pipenv install`.
63 | 
64 | ### Run
65 | 
66 | To generate code from an ONNX model, run the following command inside a pipenv shell:
67 | 
68 | ```sh
69 | python -m onnx2code --variation=im2col,loop-tiling mnist.onnx output_folder --checks=3
70 | ```
71 | 


--------------------------------------------------------------------------------
/onnx2code/memory.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | from dataclasses import dataclass
  3 | 
  4 | # We implement different memory strategies used in TFLite
  5 | # Since we are using CPU we aim to the Memory Offset Calculation approach
  6 | #
  7 | # See:
  8 | #  * https://arxiv.org/pdf/2001.03288.pdf (main paper)
  9 | #  * https://blog.tensorflow.org/2020/10/optimizing-tensorflow-lite-runtime.html (blog post)
 10 | #  * https://github.com/tensorflow/tensorflow/blob/1b36c9fb27ce899e19ddf65da3c0920861210472/tensorflow/lite/delegates/gpu/common/memory_management (ref code)
 11 | 
 12 | 
 13 | @dataclass
 14 | class TensorUsageRecord:
 15 |     first_op: int
 16 |     last_op: int
 17 |     size: int
 18 |     index: int = -1  # used to store the original index after sorting
 19 | 
 20 | 
 21 | Records = list[TensorUsageRecord]
 22 | Offsets = list[int | None]
 23 | Result = tuple[int, Offsets]
 24 | 
 25 | 
 26 | ##########################
 27 | # Naive
 28 | ##########################
 29 | def naive(records: Records) -> Result:
 30 |     total_consumption = 0
 31 |     offsets: Offsets = [None] * len(records)
 32 | 
 33 |     for i, r in enumerate(records):
 34 |         offsets[i] = total_consumption
 35 |         total_consumption += r.size
 36 | 
 37 |     return total_consumption, offsets
 38 | 
 39 | 
 40 | ##########################
 41 | # Greed by Size
 42 | #
 43 | # TFLite C impl: https://github.com/tensorflow/tensorflow/blob/1b36c9fb27ce899e19ddf65da3c0920861210472/tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_size_assignment.cc#L69
 44 | ##########################
 45 | def greedy_by_size(records: Records) -> Result:
 46 |     # save original indexes
 47 |     for i, r in enumerate(records):
 48 |         r.index = i
 49 | 
 50 |     # sort records in decreasing order of size
 51 |     records.sort(key=lambda r: r.size, reverse=True)
 52 | 
 53 |     # result
 54 |     total_consumption = 0
 55 |     offsets: Offsets = [None] * len(records)
 56 | 
 57 |     # indexes already allocated, ordered by offset
 58 |     ordered_allocs: list[int] = []
 59 | 
 60 |     for t_i, t in enumerate(records):
 61 |         prev_offset = 0
 62 |         best_offset = None
 63 |         smallest_gap = math.inf
 64 | 
 65 |         for allocated_id in ordered_allocs:
 66 |             rec = records[allocated_id]
 67 | 
 68 |             if rec.last_op < t.first_op or rec.first_op > t.last_op:
 69 |                 # no overlap, skip
 70 |                 continue
 71 | 
 72 |             cur_offset = offsets[rec.index]
 73 |             assert cur_offset is not None
 74 | 
 75 |             if cur_offset >= prev_offset:
 76 |                 gap = cur_offset - prev_offset
 77 | 
 78 |                 if gap >= t.size and gap < smallest_gap:
 79 |                     smallest_gap = gap
 80 |                     best_offset = prev_offset
 81 | 
 82 |             prev_offset = max(prev_offset, cur_offset + rec.size)
 83 | 
 84 |         # if no suitable gap found, allocate at the end
 85 |         if best_offset is None:
 86 |             best_offset = prev_offset
 87 | 
 88 |         offsets[t.index] = best_offset
 89 |         total_consumption = max(total_consumption, best_offset + t.size)
 90 | 
 91 |         ordered_allocs.append(t_i)
 92 | 
 93 |         # sort by offset
 94 |         ordered_allocs.sort(key=lambda i: offsets[records[i].index])  # type: ignore
 95 | 
 96 |     return total_consumption, offsets
 97 | 
 98 | 
 99 | ##########################
100 | # Greed by Breadth
101 | ##########################
102 | def greedy_by_breadth(records: Records) -> Result:
103 |     raise NotImplementedError()
104 | 
105 | 
106 | def find_best_layout(records: Records) -> Result:
107 |     """
108 |     Find the best memory layout using different strategies.
109 |     """
110 |     alternatives = [
111 |         naive(records),
112 |         greedy_by_size(records),
113 |         # greedy_by_breadth(records),
114 |     ]
115 | 
116 |     return min(alternatives, key=lambda r: r[0])
117 | 
118 | 
119 | if __name__ == "__main__":
120 |     test = [
121 |         TensorUsageRecord(0, 1, 32),
122 |         TensorUsageRecord(1, 4, 28),
123 |         TensorUsageRecord(2, 5, 36),
124 |         TensorUsageRecord(3, 5, 16),
125 |         TensorUsageRecord(4, 5, 8),
126 |         TensorUsageRecord(5, 7, 64),
127 |         TensorUsageRecord(6, 8, 10),
128 |         TensorUsageRecord(7, 8, 40),
129 |     ]
130 | 
131 |     print(find_best_layout(test))
132 | 


--------------------------------------------------------------------------------
/onnx2code/ops/operation.py:
--------------------------------------------------------------------------------
  1 | from abc import ABC, abstractmethod
  2 | from collections import defaultdict
  3 | from dataclasses import dataclass
  4 | from pathlib import Path
  5 | from textwrap import dedent
  6 | from typing import Any, Callable, Literal
  7 | 
  8 | import onnx
  9 | 
 10 | from ..tensor import TensorInfo
 11 | 
 12 | # used as tensor names
 13 | LETTERS = (
 14 |     "A",
 15 |     "B",
 16 |     "C",
 17 |     "D",
 18 |     "E",
 19 |     "F",
 20 |     "G",
 21 |     "H",
 22 |     "I",
 23 |     "J",
 24 |     "K",
 25 |     "L",
 26 |     "M",
 27 |     "N",
 28 |     "O",
 29 |     "P",
 30 |     "Q",
 31 |     "R",
 32 |     "S",
 33 |     "T",
 34 |     "U",
 35 |     "V",
 36 |     "W",
 37 |     "X",
 38 |     "Y",
 39 |     "Z",
 40 | )
 41 | 
 42 | 
 43 | @dataclass
 44 | class OpCall:
 45 |     sig_name: str
 46 |     sig_params: list[int | str | list[int] | list[str]]
 47 |     inputs: list[TensorInfo]
 48 |     outputs: list[TensorInfo]
 49 |     input_names: tuple[str, ...] = LETTERS
 50 |     output_names: tuple[str, ...] = ("OUT",)
 51 | 
 52 |     def fn_name(self) -> str:
 53 |         str_sig_params = []
 54 |         for sig_param in self.sig_params:
 55 |             if isinstance(sig_param, list):
 56 |                 str_sig_params.append("x".join(map(str, sig_param)))
 57 |             else:
 58 |                 str_sig_params.append(str(sig_param))
 59 | 
 60 |         return f"{self.sig_name}{'_' if len(str_sig_params) > 0 else ''}" + "_".join(
 61 |             str_sig_params
 62 |         )
 63 | 
 64 |     def signature(self) -> str:
 65 |         params = []
 66 |         for i in range(len(self.inputs)):
 67 |             params.append(f"const float* __restrict__ {self.input_names[i]}")
 68 |         for i in range(len(self.outputs)):
 69 |             params.append(f"float* __restrict__ {self.output_names[i]}")
 70 | 
 71 |         return f"void {self.fn_name()}({', '.join(params)})"
 72 | 
 73 |     def invocation(self) -> str:
 74 |         return (
 75 |             self.fn_name()
 76 |             + f"({', '.join(t.variable for t in self.inputs + self.outputs)})"
 77 |         )
 78 | 
 79 | 
 80 | @dataclass(frozen=True)
 81 | class ASMAuxFunction:
 82 |     signature: str
 83 |     source: str
 84 | 
 85 | 
 86 | @dataclass(frozen=True)
 87 | class OpImpl:
 88 |     lang: Literal["c", "asm"]
 89 |     source: str | tuple[str, ...]
 90 |     cpp_aux_functions: tuple[str, ...] = ()
 91 |     asm_aux_functions: tuple[ASMAuxFunction, ...] = ()
 92 |     external_paths: tuple[Path, ...] = ()
 93 | 
 94 |     def full_source(self) -> str:
 95 |         code = self.source if isinstance(self.source, str) else "\n".join(self.source)
 96 |         return dedent(code).strip().strip("\n")
 97 | 
 98 | 
 99 | @dataclass(frozen=True)
100 | class RegistryEntry:
101 |     variant_tags: list[str]
102 |     priority: int
103 |     klass: type["Operation"]
104 | 
105 |     def __lt__(self, other: Any) -> bool:
106 |         return self.priority < other.priority  # type: ignore
107 | 
108 | 
109 | class Operation(ABC):
110 |     node_types: set[str]
111 |     _registry: defaultdict[str, list[RegistryEntry]] = defaultdict(list)
112 | 
113 |     def __init__(
114 |         self,
115 |         node: onnx.NodeProto,
116 |         inputs: list[TensorInfo],
117 |         outputs: list[TensorInfo],
118 |     ):
119 |         self.node = node
120 |         self.inputs = inputs
121 |         self.outputs = outputs
122 |         self.parse()
123 | 
124 |     @abstractmethod
125 |     def parse(self) -> None:
126 |         pass
127 | 
128 |     @abstractmethod
129 |     def call(self) -> OpCall | None:
130 |         return None
131 | 
132 |     @abstractmethod
133 |     def impl(self) -> OpImpl | None:
134 |         pass
135 | 
136 |     @classmethod
137 |     def variant(
138 |         cls, var: str | list[str], priority: int = 0
139 |     ) -> Callable[[type["Operation"]], type["Operation"]]:
140 |         vars = [var] if isinstance(var, str) else var
141 | 
142 |         def decorator(newcls: type[Operation]) -> type[Operation]:
143 |             for node_type in newcls.node_types:
144 |                 cls._registry[node_type].append(
145 |                     RegistryEntry(variant_tags=vars, priority=priority, klass=newcls)
146 |                 )
147 |                 # always keep sorted
148 |                 cls._registry[node_type].sort()
149 | 
150 |             return newcls
151 | 
152 |         return decorator
153 | 
154 |     @staticmethod
155 |     def get(node_type: str, variant_order: list[str]) -> list[type["Operation"]]:
156 |         if node_type not in Operation._registry:
157 |             raise NotImplementedError(f"Operation {node_type} not implemented")
158 | 
159 |         variants = []
160 | 
161 |         for variant_tag in variant_order:
162 |             for entry in Operation._registry[node_type]:
163 |                 if variant_tag in entry.variant_tags:
164 |                     variants.append(entry.klass)
165 | 
166 |         if len(variants) == 0:
167 |             raise ValueError(f"No valid variant found for {node_type}")
168 |         else:
169 |             return list(dict.fromkeys(variants))
170 | 


--------------------------------------------------------------------------------
/onnx2code/util.py:
--------------------------------------------------------------------------------
  1 | from typing import Any, Literal, Optional
  2 | 
  3 | import numpy as np
  4 | import onnx
  5 | 
  6 | TensorShape = list[int]
  7 | ShapesMap = dict[str, TensorShape]
  8 | 
  9 | 
 10 | # taken from onnx_simplifier.get_inputs
 11 | def get_model_inputs(model: onnx.ModelProto) -> list[onnx.ValueInfoProto]:
 12 |     initializer_names = [x.name for x in model.graph.initializer]
 13 |     return [ipt for ipt in model.graph.input if ipt.name not in initializer_names]
 14 | 
 15 | 
 16 | # taken from onnx_simplifier.get_shape_from_value_info_proto
 17 | def get_shape_from_value_info_proto(v: onnx.ValueInfoProto) -> TensorShape:
 18 |     return [dim.dim_value for dim in v.type.tensor_type.shape.dim]
 19 | 
 20 | 
 21 | # taken from onnx_simplifier.get_value_info_all
 22 | def get_value_info_all(m: onnx.ModelProto, name: str) -> Optional[onnx.ValueInfoProto]:
 23 |     for v in m.graph.value_info:
 24 |         if v.name == name:
 25 |             return v  # type: ignore
 26 | 
 27 |     for v in m.graph.input:
 28 |         if v.name == name:
 29 |             return v  # type: ignore
 30 | 
 31 |     for v in m.graph.output:
 32 |         if v.name == name:
 33 |             return v  # type: ignore
 34 | 
 35 |     return None
 36 | 
 37 | 
 38 | # taken from onnx_simplifier.get_shape
 39 | def get_shape(m: onnx.ModelProto, name: str) -> TensorShape:
 40 |     v = get_value_info_all(m, name)
 41 |     if v is not None:
 42 |         return get_shape_from_value_info_proto(v)
 43 |     raise RuntimeError('Cannot get shape of "{}"'.format(name))
 44 | 
 45 | 
 46 | def get_fixed_input_shapes(onnx_model: onnx.ModelProto) -> ShapesMap:
 47 |     """
 48 |     Returns a map with the input name as key and the shape of the input
 49 |     fixed to one batch.
 50 | 
 51 |     For example, if one of the inputs of the model is [None, 32, 32, 3],
 52 |     the resulting shape for that input will be [1, 32, 32, 3].
 53 |     """
 54 | 
 55 |     def fix_shape(shape: list[int]) -> list[int]:
 56 |         return [1 if (d == 0 or d is None) else d for d in shape]
 57 | 
 58 |     return {
 59 |         tensor.name: fix_shape(get_shape(onnx_model, tensor.name))
 60 |         for tensor in get_model_inputs(onnx_model)
 61 |     }
 62 | 
 63 | 
 64 | def get_attribute(node: onnx.NodeProto, name: str, default: Any = None) -> Any:
 65 |     """
 66 |     Returns the value of the attribute with the given name.
 67 |     If the attribute is not found, returns the default value.
 68 |     """
 69 |     for attr in node.attribute:
 70 |         if attr.name == name:
 71 |             return onnx.helper.get_attribute_value(attr)
 72 |     return default
 73 | 
 74 | 
 75 | def compute_strides(shape: list[int]) -> list[int]:
 76 |     """
 77 |     Returns the strides of the given shape.
 78 | 
 79 |     For example, compute_strides([1, 2, 3]) returns [6, 3, 1].
 80 |     """
 81 |     strides = []
 82 |     for i in range(len(shape)):
 83 |         after = shape[i + 1 :]
 84 |         if len(after) == 0:
 85 |             strides.append(1)
 86 |         else:
 87 |             strides.append(int(np.prod(after)))
 88 |     return strides
 89 | 
 90 | 
 91 | def resolve_stride_attribute(node: onnx.NodeProto) -> list[int]:
 92 |     """
 93 |     Retrieves the strides attribute from a node or returns the default value
 94 |     """
 95 |     strides: list[int] = get_attribute(node, "strides", [1] * 2)
 96 |     return strides
 97 | 
 98 | 
 99 | def compute_pad_in_dimension(
100 |     in_dim: int,
101 |     stride: int,
102 |     kernel: int,
103 |     pad_type: Literal[b"SAME_UPPER", b"SAME_LOWER", b"VALID", b"NOTSET"],
104 | ) -> tuple[int, int]:
105 |     """
106 |     https://github.com/microsoft/onnxruntime/blob/9ec1ed42a809170b87474f5822c4557101812399/onnxruntime/core/providers/common.h#L73
107 |     """
108 |     pad_head = 0
109 |     pad_tail = 0
110 | 
111 |     if pad_type == b"VALID" or pad_type == b"NOTSET":
112 |         pass
113 |     elif pad_type == b"SAME_UPPER" or pad_type == b"SAME_LOWER":
114 |         legacy_target_size = (in_dim + stride - 1) // stride
115 |         pad_needed = (legacy_target_size - 1) * stride + kernel - in_dim
116 | 
117 |         if pad_type == b"SAME_LOWER":
118 |             pad_head = (pad_needed + 1) // 2
119 |         else:
120 |             pad_head = pad_needed // 2
121 | 
122 |         pad_tail = pad_needed - pad_head
123 |     else:
124 |         raise NotImplementedError(f"Pad type {pad_type} not implemented")
125 | 
126 |     return pad_head, pad_tail
127 | 
128 | 
129 | def resolve_padding_attribute(
130 |     node: onnx.NodeProto, X: TensorShape, W: TensorShape
131 | ) -> list[int]:
132 |     """
133 |     Retrieves the padding attribute from a node or returns the default value
134 |     """
135 |     ndims = len(X) - 2  # number of spatial dimensions (excluding batch and channel)
136 |     pads: list[int] = get_attribute(node, "pads", None)
137 |     auto_pad = get_attribute(node, "auto_pad", b"NOTSET")
138 |     stride = resolve_stride_attribute(node)
139 | 
140 |     if pads is not None:
141 |         assert auto_pad == b"NOTSET", "Cannot specify both pads and auto_pad"
142 |         return pads
143 | 
144 |     pads = [0] * ndims * 2
145 |     for i in range(ndims):
146 |         pad_head, pad_tail = compute_pad_in_dimension(
147 |             X[i + 2], stride[i], W[i + 2], auto_pad
148 |         )
149 |         pads[i] = pad_head
150 |         pads[i + ndims] = pad_tail
151 | 
152 |     return pads
153 | 


--------------------------------------------------------------------------------
/onnx2code/service.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import subprocess
  3 | import tempfile
  4 | from multiprocessing import shared_memory
  5 | from pathlib import Path
  6 | from subprocess import PIPE, run
  7 | from typing import Any
  8 | 
  9 | import numpy as np
 10 | 
 11 | from .result import ModelResult
 12 | from .tensor import TensorData
 13 | from .util import ShapesMap
 14 | 
 15 | TensorsMap = dict[str, TensorData]
 16 | TensorsList = list[TensorData]
 17 | 
 18 | 
 19 | def _run_compilation_command(cmd: list[str]) -> None:
 20 |     """
 21 |     Runs a given compilation command as a subprocess
 22 | 
 23 |     :param cmd: A list containing the command and its CLI args
 24 |     :raises SyntaxError: If the process return code is non-zero
 25 |     """
 26 |     compilation_process = run(cmd, stderr=PIPE)
 27 |     if compilation_process.returncode != 0:
 28 |         raise SyntaxError(compilation_process.stderr.decode("utf8"))
 29 | 
 30 | 
 31 | class ModelService:
 32 |     """
 33 |     Allows using a model generated by onnx2code in a convenient way
 34 | 
 35 |     Used for testing and evaluation
 36 |     """
 37 | 
 38 |     def __init__(self, result: ModelResult):
 39 |         self.result = result
 40 | 
 41 |     def __enter__(self) -> "ModelService":
 42 |         """
 43 |         Compiles the model and starts a subprocess
 44 |         """
 45 |         self.temp_dir = tempfile.TemporaryDirectory()
 46 | 
 47 |         self._compile()
 48 |         self._boot()
 49 | 
 50 |         return self
 51 | 
 52 |     def _compile(self) -> None:
 53 |         debug = os.getenv("ONNX2CODE_DEBUG", "0") == "1"
 54 | 
 55 |         if debug:
 56 |             # save for later inspection
 57 |             temp_dir = Path(__file__).parent.parent / "tmp/"
 58 |         else:
 59 |             temp_dir = Path(self.temp_dir.name)
 60 | 
 61 |         temp_dir.mkdir(exist_ok=True)
 62 | 
 63 |         c_file = temp_dir / "model.cpp"
 64 |         h_file = temp_dir / "model.h"
 65 |         asm_file = temp_dir / "model.asm"
 66 |         asm_object = temp_dir / "model-asm.o"
 67 |         svc_file = Path(__file__).parent / "service.c"
 68 |         self.weights_file = temp_dir / "weights.bin"
 69 |         self.service_executable = temp_dir / "service"
 70 | 
 71 |         self.result.weights.tofile(self.weights_file)
 72 | 
 73 |         for file, content in [
 74 |             (c_file, self.result.source_c),
 75 |             (h_file, self.result.source_h),
 76 |             (asm_file, self.result.source_asm),
 77 |         ]:
 78 |             with open(file, "w") as f:
 79 |                 f.write(content)
 80 | 
 81 |         _run_compilation_command(
 82 |             [
 83 |                 "nasm",
 84 |                 "-f",
 85 |                 "elf64",
 86 |                 str(asm_file),
 87 |                 "-o",
 88 |                 str(asm_object),
 89 |             ]
 90 |             + (["-g", "-w+all", "-w+error"] if debug else [])
 91 |         )
 92 | 
 93 |         _run_compilation_command(
 94 |             [
 95 |                 "g++",
 96 |                 "-m64",  # 64 bit env
 97 |                 str(asm_object),
 98 |                 str(h_file),
 99 |                 str(c_file),
100 |                 str(svc_file),
101 |                 "-o",
102 |                 str(self.service_executable),
103 |                 "-I",
104 |                 temp_dir.__str__(),
105 |                 "-lrt",  # for shm
106 |                 "-lm",  # for math
107 |                 "-march=native",
108 |                 "-mtune=native",
109 |                 "-O3",
110 |             ]
111 |             + (
112 |                 [
113 |                     "-g",
114 |                     "-fsanitize=address",
115 |                     "-Wall",
116 |                     "-Werror",
117 |                     "-Wno-unused-result",
118 |                     "-Wno-unused-but-set-variable",
119 |                     "-Wno-unused-variable",
120 |                 ]
121 |                 if debug
122 |                 else []
123 |             )
124 |         )
125 | 
126 |     def _boot(self) -> None:
127 |         """
128 |         Creates the shared memory buffers and starts the service subprocess
129 |         """
130 |         self.inputs_buffer = SharedNDArrays("/o2c-inputs", self.result.input_shapes)
131 |         self.outputs_buffer = SharedNDArrays("/o2c-outputs", self.result.output_shapes)
132 | 
133 |         self.process = subprocess.Popen(
134 |             [self.service_executable, self.weights_file],
135 |             stdin=subprocess.PIPE,
136 |             stdout=subprocess.PIPE,
137 |         )
138 | 
139 |     def inference(self, inputs: TensorsMap) -> TensorsList:
140 |         """
141 |         Runs the model with the given inputs
142 |         """
143 |         assert len(inputs) == len(self.result.input_shapes)
144 | 
145 |         # load inputs into shared memory
146 |         self.inputs_buffer.set(inputs)
147 | 
148 |         # signal service that inputs are ready
149 |         assert self.process.stdin and self.process.stdout
150 |         self.process.stdin.write("1".encode())
151 |         self.process.stdin.flush()
152 |         # wait for service to finish inference
153 |         self.process.stdout.read(1)
154 | 
155 |         # read outputs from shared memory
156 |         return self.outputs_buffer.get()
157 | 
158 |     def __exit__(self, _1: Any, _2: Any, _3: Any) -> None:
159 |         # exit service
160 |         self.process.terminate()
161 | 
162 |         # release shared memory
163 |         self.inputs_buffer.cleanup()
164 |         self.outputs_buffer.cleanup()
165 | 
166 |         # remove compilation files
167 |         self.temp_dir.cleanup()
168 | 
169 | 
170 | class SharedNDArrays:
171 |     """
172 |     List of NDArray[float32]'s backed by shared memory
173 |     """
174 | 
175 |     def __init__(self, name: str, shapes: ShapesMap):
176 |         self.shapes = shapes
177 |         self.offsets = np.cumsum([0, *[np.prod(s) for s in shapes.values()]])
178 |         self.elems = self.offsets[-1]
179 |         self.size = self.elems * 4
180 | 
181 |         try:
182 |             shm = shared_memory.SharedMemory(name, create=False)
183 |             shm.unlink()
184 |         except FileNotFoundError:
185 |             pass
186 | 
187 |         self.shm = shared_memory.SharedMemory(name, create=True, size=self.size)
188 |         self.buffer: TensorData = np.ndarray(
189 |             self.elems, dtype=np.float32, buffer=self.shm.buf
190 |         )
191 | 
192 |     def set(self, inputs: TensorsMap) -> None:
193 |         self.buffer[:] = np.concatenate([inp.reshape(-1) for inp in inputs.values()])
194 | 
195 |     def get(self) -> TensorsList:
196 |         return [
197 |             self.buffer[self.offsets[i] : self.offsets[i + 1]].reshape(self.shapes[n])
198 |             for i, n in enumerate(self.shapes)
199 |         ]
200 | 
201 |     def cleanup(self) -> None:
202 |         del self.buffer
203 |         self.shm.close()
204 |         self.shm.unlink()
205 | 


--------------------------------------------------------------------------------
/onnx2code/ops/gemm.py:
--------------------------------------------------------------------------------
  1 | import subprocess
  2 | from typing import Iterable
  3 | 
  4 | from onnx2code.util import get_attribute
  5 | 
  6 | from .gemm_tiling.GEMM import call_GEMM, external_paths_GEMM
  7 | from .operation import OpCall, Operation, OpImpl
  8 | 
  9 | 
 10 | class GEMM(Operation):
 11 |     """
 12 |     GEneral Matrix Multiplication operator
 13 | 
 14 |     https://github.com/onnx/onnx/blob/main/docs/Operators.md#gemm
 15 |     """
 16 | 
 17 |     node_types = {"Gemm", "MatMul"}
 18 | 
 19 |     def parse(self) -> None:
 20 |         assert (
 21 |             len(self.inputs) == 2 or len(self.inputs) == 3
 22 |         ), "expected two or three inputs"
 23 |         assert len(self.outputs) == 1, "expected one output"
 24 | 
 25 |         self.hasC = len(self.inputs) == 3
 26 |         self.transA = get_attribute(self.node, "transA", 0) > 0.5
 27 |         self.transB = get_attribute(self.node, "transB", 0) > 0.5
 28 |         self.alpha = get_attribute(self.node, "alpha", 1.0)
 29 |         self.beta = get_attribute(self.node, "beta", 1.0)
 30 | 
 31 |         # normalize
 32 |         self.alpha = None if self.alpha == 1.0 else self.alpha
 33 |         self.beta = None if self.beta == 1.0 else self.beta
 34 | 
 35 |         if self.transA:
 36 |             raise NotImplementedError("transA not supported")
 37 |         if self.alpha is not None:
 38 |             raise NotImplementedError("alpha not supported")
 39 |         if self.beta is not None:
 40 |             raise NotImplementedError("beta not supported")
 41 | 
 42 |         A = self.inputs[0]
 43 |         B = self.inputs[1]
 44 |         Y = self.outputs[0]
 45 | 
 46 |         self.N = A.shape[0]
 47 |         self.M = B.shape[1] if self.transB else B.shape[0]
 48 |         self.K = B.shape[0] if self.transB else B.shape[1]
 49 | 
 50 |         assert Y.shape[0] == self.N
 51 |         assert Y.shape[1] == self.K
 52 | 
 53 |     def call(self) -> OpCall:
 54 |         return OpCall(
 55 |             sig_name="GEMM",
 56 |             sig_params=[
 57 |                 self.hasC,
 58 |                 self.N,
 59 |                 self.M,
 60 |                 self.K,
 61 |                 self.transB,
 62 |             ],
 63 |             inputs=self.inputs,
 64 |             outputs=self.outputs,
 65 |         )
 66 | 
 67 | 
 68 | @GEMM.variant(["c", "gemm-naive"], priority=2)
 69 | class GEMMC(GEMM):
 70 |     def impl(self) -> OpImpl:
 71 |         N, M, K = self.N, self.M, self.K
 72 | 
 73 |         index_B = f"i * {K} + col" if not self.transB else f"col * {M} + i"
 74 | 
 75 |         source = f"""
 76 |         for(int row = 0; row < {N}; row++) {{
 77 |             for(int col = 0; col < {K}; col++) {{
 78 |                 float sum = 0;
 79 |                 for(int i = 0; i < {M}; i++) {{
 80 |                     sum += A[row * {M} + i] * B[{index_B}];
 81 |                 }}
 82 |                 OUT[row * {K} + col] = sum{f' + C[row * {K} + col]' if self.hasC else ''};
 83 |             }}
 84 |         }}
 85 |         """
 86 | 
 87 |         return OpImpl(lang="c", source=source)
 88 | 
 89 | 
 90 | # Make sure this executable is in your PATH
 91 | LIBXSMM_PATH = "libxsmm_gemm_generator"
 92 | 
 93 | 
 94 | @GEMM.variant(["asm", "libxsmm"], priority=0)
 95 | class GEMMAsm(GEMM):
 96 |     def impl(self) -> OpImpl:
 97 |         N, M, K = self.N, self.M, self.K
 98 | 
 99 |         aux_fn_name = f"libxsmm_GEMM_{N}_{M}_{K}"
100 | 
101 |         # Reference: https://scalable.uni-jena.de/opt/hpc/chapters/assignment_small_gemms.html
102 |         generator_args = [
103 |             LIBXSMM_PATH,
104 |             # matrix type
105 |             "dense",
106 |             # output file name
107 |             "/dev/stdout",
108 |             # function name
109 |             aux_fn_name,
110 |             # matrix size
111 |             str(K),
112 |             str(N),
113 |             str(M),
114 |             # lda, ldb, ldc
115 |             str(K),
116 |             str(M),
117 |             str(K),
118 |             # alpha beta
119 |             # C := alpha*A*B + beta*C
120 |             "1",
121 |             "0",
122 |             # 0: unaligned A, C
123 |             "0",
124 |             "0",
125 |             # arch
126 |             "hsw",  # haswell, targets AVX2
127 |             # prefetch
128 |             "nopf",  # no prefetch
129 |             # precision
130 |             "SP",  # single precision (f32)
131 |         ]
132 | 
133 |         try:
134 |             libxsmm_generator_process = subprocess.run(
135 |                 " ".join(generator_args),
136 |                 capture_output=True,
137 |                 encoding="utf-8",
138 |                 shell=True,
139 |             )
140 |         except PermissionError:
141 |             raise RuntimeError(f"libxsmm not found at '{LIBXSMM_PATH}'")
142 | 
143 |         if (
144 |             libxsmm_generator_process.returncode != 0
145 |             or libxsmm_generator_process.stderr != ""
146 |         ):
147 |             raise RuntimeError(f"libxsmm: {libxsmm_generator_process.stderr}")
148 | 
149 |         lines: Iterable[str] = libxsmm_generator_process.stdout.splitlines()
150 | 
151 |         aux_fn = "\n".join(
152 |             filter(
153 |                 # Filter out the flops line
154 |                 lambda line: not (
155 |                     line.startswith("libxsmm_num_total_flops") or line == ""
156 |                 ),
157 |                 lines,
158 |             )
159 |         )
160 | 
161 |         if aux_fn == "":
162 |             raise RuntimeError("libxsmm: no output")
163 | 
164 |         # tensors MUST be reversed since libxsmm uses BLAS' column-major order
165 |         # and we use onnx's row-major order
166 |         source = f"""
167 |         {aux_fn_name}(B, A, OUT);
168 |         """ + (
169 |             f"""
170 |             for(int i = 0; i < {N * K}; i++) {{
171 |                 OUT[i] += C[i];
172 |             }}
173 |             """
174 |             if self.hasC
175 |             else ""
176 |         )
177 | 
178 |         return OpImpl(lang="c", source=source, cpp_aux_functions=(aux_fn,))
179 | 
180 | 
181 | @GEMM.variant(["c", "loop-tiling"], priority=1)
182 | class GEMMLoopTiling(GEMM):
183 |     def impl(self) -> OpImpl:
184 |         M, K, N = self.N, self.M, self.K
185 | 
186 |         if self.hasC:
187 |             raise NotImplementedError("hasC not supported")
188 | 
189 |         # unit_update_asm = ASMAuxFunction(
190 |         #     signature="void unit_update(const float*, const float*, float*)",
191 |         #     source="""
192 |         #         vbroadcastss ymm0, [rsi]
193 |         #         vmovups ymm1, [rdi]
194 |         #         vfmadd213ps ymm0, ymm1, [rdx]
195 |         #         vmovups [rdx], ymm0
196 |         #         vzeroupper
197 |         #         ret
198 |         #     """,
199 |         # )
200 | 
201 |         return OpImpl(
202 |             lang="c",
203 |             source=call_GEMM(M, K, N, "A, B, OUT"),
204 |             external_paths=external_paths_GEMM,
205 |             # asm_aux_functions=(unit_update_asm,),
206 |         )
207 | 


--------------------------------------------------------------------------------
/evaluation/results_conv/10th.csv:
--------------------------------------------------------------------------------
  1 | MNK,runtime,time_mean,time_std
  2 | 256,onnx2code-conv-naive,0.5491880600000001,0.13237336197376118
  3 | 256,onnx2code-im2col,0.9188441900000001,0.1298197832873476
  4 | 256,tensorflow,1.3908916,0.2866774618994664
  5 | 256,onnxruntime,0.4345319399999999,0.1870622537994141
  6 | 288,onnx2code-conv-naive,0.48715349999999996,0.08764745483840361
  7 | 288,onnx2code-im2col,1.34965215,0.26676209362540154
  8 | 288,tensorflow,1.4971754700000002,0.2615000408917924
  9 | 288,onnxruntime,0.66567734,0.13843684378489854
 10 | 320,onnx2code-conv-naive,0.69172044,0.13837388211142448
 11 | 320,onnx2code-im2col,1.5007587900000001,0.27390226756988684
 12 | 320,tensorflow,1.6004319200000001,0.24421334188048285
 13 | 320,onnxruntime,0.83764132,0.18599264028895768
 14 | 352,onnx2code-conv-naive,0.7370404599999999,0.13744646944053673
 15 | 352,onnx2code-im2col,1.9929681699999995,0.3946627409351447
 16 | 352,tensorflow,1.7411530500000003,0.2345129273474439
 17 | 352,onnxruntime,1.0316308899999997,0.16029263369212543
 18 | 384,onnx2code-conv-naive,0.85087959,0.14158904427695632
 19 | 384,onnx2code-im2col,2.24507743,0.38668566384204767
 20 | 384,tensorflow,1.8663835800000002,0.3699921724628017
 21 | 384,onnxruntime,1.34216344,0.2132365995996616
 22 | 416,onnx2code-conv-naive,1.07306232,0.14262480010565343
 23 | 416,onnx2code-im2col,3.00455171,0.7940435426173719
 24 | 416,tensorflow,1.9588059400000004,0.2563426713220731
 25 | 416,onnxruntime,1.5810998100000004,0.24184807849080361
 26 | 448,onnx2code-conv-naive,1.2683012500000002,0.18853181540193026
 27 | 448,onnx2code-im2col,3.25650464,0.6449111779435757
 28 | 448,tensorflow,2.41006099,0.3340333789623874
 29 | 448,onnxruntime,1.85145789,0.2536602645151934
 30 | 480,onnx2code-conv-naive,1.4698558999999998,0.17367458305287506
 31 | 480,onnx2code-im2col,3.86520041,0.9520036125981465
 32 | 480,tensorflow,2.3632088700000002,0.28686510795984427
 33 | 480,onnxruntime,2.07489152,0.28566567101177837
 34 | 512,onnx2code-conv-naive,1.59486367,0.198567874527178
 35 | 512,onnx2code-im2col,4.39524337,0.7469946433111774
 36 | 512,tensorflow,2.51543401,0.26654362903001433
 37 | 512,onnxruntime,2.4066684400000002,0.29880642444644057
 38 | 544,onnx2code-conv-naive,1.59447175,0.20297294414607947
 39 | 544,onnx2code-im2col,4.877493419999999,0.946721760463772
 40 | 544,tensorflow,2.78587158,0.4026182149866093
 41 | 544,onnxruntime,2.64123508,0.30412804938264676
 42 | 576,onnx2code-conv-naive,1.8584777100000003,0.2965443570897715
 43 | 576,onnx2code-im2col,5.182136000000001,0.613396774608165
 44 | 576,tensorflow,2.94818244,0.34298876794677463
 45 | 576,onnxruntime,3.0138453599999995,0.32912679608830153
 46 | 608,onnx2code-conv-naive,2.1042586300000004,0.33962333469941824
 47 | 608,onnx2code-im2col,5.90957882,0.7201904177597808
 48 | 608,tensorflow,2.9772878200000004,0.38372111270959225
 49 | 608,onnxruntime,3.3932347099999993,0.4371578633363993
 50 | 640,onnx2code-conv-naive,2.2751119600000003,0.2228727057489508
 51 | 640,onnx2code-im2col,6.65117641,0.8332841357658395
 52 | 640,tensorflow,3.4733946799999997,0.4854518077448446
 53 | 640,onnxruntime,3.6088097100000005,0.37096904414304693
 54 | 672,onnx2code-conv-naive,2.38564787,0.30105978589790616
 55 | 672,onnx2code-im2col,7.230346799999999,0.779337072554604
 56 | 672,tensorflow,3.72211326,0.3315343707700189
 57 | 672,onnxruntime,4.015681740000001,0.36337057032155534
 58 | 704,onnx2code-conv-naive,2.73222686,0.3305703471703722
 59 | 704,onnx2code-im2col,7.684625929999999,0.7507740755702911
 60 | 704,tensorflow,3.9292453199999993,0.36010634476584497
 61 | 704,onnxruntime,4.47966224,0.4367974261954189
 62 | 736,onnx2code-conv-naive,3.0278000099999995,0.4229891265275384
 63 | 736,onnx2code-im2col,8.640412600000001,0.9262444881498297
 64 | 736,tensorflow,4.10335983,0.42943891032949155
 65 | 736,onnxruntime,4.78642897,0.4368799570415529
 66 | 768,onnx2code-conv-naive,3.1074968299999997,0.35935848197272474
 67 | 768,onnx2code-im2col,9.42943564,1.0549493285844636
 68 | 768,tensorflow,4.35549105,0.4705102456384
 69 | 768,onnxruntime,5.20144776,0.48165631481254595
 70 | 800,onnx2code-conv-naive,3.40349017,0.4410177443325167
 71 | 800,onnx2code-im2col,10.087175249999998,1.0337052905505357
 72 | 800,tensorflow,4.58121696,0.515248672096919
 73 | 800,onnxruntime,5.688211319999999,0.5391705471565167
 74 | 832,onnx2code-conv-naive,3.74884378,0.4504297513641739
 75 | 832,onnx2code-im2col,11.362020870000004,1.3332305096315464
 76 | 832,tensorflow,4.743467050000001,0.627452077184997
 77 | 832,onnxruntime,6.235891240000001,0.5067022519198059
 78 | 864,onnx2code-conv-naive,4.14272667,0.5105069073467872
 79 | 864,onnx2code-im2col,12.0510654,1.139568936617281
 80 | 864,tensorflow,5.09580038,0.509759947245226
 81 | 864,onnxruntime,6.721285499999999,0.5030100573284891
 82 | 896,onnx2code-conv-naive,4.175582160000001,0.4371373430177001
 83 | 896,onnx2code-im2col,12.419930509999999,0.899210697129538
 84 | 896,tensorflow,5.326666319999998,0.4760109094783623
 85 | 896,onnxruntime,7.1844892499999995,0.6275378377775538
 86 | 928,onnx2code-conv-naive,4.54842397,0.46409270056854923
 87 | 928,onnx2code-im2col,13.424788929999998,1.0794657054235974
 88 | 928,tensorflow,5.974092810000001,0.569689363140259
 89 | 928,onnxruntime,7.64295418,0.611710935315078
 90 | 960,onnx2code-conv-naive,4.86105023,0.5581146653470925
 91 | 960,onnx2code-im2col,14.558563390000003,1.1082483519696646
 92 | 960,tensorflow,6.5112397500000005,1.460229401409055
 93 | 960,onnxruntime,8.19947209,0.6584829936922305
 94 | 992,onnx2code-conv-naive,5.120063300000001,0.544389909767209
 95 | 992,onnx2code-im2col,15.694976389999997,1.2412264189912563
 96 | 992,tensorflow,6.476902399999999,0.6904489808260419
 97 | 992,onnxruntime,8.69421666,0.6245109155897632
 98 | 1024,onnx2code-conv-naive,5.590155300000001,0.6496297329966895
 99 | 1024,onnx2code-im2col,17.33253985,1.077605242483038
100 | 1024,tensorflow,6.79198762,0.5675002324853758
101 | 1024,onnxruntime,9.496853179999999,1.333272052480006
102 | 1056,onnx2code-conv-naive,5.772926100000001,0.6658795291482311
103 | 1056,onnx2code-im2col,17.946812150000003,1.2365768010534273
104 | 1056,tensorflow,7.02846319,0.6944497910994529
105 | 1056,onnxruntime,9.9349942,0.6898595990707819
106 | 1088,onnx2code-conv-naive,6.290629489999999,0.8784981131715252
107 | 1088,onnx2code-im2col,19.00194285,1.3439142948857516
108 | 1088,tensorflow,7.45540793,0.4714124189516067
109 | 1088,onnxruntime,10.528197050000001,0.7615252199567178
110 | 1120,onnx2code-conv-naive,6.617438369999999,0.9501307714331607
111 | 1120,onnx2code-im2col,20.147991229999995,1.2449445515221622
112 | 1120,tensorflow,7.864878350000001,0.665257063734995
113 | 1120,onnxruntime,11.101594520000003,0.8159476296610155
114 | 1152,onnx2code-conv-naive,6.831439400000001,0.6716892841138081
115 | 1152,onnx2code-im2col,21.205939569999995,1.212284878937721
116 | 1152,tensorflow,7.92708685,0.46568353623677466
117 | 1152,onnxruntime,11.954561950000002,0.6959551411048039
118 | 1184,onnx2code-conv-naive,7.328971719999998,0.8108132758963938
119 | 1184,onnx2code-im2col,22.756164560000002,1.4384047279126229
120 | 1184,tensorflow,8.509471740000002,0.5682869313954813
121 | 1184,onnxruntime,12.447344249999999,0.8514161123153985
122 | 1216,onnx2code-conv-naive,7.66072625,0.8296190286507341
123 | 1216,onnx2code-im2col,23.729592410000006,1.2839267518061153
124 | 1216,tensorflow,9.0151338,0.7025805359415104
125 | 1216,onnxruntime,13.358221110000004,0.9357839590492228
126 | 1248,onnx2code-conv-naive,7.97544042,1.0038998031313302
127 | 1248,onnx2code-im2col,25.231953579999995,1.592579618776789
128 | 1248,tensorflow,9.393306319999999,0.6601032833059064
129 | 1248,onnxruntime,14.070093029999999,0.8027447679136805
130 | 


--------------------------------------------------------------------------------
/onnx2code/ops/conv.py:
--------------------------------------------------------------------------------
  1 | from math import ceil
  2 | 
  3 | import numpy as np
  4 | 
  5 | from onnx2code.ops.gemm_tiling.GEMM import call_GEMM, external_paths_GEMM
  6 | from onnx2code.util import (
  7 |     compute_strides,
  8 |     get_attribute,
  9 |     resolve_padding_attribute,
 10 |     resolve_stride_attribute,
 11 | )
 12 | 
 13 | from .operation import OpCall, Operation, OpImpl
 14 | 
 15 | 
 16 | class Conv(Operation):
 17 |     """
 18 |     Conv operator
 19 | 
 20 |     Only 2D convolutions are supported
 21 | 
 22 |     https://github.com/onnx/onnx/blob/main/docs/Operators.md#conv
 23 |     """
 24 | 
 25 |     node_types = {"Conv"}
 26 | 
 27 |     def parse(self) -> None:
 28 |         assert (
 29 |             len(self.inputs) == 2 or len(self.inputs) == 3
 30 |         ), "expected two or three inputs"
 31 |         assert len(self.outputs) == 1, "expected one output"
 32 | 
 33 |         group = get_attribute(self.node, "group", 1)
 34 |         if group != 1:
 35 |             raise NotImplementedError("depthwise is not supported (group != 1)")
 36 | 
 37 |         self.X = self.inputs[0]
 38 |         self.W = self.inputs[1]
 39 |         self.B = self.inputs[2] if len(self.inputs) == 3 else None
 40 |         self.Y = self.outputs[0]
 41 | 
 42 |         self.dilations = get_attribute(self.node, "dilations", [1] * 2)
 43 |         self.strides = resolve_stride_attribute(self.node)
 44 |         self.pads = resolve_padding_attribute(self.node, self.X.shape, self.W.shape)
 45 | 
 46 |     def call(self) -> OpCall:
 47 |         return OpCall(
 48 |             sig_name="Conv",
 49 |             sig_params=[self.X.shape, self.W.shape, self.strides, self.pads],
 50 |             inputs=self.inputs,
 51 |             outputs=self.outputs,
 52 |             input_names=("X", "W", "B"),
 53 |         )
 54 | 
 55 | 
 56 | @Conv.variant(["c", "conv-naive"], priority=1)
 57 | class ConvC(Conv):
 58 |     def impl(self) -> OpImpl:
 59 |         # onnx is NCHW
 60 |         # N = self.X.shape[0]
 61 |         # C = self.X.shape[1]
 62 |         H = self.X.shape[2]
 63 |         W = self.X.shape[3]
 64 |         F = self.W.shape[0]  # filters
 65 |         KC = self.W.shape[1]
 66 |         KH = self.W.shape[2]
 67 |         KW = self.W.shape[3]
 68 | 
 69 |         pads_start = [self.pads[0], self.pads[1]]
 70 |         # pads_end = [self.pads[2], self.pads[3]]
 71 | 
 72 |         input_strides = compute_strides(self.X.shape)
 73 |         output_strides = compute_strides(self.Y.shape)
 74 |         kernel_strides = compute_strides(self.W.shape)
 75 | 
 76 |         source = ""
 77 | 
 78 |         source += f"""
 79 |         for(int f = 0; f < {F}; f++) {{
 80 |             // start position of kernel
 81 |             for(int h = 0; h < {self.Y.shape[2]}; h++) {{
 82 |                 for(int w = 0; w < {self.Y.shape[3]}; w++) {{
 83 |                     float accum = {"0.0f" if self.B is None else "B[f]" };
 84 | 
 85 |                     // position in kernel
 86 |                     for(int cc = 0; cc < {KC}; cc++) {{
 87 |                         for(int hh = 0; hh < {KH}; hh++) {{
 88 |                             for(int ww = 0; ww < {KW}; ww++) {{
 89 |                                 const int ih = {-pads_start[0]} + (h * {self.strides[0]}) + hh;
 90 |                                 const int iw = {-pads_start[1]} + (w * {self.strides[1]}) + ww;
 91 |                                 if(ih >= 0 && ih < {H} && iw >= 0 && iw < {W}) {{
 92 |                                     accum += X[
 93 |                                         cc * {input_strides[1]} +
 94 |                                         ih * {input_strides[2]} +
 95 |                                         iw * {input_strides[3]}
 96 |                                     ] * W[
 97 |                                         f * {kernel_strides[0]} +
 98 |                                         cc * {kernel_strides[1]} +
 99 |                                         hh * {kernel_strides[2]} +
100 |                                         ww * {kernel_strides[3]}
101 |                                     ];
102 |                                 }}
103 |                             }}
104 |                         }}
105 |                     }}
106 | 
107 |                     OUT[
108 |                         f * {output_strides[1]} +
109 |                         h * {output_strides[2]} +
110 |                         w * {output_strides[3]}
111 |                     ] = accum;
112 |                 }}
113 |             }}
114 |         }}
115 |         """
116 | 
117 |         return OpImpl(lang="c", source=source)
118 | 
119 | 
120 | @Conv.variant(["im2col", "loop-tiling"], priority=0)
121 | class ConvIm2col(Conv):
122 |     def impl(self) -> OpImpl:
123 |         input_shape = self.X.shape
124 |         weight_shape = self.W.shape
125 |         has_bias = self.B is not None
126 |         pads, dilations, strides = self.pads, self.dilations, self.strides
127 | 
128 |         assert len(pads) == 4 or np.allclose(
129 |             pads, 0
130 |         ), "expected padding only in two dimensions"
131 | 
132 |         # onnx is NCHW
133 |         N = input_shape[0]
134 |         C = input_shape[1]
135 |         H = input_shape[2]
136 |         W = input_shape[3]
137 |         F = weight_shape[0]  # filters
138 |         KC = weight_shape[1]
139 |         KH = weight_shape[2]
140 |         KW = weight_shape[3]
141 | 
142 |         input_strides = compute_strides(input_shape)
143 |         kernel_strides = compute_strides([KC, KH, KW])
144 |         pads_start = [pads[0], pads[1]]
145 |         pads_end = [pads[2], pads[3]]
146 |         patch_stride = KC * KH * KW
147 |         num_patches = ceil(
148 |             (H - KH + 1 + pads_start[0] + pads_end[0] - (dilations[0] - 1) * (KH - 1))
149 |             / strides[0]
150 |         ) * ceil(
151 |             (W - KW + 1 + pads_start[1] + pads_end[1] - (dilations[1] - 1) * (KW - 1))
152 |             / strides[0]
153 |         )
154 |         im2col_shape = [patch_stride, num_patches]
155 | 
156 |         bias_code = (
157 |             f"""
158 |             // bias
159 |             for (int f = 0; f < {F}; f++) {{
160 |                 for (int i = 0; i < {num_patches}; i++) {{
161 |                     OUT[f * {num_patches} + i] += B[f];
162 |                 }}
163 |             }}
164 |             """
165 |             if has_bias
166 |             else ""
167 |         )
168 | 
169 |         _N = F  # weight_shape[0]
170 |         _M = patch_stride  # weight_shape[1]
171 |         _K = im2col_shape[1]
172 | 
173 |         source = f"""
174 |         // padding, dilations, strides
175 |         // im2col
176 |         // float im2col[{np.prod(im2col_shape)}];
177 |         int patch = 0;
178 |         for(int c = 0; c < {C - KC + 1}; c++) {{
179 |             for(int h = {-pads_start[0]}; h < {H - KH + 1 + pads_end[0] - (dilations[0] - 1) * (KH - 1)}; h += {strides[0]}) {{
180 |                 for(int w = {-pads_start[1]}; w < {W - KW + 1 + pads_end[1] - (dilations[1] - 1) * (KW - 1)}; w += {strides[1]}) {{
181 |                     // copy patch
182 |                     for(int cc = 0; cc < {KC}; cc++) {{
183 |                         for(int hh = 0; hh < {KH}; hh++) {{
184 |                             for(int ww = 0; ww < {KW}; ww++) {{
185 |                                 const int ih = h + hh * {dilations[0]};
186 |                                 const int iw = w + ww * {dilations[1]};
187 |                                 float value;
188 |                                 if(ih < 0 || ih >= {H} || iw < 0 || iw >= {W}) {{
189 |                                     value = 0.0f;
190 |                                 }} else {{
191 |                                     value = X[
192 |                                         (c + cc) * {input_strides[1]} +
193 |                                         ih * {input_strides[2]} +
194 |                                         iw * {input_strides[3]}
195 |                                     ];
196 |                                 }}
197 |                                 im2col[
198 |                                     (cc * {kernel_strides[0]} +
199 |                                     hh * {kernel_strides[1]} +
200 |                                     ww * {kernel_strides[2]}) * {num_patches} +
201 |                                     patch
202 |                                 ] = value;
203 |                             }}
204 |                         }}
205 |                     }}
206 |                     patch++;
207 |                 }}
208 |             }}
209 |         }}
210 |         // gemm ({self.Y.shape})
211 |         for(int row = 0; row < {_N}; row++) {{
212 |             for(int col = 0; col < {_K}; col++) {{
213 |                 float sum = 0;
214 |                 for(int i = 0; i < {_M}; i++) {{
215 |                     sum += W[row * {_M} + i] * im2col[i * {_K} + col];
216 |                 }}
217 |                 OUT[row * {_K} + col] = sum;
218 |             }}
219 |         }}
220 |         //{call_GEMM(_N, _M, _K,"W, im2col, OUT")}
221 |         {bias_code}
222 |         """
223 | 
224 |         return OpImpl(lang="c", source=source, external_paths=external_paths_GEMM)
225 | 


--------------------------------------------------------------------------------
/evaluation/results_gemm/10th.csv:
--------------------------------------------------------------------------------
  1 | MNK,runtime,time_mean,time_std
  2 | 256,onnx2code-gemm-naive,3.032503173333333,0.3963114916526603
  3 | 256,onnx2code-loop-tiling,0.6197000233333334,0.10369838811603
  4 | 256,onnx2code-libxsmm,0.7196239033333334,0.10866221183291207
  5 | 256,tensorflow,0.8746342333333333,0.1925181370934322
  6 | 256,onnxruntime,0.20683064666666667,0.03900188116251773
  7 | 288,onnx2code-gemm-naive,4.024115666666667,0.5194440929148348
  8 | 288,onnx2code-loop-tiling,0.80517784,0.13400563886996597
  9 | 288,onnx2code-libxsmm,0.93845815,0.14355573278716818
 10 | 288,tensorflow,0.9673922599999999,0.1858061522799655
 11 | 288,onnxruntime,0.2888854766666667,0.024172225820890434
 12 | 320,onnx2code-gemm-naive,6.331535613333334,0.5509802762758547
 13 | 320,onnx2code-loop-tiling,0.9538450333333333,0.13810027369501804
 14 | 320,onnx2code-libxsmm,1.2905169033333335,0.16415241675008624
 15 | 320,tensorflow,1.2187085466666667,0.2594019197373344
 16 | 320,onnxruntime,0.3680314866666667,0.057412645192644524
 17 | 352,onnx2code-gemm-naive,7.8856523366666655,0.6560213203748767
 18 | 352,onnx2code-loop-tiling,1.27658195,0.20538526113336575
 19 | 352,onnx2code-libxsmm,1.7584370266666671,0.21596043915740515
 20 | 352,tensorflow,1.4465645866666668,0.221910820228568
 21 | 352,onnxruntime,0.46340960333333325,0.11418972732270034
 22 | 384,onnx2code-gemm-naive,11.550311796666666,1.0403060429752689
 23 | 384,onnx2code-loop-tiling,1.5058879933333333,0.20728834380472683
 24 | 384,onnx2code-libxsmm,2.2619934833333333,0.32138281326548307
 25 | 384,tensorflow,1.76205783,0.22727102523365014
 26 | 384,onnxruntime,0.6222279099999999,0.04984593144796507
 27 | 416,onnx2code-gemm-naive,14.391191236666668,0.7851056160948203
 28 | 416,onnx2code-loop-tiling,1.8807214333333335,0.23426841520328534
 29 | 416,onnx2code-libxsmm,2.9243486800000005,0.39641411833061346
 30 | 416,tensorflow,2.1911857866666664,0.26867450219499395
 31 | 416,onnxruntime,0.78398472,0.07031717088934679
 32 | 448,onnx2code-gemm-naive,18.20398146333333,0.9198346380517328
 33 | 448,onnx2code-loop-tiling,2.31714408,0.25164997428852165
 34 | 448,onnx2code-libxsmm,3.5859564033333338,0.4675003390828581
 35 | 448,tensorflow,2.68536728,0.27450274408191555
 36 | 448,onnxruntime,1.0722129066666668,0.05506651126305917
 37 | 480,onnx2code-gemm-naive,22.210734633333335,0.8697894314733723
 38 | 480,onnx2code-loop-tiling,2.82071765,0.3375718366785962
 39 | 480,onnx2code-libxsmm,4.63586339,0.525838292893875
 40 | 480,tensorflow,3.28927851,0.3330544670676804
 41 | 480,onnxruntime,1.12585362,0.2527811935213976
 42 | 512,onnx2code-gemm-naive,37.82177006333333,1.7695390811710392
 43 | 512,onnx2code-loop-tiling,3.387950926666667,0.36442807124270943
 44 | 512,onnx2code-libxsmm,7.333030716666667,0.6106197815637948
 45 | 512,tensorflow,3.90125637,0.37029067510700264
 46 | 512,onnxruntime,1.392510446666667,0.33206850390999215
 47 | 544,onnx2code-gemm-naive,32.94581162666667,1.4858811647637689
 48 | 544,onnx2code-loop-tiling,4.131661676666667,0.656923921812614
 49 | 544,onnx2code-libxsmm,6.612609873333334,0.5843372144030853
 50 | 544,tensorflow,4.522499783333333,0.39981151880847937
 51 | 544,onnxruntime,1.8869457600000001,0.10023161830857434
 52 | 576,onnx2code-gemm-naive,38.92624099333334,1.3423770376674533
 53 | 576,onnx2code-loop-tiling,4.74182933,0.5772705808930342
 54 | 576,onnx2code-libxsmm,7.704783719999999,0.9048697276132155
 55 | 576,tensorflow,5.206054753333333,0.3989466387608822
 56 | 576,onnxruntime,2.1019028766666663,0.38584045712099896
 57 | 608,onnx2code-gemm-naive,45.28636947333334,1.2418235486734779
 58 | 608,onnx2code-loop-tiling,5.535877246666667,0.68465347702584
 59 | 608,onnx2code-libxsmm,9.329710803333334,0.6821022386722522
 60 | 608,tensorflow,6.038077293333333,0.4066848204053199
 61 | 608,onnxruntime,2.3380995933333333,0.5150944124831821
 62 | 640,onnx2code-gemm-naive,53.979821606666675,1.5732381435772353
 63 | 640,onnx2code-loop-tiling,6.528255413333333,0.7382412954780949
 64 | 640,onnx2code-libxsmm,11.105261486666668,0.6702049593273811
 65 | 640,tensorflow,6.867506293333332,0.33479984655450573
 66 | 640,onnxruntime,2.8644599933333335,0.44012363370755264
 67 | 672,onnx2code-gemm-naive,61.797607606666666,2.0206490238507837
 68 | 672,onnx2code-loop-tiling,7.335917296666667,0.48413999592309587
 69 | 672,onnx2code-libxsmm,12.526532143333334,0.7823617295629833
 70 | 672,tensorflow,7.629762806666666,0.3806301394771867
 71 | 672,onnxruntime,3.3707401600000004,0.4525826643422036
 72 | 704,onnx2code-gemm-naive,70.44579822666667,1.8285376918963692
 73 | 704,onnx2code-loop-tiling,8.239208566666667,0.689555869604747
 74 | 704,onnx2code-libxsmm,14.119220446666665,0.845159265942012
 75 | 704,tensorflow,8.895071413333334,0.34419289040192147
 76 | 704,onnxruntime,3.5427747,0.7727582562802354
 77 | 736,onnx2code-gemm-naive,79.81430747333333,1.7413606458959807
 78 | 736,onnx2code-loop-tiling,9.634022176666667,0.6114007260691514
 79 | 736,onnx2code-libxsmm,16.302030413333334,1.007909693988049
 80 | 736,tensorflow,9.830212666666666,0.4567337411348858
 81 | 736,onnxruntime,3.7305446866666663,1.0182194903883912
 82 | 768,onnx2code-gemm-naive,122.00736332333331,2.1067141418493156
 83 | 768,onnx2code-loop-tiling,10.923692083333334,0.5637555127495397
 84 | 768,onnx2code-libxsmm,25.253630956666665,2.3042468585120885
 85 | 768,tensorflow,11.239431676666667,0.4128013705502185
 86 | 768,onnxruntime,5.13070866,0.47881221418431297
 87 | 800,onnx2code-gemm-naive,104.17442842,1.6762800916184926
 88 | 800,onnx2code-loop-tiling,12.447658143333335,1.4992577998217194
 89 | 800,onnx2code-libxsmm,21.749256313333333,1.0426478679011155
 90 | 800,tensorflow,12.379860203333335,0.7496123154140425
 91 | 800,onnxruntime,5.28670129,1.1996524549301877
 92 | 832,onnx2code-gemm-naive,116.08897560333335,2.0299157631398157
 93 | 832,onnx2code-loop-tiling,13.754875326666667,0.6784085029189272
 94 | 832,onnx2code-libxsmm,24.601902156666664,2.062709708078124
 95 | 832,tensorflow,13.695079760000002,0.3706590754333004
 96 | 832,onnxruntime,6.141160866666667,1.1566683448185664
 97 | 864,onnx2code-gemm-naive,132.48205046333334,3.008030542358048
 98 | 864,onnx2code-loop-tiling,15.861842113333331,0.821118800358304
 99 | 864,onnx2code-libxsmm,26.891542533333336,1.554060923792188
100 | 864,tensorflow,15.222112973333333,0.6294818808192755
101 | 864,onnxruntime,6.738747506666667,1.3725968574633352
102 | 896,onnx2code-gemm-naive,152.92541574333333,1.9424146789867993
103 | 896,onnx2code-loop-tiling,17.463121376666663,1.4855081415663267
104 | 896,onnx2code-libxsmm,31.479977646666665,1.7297347422679645
105 | 896,tensorflow,16.688846016666666,1.38853294476488
106 | 896,onnxruntime,7.102040583333334,1.7228627994158145
107 | 928,onnx2code-gemm-naive,161.55684296666666,1.4504057033650344
108 | 928,onnx2code-loop-tiling,19.451436123333334,1.7354361387323423
109 | 928,onnx2code-libxsmm,33.236639976666666,1.796100522607714
110 | 928,tensorflow,18.249888,0.45181684454396637
111 | 928,onnxruntime,8.673702366666667,1.3608428073642937
112 | 960,onnx2code-gemm-naive,179.54494265,2.20539800179465
113 | 960,onnx2code-loop-tiling,21.493540293333332,0.8316535835294889
114 | 960,onnx2code-libxsmm,37.78634877,1.8713101890560713
115 | 960,tensorflow,20.094944476666665,1.540206518013887
116 | 960,onnxruntime,9.514956133333333,1.5622842941740966
117 | 992,onnx2code-gemm-naive,199.19028681,2.460440855555875
118 | 992,onnx2code-loop-tiling,22.859786763333336,1.1154943400971729
119 | 992,onnx2code-libxsmm,40.267736176666666,1.4405508791241584
120 | 992,tensorflow,22.12998395,1.6512399342279749
121 | 992,onnxruntime,9.939381456666666,2.22546314880591
122 | 1024,onnx2code-gemm-naive,302.44768983999995,16.24694655356355
123 | 1024,onnx2code-loop-tiling,25.54218734,0.7281526073349307
124 | 1024,onnx2code-libxsmm,71.78343102333332,2.9600656701989223
125 | 1024,tensorflow,24.584593536666667,0.8282364742632156
126 | 1024,onnxruntime,11.35924066,2.136863288270128
127 | 1056,onnx2code-gemm-naive,243.8744718566667,2.4895748810929565
128 | 1056,onnx2code-loop-tiling,28.703636126666666,0.8993973898595125
129 | 1056,onnx2code-libxsmm,56.09443612333334,2.1420622768167505
130 | 1056,tensorflow,26.570579916666667,0.6059130481320978
131 | 1056,onnxruntime,12.543689193333334,2.1933724458049437
132 | 1088,onnx2code-gemm-naive,266.70388051333333,2.330864442781358
133 | 1088,onnx2code-loop-tiling,30.41403294,0.8988800675524125
134 | 1088,onnx2code-libxsmm,58.41972651333332,2.797254098747936
135 | 1088,tensorflow,29.04282031666667,1.0215327865643453
136 | 1088,onnxruntime,14.144601546666665,2.299014144461868
137 | 1120,onnx2code-gemm-naive,289.6749728433333,4.023731298223004
138 | 1120,onnx2code-loop-tiling,34.07599961333333,0.8902777108117541
139 | 1120,onnx2code-libxsmm,65.24443668333333,3.581385816881316
140 | 1120,tensorflow,31.45872342,0.5698554580779852
141 | 1120,onnxruntime,15.405860513333334,1.828299272079097
142 | 1152,onnx2code-gemm-naive,343.24906158666664,3.7112365451323703
143 | 1152,onnx2code-loop-tiling,36.349583179999996,0.9711783836167112
144 | 1152,onnx2code-libxsmm,91.12284834666666,2.9376563991947924
145 | 1152,tensorflow,34.17130621333334,0.7050820407134677
146 | 1152,onnxruntime,15.90146292,3.121942005048774
147 | 1184,onnx2code-gemm-naive,348.46309920333334,4.062900327154884
148 | 1184,onnx2code-loop-tiling,38.731178416666665,1.2522460082867726
149 | 1184,onnx2code-libxsmm,84.33077342666667,3.665204177579058
150 | 1184,tensorflow,37.23460804,1.1237919123701319
151 | 1184,onnxruntime,17.54617040666667,3.289721269978182
152 | 1216,onnx2code-gemm-naive,397.16544792,14.727324074322256
153 | 1216,onnx2code-loop-tiling,43.77018736,0.9206742296714242
154 | 1216,onnx2code-libxsmm,107.27316728666668,2.795479668821692
155 | 1216,tensorflow,40.17801400333334,0.6536839254514244
156 | 1216,onnxruntime,19.293283923333334,3.392666877767465
157 | 1248,onnx2code-gemm-naive,410.9592286366667,6.704456245496791
158 | 1248,onnx2code-loop-tiling,45.56106598333333,0.9108245501411985
159 | 1248,onnx2code-libxsmm,108.07985238333332,2.3859204441571227
160 | 1248,tensorflow,42.84209676333333,0.6350184200923578
161 | 1248,onnxruntime,19.562509316666663,4.389890480237362
162 | 


--------------------------------------------------------------------------------
/evaluation/results_gemm/6th.csv:
--------------------------------------------------------------------------------
  1 | MNK,runtime,time_mean,time_std
  2 | 256,onnx2code-gemm-naive,2.93099801,0.21223372618581754
  3 | 256,onnx2code-loop-tiling,0.6115743366666666,0.06599749912481524
  4 | 256,onnx2code-libxsmm,0.6862000033333334,0.09256254610444041
  5 | 256,tensorflow,0.83879467,0.2111202157956325
  6 | 256,onnxruntime,0.17502199999999998,0.042453192844198025
  7 | 288,onnx2code-gemm-naive,4.013051343333333,0.1922353570042171
  8 | 288,onnx2code-loop-tiling,0.8613543366666666,0.08961090262274632
  9 | 288,onnx2code-libxsmm,0.9549310066666667,0.115530932781052
 10 | 288,tensorflow,0.9110693400000001,0.15713004746802695
 11 | 288,onnxruntime,0.242907,0.06730584212632165
 12 | 320,onnx2code-gemm-naive,5.73331939,0.27483905993038527
 13 | 320,onnx2code-loop-tiling,1.1365396766666667,0.13217515031366067
 14 | 320,onnx2code-libxsmm,1.2718516766666665,0.13979089496648958
 15 | 320,tensorflow,1.1791876766666667,0.20658304444656664
 16 | 320,onnxruntime,0.3266730033333333,0.05591295871724779
 17 | 352,onnx2code-gemm-naive,7.659213810000001,0.31697362694935244
 18 | 352,onnx2code-loop-tiling,1.4359520333333333,0.13807107627317253
 19 | 352,onnx2code-libxsmm,1.7002643733333334,0.19519460629189345
 20 | 352,tensorflow,1.4036383666666665,0.27107338265993375
 21 | 352,onnxruntime,0.4308296766666666,0.09773607186690145
 22 | 384,onnx2code-gemm-naive,11.107058206666666,1.4121811533334494
 23 | 384,onnx2code-loop-tiling,1.8372090099999998,0.20677005859850672
 24 | 384,onnx2code-libxsmm,2.311948676666667,0.25727827390834773
 25 | 384,tensorflow,1.6829836733333334,0.2696682622089221
 26 | 384,onnxruntime,0.538148,0.11062774047528344
 27 | 416,onnx2code-gemm-naive,14.297038139999998,2.729676676434938
 28 | 416,onnx2code-loop-tiling,2.4032359733333326,0.43364645287365833
 29 | 416,onnx2code-libxsmm,2.9514771200000003,0.3241932617549172
 30 | 416,tensorflow,2.00626849,0.322824547564065
 31 | 416,onnxruntime,0.6981953366666667,0.14003255632503223
 32 | 448,onnx2code-gemm-naive,17.374831833333335,0.6144123549041031
 33 | 448,onnx2code-loop-tiling,2.872743363333333,0.2822886150100956
 34 | 448,onnx2code-libxsmm,3.7628733766666667,0.41442373074013844
 35 | 448,tensorflow,2.3588540266666667,0.3072846262026125
 36 | 448,onnxruntime,0.8693563433333332,0.1435663590984048
 37 | 480,onnx2code-gemm-naive,21.50225923,0.7819840128890342
 38 | 480,onnx2code-loop-tiling,3.50825207,0.31527052039920433
 39 | 480,onnx2code-libxsmm,4.428268086666667,0.42702809932715097
 40 | 480,tensorflow,2.62927905,0.3265735363913425
 41 | 480,onnxruntime,1.0414200200000001,0.19804321140402564
 42 | 512,onnx2code-gemm-naive,34.906725013333336,2.4372404714513394
 43 | 512,onnx2code-loop-tiling,4.392616076666666,0.34323584249067596
 44 | 512,onnx2code-libxsmm,6.58534778,0.5006035942198693
 45 | 512,tensorflow,3.917376396666667,0.6595506638626123
 46 | 512,onnxruntime,1.2402080100000001,0.16426971820915917
 47 | 544,onnx2code-gemm-naive,30.517709726666666,1.4655694722498689
 48 | 544,onnx2code-loop-tiling,5.456461906666667,0.47872229454143655
 49 | 544,onnx2code-libxsmm,6.595671356666667,0.6647671986123534
 50 | 544,tensorflow,3.88902766,0.46820230109173716
 51 | 544,onnxruntime,1.482888666666667,0.17757750778619336
 52 | 576,onnx2code-gemm-naive,36.852594706666665,2.328737123134206
 53 | 576,onnx2code-loop-tiling,6.3206076200000005,0.4519496564714359
 54 | 576,onnx2code-libxsmm,7.905415273333334,0.6909014338298326
 55 | 576,tensorflow,4.415751966666667,0.5194307622421865
 56 | 576,onnxruntime,1.73132532,0.1933088669553924
 57 | 608,onnx2code-gemm-naive,42.66949377666666,1.2014143562159671
 58 | 608,onnx2code-loop-tiling,7.304635963333335,0.7251682138322634
 59 | 608,onnx2code-libxsmm,9.214234646666666,0.778034726474529
 60 | 608,tensorflow,5.144455656666667,0.5631055999529266
 61 | 608,onnxruntime,2.047746663333333,0.1852386781655944
 62 | 640,onnx2code-gemm-naive,55.89334453333334,2.273516001627766
 63 | 640,onnx2code-loop-tiling,8.369001973333333,0.5048504896278362
 64 | 640,onnx2code-libxsmm,11.598928710000001,0.9318555946571332
 65 | 640,tensorflow,6.01065204,0.7255272663531527
 66 | 640,onnxruntime,2.3855636833333334,0.2919221096476188
 67 | 672,onnx2code-gemm-naive,58.554538380000004,2.8564249775566233
 68 | 672,onnx2code-loop-tiling,9.613490160000001,0.6311975762601341
 69 | 672,onnx2code-libxsmm,13.277355443333333,1.5214488973740832
 70 | 672,tensorflow,6.44044367,0.6637398145560762
 71 | 672,onnxruntime,2.745889,0.3140661292026463
 72 | 704,onnx2code-gemm-naive,67.34064747000001,2.9076649778401698
 73 | 704,onnx2code-loop-tiling,11.017596743333334,0.6963415985621837
 74 | 704,onnx2code-libxsmm,14.665297196666664,1.2571258474802771
 75 | 704,tensorflow,7.532525746666667,0.8175024675900715
 76 | 704,onnxruntime,3.1209063366666667,0.30571192055678276
 77 | 736,onnx2code-gemm-naive,77.81776146333333,3.994680527831363
 78 | 736,onnx2code-loop-tiling,12.602153343333333,0.7199006589244326
 79 | 736,onnx2code-libxsmm,17.260685663333334,1.3552461221698675
 80 | 736,tensorflow,8.18517155,0.8955714304885909
 81 | 736,onnxruntime,3.4846302700000003,0.29393547805562775
 82 | 768,onnx2code-gemm-naive,114.22778595999999,8.203709334359631
 83 | 768,onnx2code-loop-tiling,14.643412720000002,0.770094425828583
 84 | 768,onnx2code-libxsmm,21.88221372,2.3626316578265025
 85 | 768,tensorflow,9.402519983333335,1.0571307250323347
 86 | 768,onnxruntime,3.9847046133333333,0.35216399742038346
 87 | 800,onnx2code-gemm-naive,102.52447922666666,4.841998191757737
 88 | 800,onnx2code-loop-tiling,16.721700436666666,0.7321350723210706
 89 | 800,onnx2code-libxsmm,22.56168299333333,1.5385892624864095
 90 | 800,tensorflow,10.610878073333334,1.2925829745715158
 91 | 800,onnxruntime,4.526122559999999,0.394388227185878
 92 | 832,onnx2code-gemm-naive,118.56218002333335,3.939953692180838
 93 | 832,onnx2code-loop-tiling,18.922631803333335,1.4425692216179269
 94 | 832,onnx2code-libxsmm,25.1037305,3.1527200230313914
 95 | 832,tensorflow,11.518043606666668,1.147024838134664
 96 | 832,onnxruntime,5.080414686666666,0.47586088191467807
 97 | 864,onnx2code-gemm-naive,130.09689659666668,5.189265473084857
 98 | 864,onnx2code-loop-tiling,20.914084600000002,0.9556391532755657
 99 | 864,onnx2code-libxsmm,29.258843553333328,1.9307758241843858
100 | 864,tensorflow,13.143729653333333,2.0231548745389447
101 | 864,onnxruntime,5.760477336666666,1.477134492618634
102 | 896,onnx2code-gemm-naive,168.89975199666668,5.312927351664043
103 | 896,onnx2code-loop-tiling,23.185644933333332,1.0045551482742974
104 | 896,onnx2code-libxsmm,34.172522423333334,1.9982843256822413
105 | 896,tensorflow,14.184290020000002,1.8862201389964413
106 | 896,onnxruntime,6.194402336666667,0.5051867525439248
107 | 928,onnx2code-gemm-naive,165.13455939666667,3.6329771619842033
108 | 928,onnx2code-loop-tiling,25.830488803333335,1.6850606511951327
109 | 928,onnx2code-libxsmm,37.66786822666667,3.800566116722251
110 | 928,tensorflow,15.979504666666665,1.5224369501662662
111 | 928,onnxruntime,6.822730663333334,0.8137015051617086
112 | 960,onnx2code-gemm-naive,183.91971043000004,3.7863765477388824
113 | 960,onnx2code-loop-tiling,27.901448323333334,1.140211974885468
114 | 960,onnx2code-libxsmm,40.32194981999999,4.608950302465135
115 | 960,tensorflow,17.55200400333333,3.1222178969760126
116 | 960,onnxruntime,7.582337316666665,1.3798306560698488
117 | 992,onnx2code-gemm-naive,202.18115277666666,4.9461951688977575
118 | 992,onnx2code-loop-tiling,30.831881576666667,1.2410191207267294
119 | 992,onnx2code-libxsmm,46.472281063333334,3.5206640553037913
120 | 992,tensorflow,19.609973956666668,1.9594430432812078
121 | 992,onnxruntime,8.354810469999999,1.224331820426381
122 | 1024,onnx2code-gemm-naive,311.7934103333333,6.877924793102408
123 | 1024,onnx2code-loop-tiling,34.00422953666667,1.65053201542282
124 | 1024,onnx2code-libxsmm,56.044168510000006,4.313170511153236
125 | 1024,tensorflow,21.53622228,3.0112313016520313
126 | 1024,onnxruntime,9.185121386666667,1.5104097708390252
127 | 1056,onnx2code-gemm-naive,254.14210583000002,6.641041915904256
128 | 1056,onnx2code-loop-tiling,38.359650099999996,1.305478721310599
129 | 1056,onnx2code-libxsmm,59.21067334000001,4.997676490874754
130 | 1056,tensorflow,22.382056223333333,2.0500099528156066
131 | 1056,onnxruntime,9.777039993333332,0.7107724979235902
132 | 1088,onnx2code-gemm-naive,292.2926318666667,9.702008574620628
133 | 1088,onnx2code-loop-tiling,42.027352369999996,1.749770538149051
134 | 1088,onnx2code-libxsmm,67.41834954333333,4.8664865627514065
135 | 1088,tensorflow,26.594176163333334,2.0905433452247073
136 | 1088,onnxruntime,10.497412073333331,0.8809854735765934
137 | 1120,onnx2code-gemm-naive,338.5233844933333,10.164223333108087
138 | 1120,onnx2code-loop-tiling,45.16896703,2.3841331282466762
139 | 1120,onnx2code-libxsmm,76.79927917333333,5.7278331098052515
140 | 1120,tensorflow,28.83353773333333,2.563366119458253
141 | 1120,onnxruntime,11.659171516666666,1.8744377435576556
142 | 1152,onnx2code-gemm-naive,403.66976146333326,9.52804313250809
143 | 1152,onnx2code-loop-tiling,49.305075323333334,3.73440690467644
144 | 1152,onnx2code-libxsmm,86.43004374333333,6.511510521143248
145 | 1152,tensorflow,30.32093784666667,2.2242678975026324
146 | 1152,onnxruntime,12.255513253333332,0.9498209524135391
147 | 1184,onnx2code-gemm-naive,416.8149896066667,16.73399445325064
148 | 1184,onnx2code-loop-tiling,53.818254503333335,4.412354834290176
149 | 1184,onnx2code-libxsmm,95.98643558,6.660626441719355
150 | 1184,tensorflow,32.997734083333334,3.954762346023894
151 | 1184,onnxruntime,13.61072438,0.9931182019533201
152 | 1216,onnx2code-gemm-naive,449.9982161666666,11.71757127478388
153 | 1216,onnx2code-loop-tiling,57.79504892,3.78510541217508
154 | 1216,onnx2code-libxsmm,113.02410599,6.211236515543298
155 | 1216,tensorflow,33.57050304333334,3.4069599306920897
156 | 1216,onnxruntime,14.462370676666664,1.1351487701683944
157 | 1248,onnx2code-gemm-naive,533.9181180966667,19.01682602485039
158 | 1248,onnx2code-loop-tiling,61.468259516666656,2.3679392876943126
159 | 1248,onnx2code-libxsmm,128.34663120999997,6.3470097444951215
160 | 1248,tensorflow,36.713297913333335,2.8748869668129955
161 | 1248,onnxruntime,15.35723634,1.0219832136925788
162 | 


--------------------------------------------------------------------------------
/evaluation/results_gemm.csv:
--------------------------------------------------------------------------------
  1 | MNK,runtime,time_mean,time_std
  2 | 256,onnx2code-gemm-naive,3.3703890066666666,0.26234957107102974
  3 | 256,onnx2code-loop-tiling,0.4769373333333333,0.11200040330086118
  4 | 256,onnx2code-libxsmm,0.7685216666666665,0.1347120594597562
  5 | 256,tensorflow,0.8927413366666667,0.21565150082934734
  6 | 256,onnxruntime,0.19556233333333334,0.10832180150469352
  7 | 288,onnx2code-gemm-naive,4.213650009999999,0.2412096531602261
  8 | 288,onnx2code-loop-tiling,0.699732,0.1757824672030746
  9 | 288,onnx2code-libxsmm,1.0847346666666666,0.1939209450563697
 10 | 288,tensorflow,0.9772826666666666,0.22258221978905282
 11 | 288,onnxruntime,0.26156667,0.13187345940122966
 12 | 320,onnx2code-gemm-naive,6.044232376666667,0.3223155900942348
 13 | 320,onnx2code-loop-tiling,0.9018526733333332,0.21311764860484914
 14 | 320,onnx2code-libxsmm,1.38776568,0.2599831027845546
 15 | 320,tensorflow,1.24392801,0.27125448357114745
 16 | 320,onnxruntime,0.40802100333333335,0.2441359848681263
 17 | 352,onnx2code-gemm-naive,7.893988396666667,0.367508493608097
 18 | 352,onnx2code-loop-tiling,1.3560120133333333,0.7348494896233467
 19 | 352,onnx2code-libxsmm,1.79440235,0.2837481352934221
 20 | 352,tensorflow,1.49494068,0.3057390412554759
 21 | 352,onnxruntime,0.46045267000000006,0.20686798562821274
 22 | 384,onnx2code-gemm-naive,11.524608453333334,0.5201314323649308
 23 | 384,onnx2code-loop-tiling,1.4596343366666666,0.2841688073669065
 24 | 384,onnx2code-libxsmm,2.4336043433333336,0.32883703427758676
 25 | 384,tensorflow,1.72980867,0.33966093636865935
 26 | 384,onnxruntime,0.5861753366666667,0.1912006903240414
 27 | 416,onnx2code-gemm-naive,14.006902730000002,0.48876344668009947
 28 | 416,onnx2code-loop-tiling,1.830579023333333,0.33124209183315273
 29 | 416,onnx2code-libxsmm,3.1111713733333333,0.37648462074904726
 30 | 416,tensorflow,2.12371503,0.3163907697054321
 31 | 416,onnxruntime,0.7455333433333334,0.25015353024405923
 32 | 448,onnx2code-gemm-naive,18.138151503333333,0.9224213063871142
 33 | 448,onnx2code-loop-tiling,2.343757016666667,0.44236630097742496
 34 | 448,onnx2code-libxsmm,3.98062436,0.4661748076261562
 35 | 448,tensorflow,2.5604050199999997,0.3854315060294798
 36 | 448,onnxruntime,0.93208034,0.26121660448528994
 37 | 480,onnx2code-gemm-naive,22.125118099999998,2.159088927144462
 38 | 480,onnx2code-loop-tiling,2.8662290200000005,0.5107667355572988
 39 | 480,onnx2code-libxsmm,4.9646453699999995,0.5687715558677898
 40 | 480,tensorflow,3.0218440199999996,0.4649805323483191
 41 | 480,onnxruntime,1.1394096733333334,0.2579315916047637
 42 | 512,onnx2code-gemm-naive,35.79122658,1.4332680004341185
 43 | 512,onnx2code-loop-tiling,3.3537350266666666,0.5237410889073206
 44 | 512,onnx2code-libxsmm,6.718171383333333,0.6435436307087524
 45 | 512,tensorflow,4.010103033333333,0.3728290496767013
 46 | 512,onnxruntime,1.3507263299999999,0.29469939469919926
 47 | 544,onnx2code-gemm-naive,31.681916779999998,1.4659728759093686
 48 | 544,onnx2code-loop-tiling,4.23727131,0.6526400838989082
 49 | 544,onnx2code-libxsmm,6.9747556433333315,0.6103129442386007
 50 | 544,tensorflow,4.051602676666667,0.4291065866271986
 51 | 544,onnxruntime,1.6376596700000001,0.35580045226994084
 52 | 576,onnx2code-gemm-naive,37.99052364,1.0874292602662623
 53 | 576,onnx2code-loop-tiling,5.001330343333333,0.7604748176479122
 54 | 576,onnx2code-libxsmm,8.230971016666667,0.7198893260045716
 55 | 576,tensorflow,4.502336343333333,0.586047683102526
 56 | 576,onnxruntime,1.94110367,0.3945149318786313
 57 | 608,onnx2code-gemm-naive,44.638641256666666,1.215741187119763
 58 | 608,onnx2code-loop-tiling,5.787086216666666,0.8303883343886781
 59 | 608,onnx2code-libxsmm,9.801832463333334,0.7914461282502653
 60 | 608,tensorflow,5.3438368899999995,0.6125494605928336
 61 | 608,onnxruntime,2.1587026366666664,0.3883709388781272
 62 | 640,onnx2code-gemm-naive,58.05911371,2.694715017241604
 63 | 640,onnx2code-loop-tiling,6.634438333333333,0.8345216391226107
 64 | 640,onnx2code-libxsmm,12.054989756666666,1.5028348949102772
 65 | 640,tensorflow,5.99051355,0.7330592824266312
 66 | 640,onnxruntime,2.5221542833333332,0.4573304520249769
 67 | 672,onnx2code-gemm-naive,60.40745225999999,1.4315425880311647
 68 | 672,onnx2code-loop-tiling,7.762591996666667,0.9044419656465619
 69 | 672,onnx2code-libxsmm,13.663411669999999,0.9982044028997674
 70 | 672,tensorflow,6.9099103333333325,0.8983987900295478
 71 | 672,onnxruntime,2.9145486666666667,0.3943143716607629
 72 | 704,onnx2code-gemm-naive,71.40485638000001,2.114370430434871
 73 | 704,onnx2code-loop-tiling,8.846936676666667,1.0275345510924836
 74 | 704,onnx2code-libxsmm,15.719211346666665,1.1871685584464966
 75 | 704,tensorflow,7.95713067,0.7963579281085575
 76 | 704,onnxruntime,3.3527723366666664,0.4527388447792796
 77 | 736,onnx2code-gemm-naive,81.41985087,2.243184882505461
 78 | 736,onnx2code-loop-tiling,10.761460323333335,1.799618396172303
 79 | 736,onnx2code-libxsmm,18.336470986666665,1.1059623489511625
 80 | 736,tensorflow,8.837726993333332,1.3609673276859735
 81 | 736,onnxruntime,3.77466933,0.4777523148543477
 82 | 768,onnx2code-gemm-naive,116.2303691,3.6316318818898234
 83 | 768,onnx2code-loop-tiling,11.265402053333334,1.2191776120246236
 84 | 768,onnx2code-libxsmm,22.767142543333332,2.118667472431719
 85 | 768,tensorflow,10.311439233333333,1.0654921718879553
 86 | 768,onnxruntime,4.192945856666667,0.4530428803391309
 87 | 800,onnx2code-gemm-naive,108.29446495,2.314271972973089
 88 | 800,onnx2code-loop-tiling,13.067479950000001,1.2708659041292807
 89 | 800,onnx2code-libxsmm,23.835922146666665,1.3408881566847868
 90 | 800,tensorflow,11.30956228,1.0025326876216665
 91 | 800,onnxruntime,4.919521963333334,1.3622212610655369
 92 | 832,onnx2code-gemm-naive,124.03364023333333,3.910260580697564
 93 | 832,onnx2code-loop-tiling,14.569716833333334,1.2364054854243702
 94 | 832,onnx2code-libxsmm,26.099881983333333,1.4702241860694976
 95 | 832,tensorflow,12.40567943,1.0726871141114627
 96 | 832,onnxruntime,5.281970426666667,0.5238231226909117
 97 | 864,onnx2code-gemm-naive,138.47043131333334,4.4882056117959355
 98 | 864,onnx2code-loop-tiling,16.571924443333334,1.3590672205781729
 99 | 864,onnx2code-libxsmm,31.187495433333332,2.3485493769686947
100 | 864,tensorflow,13.782888929999999,1.1931003490375507
101 | 864,onnxruntime,5.9023876333333325,0.5151141892303416
102 | 896,onnx2code-gemm-naive,177.52883427333333,3.7420385927551454
103 | 896,onnx2code-loop-tiling,18.290402886666666,1.4539367671765442
104 | 896,onnx2code-libxsmm,36.61725613,2.8889872890932375
105 | 896,tensorflow,15.695179583333335,1.7116501377966362
106 | 896,onnxruntime,6.5521933,0.6331403943611638
107 | 928,onnx2code-gemm-naive,176.81010817,4.868150399428317
108 | 928,onnx2code-loop-tiling,20.290687633333334,1.536577118268534
109 | 928,onnx2code-libxsmm,39.80128498,2.0321174807328157
110 | 928,tensorflow,16.660500926666664,1.2334493721870932
111 | 928,onnxruntime,7.147731570000001,0.5679753831886013
112 | 960,onnx2code-gemm-naive,206.60193386333333,6.203880944471459
113 | 960,onnx2code-loop-tiling,22.565377373333334,2.748720403237248
114 | 960,onnx2code-libxsmm,44.72559841333334,2.207069747547238
115 | 960,tensorflow,18.77272903,1.831075804896297
116 | 960,onnxruntime,7.780628350000001,0.6423223296151973
117 | 992,onnx2code-gemm-naive,224.6577771866667,5.341322166873297
118 | 992,onnx2code-loop-tiling,24.939347386666668,1.644355930024445
119 | 992,onnx2code-libxsmm,47.02947108666667,2.8481977420198152
120 | 992,tensorflow,20.788774706666665,1.7194106633212887
121 | 992,onnxruntime,8.619736683333334,0.7481155531326805
122 | 1024,onnx2code-gemm-naive,333.87419574,5.810992600601528
123 | 1024,onnx2code-loop-tiling,27.680156300000004,1.742366038172522
124 | 1024,onnx2code-libxsmm,64.00978282,4.674831253523885
125 | 1024,tensorflow,22.40708031,3.0494422123010727
126 | 1024,onnxruntime,10.346731420000001,1.7225040124799624
127 | 1056,onnx2code-gemm-naive,301.18202313666666,12.973498944847293
128 | 1056,onnx2code-loop-tiling,31.540917860000004,2.5664617374331913
129 | 1056,onnx2code-libxsmm,69.36479905,6.49757857067899
130 | 1056,tensorflow,23.45093885333333,2.0352275444335666
131 | 1056,onnxruntime,10.495233546666666,2.052902058341192
132 | 1088,onnx2code-gemm-naive,330.22071242000004,12.747447046267114
133 | 1088,onnx2code-loop-tiling,33.63197182333333,3.225165893332042
134 | 1088,onnx2code-libxsmm,76.57936520666668,6.61680822684353
135 | 1088,tensorflow,26.416019476666666,2.408813440294934
136 | 1088,onnxruntime,11.387930626666666,1.1355637328804493
137 | 1120,onnx2code-gemm-naive,371.04348910333334,14.914355425716268
138 | 1120,onnx2code-loop-tiling,36.63864172666666,3.522334801326663
139 | 1120,onnx2code-libxsmm,85.83883283666667,4.976313906974456
140 | 1120,tensorflow,28.497101723333333,2.975764558625859
141 | 1120,onnxruntime,12.064457016666667,0.9149140815299451
142 | 1152,onnx2code-gemm-naive,428.82500173333335,11.09360301945586
143 | 1152,onnx2code-loop-tiling,38.82765699666666,1.980450791346449
144 | 1152,onnx2code-libxsmm,106.76464112,4.759159336346207
145 | 1152,tensorflow,30.691326269999998,3.6969038110381796
146 | 1152,onnxruntime,13.085166249999999,0.9673633238490873
147 | 1184,onnx2code-gemm-naive,453.3688591733334,12.117019376194405
148 | 1184,onnx2code-loop-tiling,50.478685796666674,10.577480851929527
149 | 1184,onnx2code-libxsmm,133.68964269333335,29.239221574794968
150 | 1184,tensorflow,34.335070269999996,2.725595283464426
151 | 1184,onnxruntime,14.219474640000001,1.209578083563947
152 | 1216,onnx2code-gemm-naive,564.1206916233333,61.78638406885801
153 | 1216,onnx2code-loop-tiling,47.49215579333333,4.236138557224631
154 | 1216,onnx2code-libxsmm,157.28695468,39.71233912090336
155 | 1216,tensorflow,40.785947220000004,6.183618098802194
156 | 1216,onnxruntime,18.38131386,4.049771021552129
157 | 1248,onnx2code-gemm-naive,696.7071773133334,114.50236946627746
158 | 1248,onnx2code-loop-tiling,50.30315509666666,4.852534986694046
159 | 1248,onnx2code-libxsmm,152.39833104666667,25.41443230058589
160 | 1248,tensorflow,39.194375599999994,2.5711971444149073
161 | 1248,onnxruntime,16.34784598,1.2673714435523076
162 | 1280,onnx2code-gemm-naive,709.38608777,94.8956988090892
163 | 1280,onnx2code-loop-tiling,54.26350280999999,4.968193434869949
164 | 1280,onnx2code-libxsmm,190.09006949333335,32.978411602973054
165 | 1280,tensorflow,45.37788102,6.081976934479801
166 | 1280,onnxruntime,18.46193592666667,3.544112383502158
167 | 1312,onnx2code-gemm-naive,866.76174079,102.58889044736809
168 | 1312,onnx2code-loop-tiling,61.21315318333334,5.923573162301649
169 | 1312,onnx2code-libxsmm,213.33526460000002,26.399091248320676
170 | 1312,tensorflow,47.08061651,5.576627014049706
171 | 1312,onnxruntime,21.336600666666666,4.77412035508444
172 | 


--------------------------------------------------------------------------------
/preliminar/build_dataset.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "82cc7d46",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "Clone `onnx/models` and then pull all ONNX models to some folder (~50GB at the time of writing)\n",
  9 |     "\n",
 10 |     "`git clone https://github.com/onnx/models.git`\n",
 11 |     "\n",
 12 |     "`git lfs pull --include=\"*.onnx\" --exclude=\"\"`\n"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 1,
 18 |    "id": "7b7587b1",
 19 |    "metadata": {},
 20 |    "outputs": [],
 21 |    "source": [
 22 |     "import os\n",
 23 |     "from tqdm import tqdm\n",
 24 |     "import pandas as pd\n",
 25 |     "import numpy as np\n",
 26 |     "from pathlib import Path\n",
 27 |     "import matplotlib\n",
 28 |     "import onnx\n",
 29 |     "import matplotlib.pyplot as plt"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": 2,
 35 |    "id": "18e30425",
 36 |    "metadata": {},
 37 |    "outputs": [],
 38 |    "source": [
 39 |     "PATH = Path(\"C:/ONNX/models\")"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": 3,
 45 |    "id": "8c2f56a8",
 46 |    "metadata": {
 47 |     "scrolled": true
 48 |    },
 49 |    "outputs": [],
 50 |    "source": [
 51 |     "models = []\n",
 52 |     "\n",
 53 |     "for area in [\"text\", \"vision\"]:\n",
 54 |     "    for problem in os.scandir(PATH / area):\n",
 55 |     "        for model in os.scandir(PATH / area / problem):\n",
 56 |     "            for root, _, files in os.walk(PATH / area / problem / model):\n",
 57 |     "                for file in files:\n",
 58 |     "                    if file.endswith('.onnx'):\n",
 59 |     "                        file_path = os.path.join(root, file)\n",
 60 |     "                        models.append({\n",
 61 |     "                            \"area\": area,\n",
 62 |     "                            \"problem\": problem.name,\n",
 63 |     "                            \"model\": model.name,\n",
 64 |     "                            \"version\": file,\n",
 65 |     "                            \"size\": os.path.getsize(file_path),\n",
 66 |     "                            \"path\": file_path\n",
 67 |     "                        })\n",
 68 |     "        \n",
 69 |     "models = pd.DataFrame(models)"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "code",
 74 |    "execution_count": 4,
 75 |    "id": "b3ccff1c",
 76 |    "metadata": {},
 77 |    "outputs": [
 78 |     {
 79 |      "name": "stderr",
 80 |      "output_type": "stream",
 81 |      "text": [
 82 |       "168it [00:37,  4.44it/s]\n"
 83 |      ]
 84 |     }
 85 |    ],
 86 |    "source": [
 87 |     "model_ops = []\n",
 88 |     "\n",
 89 |     "for index, row in tqdm(models.iterrows()):\n",
 90 |     "    model = onnx.load(row[\"path\"])\n",
 91 |     "    ops_present = set()\n",
 92 |     "\n",
 93 |     "    for node in model.graph.node:\n",
 94 |     "        ops_present.add(node.op_type)\n",
 95 |     "    \n",
 96 |     "    model_ops.append(ops_present)\n",
 97 |     "\n",
 98 |     "models[\"ops\"] = model_ops"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": 5,
104 |    "id": "fc97c123",
105 |    "metadata": {},
106 |    "outputs": [
107 |     {
108 |      "data": {
109 |       "text/html": [
110 |        "<div>\n",
111 |        "<style scoped>\n",
112 |        "    .dataframe tbody tr th:only-of-type {\n",
113 |        "        vertical-align: middle;\n",
114 |        "    }\n",
115 |        "\n",
116 |        "    .dataframe tbody tr th {\n",
117 |        "        vertical-align: top;\n",
118 |        "    }\n",
119 |        "\n",
120 |        "    .dataframe thead th {\n",
121 |        "        text-align: right;\n",
122 |        "    }\n",
123 |        "</style>\n",
124 |        "<table border=\"1\" class=\"dataframe\">\n",
125 |        "  <thead>\n",
126 |        "    <tr style=\"text-align: right;\">\n",
127 |        "      <th></th>\n",
128 |        "      <th>area</th>\n",
129 |        "      <th>problem</th>\n",
130 |        "      <th>model</th>\n",
131 |        "      <th>version</th>\n",
132 |        "      <th>size</th>\n",
133 |        "      <th>path</th>\n",
134 |        "      <th>ops</th>\n",
135 |        "    </tr>\n",
136 |        "  </thead>\n",
137 |        "  <tbody>\n",
138 |        "    <tr>\n",
139 |        "      <th>0</th>\n",
140 |        "      <td>text</td>\n",
141 |        "      <td>machine_comprehension</td>\n",
142 |        "      <td>bert-squad</td>\n",
143 |        "      <td>bertsquad-10.onnx</td>\n",
144 |        "      <td>435852734</td>\n",
145 |        "      <td>C:\\ONNX\\models\\text\\machine_comprehension\\bert...</td>\n",
146 |        "      <td>{ConstantOfShape, Tanh, Sub, Softmax, Mul, Cas...</td>\n",
147 |        "    </tr>\n",
148 |        "    <tr>\n",
149 |        "      <th>1</th>\n",
150 |        "      <td>text</td>\n",
151 |        "      <td>machine_comprehension</td>\n",
152 |        "      <td>bert-squad</td>\n",
153 |        "      <td>bertsquad-12-int8.onnx</td>\n",
154 |        "      <td>124565601</td>\n",
155 |        "      <td>C:\\ONNX\\models\\text\\machine_comprehension\\bert...</td>\n",
156 |        "      <td>{ConstantOfShape, Tanh, Sub, Softmax, Mul, Cas...</td>\n",
157 |        "    </tr>\n",
158 |        "    <tr>\n",
159 |        "      <th>2</th>\n",
160 |        "      <td>text</td>\n",
161 |        "      <td>machine_comprehension</td>\n",
162 |        "      <td>bert-squad</td>\n",
163 |        "      <td>bertsquad-12.onnx</td>\n",
164 |        "      <td>435852736</td>\n",
165 |        "      <td>C:\\ONNX\\models\\text\\machine_comprehension\\bert...</td>\n",
166 |        "      <td>{ConstantOfShape, Tanh, Sub, Softmax, Mul, Cas...</td>\n",
167 |        "    </tr>\n",
168 |        "    <tr>\n",
169 |        "      <th>3</th>\n",
170 |        "      <td>text</td>\n",
171 |        "      <td>machine_comprehension</td>\n",
172 |        "      <td>bert-squad</td>\n",
173 |        "      <td>bertsquad-8.onnx</td>\n",
174 |        "      <td>435882893</td>\n",
175 |        "      <td>C:\\ONNX\\models\\text\\machine_comprehension\\bert...</td>\n",
176 |        "      <td>{Tanh, Sub, Softmax, Tile, Mul, Cast, Gather, ...</td>\n",
177 |        "    </tr>\n",
178 |        "    <tr>\n",
179 |        "      <th>4</th>\n",
180 |        "      <td>text</td>\n",
181 |        "      <td>machine_comprehension</td>\n",
182 |        "      <td>bidirectional_attention_flow</td>\n",
183 |        "      <td>bidaf-9.onnx</td>\n",
184 |        "      <td>43522228</td>\n",
185 |        "      <td>C:\\ONNX\\models\\text\\machine_comprehension\\bidi...</td>\n",
186 |        "      <td>{ConstantOfShape, Relu, CategoryMapper, Sub, S...</td>\n",
187 |        "    </tr>\n",
188 |        "    <tr>\n",
189 |        "      <th>...</th>\n",
190 |        "      <td>...</td>\n",
191 |        "      <td>...</td>\n",
192 |        "      <td>...</td>\n",
193 |        "      <td>...</td>\n",
194 |        "      <td>...</td>\n",
195 |        "      <td>...</td>\n",
196 |        "      <td>...</td>\n",
197 |        "    </tr>\n",
198 |        "    <tr>\n",
199 |        "      <th>163</th>\n",
200 |        "      <td>vision</td>\n",
201 |        "      <td>style_transfer</td>\n",
202 |        "      <td>fast_neural_style</td>\n",
203 |        "      <td>rain-princess-8.onnx</td>\n",
204 |        "      <td>6726529</td>\n",
205 |        "      <td>C:\\ONNX\\models\\vision\\style_transfer\\fast_neur...</td>\n",
206 |        "      <td>{Add, Relu, Upsample, Conv, Pad, InstanceNorma...</td>\n",
207 |        "    </tr>\n",
208 |        "    <tr>\n",
209 |        "      <th>164</th>\n",
210 |        "      <td>vision</td>\n",
211 |        "      <td>style_transfer</td>\n",
212 |        "      <td>fast_neural_style</td>\n",
213 |        "      <td>rain-princess-9.onnx</td>\n",
214 |        "      <td>6728029</td>\n",
215 |        "      <td>C:\\ONNX\\models\\vision\\style_transfer\\fast_neur...</td>\n",
216 |        "      <td>{Shape, Add, Cast, Div, Relu, Constant, Gather...</td>\n",
217 |        "    </tr>\n",
218 |        "    <tr>\n",
219 |        "      <th>165</th>\n",
220 |        "      <td>vision</td>\n",
221 |        "      <td>style_transfer</td>\n",
222 |        "      <td>fast_neural_style</td>\n",
223 |        "      <td>udnie-8.onnx</td>\n",
224 |        "      <td>6726529</td>\n",
225 |        "      <td>C:\\ONNX\\models\\vision\\style_transfer\\fast_neur...</td>\n",
226 |        "      <td>{Add, Relu, Upsample, Conv, Pad, InstanceNorma...</td>\n",
227 |        "    </tr>\n",
228 |        "    <tr>\n",
229 |        "      <th>166</th>\n",
230 |        "      <td>vision</td>\n",
231 |        "      <td>style_transfer</td>\n",
232 |        "      <td>fast_neural_style</td>\n",
233 |        "      <td>udnie-9.onnx</td>\n",
234 |        "      <td>6728029</td>\n",
235 |        "      <td>C:\\ONNX\\models\\vision\\style_transfer\\fast_neur...</td>\n",
236 |        "      <td>{Shape, Add, Cast, Div, Relu, Constant, Gather...</td>\n",
237 |        "    </tr>\n",
238 |        "    <tr>\n",
239 |        "      <th>167</th>\n",
240 |        "      <td>vision</td>\n",
241 |        "      <td>super_resolution</td>\n",
242 |        "      <td>sub_pixel_cnn_2016</td>\n",
243 |        "      <td>super-resolution-10.onnx</td>\n",
244 |        "      <td>240078</td>\n",
245 |        "      <td>C:\\ONNX\\models\\vision\\super_resolution\\sub_pix...</td>\n",
246 |        "      <td>{Relu, Constant, Conv, Reshape, Transpose}</td>\n",
247 |        "    </tr>\n",
248 |        "  </tbody>\n",
249 |        "</table>\n",
250 |        "<p>168 rows × 7 columns</p>\n",
251 |        "</div>"
252 |       ],
253 |       "text/plain": [
254 |        "       area                problem                         model  \\\n",
255 |        "0      text  machine_comprehension                    bert-squad   \n",
256 |        "1      text  machine_comprehension                    bert-squad   \n",
257 |        "2      text  machine_comprehension                    bert-squad   \n",
258 |        "3      text  machine_comprehension                    bert-squad   \n",
259 |        "4      text  machine_comprehension  bidirectional_attention_flow   \n",
260 |        "..      ...                    ...                           ...   \n",
261 |        "163  vision         style_transfer             fast_neural_style   \n",
262 |        "164  vision         style_transfer             fast_neural_style   \n",
263 |        "165  vision         style_transfer             fast_neural_style   \n",
264 |        "166  vision         style_transfer             fast_neural_style   \n",
265 |        "167  vision       super_resolution            sub_pixel_cnn_2016   \n",
266 |        "\n",
267 |        "                      version       size  \\\n",
268 |        "0           bertsquad-10.onnx  435852734   \n",
269 |        "1      bertsquad-12-int8.onnx  124565601   \n",
270 |        "2           bertsquad-12.onnx  435852736   \n",
271 |        "3            bertsquad-8.onnx  435882893   \n",
272 |        "4                bidaf-9.onnx   43522228   \n",
273 |        "..                        ...        ...   \n",
274 |        "163      rain-princess-8.onnx    6726529   \n",
275 |        "164      rain-princess-9.onnx    6728029   \n",
276 |        "165              udnie-8.onnx    6726529   \n",
277 |        "166              udnie-9.onnx    6728029   \n",
278 |        "167  super-resolution-10.onnx     240078   \n",
279 |        "\n",
280 |        "                                                  path  \\\n",
281 |        "0    C:\\ONNX\\models\\text\\machine_comprehension\\bert...   \n",
282 |        "1    C:\\ONNX\\models\\text\\machine_comprehension\\bert...   \n",
283 |        "2    C:\\ONNX\\models\\text\\machine_comprehension\\bert...   \n",
284 |        "3    C:\\ONNX\\models\\text\\machine_comprehension\\bert...   \n",
285 |        "4    C:\\ONNX\\models\\text\\machine_comprehension\\bidi...   \n",
286 |        "..                                                 ...   \n",
287 |        "163  C:\\ONNX\\models\\vision\\style_transfer\\fast_neur...   \n",
288 |        "164  C:\\ONNX\\models\\vision\\style_transfer\\fast_neur...   \n",
289 |        "165  C:\\ONNX\\models\\vision\\style_transfer\\fast_neur...   \n",
290 |        "166  C:\\ONNX\\models\\vision\\style_transfer\\fast_neur...   \n",
291 |        "167  C:\\ONNX\\models\\vision\\super_resolution\\sub_pix...   \n",
292 |        "\n",
293 |        "                                                   ops  \n",
294 |        "0    {ConstantOfShape, Tanh, Sub, Softmax, Mul, Cas...  \n",
295 |        "1    {ConstantOfShape, Tanh, Sub, Softmax, Mul, Cas...  \n",
296 |        "2    {ConstantOfShape, Tanh, Sub, Softmax, Mul, Cas...  \n",
297 |        "3    {Tanh, Sub, Softmax, Tile, Mul, Cast, Gather, ...  \n",
298 |        "4    {ConstantOfShape, Relu, CategoryMapper, Sub, S...  \n",
299 |        "..                                                 ...  \n",
300 |        "163  {Add, Relu, Upsample, Conv, Pad, InstanceNorma...  \n",
301 |        "164  {Shape, Add, Cast, Div, Relu, Constant, Gather...  \n",
302 |        "165  {Add, Relu, Upsample, Conv, Pad, InstanceNorma...  \n",
303 |        "166  {Shape, Add, Cast, Div, Relu, Constant, Gather...  \n",
304 |        "167         {Relu, Constant, Conv, Reshape, Transpose}  \n",
305 |        "\n",
306 |        "[168 rows x 7 columns]"
307 |       ]
308 |      },
309 |      "execution_count": 5,
310 |      "metadata": {},
311 |      "output_type": "execute_result"
312 |     }
313 |    ],
314 |    "source": [
315 |     "models.sort_values(\"size\", ascending=False)\n",
316 |     "models"
317 |    ]
318 |   },
319 |   {
320 |    "cell_type": "code",
321 |    "execution_count": 6,
322 |    "id": "a3a6c4ef",
323 |    "metadata": {},
324 |    "outputs": [],
325 |    "source": [
326 |     "models.to_pickle(\"models-df.pkl\")"
327 |    ]
328 |   }
329 |  ],
330 |  "metadata": {
331 |   "kernelspec": {
332 |    "display_name": "onnx2code-ufxMYK0j",
333 |    "language": "python",
334 |    "name": "python3"
335 |   },
336 |   "language_info": {
337 |    "codemirror_mode": {
338 |     "name": "ipython",
339 |     "version": 3
340 |    },
341 |    "file_extension": ".py",
342 |    "mimetype": "text/x-python",
343 |    "name": "python",
344 |    "nbconvert_exporter": "python",
345 |    "pygments_lexer": "ipython3",
346 |    "version": "3.10.6 (main, Nov 14 2022, 16:10:14) [GCC 11.3.0]"
347 |   },
348 |   "vscode": {
349 |    "interpreter": {
350 |     "hash": "c1aa1ee17a9068633b0ad7418d6283b8ec82042b46ede1c27bec8ef59eb01779"
351 |    }
352 |   }
353 |  },
354 |  "nbformat": 4,
355 |  "nbformat_minor": 5
356 | }
357 | 


--------------------------------------------------------------------------------
/onnx2code/generator.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import re
  3 | import warnings
  4 | from collections import defaultdict
  5 | from itertools import chain
  6 | from pathlib import Path
  7 | from textwrap import dedent, indent
  8 | 
  9 | import numpy as np
 10 | import onnx
 11 | import onnxsim.onnx_simplifier as onnx_simplifier
 12 | 
 13 | from .memory import TensorUsageRecord, find_best_layout
 14 | from .ops.operation import OpCall, Operation, OpImpl
 15 | from .result import ModelResult
 16 | from .tensor import TensorData, parse_tensors
 17 | from .util import get_fixed_input_shapes
 18 | 
 19 | REGISTER_ORDER = ["rdi", "rsi", "rdx", "rcx", "r8", "r9"]
 20 | INFERENCE_SIGNATURE = "void __attribute__ ((noinline)) inference(const float* weights, const float* inputs, float* outputs)"
 21 | 
 22 | 
 23 | class Generator:
 24 |     """
 25 |     Code generator
 26 | 
 27 |     Proto ref: https://github.com/onnx/onnx/blob/main/docs/IR.md
 28 |     """
 29 | 
 30 |     def __init__(self, _model_proto: onnx.ModelProto, variations: list[str] = []):
 31 |         try:
 32 |             model_proto, check = onnx_simplifier.simplify(
 33 |                 model=_model_proto,
 34 |                 overwrite_input_shapes=get_fixed_input_shapes(_model_proto),
 35 |             )
 36 |             assert check, "ONNX model could not be simplified"
 37 |         except Exception as e:
 38 |             model_proto = _model_proto
 39 |             warnings.warn("Model could not be simplified, using as is (" + str(e) + ")")
 40 | 
 41 |         # save model for later inspection
 42 |         if os.getenv("ONNX2CODE_DEBUG", "0") == "1":
 43 |             tmp = Path(__file__).parent.parent / "tmp"
 44 |             tmp.mkdir(exist_ok=True)
 45 |             onnx.save_model(model_proto, (tmp / "model.onnx").__str__())
 46 | 
 47 |         self.model_proto = model_proto
 48 |         self.tensors = {tensor.name: tensor for tensor in parse_tensors(model_proto)}
 49 |         self.variations = variations + ["c", "asm"]
 50 | 
 51 |         self.impls: dict[OpImpl, OpCall] = {}
 52 |         self.calls: list[OpCall] = []
 53 | 
 54 |     def weld_tensors(self, name_from: str, name_to: str) -> None:
 55 |         """
 56 |         Weld tensors together
 57 |         This means they should point to the same variable in runtime
 58 | 
 59 |         :param name_from: Name of the origin tensor
 60 |         :param name_to: Name of the destination tensor
 61 |         :raises KeyError: If the tensor names are not found
 62 |         """
 63 |         self.tensors[name_to].variable = self.tensors[name_from].variable
 64 | 
 65 |         if self.tensors[name_to].tag != "output":
 66 |             self.tensors[name_to].tag = "welded"
 67 | 
 68 |     def generate(self) -> ModelResult:
 69 |         """
 70 |         Generate C and ASM code to run the model
 71 |         """
 72 |         for node in self.model_proto.graph.node:
 73 |             if node.op_type in [
 74 |                 # Reshape/Squeeze/Unsqueeze operator ⚠️ SPECIAL CASE ⚠️
 75 |                 #
 76 |                 # https://github.com/onnx/onnx/blob/main/docs/Operators.md#reshape
 77 |                 # https://github.com/onnx/onnx/blob/main/docs/Operators.md#squeeze
 78 |                 # https://github.com/onnx/onnx/blob/main/docs/Operators.md#unsqueeze
 79 |                 "Reshape",
 80 |                 "Squeeze",
 81 |                 "Unsqueeze",
 82 |                 # have no effect during inference
 83 |                 "Dropout",
 84 |                 "BatchNormalization",  # are we sure about this one?
 85 |                 # other kind of reshape
 86 |                 "Flatten",
 87 |             ]:
 88 |                 # Since it just reshapes the tensor, we don't need to do anything in runtime
 89 |                 # But we must must be weld the input and output tensors (variables/data)
 90 |                 self.weld_tensors(node.input[0], node.output[0])
 91 | 
 92 |                 continue
 93 | 
 94 |             variants = Operation.get(node.op_type, self.variations)
 95 | 
 96 |             impl: (OpImpl | None) = None
 97 |             call: (OpCall | None) = None
 98 |             ex: (Exception | None) = None
 99 | 
100 |             # we try all the variants we have available, in the order specified
101 |             # if one throws NotImplemented, we try the next one
102 |             for var in variants:
103 |                 try:
104 |                     op = var(
105 |                         node,
106 |                         [self.tensors[name] for name in node.input],
107 |                         [self.tensors[name] for name in node.output],
108 |                     )
109 |                     impl = op.impl()
110 |                     call = op.call()
111 |                     break
112 |                 except NotImplementedError as _ex:
113 |                     # keep first
114 |                     if ex is None:
115 |                         ex = _ex
116 | 
117 |             if impl is None or call is None:
118 |                 assert ex is not None
119 |                 raise ex
120 | 
121 |             if call is not None and impl is not None:
122 |                 if impl in self.impls:
123 |                     new_name = call.fn_name()
124 |                     prev_name = self.impls[impl].fn_name()
125 |                     assert (
126 |                         new_name == prev_name
127 |                     ), "function name should coincide if the implementation is the same"
128 | 
129 |                 self.impls[impl] = call
130 |                 self.calls.append(call)
131 | 
132 |         self._compute_memory_layout()
133 | 
134 |         inputs = [tensor for tensor in self.tensors.values() if tensor.tag == "input"]
135 |         outputs = [tensor for tensor in self.tensors.values() if tensor.tag == "output"]
136 | 
137 |         return ModelResult(
138 |             input_shapes={tensor.name: tensor.shape for tensor in inputs},
139 |             output_shapes={tensor.name: tensor.shape for tensor in outputs},
140 |             source_c=self._gen_c_source(),
141 |             source_h=f"extern {INFERENCE_SIGNATURE};",
142 |             source_asm=self._gen_asm_source(),
143 |             weights=self._gen_weights(),
144 |         )
145 | 
146 |     def _compute_memory_layout(self) -> None:
147 |         """
148 |         Finds a good memory layout for intermediate tensors
149 |         """
150 |         MAX = 999999999
151 |         MIN = -1
152 | 
153 |         inter_tensors: dict[str, TensorUsageRecord] = {}
154 | 
155 |         # add all intermediate tensors
156 |         for t in self.tensors.values():
157 |             if t.tag == "intermediate":
158 |                 inter_tensors[t.variable] = TensorUsageRecord(MAX, MIN, t.size)
159 | 
160 |         # build usage records knowing the order of calls and data dependencies
161 |         for index, call in enumerate(self.calls):
162 |             # for inputs, make sure we reserve the tensor up to index
163 |             for tensor in call.inputs:
164 |                 if tensor.tag == "intermediate":
165 |                     rec = inter_tensors[tensor.variable]
166 |                     rec.last_op = max(rec.last_op, index)
167 | 
168 |             # for outputs, make sure we reserve the tensor from at least index
169 |             for tensor in call.outputs:
170 |                 if tensor.tag == "intermediate":
171 |                     rec = inter_tensors[tensor.variable]
172 |                     rec.first_op = min(rec.first_op, index)
173 | 
174 |         # tensors that connect with the output don't have last_op set
175 |         # set to first_op + 1
176 |         for var, rec in inter_tensors.items():
177 |             assert rec.first_op != MAX, "tensor is never used"
178 | 
179 |             if rec.last_op == -1:
180 |                 rec.last_op = rec.first_op + 1
181 | 
182 |         self.inter_size, offsets = find_best_layout(list(inter_tensors.values()))
183 |         self.inter_offsets = {}
184 | 
185 |         # map tensor names to variables
186 |         for var, offset in zip(inter_tensors.keys(), offsets):
187 |             self.inter_offsets[var] = offset
188 | 
189 |     def _gen_weights(self) -> TensorData:
190 |         return np.concatenate(
191 |             [
192 |                 tensor.data.reshape(-1)
193 |                 for tensor in self.tensors.values()
194 |                 if tensor.tag == "weight"
195 |                 and tensor.data is not None
196 |                 and tensor.data.dtype == np.float32
197 |             ]
198 |             # concatenate needs at least one array
199 |             + [np.array([], dtype=np.float32)],
200 |         )
201 | 
202 |     def _gen_c_source(self) -> str:
203 |         source = "\n".join(
204 |             [
205 |                 "#include <stdio.h>",
206 |                 "#include <assert.h>",
207 |                 "#include <math.h>",
208 |                 "#include <string.h>",
209 |                 "#define min(a,b) ((a)<(b)?(a):(b))",
210 |                 "#define max(a,b) ((a)>(b)?(a):(b))",
211 |                 "float im2col[50000000]; // TODO: do this correctly...",
212 |                 "",
213 |             ]
214 |         )
215 | 
216 |         # asm auxiliary function declarations
217 | 
218 |         source += "// Auxiliary functions (ASM):\n\n"
219 | 
220 |         asm_aux_declarations = [
221 |             f"{asm_aux_function.signature};"
222 |             for impl in self.impls.keys()
223 |             for asm_aux_function in impl.asm_aux_functions
224 |         ]
225 | 
226 |         source += 'extern "C" {\n' + "\n\n".join(asm_aux_declarations) + "\n}\n\n"
227 | 
228 |         # loading external files
229 |         source += "// External files:\n\n"
230 | 
231 |         efp = [path for impl in self.impls.keys() for path in impl.external_paths]
232 |         external_file_paths = sorted(set(efp), key=efp.index)
233 | 
234 |         for path in external_file_paths:
235 |             source += f"// {path}\n\n"
236 |             with open(path, "r") as f:
237 |                 source += f.read() + "\n"
238 | 
239 |         source += "\n" * 2
240 | 
241 |         # c++ auxiliary functions
242 | 
243 |         source += "// Auxiliary functions (C++):\n\n"
244 | 
245 |         cpp_aux_functions = list(
246 |             dict.fromkeys(
247 |                 chain.from_iterable(
248 |                     impl.cpp_aux_functions for impl in self.impls.keys()
249 |                 )
250 |             )
251 |         )
252 | 
253 |         source += "\n".join(cpp_aux_functions) + "\n" * 2
254 | 
255 |         # define ASM functions in C
256 | 
257 |         source += "// ASM functions:\n\n"
258 | 
259 |         for impl, call in self.impls.items():
260 |             if impl.lang == "asm":
261 |                 source += f"extern {call.signature()};"
262 | 
263 |         source += "\n" * 2
264 | 
265 |         # implementations
266 | 
267 |         source += "// Implementations:\n\n"
268 | 
269 |         for impl, call in self.impls.items():
270 |             if impl.lang == "c":
271 |                 source += call.signature() + " {\n"
272 |                 source += indent(impl.full_source().strip(), prefix=" " * 4)
273 |                 source += "\n}\n"
274 | 
275 |         # define intermediate tensor
276 |         # it is a shared buffer
277 |         source += "\n" * 2
278 |         source += f"float intermediates[{self.inter_size}];"
279 |         source += "\n" * 2
280 | 
281 |         inference_source = ""
282 |         io_offsets: defaultdict[str, int] = defaultdict(int)
283 |         # build tensor variables
284 |         for tensor in self.tensors.values():
285 |             if tensor.tag != "welded":
286 |                 if (
287 |                     tensor.tag == "weight"
288 |                     and tensor.data is not None
289 |                     and tensor.data.dtype != np.float32
290 |                 ):
291 |                     # weight with no data or invalid, skip
292 |                     continue
293 | 
294 |                 if tensor.tag == "intermediate":
295 |                     # IF an intermediate tensor is welded with the output
296 |                     # we want to preserve the output tensor instead of the intermediate one
297 |                     # so we skip the definition of the intermediate in favor of the output
298 |                     skip = False
299 |                     for other in self.tensors.values():
300 |                         if other.tag == "output" and other.variable == tensor.variable:
301 |                             # already defined as output
302 |                             skip = True
303 |                             break
304 |                     if skip:
305 |                         continue
306 | 
307 |                 if tensor.tag == "intermediate":
308 |                     offset = self.inter_offsets[tensor.variable]
309 |                     assert offset is not None, "invliad offset"
310 |                 else:  # input, output or weight
311 |                     offset = io_offsets[tensor.tag]
312 |                     io_offsets[tensor.tag] += tensor.size
313 | 
314 |                 decl = "const " if tensor.tag in ["input", "weight"] else ""
315 |                 decl += f"float* {tensor.variable} = "
316 |                 decl += f"{tensor.tag}s + {offset};"
317 | 
318 |             else:
319 |                 # welded
320 |                 continue
321 | 
322 |             decl = f"\n{decl : <34} // ({tensor.shape_str()}) {tensor.name}"
323 |             inference_source += decl
324 | 
325 |         # make op calls
326 |         inference_source += "\n"
327 |         for call in self.calls:
328 |             inference_source += f"\n{call.invocation()};"
329 | 
330 |         source += INFERENCE_SIGNATURE + " {"
331 |         source += indent(inference_source, prefix=" " * 4)
332 |         source += "\n}"
333 | 
334 |         return source
335 | 
336 |     def _gen_asm_source(self) -> str:
337 |         source = ""
338 | 
339 |         # asm auxiliary functions
340 | 
341 |         for impl in self.impls.keys():
342 |             for asm_aux_function in impl.asm_aux_functions:
343 |                 # extract name from signature
344 |                 regex = re.compile(r"(\w+)\s*\(")
345 |                 match = regex.search(asm_aux_function.signature)
346 |                 assert match is not None, "invalid signature"
347 |                 name = match.group(1)
348 | 
349 |                 function_source = indent(
350 |                     dedent(asm_aux_function.source), prefix=" " * 4
351 |                 )
352 | 
353 |                 source += f"global {name}\n{name}:{function_source}\n\n"
354 | 
355 |         for impl, call in self.impls.items():
356 |             if impl.lang == "asm":
357 |                 comments = [call.signature()] + [
358 |                     f"{p}: {REGISTER_ORDER[i]}"
359 |                     for i, p in enumerate(
360 |                         call.input_names[: len(call.inputs)]
361 |                         + call.output_names[: len(call.outputs)]
362 |                     )
363 |                 ]
364 |                 source += "\n\n".join(
365 |                     [
366 |                         *[f";; {c}" for c in comments],
367 |                         f"global {call.fn_name()}",
368 |                         f"{call.fn_name()}:",
369 |                         indent(impl.full_source(), prefix=" " * 4),
370 |                     ]
371 |                 )
372 | 
373 |         return source.strip() + "\n"
374 | 


--------------------------------------------------------------------------------