├── tests ├── __init__.py ├── ops │ ├── __init__.py │ ├── test_identity.py │ ├── test_softmax.py │ ├── test_elementwise.py │ ├── test_concat.py │ ├── test_transpose.py │ ├── test_conv.py │ ├── test_pooling.py │ ├── test_broadcastable.py │ └── test_gemm.py ├── util.py ├── conftest.py ├── zoo.py └── test_zoo.py ├── evaluation ├── __init__.py ├── .gitignore ├── results │ ├── 10th-ice-lake.png │ ├── 6th-skylake.png │ ├── 10th-comet-lake.png │ ├── 11th-tiger-lake.jpg │ └── analysis.Rmd ├── random_sample.py ├── setup.py ├── evaluate.py ├── results_conv │ ├── analysis.Rmd │ ├── 6th.csv │ └── 10th.csv ├── results_gemm │ ├── analysis.Rmd │ ├── 10th.csv │ └── 6th.csv ├── measure_gemm.py ├── measure_conv.py ├── measure_models.py ├── eval_tilings.py ├── find_best_tiling_params.py ├── measure.py └── results_gemm.csv ├── onnx2code ├── __init__.py ├── ops │ ├── __init__.py │ ├── gemm_tiling │ │ ├── microkernel_ref.cpp │ │ ├── GEMM.py │ │ ├── microkernel_test.cpp │ │ ├── gpackB.cpp │ │ ├── gpackA.cpp │ │ └── gemm.cpp │ ├── identity.py │ ├── transpose.py │ ├── concat.py │ ├── softmax.py │ ├── elementwise.py │ ├── broadcastable.py │ ├── pooling.py │ ├── operation.py │ ├── gemm.py │ └── conv.py ├── result.py ├── debugger.c ├── service.c ├── checker.py ├── __main__.py ├── tensor.py ├── memory.py ├── util.py ├── service.py └── generator.py ├── pyproject.toml ├── pytest.ini ├── preliminar ├── models-df.pkl └── build_dataset.ipynb ├── docs └── TP Final onnx2code.pdf ├── .flake8 ├── .gitattributes ├── .gitignore ├── Pipfile ├── Dockerfile ├── Makefile ├── .vscode └── settings.json ├── .github └── workflows │ └── ci.yml └── README.md /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /onnx2code/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/ops/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.isort] 2 | profile = "black" 3 | 4 | -------------------------------------------------------------------------------- /evaluation/.gitignore: -------------------------------------------------------------------------------- 1 | *.pdf 2 | *.html 3 | results_*.csv 4 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | addopts = --no-header -v -W ignore::DeprecationWarning --ignore=data 3 | -------------------------------------------------------------------------------- /preliminar/models-df.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlomb/onnx2code/HEAD/preliminar/models-df.pkl -------------------------------------------------------------------------------- /docs/TP Final onnx2code.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlomb/onnx2code/HEAD/docs/TP Final onnx2code.pdf -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 88 3 | ignore = E203, E266, E402, E501, W503, F841 4 | exclude = 5 | data 6 | -------------------------------------------------------------------------------- /evaluation/results/10th-ice-lake.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlomb/onnx2code/HEAD/evaluation/results/10th-ice-lake.png -------------------------------------------------------------------------------- /evaluation/results/6th-skylake.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlomb/onnx2code/HEAD/evaluation/results/6th-skylake.png -------------------------------------------------------------------------------- /evaluation/results/10th-comet-lake.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlomb/onnx2code/HEAD/evaluation/results/10th-comet-lake.png -------------------------------------------------------------------------------- /evaluation/results/11th-tiger-lake.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlomb/onnx2code/HEAD/evaluation/results/11th-tiger-lake.jpg -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | 4 | # ignore notebooks 5 | *.ipynb linguist-vendored 6 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Cache files 2 | .mypy_cache/ 3 | .ipynb_checkpoints/ 4 | .pytest_cache/ 5 | __pycache__/ 6 | 7 | # Data files 8 | **/tmp 9 | data 10 | output/ 11 | results.csv 12 | -------------------------------------------------------------------------------- /onnx2code/ops/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | 3 | from . import ( 4 | broadcastable, 5 | concat, 6 | conv, 7 | elementwise, 8 | gemm, 9 | identity, 10 | operation, 11 | pooling, 12 | softmax, 13 | transpose, 14 | ) 15 | -------------------------------------------------------------------------------- /tests/util.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import tf2onnx 3 | 4 | from onnx2code.checker import check_model 5 | 6 | 7 | def check_keras(model: tf.keras.Model, variations: list[str] = []) -> None: 8 | model_proto, _ = tf2onnx.convert.from_keras(model) 9 | check_model(model_proto, variations) 10 | -------------------------------------------------------------------------------- /onnx2code/result.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | from .tensor import TensorData 4 | from .util import ShapesMap 5 | 6 | 7 | @dataclass 8 | class ModelResult: 9 | input_shapes: ShapesMap 10 | output_shapes: ShapesMap 11 | source_c: str 12 | source_h: str 13 | source_asm: str 14 | weights: TensorData 15 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | 4 | def shapes_id(shape: list[int]) -> str: 5 | return f"""({",".join(map(str, shape))})""" 6 | 7 | 8 | def pytest_make_parametrize_id(config: Any, val: Any, argname: str) -> str | None: 9 | if argname.startswith("shape"): 10 | return shapes_id(val) 11 | return None 12 | -------------------------------------------------------------------------------- /evaluation/random_sample.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import numpy as np 4 | 5 | # Read a single int from args 6 | parser = argparse.ArgumentParser() 7 | parser.add_argument("N", type=int) 8 | 9 | args = parser.parse_args() 10 | 11 | # Generate a random float array of size N 12 | arr = np.random.uniform(size=args.N).astype(np.float32) 13 | 14 | # Write the array to a file 15 | with open("random.bin", "wb") as f: 16 | f.write(arr.tobytes()) 17 | -------------------------------------------------------------------------------- /tests/ops/test_identity.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import tensorflow as tf 3 | 4 | from ..util import check_keras 5 | 6 | 7 | @pytest.mark.parametrize("shape", [[1], [2, 3], [4, 5, 6]]) 8 | @pytest.mark.parametrize("variation", ["c"]) # , "asm" 9 | def test_identity(variation: str, shape: list[int]) -> None: 10 | input = tf.keras.Input(shape) 11 | out = tf.keras.layers.Lambda(lambda x: x)(input) 12 | model = tf.keras.Model(inputs=[input], outputs=[out]) 13 | check_keras(model, [variation]) 14 | -------------------------------------------------------------------------------- /Pipfile: -------------------------------------------------------------------------------- 1 | [[source]] 2 | url = "https://pypi.org/simple" 3 | verify_ssl = true 4 | name = "pypi" 5 | 6 | [packages] 7 | numpy = "*" 8 | onnx = "*" 9 | onnx-simplifier = "*" 10 | onnxruntime = "*" 11 | pytest = "*" 12 | tensorflow = "*" 13 | tf2onnx = ">=1.12.1" 14 | matplotlib = "*" 15 | pandas = "*" 16 | black = "==22.10.0" 17 | tqdm = "*" 18 | 19 | [dev-packages] 20 | mypy = "*" 21 | flake8 = "*" 22 | black = "*" 23 | ipykernel = "*" 24 | isort = "*" 25 | 26 | [requires] 27 | python_version = "3.10" 28 | 29 | [pipenv] 30 | allow_prereleases = true 31 | -------------------------------------------------------------------------------- /tests/ops/test_softmax.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import tensorflow as tf 3 | 4 | from ..util import check_keras 5 | 6 | 7 | @pytest.mark.parametrize( 8 | "shape", 9 | [[1], [2, 3], [4, 5, 6]], 10 | ) 11 | @pytest.mark.parametrize("axis", [-1, 1, 2]) 12 | def test_softmax(shape: list[int], axis: int) -> None: 13 | input = tf.keras.Input(shape) 14 | try: 15 | output = tf.keras.layers.Softmax(axis=axis)(input) 16 | model = tf.keras.Model(inputs=[input], outputs=[output]) 17 | except Exception: 18 | pytest.skip("incompatible configuration") 19 | 20 | check_keras(model) 21 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.10 2 | 3 | # install deps 4 | RUN apt-get update && apt-get install -y --no-install-recommends gcc nasm 5 | RUN pip install pipenv 6 | 7 | # install libxsmm @ 4e1aa533 8 | RUN git clone https://github.com/libxsmm/libxsmm 9 | WORKDIR /libxsmm 10 | RUN git checkout 4e1aa5332123088916989651ae9b187ecba377dc 11 | RUN make generator 12 | ENV PATH="/libxsmm/bin:${PATH}" 13 | 14 | # install onnx2code 15 | WORKDIR /app 16 | COPY Pipfile . 17 | COPY Pipfile.lock . 18 | RUN pipenv install --deploy 19 | 20 | COPY onnx2code onnx2code 21 | 22 | ENTRYPOINT ["pipenv", "run", "python", "-m", "onnx2code", "input.onnx", "output"] 23 | -------------------------------------------------------------------------------- /tests/ops/test_elementwise.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import tensorflow as tf 3 | 4 | from ..util import check_keras 5 | 6 | 7 | @pytest.mark.parametrize( 8 | "activation", 9 | [ 10 | tf.keras.layers.Activation("relu"), 11 | tf.keras.layers.Activation("tanh"), 12 | tf.keras.layers.Activation("sigmoid"), 13 | ], 14 | ids=lambda x: str(x.activation.__name__), 15 | ) 16 | def test_activations(activation: tf.keras.layers.Activation) -> None: 17 | input = tf.keras.Input(shape=(4, 5, 6)) 18 | output = activation(input) 19 | model = tf.keras.Model(inputs=[input], outputs=[output]) 20 | check_keras(model) 21 | -------------------------------------------------------------------------------- /tests/ops/test_concat.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import tensorflow as tf 3 | 4 | from ..util import check_keras 5 | 6 | 7 | @pytest.mark.parametrize( 8 | "shapes", 9 | [ 10 | [[1, 2, 3, 4], [1, 2, 3, 4]], 11 | [[2, 2, 5, 1], [2, 1, 5, 1]], 12 | [[2, 2, 5, 1], [2, 1, 5, 1], [2, 3, 5, 1]], 13 | ], 14 | ) 15 | @pytest.mark.parametrize("axis", [0, 1, 2, 3]) 16 | @pytest.mark.parametrize("variation", ["c"]) 17 | def test_concat(shapes: list[list[int]], axis: int, variation: str) -> None: 18 | inputs = [tf.keras.Input(shape) for shape in shapes] 19 | 20 | try: 21 | out = tf.keras.layers.Concatenate(axis=1 + axis)(inputs) # +1 for batch dim 22 | model = tf.keras.Model(inputs=inputs, outputs=[out]) 23 | except Exception: 24 | pytest.skip("incompatible configuration") 25 | 26 | check_keras(model, [variation]) 27 | -------------------------------------------------------------------------------- /onnx2code/ops/gemm_tiling/microkernel_ref.cpp: -------------------------------------------------------------------------------- 1 | template< 2 | int mr, 3 | int nr, 4 | int kc, 5 | 6 | int CStrideRow 7 | > 8 | inline void ref_microkernel( 9 | const float* __restrict__ A_kernel, // (mr x kc) column major 10 | const float* __restrict__ B_kernel, // (kc x nr) row major 11 | float* __restrict__ C 12 | ) { 13 | float AB[mr * nr]; 14 | memset(AB, 0, mr * nr * sizeof(float)); 15 | 16 | for (int k = 0; k < kc; k++) { 17 | for (int n = 0; n < nr; n++) { 18 | for (int m = 0; m < mr; m++) { 19 | AB[n * mr + m] += 20 | A_kernel[k * mr + m] * 21 | B_kernel[k * nr + n]; 22 | } 23 | } 24 | } 25 | 26 | for (int j = 0; j < nr; j++) { 27 | for (int i = 0; i < mr; i++) { 28 | C[i * CStrideRow + j] += AB[mr * j + i]; 29 | } 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /evaluation/setup.py: -------------------------------------------------------------------------------- 1 | # Makes sure the libraries are using only 1 CPU thread 2 | # and are optimized for inference. 3 | 4 | import os 5 | import sys 6 | 7 | # Silence TF 8 | os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" 9 | # Do not use GPU 10 | os.environ["CUDA_VISIBLE_DEVICES"] = "-1" 11 | 12 | # Make onnxruntime only use 1 CPU thread 13 | os.environ["OMP_NUM_THREADS"] = "1" 14 | os.environ["MKL_NUM_THREADS"] = "1" 15 | os.environ["OPENBLAS_NUM_THREADS"] = "1" 16 | 17 | import tensorflow as tf 18 | 19 | # Make tensorflow only use 1 CPU thread 20 | tf.config.threading.set_inter_op_parallelism_threads(1) 21 | tf.config.threading.set_intra_op_parallelism_threads(1) 22 | 23 | # We don't need to disable eager execution, because we are using tf.function (I hope) 24 | # tf.compat.v1.disable_v2_behavior() 25 | # tf.compat.v1.disable_eager_execution() 26 | # tf.config.run_functions_eagerly(False) # this line does not work 🤡 27 | 28 | sys.path.append("../") 29 | -------------------------------------------------------------------------------- /tests/zoo.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import urllib.request 4 | from pathlib import Path 5 | from typing import Any 6 | 7 | 8 | def download_from_zoo(path: str, expected_size: int | None = None) -> Path: 9 | """ 10 | Download files from the repo https://github.com/onnx/models 11 | """ 12 | target = Path(os.path.dirname(__file__)) / "../data" / path 13 | 14 | if target.is_file(): 15 | # file already downloaded 16 | 17 | # check if the size is the expected one 18 | if expected_size is None or target.stat().st_size == expected_size: 19 | return target 20 | 21 | target.parent.mkdir(parents=True, exist_ok=True) 22 | 23 | urllib.request.urlretrieve( 24 | f"https://github.com/onnx/models/raw/main/{path}", target 25 | ) 26 | 27 | return target 28 | 29 | 30 | def zoo_manifest() -> Any: 31 | return json.loads(download_from_zoo("ONNX_HUB_MANIFEST.json").read_text()) 32 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | test: 2 | env TF_CPP_MIN_LOG_LEVEL=3 CUDA_VISIBLE_DEVICES=-1 pytest --durations=10 3 | 4 | mnist: 5 | env TF_CPP_MIN_LOG_LEVEL=3 CUDA_VISIBLE_DEVICES=-1 pytest -k mnist 6 | 7 | lint: 8 | flake8 . --count --statistics 9 | 10 | format: 11 | isort . --skip=data && \ 12 | black --verbose . --exclude=data 13 | 14 | precommit: lint format test 15 | 16 | debug: 17 | python -m onnx2code model.onnx output --variations=loop-tiling --checks=1 ; \ 18 | nasm -f elf64 output/model.asm -o output/model-asm.o -g && \ 19 | g++ output/model.cpp output/debugger.cpp output/model-asm.o -o output/main -g && \ 20 | gdb output/main output/model-asm.o -ex "b unit_update" -ex "r" 21 | 22 | profile: 23 | python -m onnx2code data/model.onnx output --variations=loop-tiling --checks=1; \ 24 | nasm -f elf64 output/model.asm -o output/model-asm.o -g && \ 25 | g++ -Ioutput/ output/model.cpp onnx2code/debugger.c output/model-asm.o -o output/main \ 26 | -g -O3 -march=native -mtune=native 27 | -------------------------------------------------------------------------------- /tests/ops/test_transpose.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import tensorflow as tf 3 | 4 | from ..util import check_keras 5 | 6 | 7 | @pytest.mark.parametrize("shape", [[2, 3], [3, 4, 5]]) 8 | def test_transpose_default(shape: list[int]) -> None: 9 | input = tf.keras.Input(shape=shape) 10 | out = tf.keras.backend.transpose(input) 11 | model = tf.keras.Model(inputs=[input], outputs=[out]) 12 | check_keras(model) 13 | 14 | 15 | @pytest.mark.parametrize( 16 | "perm", 17 | [ 18 | # ([1,2,3]), # this gets optimized with Identity 19 | ([2, 3, 1]), 20 | ([3, 2, 1]), 21 | ([3, 1, 2]), 22 | ([1, 3, 2]), 23 | ([2, 1, 3]), 24 | ], 25 | ids=lambda x: ",".join(map(str, x)), 26 | ) 27 | def test_transpose_perm(perm: list[int]) -> None: 28 | input = tf.keras.Input(shape=[3, 4, 5]) 29 | out = tf.keras.layers.Permute(perm)(input) 30 | model = tf.keras.Model(inputs=[input], outputs=[out]) 31 | check_keras(model) 32 | -------------------------------------------------------------------------------- /evaluation/evaluate.py: -------------------------------------------------------------------------------- 1 | import setup # noqa # isort:skip 2 | 3 | import keras 4 | import matplotlib.pyplot as plt 5 | import tensorflow as tf 6 | from keras import layers 7 | from measure import measure_all 8 | 9 | from onnx2code.ops.gemm_tiling.GEMM import LoopTilingParams, set_tiling_params 10 | 11 | # Custom MNIST-like model 12 | input = tf.keras.Input([4096 * 64]) 13 | out = tf.keras.layers.Lambda(lambda x: x)(input) 14 | 15 | input_shape = (512, 512) 16 | 17 | M = 4 18 | K = 16 19 | N = 64009 20 | 21 | 22 | model = keras.Sequential( 23 | [ 24 | keras.Input(shape=(M, K)), 25 | layers.Dense(N, activation="relu"), 26 | ] 27 | ) 28 | 29 | set_tiling_params(LoopTilingParams(nc=4096, kc=256, mc=128, mr=4, nr=8, mv=4, nu=4)) 30 | 31 | # Measure models 32 | data = measure_all(model, variations=["loop-tiling", "gemm-naive"]) 33 | 34 | # Plot results 35 | plt.boxplot(data.values(), labels=data.keys()) 36 | plt.ylabel("Time (ms)") 37 | plt.title("Inference time of Identity model") 38 | 39 | plt.show() 40 | -------------------------------------------------------------------------------- /evaluation/results_conv/analysis.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "R Notebook" 3 | output: html_notebook 4 | --- 5 | 6 | ```{r} 7 | library(ggplot2) 8 | library(extrafont) 9 | 10 | theme_set(theme(text=element_text(family="LM Roman 10"))) 11 | ``` 12 | 13 | ```{r} 14 | datos <- read.csv("C:/Users/mlomb/Desktop/onnx2code/evaluation/results_conv/6th.csv") 15 | #datos$runtime <- factor(datos$runtime, labels=c("naïve", "libxsmm", "onnx2code", "onnxruntime", "tensorflow")) 16 | 17 | datosCON <- datos[datos$runtime == "onnx2code",] 18 | datosSIN <- datos[datos$runtime != "onnx2code",] 19 | ``` 20 | 21 | 22 | ```{r} 23 | ggplot(NULL, aes(x=MNK,y=time_mean, colour=runtime)) + 24 | geom_line(data=datosSIN, size=0.4) + 25 | geom_line(data=datosCON, size=1) + 26 | geom_point(data=datosSIN, size=1) + 27 | geom_point(data=datosCON, size=1.3) + 28 | #geom_errorbar(data=datosSIN, aes(ymin=time_mean-time_std, ymax=time_mean+time_std)) + 29 | xlab("M=K=N") + 30 | ylab("Tiempo (ms, log scale)") + 31 | scale_y_log10() + labs(color='Runtime') + 32 | scale_x_continuous(breaks = pretty(datos$MNK, n = 15)) 33 | ggsave("conv.pdf", width = 8, height = 4, device=cairo_pdf) 34 | ``` 35 | 36 | 37 | -------------------------------------------------------------------------------- /evaluation/results_gemm/analysis.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "R Notebook" 3 | output: html_notebook 4 | --- 5 | 6 | ```{r} 7 | library(ggplot2) 8 | library(extrafont) 9 | 10 | theme_set(theme(text=element_text(family="LM Roman 10"))) 11 | ``` 12 | 13 | ```{r} 14 | datos <- read.csv("C:/Users/mlomb/Desktop/onnx2code/evaluation/results_gemm/10th.csv") 15 | datos$runtime <- factor(datos$runtime, labels=c("naïve", "libxsmm", "onnx2code", "onnxruntime", "tensorflow")) 16 | 17 | datosCON <- datos[datos$runtime == "onnx2code",] 18 | datosSIN <- datos[datos$runtime != "onnx2code",] 19 | ``` 20 | 21 | 22 | ```{r} 23 | ggplot(NULL, aes(x=MNK,y=time_mean, colour=runtime)) + 24 | geom_line(data=datosSIN, size=0.4) + 25 | geom_line(data=datosCON, size=1) + 26 | geom_point(data=datosSIN, size=1) + 27 | geom_point(data=datosCON, size=1.3) + 28 | #geom_errorbar(data=datosSIN, aes(ymin=time_mean-time_std, ymax=time_mean+time_std)) + 29 | xlab("M=K=N") + 30 | ylab("Tiempo (ms, log scale)") + 31 | scale_y_log10() + labs(color='Runtime') + 32 | scale_x_continuous(breaks = pretty(datos$MNK, n = 15)) 33 | ggsave("gemm.pdf", width = 8, height = 4, device=cairo_pdf) 34 | ``` 35 | 36 | 37 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "files.exclude": { 3 | ".git/": true, 4 | "**/__pycache__/": true, 5 | "**/.mypy_cache/": true, 6 | "**/.pytest_cache/": true, 7 | "**/.ipynb_checkpoints/": true 8 | }, 9 | "editor.tabSize": 4, 10 | "editor.rulers": [{ "column": 88, "color": "#575a57" }], 11 | "python.formatting.provider": "black", 12 | "python.linting.flake8Enabled": true, 13 | "python.linting.mypyEnabled": true, 14 | "python.linting.mypyArgs": [ 15 | "--follow-imports=silent", 16 | "--ignore-missing-imports", 17 | "--show-column-numbers", 18 | "--no-pretty", 19 | "--strict" 20 | ], 21 | "python.linting.enabled": true, 22 | "python.testing.pytestArgs": ["tests"], 23 | "python.testing.unittestEnabled": false, 24 | "python.testing.pytestEnabled": true, 25 | "python.linting.flake8Args": [ 26 | "--config=.flake8" 27 | ], 28 | "files.associations": { 29 | "*.desktop": "ini", 30 | "*.dbus": "ini", 31 | "*.systemd": "ini", 32 | ".env": "properties", 33 | "*.tcc": "c", 34 | "chrono": "cpp", 35 | "random": "cpp", 36 | "limits": "cpp", 37 | "valarray": "cpp", 38 | "algorithm": "cpp" 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /onnx2code/debugger.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include "model.h" 9 | 10 | void* read_file(const char* filename, long* _size) { 11 | FILE* fp = fopen(filename, "rb"); 12 | assert(fp != NULL); 13 | 14 | fseek(fp, 0, SEEK_END); 15 | long size = ftell(fp); 16 | fseek(fp, 0, SEEK_SET); 17 | 18 | void* buffer = malloc(size); 19 | 20 | fread(buffer, sizeof(char), size, fp); 21 | fclose(fp); 22 | 23 | if (_size) 24 | *_size = size; 25 | 26 | return buffer; 27 | } 28 | 29 | int main(int argc, char** argv) { 30 | long outputs_size; 31 | 32 | const float* inputs = (const float*)read_file("./sample_inputs.bin", NULL); 33 | const float* weights = (const float*)read_file("./weights.bin", NULL); 34 | const float* truth_outputs = (const float*)read_file("./sample_outputs.bin", &outputs_size); 35 | float* outputs = (float*)malloc(outputs_size); 36 | 37 | float total = 0; 38 | for (int i = 0; i < 10000; i++) { 39 | inference(weights, inputs, outputs); 40 | 41 | total += outputs[0]; 42 | } 43 | 44 | printf("total: %f\n", total); 45 | 46 | return 0; 47 | } 48 | -------------------------------------------------------------------------------- /evaluation/measure_gemm.py: -------------------------------------------------------------------------------- 1 | import setup # noqa # isort:skip 2 | 3 | import keras 4 | import numpy as np 5 | import pandas as pd 6 | from keras import layers 7 | from measure import measure_all 8 | from tqdm import tqdm 9 | 10 | from onnx2code.ops.gemm_tiling.GEMM import LoopTilingParams, set_tiling_params 11 | 12 | # should be set to the best 13 | set_tiling_params(LoopTilingParams(nc=4096, kc=256, mc=64, mr=4, nr=32, mv=2, nu=4)) 14 | 15 | SIZES = 2 ** np.arange(8, 10) 16 | VARIATIONS = ["gemm-naive", "loop-tiling", "libxsmm"] 17 | 18 | 19 | results = pd.DataFrame(columns=["MNK", "runtime", "time_mean", "time_std"]) 20 | 21 | for x in tqdm(range(256, 1280, 32)): 22 | model = keras.Sequential( 23 | [ 24 | keras.Input(shape=(x, x)), 25 | layers.Dense(x, activation=None, use_bias=False), 26 | ] 27 | ) 28 | 29 | result = measure_all(model, variations=VARIATIONS, runs=300, tqdm_leave=False) 30 | 31 | for var, times in result.items(): 32 | entry = { 33 | "MNK": x, 34 | "runtime": var, 35 | "time_mean": np.mean(times), 36 | "time_std": np.std(times), 37 | } 38 | results = pd.concat( 39 | [ 40 | results, 41 | pd.DataFrame.from_records([entry]), 42 | ] 43 | ) 44 | results.to_csv("results_gemm.csv", index=False) 45 | -------------------------------------------------------------------------------- /onnx2code/ops/gemm_tiling/GEMM.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | import math 3 | from pathlib import Path 4 | 5 | 6 | @dataclass 7 | class LoopTilingParams: 8 | nc: int # Columnas de panel de B 9 | kc: int # Filas de panel de B 10 | mc: int # Filas de bloque de A 11 | mr: int # Filas de microkernel 12 | nr: int # Columnas de microkernel 13 | mv: int # Filas de unit-update 14 | nu: int # Columnas de unit-update 15 | 16 | 17 | tiling_params = LoopTilingParams( 18 | nc=4096, 19 | kc=256, 20 | mc=256, 21 | mr=4, 22 | nr=8, 23 | mv=4, 24 | nu=4, 25 | ) 26 | 27 | 28 | def set_tiling_params(params: LoopTilingParams) -> None: 29 | global tiling_params 30 | tiling_params = params 31 | 32 | 33 | external_paths_GEMM = ( 34 | Path(__file__).parent / "gpackA.cpp", 35 | Path(__file__).parent / "gpackB.cpp", 36 | Path(__file__).parent / "microkernel_ref.cpp", 37 | Path(__file__).parent / "microkernel_test.cpp", 38 | Path(__file__).parent / "gemm.cpp", 39 | ) 40 | 41 | 42 | def call_GEMM(M: int, K: int, N: int, params: str) -> str: 43 | nc = min(2 ** math.ceil(math.log2(N)), tiling_params.nc) 44 | kc = tiling_params.kc 45 | mc = tiling_params.mc 46 | mr = tiling_params.mr 47 | nr = tiling_params.nr 48 | 49 | mv = tiling_params.mv 50 | nu = tiling_params.nu 51 | 52 | return f"gemm<{M},{K},{N},{nc},{kc},{mc},{mr},{nr},{mv},{nu}>({params});" 53 | -------------------------------------------------------------------------------- /tests/ops/test_conv.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import tensorflow as tf 3 | 4 | from ..util import check_keras 5 | 6 | 7 | @pytest.mark.parametrize("shape", [(4, 3, 1), (3, 4, 3), (10, 10, 5), (32, 32, 3)]) 8 | @pytest.mark.parametrize("kernel_size", [1, 2, 3]) 9 | @pytest.mark.parametrize("filters", [1, 2, 3, 10]) 10 | @pytest.mark.parametrize("padding", ["valid", "same"]) 11 | @pytest.mark.parametrize( 12 | "stride_and_dilation", 13 | [ 14 | (1, 1), 15 | (2, 1), 16 | # TODO: dilation? 17 | ], 18 | ids=lambda x: f"s{x[0]}d{x[1]}", 19 | ) 20 | @pytest.mark.parametrize("use_bias", [False, True], ids=["no_bias", "bias"]) 21 | def test_conv( 22 | shape: list[int], 23 | kernel_size: int, 24 | filters: int, 25 | padding: str, 26 | stride_and_dilation: tuple[int, int], 27 | use_bias: bool, 28 | ) -> None: 29 | try: 30 | input = tf.keras.Input(shape=shape) 31 | output = tf.keras.layers.Conv2D( 32 | filters=filters, 33 | padding=padding, 34 | kernel_size=kernel_size, 35 | strides=stride_and_dilation[0], 36 | dilation_rate=stride_and_dilation[1], 37 | use_bias=use_bias, 38 | bias_initializer="random_normal", 39 | )(input) 40 | model = tf.keras.Model(inputs=[input], outputs=[output]) 41 | except Exception: 42 | pytest.skip("incompatible configuration") 43 | 44 | check_keras(model) 45 | -------------------------------------------------------------------------------- /tests/ops/test_pooling.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import tensorflow as tf 3 | 4 | from ..util import check_keras 5 | 6 | 7 | @pytest.mark.parametrize( 8 | "shape", 9 | [ 10 | # 1D 11 | *[shape for shape in [[1, 1], [5, 1], [10, 2], [16, 3]]], 12 | # 2D 13 | *[shape for shape in [[1, 1, 1], [5, 5, 1], [10, 8, 3], [16, 8, 8]]], 14 | ], 15 | ) 16 | @pytest.mark.parametrize("pool_size", [1, 2, 3]) 17 | @pytest.mark.parametrize("strides", [1, 2, 3]) 18 | @pytest.mark.parametrize("padding", ["valid", "same"]) 19 | @pytest.mark.parametrize("op", ["max", "average"]) 20 | def test_maxpool( 21 | shape: list[int], 22 | pool_size: int, 23 | strides: int, 24 | padding: str, 25 | op: str, 26 | ) -> None: 27 | impl = { 28 | "max": { 29 | 2: tf.keras.layers.MaxPooling1D, 30 | 3: tf.keras.layers.MaxPooling2D, 31 | }, 32 | "average": { 33 | 2: tf.keras.layers.AveragePooling1D, 34 | 3: tf.keras.layers.AveragePooling2D, 35 | }, 36 | }[op][len(shape)] 37 | input = tf.keras.Input(shape) 38 | try: 39 | pool = impl( 40 | pool_size=pool_size, 41 | strides=strides, 42 | padding=padding, 43 | )(input) 44 | except ValueError as e: 45 | pytest.skip("incompatible configuration: " + str(e)) 46 | 47 | model = tf.keras.Model(inputs=[input], outputs=[pool]) 48 | check_keras(model) 49 | -------------------------------------------------------------------------------- /evaluation/measure_conv.py: -------------------------------------------------------------------------------- 1 | import setup # noqa # isort:skip 2 | 3 | import keras 4 | import numpy as np 5 | import pandas as pd 6 | from keras import layers 7 | from measure import measure_all 8 | from tqdm import tqdm 9 | 10 | from onnx2code.ops.gemm_tiling.GEMM import LoopTilingParams, set_tiling_params 11 | 12 | # should be set to the best 13 | set_tiling_params(LoopTilingParams(nc=4096, kc=256, mc=64, mr=4, nr=32, mv=2, nu=4)) 14 | 15 | SIZES = 2 ** np.arange(8, 10) 16 | VARIATIONS = ["conv-naive", "im2col"] 17 | 18 | 19 | results = pd.DataFrame(columns=["MNK", "runtime", "time_mean", "time_std"]) 20 | 21 | for x in tqdm(range(256, 1280, 32)): 22 | model = keras.Sequential( 23 | [ 24 | keras.Input(shape=(x, x, 1)), 25 | layers.Conv2D( 26 | filters=4, 27 | padding="valid", 28 | kernel_size=4, 29 | use_bias=False, 30 | ), 31 | ] 32 | ) 33 | 34 | result = measure_all(model, variations=VARIATIONS, runs=100, tqdm_leave=False) 35 | 36 | for var, times in result.items(): 37 | entry = { 38 | "MNK": x, 39 | "runtime": var, 40 | "time_mean": np.mean(times), 41 | "time_std": np.std(times), 42 | } 43 | results = pd.concat( 44 | [ 45 | results, 46 | pd.DataFrame.from_records([entry]), 47 | ] 48 | ) 49 | results.to_csv("results_conv.csv", index=False) 50 | -------------------------------------------------------------------------------- /onnx2code/ops/gemm_tiling/microkernel_test.cpp: -------------------------------------------------------------------------------- 1 | template < 2 | int mv, 3 | int nu, 4 | 5 | int CStrideRow, 6 | int CStrideCol> 7 | inline void unit_update( 8 | const float* __restrict__ a, // mv 9 | const float* __restrict__ b, // nu 10 | float* __restrict__ C // mv x nu 11 | ) { 12 | for (int i = 0; i < mv; i++) { 13 | for (int j = 0; j < nu; j++) { 14 | C[i * CStrideRow + j * CStrideCol] += a[i] * b[j]; 15 | } 16 | } 17 | } 18 | 19 | template < 20 | int mr, 21 | int nr, 22 | int mv, 23 | int nu> 24 | inline void test_microkernel( 25 | int kc, 26 | const float* __restrict__ A_kernel, // (mr x kc) column major 27 | const float* __restrict__ B_kernel, // (kc x nr) row major 28 | float* __restrict__ AB // (mr x nr) 29 | ) { 30 | static_assert(mr % mv == 0, "must be conforming"); 31 | static_assert(nr % nu == 0, "must be conforming"); 32 | 33 | for (int k = 0; k < kc; k++) { 34 | // single outer product 35 | // en una columna de A y una fila de B (del zigzag) 36 | 37 | // loop tiling 38 | for (int j = 0; j < nr; j += nu) { 39 | for (int i = 0; i < mr; i += mv) { 40 | // unit update (small outer product) 41 | unit_update( 42 | A_kernel + i, 43 | B_kernel + j, 44 | AB + i * nr + j); 45 | } 46 | } 47 | 48 | // advance one column of A and one row of B 49 | A_kernel += mr; 50 | B_kernel += nr; 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /tests/ops/test_broadcastable.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import tensorflow as tf 3 | 4 | from ..util import check_keras 5 | 6 | shapes = [ 7 | # same shape 8 | *[(s, s) for s in [[1], [2, 3], [4, 5, 6]]], # scalar, 2d, 3d 9 | # broadcasting 10 | # https://github.com/onnx/onnx/blob/master/docs/Broadcasting.md 11 | ([2], [1]), 12 | ([2, 3, 4, 5], [1]), 13 | ([2, 3, 4, 5], [5]), 14 | ([4, 5], [2, 3, 4, 5]), 15 | ([1, 4, 5], [2, 3, 1, 1]), 16 | ([3, 4, 5], [2, 1, 1, 1]), 17 | ([3, 4, 5], [5]), 18 | ([3, 4, 5], [4, 5]), 19 | ([3, 4, 5, 6], [5, 6]), 20 | ([3, 4, 5, 6], [4, 5, 6]), 21 | ([1, 4, 1, 6], [3, 1, 5, 6]), 22 | ([3, 1, 1], [1, 3, 416, 416]), 23 | ] 24 | 25 | 26 | @pytest.mark.parametrize("shapeA,shapeB", shapes) 27 | @pytest.mark.parametrize( 28 | "operation", 29 | [ 30 | tf.keras.layers.Add, 31 | tf.keras.layers.Subtract, 32 | tf.keras.layers.Multiply, 33 | ], 34 | ids=lambda x: str(x.__name__), 35 | ) 36 | def test_basic_ops( 37 | operation: tf.keras.layers.Layer, shapeA: list[int], shapeB: list[int] 38 | ) -> None: 39 | inputA = tf.keras.Input(shapeA) 40 | inputB = tf.keras.Input(shapeB) 41 | result = operation()([inputA, inputB]) 42 | model = tf.keras.Model(inputs=[inputA, inputB], outputs=[result]) 43 | check_keras(model) 44 | 45 | 46 | @pytest.mark.parametrize("shapeA,shapeB", shapes) 47 | def test_div(shapeA: list[int], shapeB: list[int]) -> None: 48 | inputA = tf.keras.Input(shapeA) 49 | inputB = tf.keras.Input(shapeB) 50 | model = tf.keras.Model(inputs=[inputA, inputB], outputs=[inputA / inputB]) 51 | check_keras(model) 52 | -------------------------------------------------------------------------------- /onnx2code/service.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "model.h" 10 | 11 | void* map_shared_memory(const char* name) { 12 | int fd = shm_open(name, O_RDWR | O_CREAT, 0666); 13 | assert(fd != -1); 14 | 15 | // we query the size so we don't have to know it beforehand 16 | struct stat finfo; 17 | fstat(fd, &finfo); 18 | size_t size = finfo.st_size; 19 | assert(size > 0); 20 | 21 | void* shared = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0); 22 | assert(shared != MAP_FAILED); 23 | 24 | return shared; 25 | } 26 | 27 | void* read_file(const char* filename) { 28 | FILE* fp = fopen(filename, "rb"); 29 | assert(fp != NULL); 30 | 31 | fseek(fp, 0, SEEK_END); 32 | long size = ftell(fp); 33 | fseek(fp, 0, SEEK_SET); 34 | 35 | void* buffer = malloc(size); 36 | 37 | fread(buffer, sizeof(char), size, fp); 38 | fclose(fp); 39 | 40 | return buffer; 41 | } 42 | 43 | int main(int argc, char** argv) { 44 | const float* weights = (const float*)read_file(argv[1]); 45 | float* inputs = (float*)map_shared_memory("/o2c-inputs"); 46 | float* outputs = (float*)map_shared_memory("/o2c-outputs"); 47 | 48 | while (1) { 49 | // wait for data 50 | char signal; 51 | read(STDIN_FILENO, &signal, 1); 52 | 53 | // run inference 54 | inference(weights, inputs, outputs); 55 | 56 | // mark as ready 57 | write(STDOUT_FILENO, &signal, 1); 58 | fsync(STDOUT_FILENO); 59 | } 60 | 61 | return 0; 62 | } 63 | -------------------------------------------------------------------------------- /onnx2code/ops/identity.py: -------------------------------------------------------------------------------- 1 | from .operation import OpCall, Operation, OpImpl 2 | 3 | 4 | class Identity(Operation): 5 | """ 6 | Identity operator 7 | 8 | https://github.com/onnx/onnx/blob/main/docs/Operators.md#identity 9 | """ 10 | 11 | node_types = {"Identity"} 12 | 13 | def parse(self) -> None: 14 | assert len(self.inputs) == 1, "expected one input" 15 | assert len(self.outputs) == 1, "expected one output" 16 | assert ( 17 | self.inputs[0].size == self.outputs[0].size 18 | ), "input and output tensors should have the same size" 19 | 20 | self.size = self.inputs[0].size 21 | 22 | def call(self) -> OpCall: 23 | return OpCall( 24 | sig_name="Identity", 25 | sig_params=[self.size], 26 | inputs=self.inputs, 27 | outputs=self.outputs, 28 | ) 29 | 30 | 31 | @Identity.variant("c", priority=1) 32 | class IdentityC(Identity): 33 | def impl(self) -> OpImpl: 34 | source = f""" 35 | for (int i = 0; i < {self.size}; i++) {{ 36 | OUT[i] = A[i]; 37 | }} 38 | """ 39 | 40 | return OpImpl(lang="c", source=source) 41 | 42 | 43 | @Identity.variant("asm", priority=0) 44 | class IdentityASM(Identity): 45 | def impl(self) -> OpImpl: 46 | source = ( 47 | f"mov rax, {self.size}", 48 | ".loop:", 49 | "movss xmm0, [rdi]", 50 | "add rdi, 4", 51 | "movss [rsi], xmm0", 52 | "add rsi, 4", 53 | "dec rax", 54 | "jnz .loop", 55 | "ret", 56 | ) 57 | 58 | return OpImpl(lang="asm", source=source) 59 | -------------------------------------------------------------------------------- /evaluation/measure_models.py: -------------------------------------------------------------------------------- 1 | import setup # noqa # isort:skip 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import onnx 6 | from measure import measure_all 7 | from tqdm import tqdm 8 | 9 | from onnx2code.ops.gemm_tiling.GEMM import LoopTilingParams, set_tiling_params 10 | 11 | # should be set to the best 12 | set_tiling_params(LoopTilingParams(nc=4096, kc=256, mc=64, mr=4, nr=32, mv=2, nu=4)) 13 | 14 | VARIATIONS = ["conv-naive", "im2col"] 15 | 16 | results = pd.DataFrame(columns=["model", "runtime", "time_mean", "time_std"]) 17 | 18 | models = [ 19 | "../data/vision/classification/mnist/model/mnist-12.onnx", 20 | "../data/vision/super_resolution/sub_pixel_cnn_2016/model/super-resolution-10.onnx", 21 | "../data/vision/classification/squeezenet/model/squeezenet1.1-7.onnx", 22 | "../data/vision/body_analysis/emotion_ferplus/model/emotion-ferplus-8.onnx", 23 | "../data/vision/classification/inception_and_googlenet/inception_v2/model/inception-v2-7.onnx", 24 | "../data/vision/classification/resnet/model/resnet50-caffe2-v1-9.onnx", 25 | ] 26 | 27 | for model in tqdm(models): 28 | result = measure_all( 29 | None, 30 | variations=VARIATIONS, 31 | runs=15, 32 | tqdm_leave=False, 33 | onnx_model=onnx.load(model), 34 | ) 35 | 36 | for var, times in result.items(): 37 | entry = { 38 | "model": model, 39 | "runtime": var, 40 | "time_mean": np.mean(times), 41 | "time_std": np.std(times), 42 | } 43 | results = pd.concat( 44 | [ 45 | results, 46 | pd.DataFrame.from_records([entry]), 47 | ] 48 | ) 49 | results.to_csv("results_models.csv", index=False) 50 | -------------------------------------------------------------------------------- /tests/ops/test_gemm.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import tensorflow as tf 3 | 4 | from ..util import check_keras 5 | 6 | 7 | @pytest.mark.parametrize("shape", [[1], [2, 2], [2, 3], [4, 5, 6]]) 8 | @pytest.mark.parametrize("units", [1, 2, 10, 100], ids=lambda x: f"{x}_units") 9 | @pytest.mark.parametrize("use_bias", [False, True], ids=["no_bias", "bias"]) 10 | def test_naive(shape: list[int], units: int, use_bias: bool) -> None: 11 | input = tf.keras.Input(shape) 12 | dense = tf.keras.layers.Dense(units, use_bias=use_bias, bias_initializer="uniform")( 13 | input 14 | ) 15 | model = tf.keras.Model(inputs=[input], outputs=[dense]) 16 | check_keras(model, variations=["gemm-naive"]) 17 | 18 | 19 | # @pytest.mark.parametrize("shape", [[1], [2, 2], [2, 3], [4, 5, 6]]) 20 | # @pytest.mark.parametrize("units", [1, 2, 10, 100], ids=lambda x: f"{x}_units") 21 | # @pytest.mark.parametrize("use_bias", [False, True], ids=["no_bias", "bias"]) 22 | # def test_libxsmm(shape: list[int], units: int, use_bias: bool) -> None: 23 | # input = tf.keras.Input(shape) 24 | # dense = tf.keras.layers.Dense(units, use_bias=use_bias, bias_initializer="uniform")( 25 | # input 26 | # ) 27 | # model = tf.keras.Model(inputs=[input], outputs=[dense]) 28 | # check_keras(model, variations=["libxsmm"]) 29 | 30 | 31 | @pytest.mark.parametrize("shape", [[64, 64]]) # , [19, 37] 32 | @pytest.mark.parametrize("units", [64], ids=lambda x: f"{x}_units") 33 | def test_tiling(shape: list[int], units: int) -> None: 34 | input = tf.keras.Input(shape) 35 | dense = tf.keras.layers.Dense(units, use_bias=False, bias_initializer="uniform")( 36 | input 37 | ) 38 | model = tf.keras.Model(inputs=[input], outputs=[dense]) 39 | check_keras(model, variations=["loop-tiling"]) 40 | -------------------------------------------------------------------------------- /onnx2code/ops/transpose.py: -------------------------------------------------------------------------------- 1 | from ..util import compute_strides, get_attribute 2 | from .operation import OpCall, Operation, OpImpl 3 | 4 | 5 | class Transpose(Operation): 6 | """ 7 | Transpose operator 8 | 9 | https://github.com/onnx/onnx/blob/main/docs/Operators.md#transpose 10 | """ 11 | 12 | node_types = {"Transpose"} 13 | 14 | def parse(self) -> None: 15 | assert len(self.inputs) == 1, "expected one input" 16 | assert len(self.outputs) == 1, "expected one output" 17 | assert ( 18 | self.inputs[0].size == self.outputs[0].size 19 | ), "input and output tensors should have the same size" 20 | 21 | self.input_strides = compute_strides(self.inputs[0].shape) 22 | self.output_strides = compute_strides(self.outputs[0].shape) 23 | self.perm = get_attribute(self.node, "perm", []) 24 | 25 | def call(self) -> OpCall: 26 | return OpCall( 27 | sig_name="Transpose", 28 | sig_params=[self.inputs[0].shape, self.outputs[0].shape, self.perm], 29 | inputs=self.inputs, 30 | outputs=self.outputs, 31 | ) 32 | 33 | 34 | @Transpose.variant("c") 35 | class TransposeC(Transpose): 36 | def impl(self) -> OpImpl: 37 | output_shape = self.outputs[0].shape 38 | 39 | for_loops = [] 40 | out_index = [] 41 | in_index = [] 42 | 43 | for i in range(len(output_shape)): 44 | for_loops.append( 45 | f"""for (int d{i} = 0; d{i} < {output_shape[i]}; ++d{i})""" 46 | ) 47 | out_index.append(f"d{i}*{self.output_strides[i]}") 48 | in_index.append(f"d{i}*{self.input_strides[self.perm[i]]}") 49 | 50 | source = "\n".join([loop + "{" for loop in for_loops]) 51 | source += ( 52 | "\n\tOUT[" + "+".join(out_index) + "] = A[" + "+".join(in_index) + "];\n" 53 | ) 54 | source += "}" * len(for_loops) 55 | source += "\n" 56 | 57 | return OpImpl(lang="c", source=source) 58 | -------------------------------------------------------------------------------- /onnx2code/ops/concat.py: -------------------------------------------------------------------------------- 1 | from ..util import compute_strides, get_attribute 2 | from .operation import LETTERS, OpCall, Operation, OpImpl 3 | 4 | 5 | class Concat(Operation): 6 | """ 7 | Concat operator 8 | 9 | https://github.com/onnx/onnx/blob/main/docs/Operators.md#concat 10 | """ 11 | 12 | node_types = {"Concat"} 13 | 14 | def parse(self) -> None: 15 | assert len(self.outputs) == 1, "expected one output" 16 | 17 | self.axis = get_attribute(self.node, "axis", None) 18 | 19 | assert self.axis is not None, "axis is not set" 20 | 21 | def call(self) -> OpCall: 22 | return OpCall( 23 | sig_name="Concat", 24 | sig_params=[inp.shape for inp in self.inputs], 25 | inputs=self.inputs, 26 | outputs=self.outputs, 27 | ) 28 | 29 | 30 | @Concat.variant("c") 31 | class ConcatC(Concat): 32 | def impl(self) -> OpImpl: 33 | source = "" 34 | 35 | output_strides = compute_strides(self.outputs[0].shape) 36 | 37 | def output_index(axis_offset: int) -> str: 38 | output_index = "" 39 | for i, stride in enumerate(output_strides): 40 | output_index += "+" 41 | if i == self.axis: 42 | output_index += f"({axis_offset}+d{i})" 43 | else: 44 | output_index += f"d{i}" 45 | output_index += f"*{stride}" 46 | return output_index 47 | 48 | axis_offset = 0 49 | for k, input in enumerate(self.inputs): 50 | index = "" 51 | input_strides = compute_strides(input.shape) 52 | 53 | for i, elems in enumerate(input.shape): 54 | source += f"for (int d{i} = 0; d{i} < {elems}; d{i}++) {{\n" 55 | index += f"+ d{i} * {input_strides[i]}" 56 | 57 | source += f"OUT[{output_index(axis_offset)}] = {LETTERS[k]}[{index}];\n" 58 | source += "}\n" * len(input.shape) 59 | 60 | axis_offset += input.shape[self.axis] 61 | 62 | return OpImpl(lang="c", source=source) 63 | -------------------------------------------------------------------------------- /evaluation/eval_tilings.py: -------------------------------------------------------------------------------- 1 | import setup # noqa # isort:skip 2 | 3 | from itertools import product 4 | 5 | import numpy as np 6 | import tensorflow as tf 7 | from measure import measure_all 8 | 9 | from onnx2code.ops.gemm import LoopTilingParams, set_tiling_params 10 | 11 | FLOAT_SIZE = 4 12 | KB = 1024 13 | L1_SIZE = 32 * KB 14 | L2_SIZE = 256 * KB 15 | 16 | # Custom MNIST-like model 17 | input = tf.keras.Input([4096 * 64]) 18 | out = tf.keras.layers.Lambda(lambda x: x)(input) 19 | 20 | input_shape = (512, 512) 21 | 22 | model = tf.keras.Sequential( 23 | [ 24 | tf.keras.Input(shape=input_shape), 25 | tf.keras.layers.Dense(512, activation="relu"), 26 | ] 27 | ) 28 | 29 | # nc, kc, mc, mr, nr 30 | nc_options = [4096] 31 | kc_options = [256] 32 | mc_options = [256] 33 | mr_options = [4] 34 | nr_options = [8] 35 | mv_options = [4] 36 | nu_options = [4] 37 | 38 | for nc, kc, mc, mr, nr, mv, nu in product( 39 | nc_options, kc_options, mc_options, mr_options, nr_options, mv_options, nu_options 40 | ): 41 | set_tiling_params(LoopTilingParams(nc=nc, kc=kc, mc=mc, mr=mr, nr=nr, mv=mv, nu=nu)) 42 | print(f"\n## nc={nc}, kc={kc}, mc={mc}, mr={mr}, nr={nr}\n") 43 | 44 | B_sliver = nr * kc * FLOAT_SIZE 45 | A_sliver = mr * kc * FLOAT_SIZE 46 | AB = mr * nr * FLOAT_SIZE 47 | L1_total = A_sliver + B_sliver + AB 48 | L1_remaining = L1_SIZE - L1_total 49 | print("L1:") 50 | print(f"\t{A_sliver=}") 51 | print(f"\t{B_sliver=}") 52 | print(f"\t{AB=}") 53 | print(f"\t{L1_total=}") 54 | print(f"\t{L1_remaining=}") 55 | 56 | A_panel = mc * kc * FLOAT_SIZE 57 | C_writeback = AB 58 | L2_total = A_panel + B_sliver + C_writeback 59 | L2_remaining = L2_SIZE - L2_total 60 | print("\nL2:") 61 | print(f"\t{A_panel=}") 62 | print(f"\t{B_sliver=}") 63 | print(f"\t{C_writeback=}") 64 | print(f"\t{L2_total=}") 65 | print(f"\t{L2_remaining=}") 66 | 67 | data = measure_all(model, variations=["loop-tiling"], measure_base=False) 68 | 69 | assert len(data) == 1 70 | result = data[next(iter(data.keys()))] 71 | 72 | print(f"result: {np.mean(result):.2f}ms") 73 | -------------------------------------------------------------------------------- /onnx2code/ops/gemm_tiling/gpackB.cpp: -------------------------------------------------------------------------------- 1 | template 2 | inline void gpackB_panel( 3 | const float* __restrict__ B, 4 | float* __restrict__ B_panel // kc x nr 5 | ) { 6 | for (int r = 0; r < kc; r++) { 7 | // copy row of nr 8 | for (int c = 0; c < nr; c++) { 9 | B_panel[c] = B[c * StrideCol]; 10 | } 11 | 12 | // advance row 13 | B_panel += nr; 14 | B += StrideRow; 15 | } 16 | } 17 | 18 | template 19 | inline void gpackB( 20 | const float* __restrict__ B, 21 | float* __restrict__ B_panel // kc x nc 22 | ) { 23 | for (int p = 0; p < nc; p += nr) { 24 | gpackB_panel(B, B_panel); 25 | 26 | // advance panel 27 | B_panel += kc * nr; 28 | B += nr * StrideCol; 29 | } 30 | } 31 | 32 | // Edge case 33 | 34 | template 35 | inline void gpackB_panel_edge( 36 | int _kc, 37 | int _nr, 38 | const float* __restrict__ B, 39 | float* __restrict__ B_panel // kc x nr 40 | ) { 41 | for (int r = 0; r < _kc; r++) { 42 | // copy row of _nr 43 | for (int c = 0; c < _nr; c++) { 44 | B_panel[c] = B[c * StrideCol]; 45 | } 46 | 47 | // advance row 48 | B_panel += nr; 49 | B += StrideRow; 50 | } 51 | } 52 | 53 | template 54 | inline void gpackB_edge( 55 | int _kc, 56 | int _nc, 57 | const float* __restrict__ B, 58 | float* __restrict__ B_panel // kc x nc 59 | ) { 60 | const int NP = _nc / nr; 61 | const int NPl = _nc % nr; 62 | 63 | memset(B_panel, 0, kc * nc * sizeof(float)); 64 | 65 | for (int p = 0; p < NP; p++) { 66 | gpackB_panel_edge(_kc, nr, B, B_panel); 67 | 68 | // advance panel 69 | B_panel += kc * nr; 70 | B += nr * StrideCol; 71 | } 72 | 73 | if (NPl != 0) { 74 | gpackB_panel_edge(_kc, NPl, B, B_panel); 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /onnx2code/checker.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | from pathlib import Path 4 | 5 | import numpy as np 6 | import onnx 7 | import onnxruntime 8 | 9 | from .generator import Generator 10 | from .result import ModelResult 11 | from .service import ModelService 12 | 13 | 14 | def check_model_result( 15 | model_proto: onnx.ModelProto, result: ModelResult, n_inputs: int = 1 16 | ) -> None: 17 | """ 18 | Checks if the generated output matches the reference runtime (ONNX Runtime) 19 | 20 | :param n_inputs: random inputs will be generated 21 | """ 22 | ort_sess = onnxruntime.InferenceSession(model_proto.SerializeToString()) 23 | 24 | with ModelService(result) as service: 25 | for _ in range(n_inputs): 26 | 27 | inputs = { 28 | name: np.random.uniform(-1.0, 1.0, shape).astype(np.float32) 29 | for name, shape in result.input_shapes.items() 30 | } 31 | 32 | out1 = service.inference(inputs) 33 | out2 = ort_sess.run(None, inputs) 34 | 35 | assert len(out1) == len(out2) 36 | 37 | output_matches = True 38 | 39 | for o1, o2 in zip(out1, out2): 40 | output_matches = output_matches and np.allclose(o1, o2, atol=1e-5) 41 | 42 | if not output_matches and os.getenv("ONNX2CODE_DEBUG", "0") == "1": 43 | temp_dir = Path(__file__).parent.parent / "tmp/" 44 | inputs_np = np.concatenate([inp.reshape(-1) for inp in inputs.values()]) 45 | outputs_np = np.concatenate([o.reshape(-1) for o in out2]) 46 | inputs_np.tofile(temp_dir / "sample_inputs.bin") 47 | outputs_np.tofile(temp_dir / "sample_outputs.bin") 48 | shutil.copyfile( 49 | Path(__file__).parent / "debugger.c", 50 | temp_dir / "debugger.c", 51 | ) 52 | 53 | if not output_matches: 54 | raise RuntimeError("output mismatch") 55 | 56 | 57 | def check_model( 58 | model_proto: onnx.ModelProto, variations: list[str] = [], n_inputs: int = 1 59 | ) -> None: 60 | """ 61 | Generates code for the given model and checks if the generated output matches the reference runtime (ONNX Runtime) 62 | 63 | :param n_inputs: random inputs will be generated 64 | """ 65 | result = Generator(model_proto, variations).generate() 66 | 67 | check_model_result(model_proto, result, n_inputs) 68 | -------------------------------------------------------------------------------- /onnx2code/__main__.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import sys 3 | from pathlib import Path 4 | 5 | import onnx 6 | from rich import print 7 | 8 | from .checker import check_model_result 9 | from .generator import Generator 10 | 11 | 12 | def main() -> None: 13 | parser = argparse.ArgumentParser(prog="onnx2code") 14 | parser.add_argument("input_model", help="input .onnx file") 15 | parser.add_argument("output_folder", help="output folder to write files") 16 | parser.add_argument( 17 | "--variations", 18 | "--vars", 19 | type=str, 20 | help="variation priority", 21 | default="asm, c", 22 | action="store", 23 | ) 24 | parser.add_argument( 25 | "--checks", 26 | type=int, 27 | help="compile and test the model with the provided amount of inputs", 28 | default=0, 29 | action="store", 30 | ) 31 | 32 | args = parser.parse_args() 33 | 34 | try: 35 | model_proto = onnx.load(args.input_model) 36 | except Exception as e: 37 | print("Error loading ONNX model: ", e) 38 | sys.exit(1) 39 | 40 | variations = [v.strip() for v in args.variations.split(",")] 41 | 42 | try: 43 | result = Generator(model_proto, variations).generate() 44 | except Exception as e: 45 | print("Error generating code: ", e) 46 | sys.exit(2) 47 | 48 | print("Input shapes:", result.input_shapes) 49 | print("Output shapes:", result.output_shapes) 50 | print("Weights size (floats):", result.weights.size) 51 | 52 | path = Path(args.output_folder) 53 | print("Writing files to", path.resolve()) 54 | 55 | path.mkdir(parents=True, exist_ok=True) 56 | c_file = path / "model.cpp" 57 | h_file = path / "model.h" 58 | asm_file = path / "model.asm" 59 | weights_file = path / "weights.bin" 60 | result.weights.tofile(weights_file) 61 | 62 | for file, content in [ 63 | (c_file, result.source_c), 64 | (h_file, result.source_h), 65 | (asm_file, result.source_asm), 66 | ]: 67 | with open(file, "w") as f: 68 | f.write(content) 69 | 70 | if args.checks > 0: 71 | print("Checking model with", args.checks, "random inputs") 72 | 73 | try: 74 | check_model_result(model_proto, result, args.checks) 75 | except Exception as e: 76 | print("Error checking model: ", e) 77 | sys.exit(3) 78 | 79 | print("Done") 80 | 81 | 82 | if __name__ == "__main__": 83 | main() 84 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: [push, pull_request] 4 | 5 | concurrency: 6 | group: ${{ github.workflow }}-${{ github.ref }} 7 | cancel-in-progress: true 8 | 9 | jobs: 10 | style: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - uses: actions/checkout@v3 14 | 15 | - name: Set up Python 16 | uses: actions/setup-python@v4 17 | with: 18 | python-version: "3.10" 19 | cache: "pipenv" 20 | 21 | - name: Install pipenv 22 | run: curl https://raw.githubusercontent.com/pypa/pipenv/master/get-pipenv.py | python 23 | 24 | - name: Install dependencies 25 | run: pipenv install --dev 26 | 27 | - name: Lint with flake8 28 | run: pipenv run flake8 . --count --statistics 29 | 30 | - name: Check formatting with black 31 | run: pipenv run black --check --verbose . 32 | 33 | test: 34 | runs-on: ubuntu-latest 35 | steps: 36 | - name: Install libxsmm 37 | run: | 38 | git clone https://github.com/libxsmm/libxsmm 39 | cd libxsmm 40 | git checkout 4e1aa5332123088916989651ae9b187ecba377dc 41 | make generator 42 | echo "$(pwd)/bin/libxsmm_gemm_generator" 43 | cd .. 44 | 45 | - uses: actions/checkout@v3 46 | 47 | - name: Install gcc and nasm 48 | run: sudo apt-get install -y gcc nasm 49 | 50 | - name: Set up Python 51 | uses: actions/setup-python@v4 52 | with: 53 | python-version: "3.10" 54 | cache: "pipenv" 55 | 56 | - name: Install pipenv 57 | run: curl https://raw.githubusercontent.com/pypa/pipenv/master/get-pipenv.py | python 58 | 59 | - name: Install dependencies 60 | run: pipenv install --dev 61 | 62 | - name: Run tests 63 | run: | 64 | export PATH=$PATH:$(pwd)/libxsmm/bin 65 | pipenv run make test 66 | 67 | deploy_docker_image: 68 | needs: [style, test] 69 | if: github.ref == 'refs/heads/main' 70 | name: Push Docker image to Docker Hub 71 | runs-on: ubuntu-latest 72 | steps: 73 | - uses: actions/checkout@v3 74 | 75 | - name: Log in to Docker Hub 76 | uses: docker/login-action@v2 77 | with: 78 | username: ${{ secrets.DOCKERHUB_USERNAME }} 79 | password: ${{ secrets.DOCKERHUB_TOKEN }} 80 | 81 | - name: Build and push Docker image 82 | uses: docker/build-push-action@v3 83 | with: 84 | push: true 85 | tags: mlomb/onnx2code:latest 86 | -------------------------------------------------------------------------------- /evaluation/find_best_tiling_params.py: -------------------------------------------------------------------------------- 1 | import setup # noqa # isort:skip 2 | 3 | import numpy as np 4 | import pandas as pd 5 | from tqdm import tqdm 6 | from itertools import product 7 | 8 | import tensorflow as tf 9 | from measure import measure_all 10 | 11 | from onnx2code.ops.gemm_tiling.GEMM import LoopTilingParams, set_tiling_params 12 | 13 | N = 512 14 | input_shape = (N, N) 15 | 16 | model = tf.keras.Sequential( 17 | [ 18 | tf.keras.Input(shape=input_shape), 19 | tf.keras.layers.Dense(N, activation=None, use_bias=False), 20 | ] 21 | ) 22 | 23 | # nc, kc, mc, mr, nr 24 | nc_options = [N] 25 | kc_options = [64, 128, 256, 512] 26 | mc_options = [64, 128, 256, 512] 27 | mr_options = [2, 4, 8, 16, 32] 28 | nr_options = [2, 4, 8, 16, 32] 29 | mv_options = [2, 4, 8, 16] 30 | nu_options = [2, 4, 8, 16] 31 | 32 | 33 | params = [ 34 | LoopTilingParams(nc=nc, kc=kc, mc=mc, mr=mr, nr=nr, mv=mv, nu=nu) 35 | for nc, kc, mc, mr, nr, mv, nu in product( 36 | nc_options, 37 | kc_options, 38 | mc_options, 39 | mr_options, 40 | nr_options, 41 | mv_options, 42 | nu_options, 43 | ) 44 | ] 45 | 46 | 47 | def is_valid_configuration(params: LoopTilingParams) -> bool: 48 | # dont blame me 49 | try: 50 | assert params.nr % params.nu == 0 51 | assert params.mr % params.mv == 0 52 | 53 | assert params.nc % params.nr == 0 54 | assert params.mc % params.mr == 0 55 | 56 | assert params.kc <= N 57 | 58 | return True 59 | except AssertionError: 60 | return False 61 | 62 | 63 | params = [p for p in params if is_valid_configuration(p)] 64 | results = pd.DataFrame(columns=["nc", "kc", "mc", "mr", "nr", "mv", "nu", "time"]) 65 | 66 | for p in tqdm(params, desc="Tiling params"): 67 | set_tiling_params(p) 68 | 69 | data = measure_all( 70 | model, 71 | variations=["loop-tiling"], 72 | measure_base=False, 73 | runs=300, 74 | tqdm_leave=False, 75 | ) 76 | 77 | assert len(data) == 1 78 | result = data[next(iter(data.keys()))] 79 | # print(f"result: {np.mean(result):.2f}ms") 80 | 81 | entry = { 82 | "nc": int(p.nc), 83 | "kc": int(p.kc), 84 | "mc": int(p.mc), 85 | "mr": int(p.mr), 86 | "nr": int(p.nr), 87 | "mv": int(p.mv), 88 | "nu": int(p.nu), 89 | "time": np.mean(result), 90 | } 91 | results = pd.concat( 92 | [ 93 | results, 94 | pd.DataFrame.from_records([entry]), 95 | ] 96 | ) 97 | results.to_csv("results.csv", index=False) 98 | -------------------------------------------------------------------------------- /onnx2code/ops/softmax.py: -------------------------------------------------------------------------------- 1 | from typing import Callable 2 | 3 | from onnx2code.util import compute_strides, get_attribute 4 | 5 | from .operation import OpCall, Operation, OpImpl 6 | 7 | 8 | class Softmax(Operation): 9 | """ 10 | Softmax operator 11 | 12 | https://github.com/onnx/onnx/blob/main/docs/Operators.md#softmax 13 | """ 14 | 15 | node_types = {"Softmax"} 16 | 17 | def parse(self) -> None: 18 | assert len(self.inputs) == 1, "expected one input" 19 | assert len(self.outputs) == 1, "expected one output" 20 | 21 | self.X = self.inputs[0] 22 | self.Y = self.outputs[0] 23 | 24 | self.strides = compute_strides(self.X.shape) 25 | self.sizes = self.X.shape.copy() 26 | self.axis = get_attribute(self.node, "axis", -1) 27 | if self.axis < 0: 28 | self.axis += len(self.X.shape) 29 | 30 | def call(self) -> OpCall: 31 | return OpCall( 32 | sig_name="Softmax", 33 | sig_params=[], 34 | inputs=self.inputs, 35 | outputs=self.outputs, 36 | ) 37 | 38 | 39 | @Softmax.variant("c") 40 | class SoftmaxC(Softmax): 41 | def impl(self) -> OpImpl: 42 | strides, sizes, axis = self.strides, self.sizes, self.axis 43 | 44 | labels_size = sizes[axis] 45 | labels_stride = strides[axis] 46 | 47 | del sizes[axis] 48 | del strides[axis] 49 | 50 | NL = "\n" 51 | 52 | def iterate(predicate: Callable[[str], str]) -> str: 53 | iterators = [] 54 | offset = f"i * {labels_stride}" 55 | 56 | for i, size in enumerate(sizes): 57 | iterators.append(f"for (int d{i} = 0; d{i} < {size}; ++d{i}) {{") 58 | offset += f" + d{i} * {strides[i]}" 59 | 60 | return f""" 61 | {NL.join(iterators)} 62 | {predicate(offset)} 63 | {NL.join("}" for _ in iterators)} 64 | """ 65 | 66 | source = iterate( 67 | lambda offset: f""" 68 | float max = -INFINITY; 69 | float sum = 0.0f; 70 | 71 | for (int i = 0; i < {labels_size}; ++i) {{ 72 | max = fmax(max, A[{offset}]); 73 | }} 74 | for (int i = 0; i < {labels_size}; ++i) {{ 75 | OUT[{offset}] = exp(A[{offset}] - max); 76 | sum += OUT[{offset}]; 77 | }} 78 | for (int i = 0; i < {labels_size}; ++i) {{ 79 | OUT[{offset}] /= sum; 80 | }} 81 | """ 82 | ) 83 | 84 | return OpImpl(lang="c", source=source) 85 | -------------------------------------------------------------------------------- /onnx2code/ops/elementwise.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from ..util import get_attribute 4 | from .operation import LETTERS, OpCall, Operation, OpImpl 5 | 6 | 7 | class Elementwise(Operation): 8 | """ 9 | Elementwise operators 10 | 11 | For example: ReLU, Tanh, Sigmoid, etc. 12 | """ 13 | 14 | node_types = {"Relu", "Tanh", "Sigmoid", "Clip", "Sum"} 15 | 16 | def parse(self) -> None: 17 | assert len(self.outputs) == 1, "expected one output" 18 | 19 | for input in self.inputs: 20 | assert ( 21 | input.size == self.outputs[0].size 22 | ), "input and output tensors should have the same size" 23 | 24 | if self.node.op_type == "Clip": 25 | # Clip may have min and max as inputs 26 | # or as attributes (depending on ONNX opset) 27 | break 28 | 29 | self.op: str = self.node.op_type 30 | self.size = self.inputs[0].size 31 | 32 | def call(self) -> OpCall: 33 | return OpCall( 34 | sig_name=self.op, 35 | sig_params=[self.size], 36 | inputs=self.inputs, 37 | outputs=self.outputs, 38 | ) 39 | 40 | 41 | @Elementwise.variant("c") 42 | class ElementwiseC(Elementwise): 43 | def impl(self) -> OpImpl: 44 | impl: str 45 | match self.op: 46 | case "Sum": 47 | impl = "+".join([f"{LETTERS[i]}[i]" for i in range(len(self.inputs))]) 48 | case "Relu": 49 | impl = "A[i] > 0 ? A[i] : 0" 50 | case "Tanh": 51 | impl = "tanh(A[i])" 52 | case "Sigmoid": 53 | impl = "1.0f / (1.0f + exp(-A[i]))" 54 | case "Clip": 55 | if len(self.inputs) == 3: 56 | min_data = self.inputs[1].data 57 | max_data = self.inputs[2].data 58 | 59 | if min_data is None or max_data is None: 60 | raise ValueError("Clip: min and max should be constants") 61 | 62 | # "cast" the 0-dimensional arrays to numbers 63 | min = min_data + 0 64 | max = max_data + 0 65 | else: 66 | finfo = np.finfo(dtype=np.float32) 67 | min = get_attribute(self.node, "min", finfo.min) 68 | max = get_attribute(self.node, "max", finfo.max) 69 | 70 | impl = "A[i] < {} ? {} : (A[i] > {} ? {} : A[i])".format( 71 | min, min, max, max 72 | ) 73 | case _: 74 | raise NotImplementedError(f"ElementwiseC: {self.op}") 75 | 76 | source = f""" 77 | for(int i = 0; i < {self.size}; i++) {{ 78 | OUT[i] = {impl}; 79 | }} 80 | """ 81 | 82 | return OpImpl(lang="c", source=source) 83 | -------------------------------------------------------------------------------- /evaluation/results/analysis.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "R Notebook" 3 | output: html_notebook 4 | --- 5 | 6 | ```{r} 7 | options(warn=-1) 8 | library(ggplot2) 9 | library(extrafont) 10 | 11 | theme_set(theme(text=element_text(family="LM Roman 10"))) 12 | ``` 13 | 14 | ```{r} 15 | datos_6th_skylake <- read.csv("C:/Users/mlomb/Desktop/onnx2code/evaluation/results/6th-skylake.csv") 16 | datos_6th_skylake$gen <- rep("Skylake (6th)", nrow(datos_6th_skylake)) 17 | 18 | datos_10th_comet_lake <- read.csv("C:/Users/mlomb/Desktop/onnx2code/evaluation/results/10th-comet-lake.csv") 19 | datos_10th_comet_lake$gen <- rep("Comet Lake (10th)", nrow(datos_10th_comet_lake)) 20 | 21 | datos_10th_ice_lake <- read.csv("C:/Users/mlomb/Desktop/onnx2code/evaluation/results/10th-ice-lake.csv") 22 | datos_10th_ice_lake$gen <- rep("Ice Lake (10th)", nrow(datos_10th_ice_lake)) 23 | 24 | datos_11th_tiger_lake <- read.csv("C:/Users/mlomb/Desktop/onnx2code/evaluation/results/11th-tiger-lake.csv") 25 | datos_11th_tiger_lake$gen <- rep("Tiger Lake (11th)", nrow(datos_11th_tiger_lake)) 26 | 27 | topN = 500 # nrow(datos_6th_skylake) 28 | 29 | datos <- rbind( 30 | datos_6th_skylake[order(datos_6th_skylake$time, decreasing=F),][1:topN,], 31 | datos_10th_comet_lake[order(datos_10th_comet_lake$time, decreasing=F),][1:topN,], 32 | datos_10th_ice_lake[order(datos_10th_ice_lake$time, decreasing=F),][1:topN,], 33 | datos_11th_tiger_lake[order(datos_11th_tiger_lake$time, decreasing=F),][1:topN,] 34 | ) 35 | datos$gen <- factor(datos$gen, levels=c("Skylake (6th)", "Comet Lake (10th)", "Ice Lake (10th)", "Tiger Lake (11th)")) 36 | datos$mrnr <- datos$mr * datos$nr 37 | datos$l1 <- datos$kc * datos$nr * 4 38 | datos$l2 <- datos$kc * datos$mc * 4 39 | datos$l3 <- datos$nc * datos$kc * 4 40 | 41 | filtrados_mrnr <- datos[ 42 | ((datos$gen == "Skylake (6th)" | datos$gen == "Comet Lake (10th)") & datos$mrnr == 64) | 43 | ((datos$gen == "Ice Lake (10th)" | datos$gen == "Tiger Lake (11th)") & datos$mrnr == 128) 44 | ,] 45 | ``` 46 | 47 | 48 | ```{r} 49 | ggplot(datos, aes(x=as.factor((mr*nr)),y=time))+ 50 | geom_boxplot() + 51 | ylab("Tiempo (ms)") + 52 | xlab(expression("m"[r] * " × n"[r])) + 53 | facet_grid(~gen) 54 | ggsave("mr_x_nr.pdf", width = 8, height = 4, device=cairo_pdf) 55 | ``` 56 | 57 | 58 | ```{r} 59 | ggplot(filtrados_mrnr, aes(x=factor(l1, labels=c("2KB","4KB","8KB","16KB","32KB","64KB")),y=time))+ 60 | geom_boxplot() + 61 | ylab("Tiempo (ms)") + 62 | xlab(expression("k"[c] * " × n"[r] * " × 4")) + 63 | facet_grid(~gen) 64 | ggsave("l1.pdf", width = 9, height = 4, device=cairo_pdf) 65 | ``` 66 | 67 | ```{r} 68 | ggplot(filtrados_mrnr, aes(x=factor(l2, labels=c("16KB","32KB","64KB","128KB","256KB","512KB","1MB")),y=time))+ 69 | geom_boxplot() + 70 | ylab("Tiempo (ms)") + 71 | xlab(expression("k"[c] * " × m"[c] * " × 4")) + 72 | facet_grid(~gen) 73 | ggsave("l2.pdf", width = 12, height = 4, device=cairo_pdf) 74 | ``` 75 | 76 | -------------------------------------------------------------------------------- /onnx2code/ops/gemm_tiling/gpackA.cpp: -------------------------------------------------------------------------------- 1 | // Referencias: 2 | // - [Automating the Last-Mile for High Performance Dense Linear Algebra] Figure 1 ~Ai (verde) 3 | // - https://github.com/michael-lehn/ulmBLAS/blob/191efa54ddb595760353a1ca557a886fa74a864a/ulmblas/level3/pack/gepack.tcc 4 | 5 | // for (1..mp) bloques de mr filas (dentro de las mc filas del panel) 6 | // for (1..kc) columnas del bloque de A (kc) 7 | // for (1..mr) filas de mr (dentro de las mc filas) 8 | 9 | // La matriz A se lee en forma ROW MAJOR (onnx) 10 | // El panel de A se puede pensar como un tensor de 3 dimensiones: (mp, kc, mr) 11 | 12 | // Target (los numeros son los indices originales de A que queremos en el packeado) 13 | // ----------------- -| -| 14 | // | 0 | 2 | 4 | 6 | | | 15 | // | 1 | 3 | 5 | 7 | | mr | 16 | // ----------------- -| | mc 17 | // | 8 | 10| 12| 14| | mr | 18 | // | 9 | 11| 13| 15| | | 19 | // ----------------- -| -| 20 | // |---------------| 21 | // kc 22 | // mp = 2 23 | 24 | template 25 | inline void gpackA_panel( 26 | const float* __restrict__ A, 27 | float* __restrict__ A_panel // mr x kc 28 | ) { 29 | for (int c = 0; c < kc; c++) { 30 | // copy column of mr 31 | for (int r = 0; r < mr; r++) { 32 | A_panel[r] = A[r * StrideRow]; 33 | } 34 | 35 | // advance column 36 | A_panel += mr; 37 | A += StrideCol; 38 | } 39 | } 40 | 41 | template 42 | inline void gpackA( 43 | const float* __restrict__ A, 44 | float* __restrict__ A_panel // mc x kc 45 | ) { 46 | for (int p = 0; p < mc; p += mr) { 47 | gpackA_panel(A, A_panel); 48 | 49 | // advance panel 50 | A_panel += mr * kc; 51 | A += mr * StrideRow; 52 | } 53 | } 54 | 55 | // Edge case 56 | 57 | template 58 | inline void gpackA_panel_edge( 59 | int _kc, 60 | int _mr, 61 | const float* __restrict__ A, 62 | float* __restrict__ A_panel // mr x kc 63 | ) { 64 | for (int c = 0; c < _kc; c++) { 65 | // copy column of mr 66 | for (int r = 0; r < _mr; r++) { 67 | A_panel[r] = A[r * StrideRow]; 68 | } 69 | 70 | // advance column 71 | A_panel += mr; 72 | A += StrideCol; 73 | } 74 | } 75 | template 76 | inline void gpackA_edge( 77 | int _kc, 78 | int _mc, 79 | const float* __restrict__ A, 80 | float* __restrict__ A_panel // mc x kc 81 | ) { 82 | const int MP = _mc / mr; 83 | const int MPl = _mc % mr; 84 | 85 | memset(A_panel, 0, mc * kc * sizeof(float)); 86 | 87 | for (int p = 0; p < MP; p++) { 88 | gpackA_panel_edge(_kc, mr, A, A_panel); 89 | 90 | // advance panel 91 | A_panel += mr * kc; 92 | A += mr * StrideRow; 93 | } 94 | 95 | if (MPl != 0) { 96 | gpackA_panel_edge(_kc, MPl, A, A_panel); 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /onnx2code/ops/broadcastable.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | import numpy as np 4 | 5 | from .operation import OpCall, Operation, OpImpl 6 | 7 | 8 | class Broadcastable(Operation): 9 | """ 10 | Broadcastable operators like Add, Sub, etc. 11 | 12 | https://github.com/onnx/onnx/blob/master/docs/Broadcasting.md 13 | """ 14 | 15 | node_types = {"Add", "Div", "Mul", "Sub"} 16 | 17 | def parse(self) -> None: 18 | assert len(self.inputs) == 2, "expected two inputs" 19 | assert len(self.outputs) == 1, "expected one output" 20 | 21 | self.op: str = self.node.op_type 22 | self.b_is_scalar = self.inputs[1].size == 1 23 | self.input_A = self.inputs[0] 24 | self.input_B = self.inputs[1] 25 | 26 | def call(self) -> OpCall: 27 | return OpCall( 28 | sig_name=self.op, 29 | sig_params=[self.input_A.shape, self.input_B.shape], 30 | inputs=self.inputs, 31 | outputs=self.outputs, 32 | ) 33 | 34 | 35 | @Broadcastable.variant("c") 36 | class BroadcastableC(Broadcastable): 37 | def impl(self) -> OpImpl: 38 | source = "" 39 | 40 | symbol = { 41 | "Add": "+", 42 | "Div": "/", 43 | "Mul": "*", 44 | "Sub": "-", 45 | }[self.op] 46 | 47 | if self.b_is_scalar: 48 | source += f""" 49 | const float D = B[0]; 50 | for (int i = 0; i < {self.inputs[0].size}; i++) {{ 51 | OUT[i] = A[i] {symbol} D; 52 | }} 53 | """ 54 | else: 55 | # since we are using the trick below, we can't tell beforehand if 56 | # implementations will differ for every pair of input shapes 57 | # so we add salt so implementations dont collide 58 | source += f"// broadcasting {self.input_A.shape_str()} with {self.input_B.shape_str()}\n" 59 | 60 | # we use nditer to generate the for loops for the broadcastable ops 61 | # it is a bit of a hack, but it works and it hides the complexity of 62 | # broadcasting :) 63 | 64 | a = np.arange(start=0, stop=self.input_A.size).reshape(self.input_A.shape) 65 | b = np.arange(start=0, stop=self.input_B.size).reshape(self.input_B.shape) 66 | offset = 0 67 | 68 | for x, y in np.nditer([a, b], flags=["external_loop"], order="C"): 69 | assert x.size == y.size, "nditer size expected to match" 70 | size = x.size 71 | 72 | # ARBITRARY ASSUMPTIONS I AM MAKING: 73 | def is_consecutive(z: Any) -> Any: 74 | return z[z.size - 1] - z[0] == z.size - 1 75 | 76 | x_is_consecutive = is_consecutive(x) 77 | x_is_all_equal = x[0] == x[x.size - 1] 78 | y_is_consecutive = is_consecutive(y) 79 | y_is_all_equal = y[0] == y[y.size - 1] 80 | assert ( 81 | x_is_all_equal or x_is_consecutive 82 | ), "nditer x expected to be all equal or consecutive" 83 | assert ( 84 | y_is_all_equal or y_is_consecutive 85 | ), "nditer y expected to be all equal or consecutive" 86 | 87 | A_index = f"{x[0]}" + (" + i" if x_is_consecutive else "") 88 | B_index = f"{y[0]}" + (" + i" if y_is_consecutive else "") 89 | 90 | source += f"for(int i = 0; i < {size}; i++) OUT[{offset} + i] = A[{A_index}] {symbol} B[{B_index}];\n" 91 | offset += size 92 | 93 | return OpImpl(lang="c", source=source) 94 | -------------------------------------------------------------------------------- /onnx2code/tensor.py: -------------------------------------------------------------------------------- 1 | import operator 2 | from dataclasses import dataclass 3 | from functools import reduce 4 | from typing import Literal 5 | 6 | import numpy as np 7 | import onnx 8 | from numpy.typing import NDArray 9 | 10 | from .util import get_model_inputs, get_shape_from_value_info_proto 11 | 12 | TensorData = NDArray[np.float32] 13 | TensorTag = Literal["input", "output", "weight", "intermediate", "welded"] 14 | 15 | 16 | @dataclass 17 | class TensorInfo: 18 | name: str 19 | tag: TensorTag 20 | shape: list[int] 21 | size: int 22 | data: TensorData | None 23 | variable: str 24 | 25 | def shape_str(self, sep: str = "x") -> str: 26 | return sep.join(map(str, self.shape)) 27 | 28 | @staticmethod 29 | def from_value( 30 | value_info: onnx.ValueInfoProto, 31 | tag: TensorTag, 32 | var_index: int, 33 | model_proto: onnx.ModelProto, 34 | ) -> "TensorInfo": 35 | """ 36 | Parses a ValueInfo and returns the tensor 37 | """ 38 | name = value_info.name 39 | shape = get_shape_from_value_info_proto(value_info) 40 | data: TensorData | None = None 41 | 42 | for node in model_proto.graph.node: 43 | if node.op_type == "Constant" and node.output[0] == name: 44 | data = np.array(node.attribute[0].t.float_data) 45 | tag = "weight" 46 | 47 | return TensorInfo( 48 | name=name, 49 | tag=tag, 50 | shape=shape, 51 | size=reduce(operator.mul, shape, 1), 52 | data=data, 53 | variable=f"T{var_index}", 54 | ) 55 | 56 | @staticmethod 57 | def from_initializer(initializer: onnx.TensorProto, var_index: int) -> "TensorInfo": 58 | """ 59 | Parses a TensorProto and returns the tensor 60 | """ 61 | shape = [dim for dim in initializer.dims] 62 | data = onnx.numpy_helper.to_array(initializer) # type: ignore 63 | assert data is not None, "data should not be None" 64 | assert list(data.shape) == shape, "Tensor shape and data shape should match" 65 | return TensorInfo( 66 | name=initializer.name, 67 | tag="weight", 68 | shape=shape, 69 | size=reduce(operator.mul, shape, 1), 70 | data=data, 71 | variable=f"T{var_index}", 72 | ) 73 | 74 | 75 | def parse_tensors(model_proto: onnx.ModelProto) -> list[TensorInfo]: 76 | """ 77 | Reads ALL tensors and store them in a manegeable format 78 | input, output, intermediate and constant tensors 79 | """ 80 | tensors: list[TensorInfo] = [] 81 | 82 | # input 83 | tensors.extend( 84 | TensorInfo.from_value(vi, "input", i, model_proto) 85 | for i, vi in enumerate(get_model_inputs(model_proto), start=0) 86 | ) 87 | 88 | # output 89 | tensors.extend( 90 | TensorInfo.from_value(vi, "output", i, model_proto) 91 | for i, vi in enumerate(model_proto.graph.output, start=len(tensors)) 92 | ) 93 | 94 | # intermediate 95 | tensors.extend( 96 | TensorInfo.from_value(vi, "intermediate", i, model_proto) 97 | for i, vi in enumerate(model_proto.graph.value_info, start=len(tensors)) 98 | ) 99 | 100 | # constant 101 | tensors.extend( 102 | TensorInfo.from_initializer(initializer, i) 103 | for i, initializer in enumerate( 104 | model_proto.graph.initializer, start=len(tensors) 105 | ) 106 | ) 107 | 108 | return tensors 109 | -------------------------------------------------------------------------------- /onnx2code/ops/pooling.py: -------------------------------------------------------------------------------- 1 | from onnx2code.util import compute_strides, get_attribute 2 | 3 | from .operation import OpCall, Operation, OpImpl 4 | 5 | 6 | class Pooling(Operation): 7 | """ 8 | MaxPool, AveragePool operators 9 | 10 | https://github.com/onnx/onnx/blob/main/docs/Operators.md#maxpool 11 | https://github.com/onnx/onnx/blob/main/docs/Operators.md#averagepool 12 | """ 13 | 14 | node_types = {"MaxPool", "AveragePool"} 15 | 16 | def parse(self) -> None: 17 | assert len(self.inputs) == 1, "expected one input" 18 | assert len(self.outputs) == 1, "expected one output" 19 | 20 | count_include_pad = get_attribute(self.node, "count_include_pad", 0) 21 | if count_include_pad != 0: 22 | raise NotImplementedError("only support count_include_pad=0") 23 | 24 | self.op: str = self.node.op_type 25 | self.X = self.inputs[0] 26 | self.Y = self.outputs[0] 27 | 28 | self.pads = get_attribute(self.node, "pads", [0] * len(self.X.shape) * 2) 29 | self.strides = get_attribute(self.node, "strides", [1] * len(self.X.shape)) 30 | 31 | kernel_shape = get_attribute(self.node, "kernel_shape", [1] * len(self.X.shape)) 32 | 33 | self.KH = kernel_shape[0] 34 | self.KW = kernel_shape[1] 35 | 36 | def call(self) -> OpCall: 37 | return OpCall( 38 | sig_name=self.op, 39 | sig_params=[self.X.shape, [self.KW, self.KH], self.strides, self.pads], 40 | inputs=self.inputs, 41 | outputs=self.outputs, 42 | ) 43 | 44 | 45 | @Pooling.variant("c") 46 | class PoolingC(Pooling): 47 | def impl(self) -> OpImpl: 48 | KH, KW = self.KH, self.KW 49 | 50 | H = self.X.shape[2] 51 | W = self.X.shape[3] 52 | 53 | pads_start = [self.pads[0], self.pads[1]] 54 | # pads_end = [self.pads[2], self.pads[3]] 55 | 56 | input_strides = compute_strides(self.X.shape) 57 | output_strides = compute_strides(self.Y.shape) 58 | 59 | source = f""" 60 | // start position of kernel 61 | for(int c = 0; c < {self.Y.shape[1]}; c++) {{ 62 | for(int h = 0; h < {self.Y.shape[2]}; h++) {{ 63 | for(int w = 0; w < {self.Y.shape[3]}; w++) {{ 64 | float acc = {'-INFINITY' if self.op == "MaxPool" else "0.0f"}; 65 | int count = 0; 66 | 67 | // position in kernel 68 | for(int hh = 0; hh < {KH}; hh++) {{ 69 | for(int ww = 0; ww < {KW}; ww++) {{ 70 | const int ih = {-pads_start[0]} + (h * {self.strides[0]}) + hh; 71 | const int iw = {-pads_start[1]} + (w * {self.strides[1]}) + ww; 72 | if(ih >= 0 && ih < {H} && iw >= 0 && iw < {W}) {{ 73 | const float val = A[ 74 | c * {input_strides[1]} + 75 | ih * {input_strides[2]} + 76 | iw * {input_strides[3]} 77 | ]; 78 | acc = {'acc > val ? acc : val' if self.op == "MaxPool" else 'acc + val'}; 79 | count++; 80 | }} 81 | }} 82 | }} 83 | OUT[ 84 | c * {output_strides[1]} + 85 | h * {output_strides[2]} + 86 | w * {output_strides[3]} 87 | ] = acc{"" if self.op == "MaxPool" else "/(float)count"}; 88 | }} 89 | }} 90 | }} 91 | """ 92 | 93 | return OpImpl(lang="c", source=source) 94 | -------------------------------------------------------------------------------- /evaluation/results_conv/6th.csv: -------------------------------------------------------------------------------- 1 | MNK,runtime,time_mean,time_std 2 | 256,onnx2code-conv-naive,0.5335040000000001,0.13929143255778512 3 | 256,onnx2code-im2col,0.9039860100000002,0.3674661125795002 4 | 256,tensorflow,4.80589202,0.683442539188438 5 | 256,onnxruntime,0.507659,0.11556475508994946 6 | 288,onnx2code-conv-naive,0.5146580000000001,0.07770419059484503 7 | 288,onnx2code-im2col,1.7917450100000003,1.8639875966218524 8 | 288,tensorflow,6.35994402,2.949189984354053 9 | 288,onnxruntime,0.61280601,0.1191525637822783 10 | 320,onnx2code-conv-naive,0.634851,0.09292494659132175 11 | 320,onnx2code-im2col,1.9863670100000002,0.44807282247829977 12 | 320,tensorflow,7.29074003,1.0714042269534076 13 | 320,onnxruntime,0.7684680000000002,0.13833235693791962 14 | 352,onnx2code-conv-naive,0.807433,0.13366325452793673 15 | 352,onnx2code-im2col,2.51028901,0.38476206666098195 16 | 352,tensorflow,8.64401896,1.1966611925806563 17 | 352,onnxruntime,0.9582860000000001,0.18548641676413935 18 | 384,onnx2code-conv-naive,0.9626979899999999,0.17574935067296807 19 | 384,onnx2code-im2col,3.4803209899999996,0.4532894176918648 20 | 384,tensorflow,11.220825960000004,1.3980038878057952 21 | 384,onnxruntime,1.4973109899999997,0.4829659531221946 22 | 416,onnx2code-conv-naive,1.38213199,0.21410341375291028 23 | 416,onnx2code-im2col,3.6922599899999993,0.4935615735054846 24 | 416,tensorflow,11.50879395,1.3834223381706137 25 | 416,onnxruntime,1.41198999,0.26554929743422384 26 | 448,onnx2code-conv-naive,1.4111139900000003,0.2204240278968922 27 | 448,onnx2code-im2col,4.43189997,0.6188949241091165 28 | 448,tensorflow,13.2603729,1.8157711212087528 29 | 448,onnxruntime,1.73833399,0.3230000907905289 30 | 480,onnx2code-conv-naive,1.5975139900000002,0.24468945613223694 31 | 480,onnx2code-im2col,5.15567996,0.6334665079184837 32 | 480,tensorflow,15.11004689,1.854034459133766 33 | 480,onnxruntime,1.9667179799999996,0.3832841641820591 34 | 512,onnx2code-conv-naive,1.8798239899999998,0.23651916821367755 35 | 512,onnx2code-im2col,6.38169198,0.6658296137854636 36 | 512,tensorflow,17.81744493,4.1299519575648045 37 | 512,onnxruntime,2.5316899900000003,1.1681892520836725 38 | 544,onnx2code-conv-naive,2.0533539899999997,0.2816379399709668 39 | 544,onnx2code-im2col,6.51396297,0.5585122655670234 40 | 544,tensorflow,19.55571191,2.396831020647522 41 | 544,onnxruntime,2.6647249700000004,0.5108897333422635 42 | 576,onnx2code-conv-naive,2.3157639800000003,0.2005958408356953 43 | 576,onnx2code-im2col,7.40451691,0.6117355805461555 44 | 576,tensorflow,23.915504730000002,4.28320132070323 45 | 576,onnxruntime,5.273924940000001,2.711525277159713 46 | 608,onnx2code-conv-naive,2.9795849700000003,0.44997838777004523 47 | 608,onnx2code-im2col,9.95935502,1.315584263856489 48 | 608,tensorflow,26.85310105,3.405717858857798 49 | 608,onnxruntime,3.9157660100000005,0.8325832152179685 50 | 640,onnx2code-conv-naive,3.27329201,0.6007385518494464 51 | 640,onnx2code-im2col,10.205583019999999,0.776620449798806 52 | 640,tensorflow,28.46991395,2.150791806086179 53 | 640,onnxruntime,3.9682809899999993,0.7603661038780134 54 | 672,onnx2code-conv-naive,3.2548249899999995,0.3024018106088486 55 | 672,onnx2code-im2col,10.301164980000001,0.667454295158567 56 | 672,tensorflow,29.82736495,2.841980785533137 57 | 672,onnxruntime,4.14581799,0.5737991852890434 58 | 704,onnx2code-conv-naive,3.8885129900000006,0.7703979854719701 59 | 704,onnx2code-im2col,12.37283298,0.7178972353382757 60 | 704,tensorflow,37.103216950000004,9.318760829180064 61 | 704,onnxruntime,5.105472,1.2238957019354224 62 | 736,onnx2code-conv-naive,3.9390469900000005,0.5133510699725383 63 | 736,onnx2code-im2col,12.53659398,1.2395910685729303 64 | 736,tensorflow,34.75010769000001,3.339327976403812 65 | 736,onnxruntime,4.972059880000001,0.629258674413397 66 | 768,onnx2code-conv-naive,4.302181900000001,0.46374701565885035 67 | 768,onnx2code-im2col,13.454682659999996,0.8720942521698353 68 | 768,tensorflow,38.50100595,4.069915104293922 69 | 768,onnxruntime,5.67756882,0.8204209666372427 70 | -------------------------------------------------------------------------------- /onnx2code/ops/gemm_tiling/gemm.cpp: -------------------------------------------------------------------------------- 1 | template < 2 | // matrix sizes 3 | int M, 4 | int K, 5 | int N, 6 | 7 | int nc, // Columnas de panel de B 8 | int kc, // Filas de panel de B 9 | int mc, // Filas de bloque de A 10 | 11 | int mr, // Filas de microkernel 12 | int nr, // Columnas de microkernel 13 | 14 | int mv, // Filas de unit update 15 | int nu // Columnas de unit update 16 | > 17 | void gemm( 18 | const float* __restrict__ A, // MxK 19 | const float* __restrict__ B, // KxN 20 | float* __restrict__ OUT // MxN 21 | ) { 22 | memset(OUT, 0, M * N * sizeof(float)); 23 | 24 | float A_block[(mc + mr) * kc]; 25 | float B_panel[(nc + nr) * kc]; 26 | 27 | float AB_microkernel[mr * nr]; 28 | 29 | for (int jc = 0; jc < N; jc += nc) { 30 | int _nc = min(N - jc, nc); // evitar que se pase "matrices grandes?" 31 | 32 | for (int pc = 0; pc < K; pc += kc) { 33 | int _kc = min(K - pc, kc); // evitar que se pase el panel 34 | 35 | if (_kc < kc || _nc < nc || true) { 36 | gpackB_edge(_kc, _nc, (float*)B + pc * N + jc, B_panel); 37 | } else { 38 | gpackB((float*)B + pc * N + jc, B_panel); 39 | } 40 | 41 | for (int ic = 0; ic < M; ic += mc) { 42 | int _mc = min(M - ic, mc); // evitar que se pase el panel 43 | 44 | if (_kc < kc || _mc < mc) { 45 | gpackA_edge(_kc, _mc, (float*)A + ic * K + pc, A_block); 46 | } else { 47 | gpackA((float*)A + ic * K + pc, A_block); 48 | } 49 | 50 | // fprintf(stderr, "jc=%d pc=%d ic=%d _kc=%d _nc=%d, _mc=%d\n", jc, pc, ic, _kc, _nc, _mc); 51 | 52 | for (int jr = 0; jr < _nc; jr += nr) { // jr es el offset del sliver de ancho nr (violeta) 53 | for (int ir = 0; ir < _mc; ir += mr) { // ir es el offset del sliver de ancho mr (verde) 54 | // (_mr x kc) * (kc x _nr) 55 | 56 | const float* A_kernel = A_block + ir * kc; // (mr x kc) column major 57 | const float* B_kernel = B_panel + jr * kc; // (kc x nr) row major 58 | 59 | // ref_microkernel(A_kernel, B_kernel, AB_microkernel); 60 | 61 | memset(AB_microkernel, 0, mr * nr * sizeof(float)); 62 | test_microkernel(_kc, A_kernel, B_kernel, AB_microkernel); 63 | 64 | // TODO: pasar _mr y _nr para evitar escribir fuera en C 65 | // quizas un branch entre optimized y ref? 66 | int _nr = min(_nc - jr, nr); // evitar que se pase el bloque 67 | int _mr = min(_mc - ir, mr); // evitar que se pase el bloque 68 | 69 | // assert(_mr == mr); 70 | // assert(_nr == nr); 71 | 72 | float* C_writeback = (float*)OUT + (ic + ir) * N + (jc + jr); 73 | 74 | if (_mr == mr && _nr == nr) { 75 | // Versión optimizada 76 | for (int i = 0; i < mr; i++) { 77 | for (int j = 0; j < nr; j++) { 78 | C_writeback[i * N + j] += AB_microkernel[i * nr + j]; 79 | } 80 | } 81 | } else { 82 | // Edge case 83 | for (int i = 0; i < _mr; i++) { 84 | for (int j = 0; j < _nr; j++) { 85 | C_writeback[i * N + j] += AB_microkernel[i * nr + j]; 86 | } 87 | } 88 | } 89 | } 90 | } 91 | } 92 | } 93 | } 94 | } 95 | -------------------------------------------------------------------------------- /tests/test_zoo.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from typing import Any 3 | 4 | import onnx 5 | import pytest 6 | 7 | from onnx2code.checker import check_model 8 | from tests.zoo import download_from_zoo, zoo_manifest 9 | 10 | # To avoid downloading big models we know are going to fail 11 | EXCLUDED_MODELS = { 12 | "Works but is too slow": ["ResNet101-DUC-7.onnx", "ResNet101-DUC-12.onnx"], 13 | "Operation GlobalAveragePool not implemented": [ 14 | "resnet101-v1-7.onnx", 15 | "resnet101-v2-7.onnx", 16 | "resnet152-v1-7.onnx", 17 | "resnet152-v2-7.onnx", 18 | "resnet50-v1-12.onnx", 19 | "resnet18-v1-7.onnx", 20 | "resnet18-v2-7.onnx", 21 | "resnet34-v1-7.onnx", 22 | "resnet34-v2-7.onnx", 23 | "resnet50-v1-7.onnx", 24 | "resnet50-v2-7.onnx", 25 | "densenet-12.onnx", 26 | "densenet-7.onnx", 27 | "densenet-8.onnx", 28 | "densenet-9.onnx", 29 | ], 30 | "Operation LRN not implemented": [ 31 | "rcnn-ilsvrc13-7.onnx", 32 | "rcnn-ilsvrc13-8.onnx", 33 | "rcnn-ilsvrc13-9.onnx", 34 | "bvlcalexnet-12.onnx", 35 | "bvlcalexnet-7.onnx", 36 | "bvlcalexnet-8.onnx", 37 | "bvlcalexnet-9.onnx", 38 | "caffenet-12.onnx", 39 | "caffenet-7.onnx", 40 | "caffenet-8.onnx", 41 | "caffenet-9.onnx", 42 | "googlenet-12.onnx", 43 | "googlenet-7.onnx", 44 | "googlenet-8.onnx", 45 | "googlenet-9.onnx", 46 | "inception-v1-12.onnx", 47 | "inception-v1-7.onnx", 48 | "inception-v1-8.onnx", 49 | "inception-v1-9.onnx", 50 | "zfnet512-12.onnx", 51 | "zfnet512-7.onnx", 52 | "zfnet512-8.onnx", 53 | "zfnet512-9.onnx", 54 | ], 55 | "Operation Pad not implemented": [ 56 | "candy-8.onnx", 57 | "candy-9.onnx", 58 | "mosaic-8.onnx", 59 | "mosaic-9.onnx", 60 | "pointilism-8.onnx", 61 | "pointilism-9.onnx", 62 | "rain-princess-8.onnx", 63 | "rain-princess-9.onnx", 64 | "udnie-8.onnx", 65 | "udnie-9.onnx", 66 | ], 67 | "Operation Resize not implemented": [ 68 | "FasterRCNN-10.onnx", 69 | "fcn-resnet101-11.onnx", 70 | "fcn-resnet50-11.onnx", 71 | "fcn-resnet50-12.onnx", 72 | "MaskRCNN-10.onnx", 73 | ], 74 | "Broken": ["FasterRCNN-12.onnx", "MaskRCNN-12.onnx"], 75 | } 76 | 77 | 78 | def idfn(model: Any) -> str: 79 | return Path(model["model_path"]).name 80 | 81 | 82 | def check_io_is_float(model: Any) -> None: 83 | io = model["metadata"]["io_ports"] 84 | 85 | for input in io["inputs"]: 86 | if input["type"] != "tensor(float)": 87 | raise NotImplementedError(f"No support for IO port type {input['type']}") 88 | 89 | 90 | @pytest.mark.parametrize("variation", ["c"]) 91 | @pytest.mark.parametrize("model", zoo_manifest(), ids=idfn) 92 | def test_zoo(model: Any, variation: str) -> None: 93 | # avoid downloading big models! 94 | 95 | # manual exclusion 96 | for reason, models in EXCLUDED_MODELS.items(): 97 | if idfn(model) in models: 98 | pytest.skip(reason) 99 | 100 | # opset unsupported 101 | if model["opset_version"] < 7: 102 | pytest.skip("Opset version < 7") 103 | 104 | # incompatible I/O 105 | try: 106 | check_io_is_float(model) 107 | except NotImplementedError as e: 108 | pytest.skip(e.__str__()) 109 | 110 | # model is quantized 111 | if "int8" in model["model"] or "qdq" in model["model"]: 112 | pytest.skip("Quantized models are not supported") 113 | 114 | model_path = download_from_zoo( 115 | model["model_path"], model["metadata"]["model_bytes"] 116 | ) 117 | model_proto = onnx.load(model_path.__str__()) 118 | 119 | try: 120 | check_model(model_proto, [variation]) 121 | except NotImplementedError as e: 122 | pytest.skip(e.__str__()) 123 | -------------------------------------------------------------------------------- /evaluation/measure.py: -------------------------------------------------------------------------------- 1 | import setup # noqa # isort:skip 2 | 3 | from time import perf_counter_ns 4 | 5 | import numpy as np 6 | import onnx 7 | import onnxruntime 8 | import tensorflow as tf 9 | import tf2onnx 10 | from tqdm import tqdm 11 | 12 | from onnx2code.generator import Generator 13 | from onnx2code.result import ModelResult 14 | from onnx2code.service import ModelService, TensorsMap 15 | 16 | 17 | def measure_tf( 18 | tf_model: tf.keras.Model, 19 | inputs: TensorsMap, 20 | runs: int, 21 | tqdm_leave: bool = True, 22 | ) -> list[int]: 23 | times = [] 24 | 25 | # ⚠️ Make sure to use graph execution and NOT eager execution 26 | graph_model = tf.function(tf_model) 27 | 28 | for _ in tqdm(range(runs), desc="tensorflow", leave=tqdm_leave): 29 | start = perf_counter_ns() 30 | graph_model(inputs) 31 | end = perf_counter_ns() 32 | times.append(end - start) 33 | 34 | return times 35 | 36 | 37 | def measure_onnxruntime( 38 | model_proto: onnx.ModelProto, 39 | inputs: TensorsMap, 40 | runs: int, 41 | tqdm_leave: bool = True, 42 | ) -> list[int]: 43 | times = [] 44 | ort_sess = onnxruntime.InferenceSession(model_proto.SerializeToString()) 45 | 46 | for _ in tqdm(range(runs), desc="onnxruntime", leave=tqdm_leave): 47 | start = perf_counter_ns() 48 | ort_sess.run(None, inputs) 49 | end = perf_counter_ns() 50 | times.append(end - start) 51 | 52 | return times 53 | 54 | 55 | def measure_onnx2code( 56 | model_result: ModelResult, 57 | inputs: TensorsMap, 58 | runs: int, 59 | variation_name: str = "", 60 | tqdm_leave: bool = True, 61 | ) -> list[int]: 62 | times = [] 63 | 64 | with ModelService(model_result) as service: 65 | for _ in tqdm( 66 | range(runs), 67 | desc="onnx2code" if not variation_name else f"onnx2code-{variation_name}", 68 | leave=tqdm_leave, 69 | ): 70 | start = perf_counter_ns() 71 | service.inference(inputs) 72 | end = perf_counter_ns() 73 | times.append(end - start) 74 | 75 | return times 76 | 77 | 78 | def measure_all( 79 | tf_model: tf.keras.Model, 80 | runs: int = 300, 81 | variations: list[str] = [], 82 | *, 83 | measure_base: bool = True, 84 | tqdm_leave: bool = True, 85 | onnx_model: onnx.ModelProto | None = None, 86 | ) -> dict[str, list[float]]: 87 | """ 88 | Measure the inference time of the given model in tf, onnxruntime and onnx2code. 89 | 90 | Time in milliseconds. 91 | """ 92 | if tf_model is not None: 93 | model_proto, _ = tf2onnx.convert.from_keras(tf_model) 94 | # onnx.save(model_proto, "debug.onnx") 95 | else: 96 | model_proto = onnx_model 97 | 98 | warmup_runs = int(min(100, max(5, runs * 0.1))) 99 | total = runs + warmup_runs 100 | 101 | def postprocess(times_in_ns: list[int]) -> list[float]: 102 | return [t / 1_000_000 for t in times_in_ns[warmup_runs:]] 103 | 104 | results: dict[str, list[float]] = {} 105 | 106 | for variation in variations: 107 | model_variation = Generator(model_proto, variations=[variation]).generate() 108 | # print(model_variation.source_c) 109 | 110 | inputs = { 111 | name: np.random.random_sample(shape).astype(np.float32) * 2 - 1 112 | for name, shape in model_variation.input_shapes.items() 113 | } 114 | 115 | results[f"onnx2code-{variation}"] = postprocess( 116 | measure_onnx2code( 117 | model_variation, inputs, total, variation, tqdm_leave=tqdm_leave 118 | ) 119 | ) 120 | 121 | return results | ( 122 | { 123 | "tensorflow": postprocess( 124 | measure_tf(tf_model, inputs, total, tqdm_leave=tqdm_leave) 125 | ), 126 | "onnxruntime": postprocess( 127 | measure_onnxruntime(model_proto, inputs, total, tqdm_leave=tqdm_leave) 128 | ), 129 | } 130 | if measure_base 131 | else {} 132 | ) 133 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # onnx2code 2 | 3 | Generate plain C++ code for inference of ONNX models without dependencies 4 | 5 | This project was made as an alternative to a final exam for the assignment "Computer Organization II". You can read the writeup in [docs/TP Final onnx2code.pdf](docs/TP%20Final%20onnx2code.pdf) (in Spanish). 6 | 7 | ## Model support 8 | 9 | The following models have been tested and work as expected. 10 | 11 | | Model | Size | 12 | |---|---| 13 | | [mnist](https://github.com/onnx/models/tree/main/vision/classification/mnist) | 26 KB | 14 | | [Super_Resolution](https://github.com/onnx/models/tree/main/vision/super_resolution/sub_pixel_cnn_2016) | 240 KB | 15 | | [squeezenet1.1](https://github.com/onnx/models/tree/main/vision/classification/squeezenet) | 9 MB | 16 | | [emotion_ferplus](https://github.com/onnx/models/tree/main/vision/body_analysis/emotion_ferplus) | 34 MB | 17 | | [inception-v2](https://github.com/onnx/models/tree/main/vision/classification/inception_and_googlenet/inception_v2) | 44 MB | 18 | | [resnet50-caffe2-v1](https://github.com/onnx/models/tree/main/vision/classification/resnet) | 98 MB | 19 | | [VGG 16 and VGG 16-bn](https://github.com/onnx/models/tree/main/vision/classification/vgg) | 527 MB | 20 | | [VGG 19 and VGG 19-bn](https://github.com/onnx/models/tree/main/vision/classification/vgg) | 548 MB | 21 | | [VGG 19-caffe2](https://github.com/onnx/models/tree/main/vision/classification/vgg) | 561 MB | 22 | 23 | * Minimum ONNX opset version: **7** 24 | * Quantized models are not supported 25 | 26 | ## Operator support 27 | 28 | Only `float` data type is supported. 29 | 30 | | Operator | Attribute support | 31 | |---|---| 32 | | [Add](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Add), [Div](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Div), [Mul](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Mul), [Sub](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Sub) | ✅ with broadcasting | 33 | | [Concat](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Concat) | ✅ with multiple inputs
✅ axis | 34 | | [Conv](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Conv) | ✅ bias
✅ stride
✅ padding (and `auto_pad`)
❌ dilations
❌ depthwise (group != 1) | 35 | | [Sum](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Sum) | ✅ with multiple inputs
❌ with broadcasting | 36 | | [Relu](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Relu), [Tanh](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Tanh), [Sigmoid](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Sigmoid), [Clip](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Clip) | ✅ | 37 | | [Gemm](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Gemm) | ✅ with bias
❌ transpose A
✅ tranpose B
❌ alpha != 1
❌ beta != 1 | 38 | | [Identity](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Identity) | ✅ | 39 | | [MaxPool](https://github.com/onnx/onnx/blob/main/docs/Operators.md#MaxPool), [AveragePool](https://github.com/onnx/onnx/blob/main/docs/Operators.md#AveragePool) | ✅ stride
✅ padding (and `auto_pad`)
❌ dilations
❌ storage_order != 0
❌ count_include_pad != 0 | 40 | | [Softmax](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Softmax) | ✅ stride
✅ axis | 41 | | [Transpose](https://github.com/onnx/onnx/blob/main/docs/Operators.md#Transpose) | ✅ perm | 42 | 43 | 44 | ## Setting up with Docker 45 | 46 | We provide a ready to use [Docker image](https://hub.docker.com/r/mlomb/onnx2code): 47 | 48 | ```sh 49 | docker run --rm -it -v $pwd/mnist.onnx:/app/input.onnx:ro -v $pwd/output:/app/output:rw mlomb/onnx2code:latest --variations=im2col,loop-tiling --checks=3 50 | ``` 51 | 52 | The command above will generate C++ code for the `mnist.onnx` model in the `output` folder. 53 | 54 | ## Setting up locally 55 | 56 | ### Prerequisites 57 | 58 | * gcc (required if checking models) 59 | * Python 3.10 60 | * [pipenv](https://pypi.org/project/pipenv/) 61 | 62 | Clone and install dependencies with `pipenv install`. 63 | 64 | ### Run 65 | 66 | To generate code from an ONNX model, run the following command inside a pipenv shell: 67 | 68 | ```sh 69 | python -m onnx2code --variation=im2col,loop-tiling mnist.onnx output_folder --checks=3 70 | ``` 71 | -------------------------------------------------------------------------------- /onnx2code/memory.py: -------------------------------------------------------------------------------- 1 | import math 2 | from dataclasses import dataclass 3 | 4 | # We implement different memory strategies used in TFLite 5 | # Since we are using CPU we aim to the Memory Offset Calculation approach 6 | # 7 | # See: 8 | # * https://arxiv.org/pdf/2001.03288.pdf (main paper) 9 | # * https://blog.tensorflow.org/2020/10/optimizing-tensorflow-lite-runtime.html (blog post) 10 | # * https://github.com/tensorflow/tensorflow/blob/1b36c9fb27ce899e19ddf65da3c0920861210472/tensorflow/lite/delegates/gpu/common/memory_management (ref code) 11 | 12 | 13 | @dataclass 14 | class TensorUsageRecord: 15 | first_op: int 16 | last_op: int 17 | size: int 18 | index: int = -1 # used to store the original index after sorting 19 | 20 | 21 | Records = list[TensorUsageRecord] 22 | Offsets = list[int | None] 23 | Result = tuple[int, Offsets] 24 | 25 | 26 | ########################## 27 | # Naive 28 | ########################## 29 | def naive(records: Records) -> Result: 30 | total_consumption = 0 31 | offsets: Offsets = [None] * len(records) 32 | 33 | for i, r in enumerate(records): 34 | offsets[i] = total_consumption 35 | total_consumption += r.size 36 | 37 | return total_consumption, offsets 38 | 39 | 40 | ########################## 41 | # Greed by Size 42 | # 43 | # TFLite C impl: https://github.com/tensorflow/tensorflow/blob/1b36c9fb27ce899e19ddf65da3c0920861210472/tensorflow/lite/delegates/gpu/common/memory_management/greedy_by_size_assignment.cc#L69 44 | ########################## 45 | def greedy_by_size(records: Records) -> Result: 46 | # save original indexes 47 | for i, r in enumerate(records): 48 | r.index = i 49 | 50 | # sort records in decreasing order of size 51 | records.sort(key=lambda r: r.size, reverse=True) 52 | 53 | # result 54 | total_consumption = 0 55 | offsets: Offsets = [None] * len(records) 56 | 57 | # indexes already allocated, ordered by offset 58 | ordered_allocs: list[int] = [] 59 | 60 | for t_i, t in enumerate(records): 61 | prev_offset = 0 62 | best_offset = None 63 | smallest_gap = math.inf 64 | 65 | for allocated_id in ordered_allocs: 66 | rec = records[allocated_id] 67 | 68 | if rec.last_op < t.first_op or rec.first_op > t.last_op: 69 | # no overlap, skip 70 | continue 71 | 72 | cur_offset = offsets[rec.index] 73 | assert cur_offset is not None 74 | 75 | if cur_offset >= prev_offset: 76 | gap = cur_offset - prev_offset 77 | 78 | if gap >= t.size and gap < smallest_gap: 79 | smallest_gap = gap 80 | best_offset = prev_offset 81 | 82 | prev_offset = max(prev_offset, cur_offset + rec.size) 83 | 84 | # if no suitable gap found, allocate at the end 85 | if best_offset is None: 86 | best_offset = prev_offset 87 | 88 | offsets[t.index] = best_offset 89 | total_consumption = max(total_consumption, best_offset + t.size) 90 | 91 | ordered_allocs.append(t_i) 92 | 93 | # sort by offset 94 | ordered_allocs.sort(key=lambda i: offsets[records[i].index]) # type: ignore 95 | 96 | return total_consumption, offsets 97 | 98 | 99 | ########################## 100 | # Greed by Breadth 101 | ########################## 102 | def greedy_by_breadth(records: Records) -> Result: 103 | raise NotImplementedError() 104 | 105 | 106 | def find_best_layout(records: Records) -> Result: 107 | """ 108 | Find the best memory layout using different strategies. 109 | """ 110 | alternatives = [ 111 | naive(records), 112 | greedy_by_size(records), 113 | # greedy_by_breadth(records), 114 | ] 115 | 116 | return min(alternatives, key=lambda r: r[0]) 117 | 118 | 119 | if __name__ == "__main__": 120 | test = [ 121 | TensorUsageRecord(0, 1, 32), 122 | TensorUsageRecord(1, 4, 28), 123 | TensorUsageRecord(2, 5, 36), 124 | TensorUsageRecord(3, 5, 16), 125 | TensorUsageRecord(4, 5, 8), 126 | TensorUsageRecord(5, 7, 64), 127 | TensorUsageRecord(6, 8, 10), 128 | TensorUsageRecord(7, 8, 40), 129 | ] 130 | 131 | print(find_best_layout(test)) 132 | -------------------------------------------------------------------------------- /onnx2code/ops/operation.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from collections import defaultdict 3 | from dataclasses import dataclass 4 | from pathlib import Path 5 | from textwrap import dedent 6 | from typing import Any, Callable, Literal 7 | 8 | import onnx 9 | 10 | from ..tensor import TensorInfo 11 | 12 | # used as tensor names 13 | LETTERS = ( 14 | "A", 15 | "B", 16 | "C", 17 | "D", 18 | "E", 19 | "F", 20 | "G", 21 | "H", 22 | "I", 23 | "J", 24 | "K", 25 | "L", 26 | "M", 27 | "N", 28 | "O", 29 | "P", 30 | "Q", 31 | "R", 32 | "S", 33 | "T", 34 | "U", 35 | "V", 36 | "W", 37 | "X", 38 | "Y", 39 | "Z", 40 | ) 41 | 42 | 43 | @dataclass 44 | class OpCall: 45 | sig_name: str 46 | sig_params: list[int | str | list[int] | list[str]] 47 | inputs: list[TensorInfo] 48 | outputs: list[TensorInfo] 49 | input_names: tuple[str, ...] = LETTERS 50 | output_names: tuple[str, ...] = ("OUT",) 51 | 52 | def fn_name(self) -> str: 53 | str_sig_params = [] 54 | for sig_param in self.sig_params: 55 | if isinstance(sig_param, list): 56 | str_sig_params.append("x".join(map(str, sig_param))) 57 | else: 58 | str_sig_params.append(str(sig_param)) 59 | 60 | return f"{self.sig_name}{'_' if len(str_sig_params) > 0 else ''}" + "_".join( 61 | str_sig_params 62 | ) 63 | 64 | def signature(self) -> str: 65 | params = [] 66 | for i in range(len(self.inputs)): 67 | params.append(f"const float* __restrict__ {self.input_names[i]}") 68 | for i in range(len(self.outputs)): 69 | params.append(f"float* __restrict__ {self.output_names[i]}") 70 | 71 | return f"void {self.fn_name()}({', '.join(params)})" 72 | 73 | def invocation(self) -> str: 74 | return ( 75 | self.fn_name() 76 | + f"({', '.join(t.variable for t in self.inputs + self.outputs)})" 77 | ) 78 | 79 | 80 | @dataclass(frozen=True) 81 | class ASMAuxFunction: 82 | signature: str 83 | source: str 84 | 85 | 86 | @dataclass(frozen=True) 87 | class OpImpl: 88 | lang: Literal["c", "asm"] 89 | source: str | tuple[str, ...] 90 | cpp_aux_functions: tuple[str, ...] = () 91 | asm_aux_functions: tuple[ASMAuxFunction, ...] = () 92 | external_paths: tuple[Path, ...] = () 93 | 94 | def full_source(self) -> str: 95 | code = self.source if isinstance(self.source, str) else "\n".join(self.source) 96 | return dedent(code).strip().strip("\n") 97 | 98 | 99 | @dataclass(frozen=True) 100 | class RegistryEntry: 101 | variant_tags: list[str] 102 | priority: int 103 | klass: type["Operation"] 104 | 105 | def __lt__(self, other: Any) -> bool: 106 | return self.priority < other.priority # type: ignore 107 | 108 | 109 | class Operation(ABC): 110 | node_types: set[str] 111 | _registry: defaultdict[str, list[RegistryEntry]] = defaultdict(list) 112 | 113 | def __init__( 114 | self, 115 | node: onnx.NodeProto, 116 | inputs: list[TensorInfo], 117 | outputs: list[TensorInfo], 118 | ): 119 | self.node = node 120 | self.inputs = inputs 121 | self.outputs = outputs 122 | self.parse() 123 | 124 | @abstractmethod 125 | def parse(self) -> None: 126 | pass 127 | 128 | @abstractmethod 129 | def call(self) -> OpCall | None: 130 | return None 131 | 132 | @abstractmethod 133 | def impl(self) -> OpImpl | None: 134 | pass 135 | 136 | @classmethod 137 | def variant( 138 | cls, var: str | list[str], priority: int = 0 139 | ) -> Callable[[type["Operation"]], type["Operation"]]: 140 | vars = [var] if isinstance(var, str) else var 141 | 142 | def decorator(newcls: type[Operation]) -> type[Operation]: 143 | for node_type in newcls.node_types: 144 | cls._registry[node_type].append( 145 | RegistryEntry(variant_tags=vars, priority=priority, klass=newcls) 146 | ) 147 | # always keep sorted 148 | cls._registry[node_type].sort() 149 | 150 | return newcls 151 | 152 | return decorator 153 | 154 | @staticmethod 155 | def get(node_type: str, variant_order: list[str]) -> list[type["Operation"]]: 156 | if node_type not in Operation._registry: 157 | raise NotImplementedError(f"Operation {node_type} not implemented") 158 | 159 | variants = [] 160 | 161 | for variant_tag in variant_order: 162 | for entry in Operation._registry[node_type]: 163 | if variant_tag in entry.variant_tags: 164 | variants.append(entry.klass) 165 | 166 | if len(variants) == 0: 167 | raise ValueError(f"No valid variant found for {node_type}") 168 | else: 169 | return list(dict.fromkeys(variants)) 170 | -------------------------------------------------------------------------------- /onnx2code/util.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Literal, Optional 2 | 3 | import numpy as np 4 | import onnx 5 | 6 | TensorShape = list[int] 7 | ShapesMap = dict[str, TensorShape] 8 | 9 | 10 | # taken from onnx_simplifier.get_inputs 11 | def get_model_inputs(model: onnx.ModelProto) -> list[onnx.ValueInfoProto]: 12 | initializer_names = [x.name for x in model.graph.initializer] 13 | return [ipt for ipt in model.graph.input if ipt.name not in initializer_names] 14 | 15 | 16 | # taken from onnx_simplifier.get_shape_from_value_info_proto 17 | def get_shape_from_value_info_proto(v: onnx.ValueInfoProto) -> TensorShape: 18 | return [dim.dim_value for dim in v.type.tensor_type.shape.dim] 19 | 20 | 21 | # taken from onnx_simplifier.get_value_info_all 22 | def get_value_info_all(m: onnx.ModelProto, name: str) -> Optional[onnx.ValueInfoProto]: 23 | for v in m.graph.value_info: 24 | if v.name == name: 25 | return v # type: ignore 26 | 27 | for v in m.graph.input: 28 | if v.name == name: 29 | return v # type: ignore 30 | 31 | for v in m.graph.output: 32 | if v.name == name: 33 | return v # type: ignore 34 | 35 | return None 36 | 37 | 38 | # taken from onnx_simplifier.get_shape 39 | def get_shape(m: onnx.ModelProto, name: str) -> TensorShape: 40 | v = get_value_info_all(m, name) 41 | if v is not None: 42 | return get_shape_from_value_info_proto(v) 43 | raise RuntimeError('Cannot get shape of "{}"'.format(name)) 44 | 45 | 46 | def get_fixed_input_shapes(onnx_model: onnx.ModelProto) -> ShapesMap: 47 | """ 48 | Returns a map with the input name as key and the shape of the input 49 | fixed to one batch. 50 | 51 | For example, if one of the inputs of the model is [None, 32, 32, 3], 52 | the resulting shape for that input will be [1, 32, 32, 3]. 53 | """ 54 | 55 | def fix_shape(shape: list[int]) -> list[int]: 56 | return [1 if (d == 0 or d is None) else d for d in shape] 57 | 58 | return { 59 | tensor.name: fix_shape(get_shape(onnx_model, tensor.name)) 60 | for tensor in get_model_inputs(onnx_model) 61 | } 62 | 63 | 64 | def get_attribute(node: onnx.NodeProto, name: str, default: Any = None) -> Any: 65 | """ 66 | Returns the value of the attribute with the given name. 67 | If the attribute is not found, returns the default value. 68 | """ 69 | for attr in node.attribute: 70 | if attr.name == name: 71 | return onnx.helper.get_attribute_value(attr) 72 | return default 73 | 74 | 75 | def compute_strides(shape: list[int]) -> list[int]: 76 | """ 77 | Returns the strides of the given shape. 78 | 79 | For example, compute_strides([1, 2, 3]) returns [6, 3, 1]. 80 | """ 81 | strides = [] 82 | for i in range(len(shape)): 83 | after = shape[i + 1 :] 84 | if len(after) == 0: 85 | strides.append(1) 86 | else: 87 | strides.append(int(np.prod(after))) 88 | return strides 89 | 90 | 91 | def resolve_stride_attribute(node: onnx.NodeProto) -> list[int]: 92 | """ 93 | Retrieves the strides attribute from a node or returns the default value 94 | """ 95 | strides: list[int] = get_attribute(node, "strides", [1] * 2) 96 | return strides 97 | 98 | 99 | def compute_pad_in_dimension( 100 | in_dim: int, 101 | stride: int, 102 | kernel: int, 103 | pad_type: Literal[b"SAME_UPPER", b"SAME_LOWER", b"VALID", b"NOTSET"], 104 | ) -> tuple[int, int]: 105 | """ 106 | https://github.com/microsoft/onnxruntime/blob/9ec1ed42a809170b87474f5822c4557101812399/onnxruntime/core/providers/common.h#L73 107 | """ 108 | pad_head = 0 109 | pad_tail = 0 110 | 111 | if pad_type == b"VALID" or pad_type == b"NOTSET": 112 | pass 113 | elif pad_type == b"SAME_UPPER" or pad_type == b"SAME_LOWER": 114 | legacy_target_size = (in_dim + stride - 1) // stride 115 | pad_needed = (legacy_target_size - 1) * stride + kernel - in_dim 116 | 117 | if pad_type == b"SAME_LOWER": 118 | pad_head = (pad_needed + 1) // 2 119 | else: 120 | pad_head = pad_needed // 2 121 | 122 | pad_tail = pad_needed - pad_head 123 | else: 124 | raise NotImplementedError(f"Pad type {pad_type} not implemented") 125 | 126 | return pad_head, pad_tail 127 | 128 | 129 | def resolve_padding_attribute( 130 | node: onnx.NodeProto, X: TensorShape, W: TensorShape 131 | ) -> list[int]: 132 | """ 133 | Retrieves the padding attribute from a node or returns the default value 134 | """ 135 | ndims = len(X) - 2 # number of spatial dimensions (excluding batch and channel) 136 | pads: list[int] = get_attribute(node, "pads", None) 137 | auto_pad = get_attribute(node, "auto_pad", b"NOTSET") 138 | stride = resolve_stride_attribute(node) 139 | 140 | if pads is not None: 141 | assert auto_pad == b"NOTSET", "Cannot specify both pads and auto_pad" 142 | return pads 143 | 144 | pads = [0] * ndims * 2 145 | for i in range(ndims): 146 | pad_head, pad_tail = compute_pad_in_dimension( 147 | X[i + 2], stride[i], W[i + 2], auto_pad 148 | ) 149 | pads[i] = pad_head 150 | pads[i + ndims] = pad_tail 151 | 152 | return pads 153 | -------------------------------------------------------------------------------- /onnx2code/service.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | import tempfile 4 | from multiprocessing import shared_memory 5 | from pathlib import Path 6 | from subprocess import PIPE, run 7 | from typing import Any 8 | 9 | import numpy as np 10 | 11 | from .result import ModelResult 12 | from .tensor import TensorData 13 | from .util import ShapesMap 14 | 15 | TensorsMap = dict[str, TensorData] 16 | TensorsList = list[TensorData] 17 | 18 | 19 | def _run_compilation_command(cmd: list[str]) -> None: 20 | """ 21 | Runs a given compilation command as a subprocess 22 | 23 | :param cmd: A list containing the command and its CLI args 24 | :raises SyntaxError: If the process return code is non-zero 25 | """ 26 | compilation_process = run(cmd, stderr=PIPE) 27 | if compilation_process.returncode != 0: 28 | raise SyntaxError(compilation_process.stderr.decode("utf8")) 29 | 30 | 31 | class ModelService: 32 | """ 33 | Allows using a model generated by onnx2code in a convenient way 34 | 35 | Used for testing and evaluation 36 | """ 37 | 38 | def __init__(self, result: ModelResult): 39 | self.result = result 40 | 41 | def __enter__(self) -> "ModelService": 42 | """ 43 | Compiles the model and starts a subprocess 44 | """ 45 | self.temp_dir = tempfile.TemporaryDirectory() 46 | 47 | self._compile() 48 | self._boot() 49 | 50 | return self 51 | 52 | def _compile(self) -> None: 53 | debug = os.getenv("ONNX2CODE_DEBUG", "0") == "1" 54 | 55 | if debug: 56 | # save for later inspection 57 | temp_dir = Path(__file__).parent.parent / "tmp/" 58 | else: 59 | temp_dir = Path(self.temp_dir.name) 60 | 61 | temp_dir.mkdir(exist_ok=True) 62 | 63 | c_file = temp_dir / "model.cpp" 64 | h_file = temp_dir / "model.h" 65 | asm_file = temp_dir / "model.asm" 66 | asm_object = temp_dir / "model-asm.o" 67 | svc_file = Path(__file__).parent / "service.c" 68 | self.weights_file = temp_dir / "weights.bin" 69 | self.service_executable = temp_dir / "service" 70 | 71 | self.result.weights.tofile(self.weights_file) 72 | 73 | for file, content in [ 74 | (c_file, self.result.source_c), 75 | (h_file, self.result.source_h), 76 | (asm_file, self.result.source_asm), 77 | ]: 78 | with open(file, "w") as f: 79 | f.write(content) 80 | 81 | _run_compilation_command( 82 | [ 83 | "nasm", 84 | "-f", 85 | "elf64", 86 | str(asm_file), 87 | "-o", 88 | str(asm_object), 89 | ] 90 | + (["-g", "-w+all", "-w+error"] if debug else []) 91 | ) 92 | 93 | _run_compilation_command( 94 | [ 95 | "g++", 96 | "-m64", # 64 bit env 97 | str(asm_object), 98 | str(h_file), 99 | str(c_file), 100 | str(svc_file), 101 | "-o", 102 | str(self.service_executable), 103 | "-I", 104 | temp_dir.__str__(), 105 | "-lrt", # for shm 106 | "-lm", # for math 107 | "-march=native", 108 | "-mtune=native", 109 | "-O3", 110 | ] 111 | + ( 112 | [ 113 | "-g", 114 | "-fsanitize=address", 115 | "-Wall", 116 | "-Werror", 117 | "-Wno-unused-result", 118 | "-Wno-unused-but-set-variable", 119 | "-Wno-unused-variable", 120 | ] 121 | if debug 122 | else [] 123 | ) 124 | ) 125 | 126 | def _boot(self) -> None: 127 | """ 128 | Creates the shared memory buffers and starts the service subprocess 129 | """ 130 | self.inputs_buffer = SharedNDArrays("/o2c-inputs", self.result.input_shapes) 131 | self.outputs_buffer = SharedNDArrays("/o2c-outputs", self.result.output_shapes) 132 | 133 | self.process = subprocess.Popen( 134 | [self.service_executable, self.weights_file], 135 | stdin=subprocess.PIPE, 136 | stdout=subprocess.PIPE, 137 | ) 138 | 139 | def inference(self, inputs: TensorsMap) -> TensorsList: 140 | """ 141 | Runs the model with the given inputs 142 | """ 143 | assert len(inputs) == len(self.result.input_shapes) 144 | 145 | # load inputs into shared memory 146 | self.inputs_buffer.set(inputs) 147 | 148 | # signal service that inputs are ready 149 | assert self.process.stdin and self.process.stdout 150 | self.process.stdin.write("1".encode()) 151 | self.process.stdin.flush() 152 | # wait for service to finish inference 153 | self.process.stdout.read(1) 154 | 155 | # read outputs from shared memory 156 | return self.outputs_buffer.get() 157 | 158 | def __exit__(self, _1: Any, _2: Any, _3: Any) -> None: 159 | # exit service 160 | self.process.terminate() 161 | 162 | # release shared memory 163 | self.inputs_buffer.cleanup() 164 | self.outputs_buffer.cleanup() 165 | 166 | # remove compilation files 167 | self.temp_dir.cleanup() 168 | 169 | 170 | class SharedNDArrays: 171 | """ 172 | List of NDArray[float32]'s backed by shared memory 173 | """ 174 | 175 | def __init__(self, name: str, shapes: ShapesMap): 176 | self.shapes = shapes 177 | self.offsets = np.cumsum([0, *[np.prod(s) for s in shapes.values()]]) 178 | self.elems = self.offsets[-1] 179 | self.size = self.elems * 4 180 | 181 | try: 182 | shm = shared_memory.SharedMemory(name, create=False) 183 | shm.unlink() 184 | except FileNotFoundError: 185 | pass 186 | 187 | self.shm = shared_memory.SharedMemory(name, create=True, size=self.size) 188 | self.buffer: TensorData = np.ndarray( 189 | self.elems, dtype=np.float32, buffer=self.shm.buf 190 | ) 191 | 192 | def set(self, inputs: TensorsMap) -> None: 193 | self.buffer[:] = np.concatenate([inp.reshape(-1) for inp in inputs.values()]) 194 | 195 | def get(self) -> TensorsList: 196 | return [ 197 | self.buffer[self.offsets[i] : self.offsets[i + 1]].reshape(self.shapes[n]) 198 | for i, n in enumerate(self.shapes) 199 | ] 200 | 201 | def cleanup(self) -> None: 202 | del self.buffer 203 | self.shm.close() 204 | self.shm.unlink() 205 | -------------------------------------------------------------------------------- /onnx2code/ops/gemm.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | from typing import Iterable 3 | 4 | from onnx2code.util import get_attribute 5 | 6 | from .gemm_tiling.GEMM import call_GEMM, external_paths_GEMM 7 | from .operation import OpCall, Operation, OpImpl 8 | 9 | 10 | class GEMM(Operation): 11 | """ 12 | GEneral Matrix Multiplication operator 13 | 14 | https://github.com/onnx/onnx/blob/main/docs/Operators.md#gemm 15 | """ 16 | 17 | node_types = {"Gemm", "MatMul"} 18 | 19 | def parse(self) -> None: 20 | assert ( 21 | len(self.inputs) == 2 or len(self.inputs) == 3 22 | ), "expected two or three inputs" 23 | assert len(self.outputs) == 1, "expected one output" 24 | 25 | self.hasC = len(self.inputs) == 3 26 | self.transA = get_attribute(self.node, "transA", 0) > 0.5 27 | self.transB = get_attribute(self.node, "transB", 0) > 0.5 28 | self.alpha = get_attribute(self.node, "alpha", 1.0) 29 | self.beta = get_attribute(self.node, "beta", 1.0) 30 | 31 | # normalize 32 | self.alpha = None if self.alpha == 1.0 else self.alpha 33 | self.beta = None if self.beta == 1.0 else self.beta 34 | 35 | if self.transA: 36 | raise NotImplementedError("transA not supported") 37 | if self.alpha is not None: 38 | raise NotImplementedError("alpha not supported") 39 | if self.beta is not None: 40 | raise NotImplementedError("beta not supported") 41 | 42 | A = self.inputs[0] 43 | B = self.inputs[1] 44 | Y = self.outputs[0] 45 | 46 | self.N = A.shape[0] 47 | self.M = B.shape[1] if self.transB else B.shape[0] 48 | self.K = B.shape[0] if self.transB else B.shape[1] 49 | 50 | assert Y.shape[0] == self.N 51 | assert Y.shape[1] == self.K 52 | 53 | def call(self) -> OpCall: 54 | return OpCall( 55 | sig_name="GEMM", 56 | sig_params=[ 57 | self.hasC, 58 | self.N, 59 | self.M, 60 | self.K, 61 | self.transB, 62 | ], 63 | inputs=self.inputs, 64 | outputs=self.outputs, 65 | ) 66 | 67 | 68 | @GEMM.variant(["c", "gemm-naive"], priority=2) 69 | class GEMMC(GEMM): 70 | def impl(self) -> OpImpl: 71 | N, M, K = self.N, self.M, self.K 72 | 73 | index_B = f"i * {K} + col" if not self.transB else f"col * {M} + i" 74 | 75 | source = f""" 76 | for(int row = 0; row < {N}; row++) {{ 77 | for(int col = 0; col < {K}; col++) {{ 78 | float sum = 0; 79 | for(int i = 0; i < {M}; i++) {{ 80 | sum += A[row * {M} + i] * B[{index_B}]; 81 | }} 82 | OUT[row * {K} + col] = sum{f' + C[row * {K} + col]' if self.hasC else ''}; 83 | }} 84 | }} 85 | """ 86 | 87 | return OpImpl(lang="c", source=source) 88 | 89 | 90 | # Make sure this executable is in your PATH 91 | LIBXSMM_PATH = "libxsmm_gemm_generator" 92 | 93 | 94 | @GEMM.variant(["asm", "libxsmm"], priority=0) 95 | class GEMMAsm(GEMM): 96 | def impl(self) -> OpImpl: 97 | N, M, K = self.N, self.M, self.K 98 | 99 | aux_fn_name = f"libxsmm_GEMM_{N}_{M}_{K}" 100 | 101 | # Reference: https://scalable.uni-jena.de/opt/hpc/chapters/assignment_small_gemms.html 102 | generator_args = [ 103 | LIBXSMM_PATH, 104 | # matrix type 105 | "dense", 106 | # output file name 107 | "/dev/stdout", 108 | # function name 109 | aux_fn_name, 110 | # matrix size 111 | str(K), 112 | str(N), 113 | str(M), 114 | # lda, ldb, ldc 115 | str(K), 116 | str(M), 117 | str(K), 118 | # alpha beta 119 | # C := alpha*A*B + beta*C 120 | "1", 121 | "0", 122 | # 0: unaligned A, C 123 | "0", 124 | "0", 125 | # arch 126 | "hsw", # haswell, targets AVX2 127 | # prefetch 128 | "nopf", # no prefetch 129 | # precision 130 | "SP", # single precision (f32) 131 | ] 132 | 133 | try: 134 | libxsmm_generator_process = subprocess.run( 135 | " ".join(generator_args), 136 | capture_output=True, 137 | encoding="utf-8", 138 | shell=True, 139 | ) 140 | except PermissionError: 141 | raise RuntimeError(f"libxsmm not found at '{LIBXSMM_PATH}'") 142 | 143 | if ( 144 | libxsmm_generator_process.returncode != 0 145 | or libxsmm_generator_process.stderr != "" 146 | ): 147 | raise RuntimeError(f"libxsmm: {libxsmm_generator_process.stderr}") 148 | 149 | lines: Iterable[str] = libxsmm_generator_process.stdout.splitlines() 150 | 151 | aux_fn = "\n".join( 152 | filter( 153 | # Filter out the flops line 154 | lambda line: not ( 155 | line.startswith("libxsmm_num_total_flops") or line == "" 156 | ), 157 | lines, 158 | ) 159 | ) 160 | 161 | if aux_fn == "": 162 | raise RuntimeError("libxsmm: no output") 163 | 164 | # tensors MUST be reversed since libxsmm uses BLAS' column-major order 165 | # and we use onnx's row-major order 166 | source = f""" 167 | {aux_fn_name}(B, A, OUT); 168 | """ + ( 169 | f""" 170 | for(int i = 0; i < {N * K}; i++) {{ 171 | OUT[i] += C[i]; 172 | }} 173 | """ 174 | if self.hasC 175 | else "" 176 | ) 177 | 178 | return OpImpl(lang="c", source=source, cpp_aux_functions=(aux_fn,)) 179 | 180 | 181 | @GEMM.variant(["c", "loop-tiling"], priority=1) 182 | class GEMMLoopTiling(GEMM): 183 | def impl(self) -> OpImpl: 184 | M, K, N = self.N, self.M, self.K 185 | 186 | if self.hasC: 187 | raise NotImplementedError("hasC not supported") 188 | 189 | # unit_update_asm = ASMAuxFunction( 190 | # signature="void unit_update(const float*, const float*, float*)", 191 | # source=""" 192 | # vbroadcastss ymm0, [rsi] 193 | # vmovups ymm1, [rdi] 194 | # vfmadd213ps ymm0, ymm1, [rdx] 195 | # vmovups [rdx], ymm0 196 | # vzeroupper 197 | # ret 198 | # """, 199 | # ) 200 | 201 | return OpImpl( 202 | lang="c", 203 | source=call_GEMM(M, K, N, "A, B, OUT"), 204 | external_paths=external_paths_GEMM, 205 | # asm_aux_functions=(unit_update_asm,), 206 | ) 207 | -------------------------------------------------------------------------------- /evaluation/results_conv/10th.csv: -------------------------------------------------------------------------------- 1 | MNK,runtime,time_mean,time_std 2 | 256,onnx2code-conv-naive,0.5491880600000001,0.13237336197376118 3 | 256,onnx2code-im2col,0.9188441900000001,0.1298197832873476 4 | 256,tensorflow,1.3908916,0.2866774618994664 5 | 256,onnxruntime,0.4345319399999999,0.1870622537994141 6 | 288,onnx2code-conv-naive,0.48715349999999996,0.08764745483840361 7 | 288,onnx2code-im2col,1.34965215,0.26676209362540154 8 | 288,tensorflow,1.4971754700000002,0.2615000408917924 9 | 288,onnxruntime,0.66567734,0.13843684378489854 10 | 320,onnx2code-conv-naive,0.69172044,0.13837388211142448 11 | 320,onnx2code-im2col,1.5007587900000001,0.27390226756988684 12 | 320,tensorflow,1.6004319200000001,0.24421334188048285 13 | 320,onnxruntime,0.83764132,0.18599264028895768 14 | 352,onnx2code-conv-naive,0.7370404599999999,0.13744646944053673 15 | 352,onnx2code-im2col,1.9929681699999995,0.3946627409351447 16 | 352,tensorflow,1.7411530500000003,0.2345129273474439 17 | 352,onnxruntime,1.0316308899999997,0.16029263369212543 18 | 384,onnx2code-conv-naive,0.85087959,0.14158904427695632 19 | 384,onnx2code-im2col,2.24507743,0.38668566384204767 20 | 384,tensorflow,1.8663835800000002,0.3699921724628017 21 | 384,onnxruntime,1.34216344,0.2132365995996616 22 | 416,onnx2code-conv-naive,1.07306232,0.14262480010565343 23 | 416,onnx2code-im2col,3.00455171,0.7940435426173719 24 | 416,tensorflow,1.9588059400000004,0.2563426713220731 25 | 416,onnxruntime,1.5810998100000004,0.24184807849080361 26 | 448,onnx2code-conv-naive,1.2683012500000002,0.18853181540193026 27 | 448,onnx2code-im2col,3.25650464,0.6449111779435757 28 | 448,tensorflow,2.41006099,0.3340333789623874 29 | 448,onnxruntime,1.85145789,0.2536602645151934 30 | 480,onnx2code-conv-naive,1.4698558999999998,0.17367458305287506 31 | 480,onnx2code-im2col,3.86520041,0.9520036125981465 32 | 480,tensorflow,2.3632088700000002,0.28686510795984427 33 | 480,onnxruntime,2.07489152,0.28566567101177837 34 | 512,onnx2code-conv-naive,1.59486367,0.198567874527178 35 | 512,onnx2code-im2col,4.39524337,0.7469946433111774 36 | 512,tensorflow,2.51543401,0.26654362903001433 37 | 512,onnxruntime,2.4066684400000002,0.29880642444644057 38 | 544,onnx2code-conv-naive,1.59447175,0.20297294414607947 39 | 544,onnx2code-im2col,4.877493419999999,0.946721760463772 40 | 544,tensorflow,2.78587158,0.4026182149866093 41 | 544,onnxruntime,2.64123508,0.30412804938264676 42 | 576,onnx2code-conv-naive,1.8584777100000003,0.2965443570897715 43 | 576,onnx2code-im2col,5.182136000000001,0.613396774608165 44 | 576,tensorflow,2.94818244,0.34298876794677463 45 | 576,onnxruntime,3.0138453599999995,0.32912679608830153 46 | 608,onnx2code-conv-naive,2.1042586300000004,0.33962333469941824 47 | 608,onnx2code-im2col,5.90957882,0.7201904177597808 48 | 608,tensorflow,2.9772878200000004,0.38372111270959225 49 | 608,onnxruntime,3.3932347099999993,0.4371578633363993 50 | 640,onnx2code-conv-naive,2.2751119600000003,0.2228727057489508 51 | 640,onnx2code-im2col,6.65117641,0.8332841357658395 52 | 640,tensorflow,3.4733946799999997,0.4854518077448446 53 | 640,onnxruntime,3.6088097100000005,0.37096904414304693 54 | 672,onnx2code-conv-naive,2.38564787,0.30105978589790616 55 | 672,onnx2code-im2col,7.230346799999999,0.779337072554604 56 | 672,tensorflow,3.72211326,0.3315343707700189 57 | 672,onnxruntime,4.015681740000001,0.36337057032155534 58 | 704,onnx2code-conv-naive,2.73222686,0.3305703471703722 59 | 704,onnx2code-im2col,7.684625929999999,0.7507740755702911 60 | 704,tensorflow,3.9292453199999993,0.36010634476584497 61 | 704,onnxruntime,4.47966224,0.4367974261954189 62 | 736,onnx2code-conv-naive,3.0278000099999995,0.4229891265275384 63 | 736,onnx2code-im2col,8.640412600000001,0.9262444881498297 64 | 736,tensorflow,4.10335983,0.42943891032949155 65 | 736,onnxruntime,4.78642897,0.4368799570415529 66 | 768,onnx2code-conv-naive,3.1074968299999997,0.35935848197272474 67 | 768,onnx2code-im2col,9.42943564,1.0549493285844636 68 | 768,tensorflow,4.35549105,0.4705102456384 69 | 768,onnxruntime,5.20144776,0.48165631481254595 70 | 800,onnx2code-conv-naive,3.40349017,0.4410177443325167 71 | 800,onnx2code-im2col,10.087175249999998,1.0337052905505357 72 | 800,tensorflow,4.58121696,0.515248672096919 73 | 800,onnxruntime,5.688211319999999,0.5391705471565167 74 | 832,onnx2code-conv-naive,3.74884378,0.4504297513641739 75 | 832,onnx2code-im2col,11.362020870000004,1.3332305096315464 76 | 832,tensorflow,4.743467050000001,0.627452077184997 77 | 832,onnxruntime,6.235891240000001,0.5067022519198059 78 | 864,onnx2code-conv-naive,4.14272667,0.5105069073467872 79 | 864,onnx2code-im2col,12.0510654,1.139568936617281 80 | 864,tensorflow,5.09580038,0.509759947245226 81 | 864,onnxruntime,6.721285499999999,0.5030100573284891 82 | 896,onnx2code-conv-naive,4.175582160000001,0.4371373430177001 83 | 896,onnx2code-im2col,12.419930509999999,0.899210697129538 84 | 896,tensorflow,5.326666319999998,0.4760109094783623 85 | 896,onnxruntime,7.1844892499999995,0.6275378377775538 86 | 928,onnx2code-conv-naive,4.54842397,0.46409270056854923 87 | 928,onnx2code-im2col,13.424788929999998,1.0794657054235974 88 | 928,tensorflow,5.974092810000001,0.569689363140259 89 | 928,onnxruntime,7.64295418,0.611710935315078 90 | 960,onnx2code-conv-naive,4.86105023,0.5581146653470925 91 | 960,onnx2code-im2col,14.558563390000003,1.1082483519696646 92 | 960,tensorflow,6.5112397500000005,1.460229401409055 93 | 960,onnxruntime,8.19947209,0.6584829936922305 94 | 992,onnx2code-conv-naive,5.120063300000001,0.544389909767209 95 | 992,onnx2code-im2col,15.694976389999997,1.2412264189912563 96 | 992,tensorflow,6.476902399999999,0.6904489808260419 97 | 992,onnxruntime,8.69421666,0.6245109155897632 98 | 1024,onnx2code-conv-naive,5.590155300000001,0.6496297329966895 99 | 1024,onnx2code-im2col,17.33253985,1.077605242483038 100 | 1024,tensorflow,6.79198762,0.5675002324853758 101 | 1024,onnxruntime,9.496853179999999,1.333272052480006 102 | 1056,onnx2code-conv-naive,5.772926100000001,0.6658795291482311 103 | 1056,onnx2code-im2col,17.946812150000003,1.2365768010534273 104 | 1056,tensorflow,7.02846319,0.6944497910994529 105 | 1056,onnxruntime,9.9349942,0.6898595990707819 106 | 1088,onnx2code-conv-naive,6.290629489999999,0.8784981131715252 107 | 1088,onnx2code-im2col,19.00194285,1.3439142948857516 108 | 1088,tensorflow,7.45540793,0.4714124189516067 109 | 1088,onnxruntime,10.528197050000001,0.7615252199567178 110 | 1120,onnx2code-conv-naive,6.617438369999999,0.9501307714331607 111 | 1120,onnx2code-im2col,20.147991229999995,1.2449445515221622 112 | 1120,tensorflow,7.864878350000001,0.665257063734995 113 | 1120,onnxruntime,11.101594520000003,0.8159476296610155 114 | 1152,onnx2code-conv-naive,6.831439400000001,0.6716892841138081 115 | 1152,onnx2code-im2col,21.205939569999995,1.212284878937721 116 | 1152,tensorflow,7.92708685,0.46568353623677466 117 | 1152,onnxruntime,11.954561950000002,0.6959551411048039 118 | 1184,onnx2code-conv-naive,7.328971719999998,0.8108132758963938 119 | 1184,onnx2code-im2col,22.756164560000002,1.4384047279126229 120 | 1184,tensorflow,8.509471740000002,0.5682869313954813 121 | 1184,onnxruntime,12.447344249999999,0.8514161123153985 122 | 1216,onnx2code-conv-naive,7.66072625,0.8296190286507341 123 | 1216,onnx2code-im2col,23.729592410000006,1.2839267518061153 124 | 1216,tensorflow,9.0151338,0.7025805359415104 125 | 1216,onnxruntime,13.358221110000004,0.9357839590492228 126 | 1248,onnx2code-conv-naive,7.97544042,1.0038998031313302 127 | 1248,onnx2code-im2col,25.231953579999995,1.592579618776789 128 | 1248,tensorflow,9.393306319999999,0.6601032833059064 129 | 1248,onnxruntime,14.070093029999999,0.8027447679136805 130 | -------------------------------------------------------------------------------- /onnx2code/ops/conv.py: -------------------------------------------------------------------------------- 1 | from math import ceil 2 | 3 | import numpy as np 4 | 5 | from onnx2code.ops.gemm_tiling.GEMM import call_GEMM, external_paths_GEMM 6 | from onnx2code.util import ( 7 | compute_strides, 8 | get_attribute, 9 | resolve_padding_attribute, 10 | resolve_stride_attribute, 11 | ) 12 | 13 | from .operation import OpCall, Operation, OpImpl 14 | 15 | 16 | class Conv(Operation): 17 | """ 18 | Conv operator 19 | 20 | Only 2D convolutions are supported 21 | 22 | https://github.com/onnx/onnx/blob/main/docs/Operators.md#conv 23 | """ 24 | 25 | node_types = {"Conv"} 26 | 27 | def parse(self) -> None: 28 | assert ( 29 | len(self.inputs) == 2 or len(self.inputs) == 3 30 | ), "expected two or three inputs" 31 | assert len(self.outputs) == 1, "expected one output" 32 | 33 | group = get_attribute(self.node, "group", 1) 34 | if group != 1: 35 | raise NotImplementedError("depthwise is not supported (group != 1)") 36 | 37 | self.X = self.inputs[0] 38 | self.W = self.inputs[1] 39 | self.B = self.inputs[2] if len(self.inputs) == 3 else None 40 | self.Y = self.outputs[0] 41 | 42 | self.dilations = get_attribute(self.node, "dilations", [1] * 2) 43 | self.strides = resolve_stride_attribute(self.node) 44 | self.pads = resolve_padding_attribute(self.node, self.X.shape, self.W.shape) 45 | 46 | def call(self) -> OpCall: 47 | return OpCall( 48 | sig_name="Conv", 49 | sig_params=[self.X.shape, self.W.shape, self.strides, self.pads], 50 | inputs=self.inputs, 51 | outputs=self.outputs, 52 | input_names=("X", "W", "B"), 53 | ) 54 | 55 | 56 | @Conv.variant(["c", "conv-naive"], priority=1) 57 | class ConvC(Conv): 58 | def impl(self) -> OpImpl: 59 | # onnx is NCHW 60 | # N = self.X.shape[0] 61 | # C = self.X.shape[1] 62 | H = self.X.shape[2] 63 | W = self.X.shape[3] 64 | F = self.W.shape[0] # filters 65 | KC = self.W.shape[1] 66 | KH = self.W.shape[2] 67 | KW = self.W.shape[3] 68 | 69 | pads_start = [self.pads[0], self.pads[1]] 70 | # pads_end = [self.pads[2], self.pads[3]] 71 | 72 | input_strides = compute_strides(self.X.shape) 73 | output_strides = compute_strides(self.Y.shape) 74 | kernel_strides = compute_strides(self.W.shape) 75 | 76 | source = "" 77 | 78 | source += f""" 79 | for(int f = 0; f < {F}; f++) {{ 80 | // start position of kernel 81 | for(int h = 0; h < {self.Y.shape[2]}; h++) {{ 82 | for(int w = 0; w < {self.Y.shape[3]}; w++) {{ 83 | float accum = {"0.0f" if self.B is None else "B[f]" }; 84 | 85 | // position in kernel 86 | for(int cc = 0; cc < {KC}; cc++) {{ 87 | for(int hh = 0; hh < {KH}; hh++) {{ 88 | for(int ww = 0; ww < {KW}; ww++) {{ 89 | const int ih = {-pads_start[0]} + (h * {self.strides[0]}) + hh; 90 | const int iw = {-pads_start[1]} + (w * {self.strides[1]}) + ww; 91 | if(ih >= 0 && ih < {H} && iw >= 0 && iw < {W}) {{ 92 | accum += X[ 93 | cc * {input_strides[1]} + 94 | ih * {input_strides[2]} + 95 | iw * {input_strides[3]} 96 | ] * W[ 97 | f * {kernel_strides[0]} + 98 | cc * {kernel_strides[1]} + 99 | hh * {kernel_strides[2]} + 100 | ww * {kernel_strides[3]} 101 | ]; 102 | }} 103 | }} 104 | }} 105 | }} 106 | 107 | OUT[ 108 | f * {output_strides[1]} + 109 | h * {output_strides[2]} + 110 | w * {output_strides[3]} 111 | ] = accum; 112 | }} 113 | }} 114 | }} 115 | """ 116 | 117 | return OpImpl(lang="c", source=source) 118 | 119 | 120 | @Conv.variant(["im2col", "loop-tiling"], priority=0) 121 | class ConvIm2col(Conv): 122 | def impl(self) -> OpImpl: 123 | input_shape = self.X.shape 124 | weight_shape = self.W.shape 125 | has_bias = self.B is not None 126 | pads, dilations, strides = self.pads, self.dilations, self.strides 127 | 128 | assert len(pads) == 4 or np.allclose( 129 | pads, 0 130 | ), "expected padding only in two dimensions" 131 | 132 | # onnx is NCHW 133 | N = input_shape[0] 134 | C = input_shape[1] 135 | H = input_shape[2] 136 | W = input_shape[3] 137 | F = weight_shape[0] # filters 138 | KC = weight_shape[1] 139 | KH = weight_shape[2] 140 | KW = weight_shape[3] 141 | 142 | input_strides = compute_strides(input_shape) 143 | kernel_strides = compute_strides([KC, KH, KW]) 144 | pads_start = [pads[0], pads[1]] 145 | pads_end = [pads[2], pads[3]] 146 | patch_stride = KC * KH * KW 147 | num_patches = ceil( 148 | (H - KH + 1 + pads_start[0] + pads_end[0] - (dilations[0] - 1) * (KH - 1)) 149 | / strides[0] 150 | ) * ceil( 151 | (W - KW + 1 + pads_start[1] + pads_end[1] - (dilations[1] - 1) * (KW - 1)) 152 | / strides[0] 153 | ) 154 | im2col_shape = [patch_stride, num_patches] 155 | 156 | bias_code = ( 157 | f""" 158 | // bias 159 | for (int f = 0; f < {F}; f++) {{ 160 | for (int i = 0; i < {num_patches}; i++) {{ 161 | OUT[f * {num_patches} + i] += B[f]; 162 | }} 163 | }} 164 | """ 165 | if has_bias 166 | else "" 167 | ) 168 | 169 | _N = F # weight_shape[0] 170 | _M = patch_stride # weight_shape[1] 171 | _K = im2col_shape[1] 172 | 173 | source = f""" 174 | // padding, dilations, strides 175 | // im2col 176 | // float im2col[{np.prod(im2col_shape)}]; 177 | int patch = 0; 178 | for(int c = 0; c < {C - KC + 1}; c++) {{ 179 | for(int h = {-pads_start[0]}; h < {H - KH + 1 + pads_end[0] - (dilations[0] - 1) * (KH - 1)}; h += {strides[0]}) {{ 180 | for(int w = {-pads_start[1]}; w < {W - KW + 1 + pads_end[1] - (dilations[1] - 1) * (KW - 1)}; w += {strides[1]}) {{ 181 | // copy patch 182 | for(int cc = 0; cc < {KC}; cc++) {{ 183 | for(int hh = 0; hh < {KH}; hh++) {{ 184 | for(int ww = 0; ww < {KW}; ww++) {{ 185 | const int ih = h + hh * {dilations[0]}; 186 | const int iw = w + ww * {dilations[1]}; 187 | float value; 188 | if(ih < 0 || ih >= {H} || iw < 0 || iw >= {W}) {{ 189 | value = 0.0f; 190 | }} else {{ 191 | value = X[ 192 | (c + cc) * {input_strides[1]} + 193 | ih * {input_strides[2]} + 194 | iw * {input_strides[3]} 195 | ]; 196 | }} 197 | im2col[ 198 | (cc * {kernel_strides[0]} + 199 | hh * {kernel_strides[1]} + 200 | ww * {kernel_strides[2]}) * {num_patches} + 201 | patch 202 | ] = value; 203 | }} 204 | }} 205 | }} 206 | patch++; 207 | }} 208 | }} 209 | }} 210 | // gemm ({self.Y.shape}) 211 | for(int row = 0; row < {_N}; row++) {{ 212 | for(int col = 0; col < {_K}; col++) {{ 213 | float sum = 0; 214 | for(int i = 0; i < {_M}; i++) {{ 215 | sum += W[row * {_M} + i] * im2col[i * {_K} + col]; 216 | }} 217 | OUT[row * {_K} + col] = sum; 218 | }} 219 | }} 220 | //{call_GEMM(_N, _M, _K,"W, im2col, OUT")} 221 | {bias_code} 222 | """ 223 | 224 | return OpImpl(lang="c", source=source, external_paths=external_paths_GEMM) 225 | -------------------------------------------------------------------------------- /evaluation/results_gemm/10th.csv: -------------------------------------------------------------------------------- 1 | MNK,runtime,time_mean,time_std 2 | 256,onnx2code-gemm-naive,3.032503173333333,0.3963114916526603 3 | 256,onnx2code-loop-tiling,0.6197000233333334,0.10369838811603 4 | 256,onnx2code-libxsmm,0.7196239033333334,0.10866221183291207 5 | 256,tensorflow,0.8746342333333333,0.1925181370934322 6 | 256,onnxruntime,0.20683064666666667,0.03900188116251773 7 | 288,onnx2code-gemm-naive,4.024115666666667,0.5194440929148348 8 | 288,onnx2code-loop-tiling,0.80517784,0.13400563886996597 9 | 288,onnx2code-libxsmm,0.93845815,0.14355573278716818 10 | 288,tensorflow,0.9673922599999999,0.1858061522799655 11 | 288,onnxruntime,0.2888854766666667,0.024172225820890434 12 | 320,onnx2code-gemm-naive,6.331535613333334,0.5509802762758547 13 | 320,onnx2code-loop-tiling,0.9538450333333333,0.13810027369501804 14 | 320,onnx2code-libxsmm,1.2905169033333335,0.16415241675008624 15 | 320,tensorflow,1.2187085466666667,0.2594019197373344 16 | 320,onnxruntime,0.3680314866666667,0.057412645192644524 17 | 352,onnx2code-gemm-naive,7.8856523366666655,0.6560213203748767 18 | 352,onnx2code-loop-tiling,1.27658195,0.20538526113336575 19 | 352,onnx2code-libxsmm,1.7584370266666671,0.21596043915740515 20 | 352,tensorflow,1.4465645866666668,0.221910820228568 21 | 352,onnxruntime,0.46340960333333325,0.11418972732270034 22 | 384,onnx2code-gemm-naive,11.550311796666666,1.0403060429752689 23 | 384,onnx2code-loop-tiling,1.5058879933333333,0.20728834380472683 24 | 384,onnx2code-libxsmm,2.2619934833333333,0.32138281326548307 25 | 384,tensorflow,1.76205783,0.22727102523365014 26 | 384,onnxruntime,0.6222279099999999,0.04984593144796507 27 | 416,onnx2code-gemm-naive,14.391191236666668,0.7851056160948203 28 | 416,onnx2code-loop-tiling,1.8807214333333335,0.23426841520328534 29 | 416,onnx2code-libxsmm,2.9243486800000005,0.39641411833061346 30 | 416,tensorflow,2.1911857866666664,0.26867450219499395 31 | 416,onnxruntime,0.78398472,0.07031717088934679 32 | 448,onnx2code-gemm-naive,18.20398146333333,0.9198346380517328 33 | 448,onnx2code-loop-tiling,2.31714408,0.25164997428852165 34 | 448,onnx2code-libxsmm,3.5859564033333338,0.4675003390828581 35 | 448,tensorflow,2.68536728,0.27450274408191555 36 | 448,onnxruntime,1.0722129066666668,0.05506651126305917 37 | 480,onnx2code-gemm-naive,22.210734633333335,0.8697894314733723 38 | 480,onnx2code-loop-tiling,2.82071765,0.3375718366785962 39 | 480,onnx2code-libxsmm,4.63586339,0.525838292893875 40 | 480,tensorflow,3.28927851,0.3330544670676804 41 | 480,onnxruntime,1.12585362,0.2527811935213976 42 | 512,onnx2code-gemm-naive,37.82177006333333,1.7695390811710392 43 | 512,onnx2code-loop-tiling,3.387950926666667,0.36442807124270943 44 | 512,onnx2code-libxsmm,7.333030716666667,0.6106197815637948 45 | 512,tensorflow,3.90125637,0.37029067510700264 46 | 512,onnxruntime,1.392510446666667,0.33206850390999215 47 | 544,onnx2code-gemm-naive,32.94581162666667,1.4858811647637689 48 | 544,onnx2code-loop-tiling,4.131661676666667,0.656923921812614 49 | 544,onnx2code-libxsmm,6.612609873333334,0.5843372144030853 50 | 544,tensorflow,4.522499783333333,0.39981151880847937 51 | 544,onnxruntime,1.8869457600000001,0.10023161830857434 52 | 576,onnx2code-gemm-naive,38.92624099333334,1.3423770376674533 53 | 576,onnx2code-loop-tiling,4.74182933,0.5772705808930342 54 | 576,onnx2code-libxsmm,7.704783719999999,0.9048697276132155 55 | 576,tensorflow,5.206054753333333,0.3989466387608822 56 | 576,onnxruntime,2.1019028766666663,0.38584045712099896 57 | 608,onnx2code-gemm-naive,45.28636947333334,1.2418235486734779 58 | 608,onnx2code-loop-tiling,5.535877246666667,0.68465347702584 59 | 608,onnx2code-libxsmm,9.329710803333334,0.6821022386722522 60 | 608,tensorflow,6.038077293333333,0.4066848204053199 61 | 608,onnxruntime,2.3380995933333333,0.5150944124831821 62 | 640,onnx2code-gemm-naive,53.979821606666675,1.5732381435772353 63 | 640,onnx2code-loop-tiling,6.528255413333333,0.7382412954780949 64 | 640,onnx2code-libxsmm,11.105261486666668,0.6702049593273811 65 | 640,tensorflow,6.867506293333332,0.33479984655450573 66 | 640,onnxruntime,2.8644599933333335,0.44012363370755264 67 | 672,onnx2code-gemm-naive,61.797607606666666,2.0206490238507837 68 | 672,onnx2code-loop-tiling,7.335917296666667,0.48413999592309587 69 | 672,onnx2code-libxsmm,12.526532143333334,0.7823617295629833 70 | 672,tensorflow,7.629762806666666,0.3806301394771867 71 | 672,onnxruntime,3.3707401600000004,0.4525826643422036 72 | 704,onnx2code-gemm-naive,70.44579822666667,1.8285376918963692 73 | 704,onnx2code-loop-tiling,8.239208566666667,0.689555869604747 74 | 704,onnx2code-libxsmm,14.119220446666665,0.845159265942012 75 | 704,tensorflow,8.895071413333334,0.34419289040192147 76 | 704,onnxruntime,3.5427747,0.7727582562802354 77 | 736,onnx2code-gemm-naive,79.81430747333333,1.7413606458959807 78 | 736,onnx2code-loop-tiling,9.634022176666667,0.6114007260691514 79 | 736,onnx2code-libxsmm,16.302030413333334,1.007909693988049 80 | 736,tensorflow,9.830212666666666,0.4567337411348858 81 | 736,onnxruntime,3.7305446866666663,1.0182194903883912 82 | 768,onnx2code-gemm-naive,122.00736332333331,2.1067141418493156 83 | 768,onnx2code-loop-tiling,10.923692083333334,0.5637555127495397 84 | 768,onnx2code-libxsmm,25.253630956666665,2.3042468585120885 85 | 768,tensorflow,11.239431676666667,0.4128013705502185 86 | 768,onnxruntime,5.13070866,0.47881221418431297 87 | 800,onnx2code-gemm-naive,104.17442842,1.6762800916184926 88 | 800,onnx2code-loop-tiling,12.447658143333335,1.4992577998217194 89 | 800,onnx2code-libxsmm,21.749256313333333,1.0426478679011155 90 | 800,tensorflow,12.379860203333335,0.7496123154140425 91 | 800,onnxruntime,5.28670129,1.1996524549301877 92 | 832,onnx2code-gemm-naive,116.08897560333335,2.0299157631398157 93 | 832,onnx2code-loop-tiling,13.754875326666667,0.6784085029189272 94 | 832,onnx2code-libxsmm,24.601902156666664,2.062709708078124 95 | 832,tensorflow,13.695079760000002,0.3706590754333004 96 | 832,onnxruntime,6.141160866666667,1.1566683448185664 97 | 864,onnx2code-gemm-naive,132.48205046333334,3.008030542358048 98 | 864,onnx2code-loop-tiling,15.861842113333331,0.821118800358304 99 | 864,onnx2code-libxsmm,26.891542533333336,1.554060923792188 100 | 864,tensorflow,15.222112973333333,0.6294818808192755 101 | 864,onnxruntime,6.738747506666667,1.3725968574633352 102 | 896,onnx2code-gemm-naive,152.92541574333333,1.9424146789867993 103 | 896,onnx2code-loop-tiling,17.463121376666663,1.4855081415663267 104 | 896,onnx2code-libxsmm,31.479977646666665,1.7297347422679645 105 | 896,tensorflow,16.688846016666666,1.38853294476488 106 | 896,onnxruntime,7.102040583333334,1.7228627994158145 107 | 928,onnx2code-gemm-naive,161.55684296666666,1.4504057033650344 108 | 928,onnx2code-loop-tiling,19.451436123333334,1.7354361387323423 109 | 928,onnx2code-libxsmm,33.236639976666666,1.796100522607714 110 | 928,tensorflow,18.249888,0.45181684454396637 111 | 928,onnxruntime,8.673702366666667,1.3608428073642937 112 | 960,onnx2code-gemm-naive,179.54494265,2.20539800179465 113 | 960,onnx2code-loop-tiling,21.493540293333332,0.8316535835294889 114 | 960,onnx2code-libxsmm,37.78634877,1.8713101890560713 115 | 960,tensorflow,20.094944476666665,1.540206518013887 116 | 960,onnxruntime,9.514956133333333,1.5622842941740966 117 | 992,onnx2code-gemm-naive,199.19028681,2.460440855555875 118 | 992,onnx2code-loop-tiling,22.859786763333336,1.1154943400971729 119 | 992,onnx2code-libxsmm,40.267736176666666,1.4405508791241584 120 | 992,tensorflow,22.12998395,1.6512399342279749 121 | 992,onnxruntime,9.939381456666666,2.22546314880591 122 | 1024,onnx2code-gemm-naive,302.44768983999995,16.24694655356355 123 | 1024,onnx2code-loop-tiling,25.54218734,0.7281526073349307 124 | 1024,onnx2code-libxsmm,71.78343102333332,2.9600656701989223 125 | 1024,tensorflow,24.584593536666667,0.8282364742632156 126 | 1024,onnxruntime,11.35924066,2.136863288270128 127 | 1056,onnx2code-gemm-naive,243.8744718566667,2.4895748810929565 128 | 1056,onnx2code-loop-tiling,28.703636126666666,0.8993973898595125 129 | 1056,onnx2code-libxsmm,56.09443612333334,2.1420622768167505 130 | 1056,tensorflow,26.570579916666667,0.6059130481320978 131 | 1056,onnxruntime,12.543689193333334,2.1933724458049437 132 | 1088,onnx2code-gemm-naive,266.70388051333333,2.330864442781358 133 | 1088,onnx2code-loop-tiling,30.41403294,0.8988800675524125 134 | 1088,onnx2code-libxsmm,58.41972651333332,2.797254098747936 135 | 1088,tensorflow,29.04282031666667,1.0215327865643453 136 | 1088,onnxruntime,14.144601546666665,2.299014144461868 137 | 1120,onnx2code-gemm-naive,289.6749728433333,4.023731298223004 138 | 1120,onnx2code-loop-tiling,34.07599961333333,0.8902777108117541 139 | 1120,onnx2code-libxsmm,65.24443668333333,3.581385816881316 140 | 1120,tensorflow,31.45872342,0.5698554580779852 141 | 1120,onnxruntime,15.405860513333334,1.828299272079097 142 | 1152,onnx2code-gemm-naive,343.24906158666664,3.7112365451323703 143 | 1152,onnx2code-loop-tiling,36.349583179999996,0.9711783836167112 144 | 1152,onnx2code-libxsmm,91.12284834666666,2.9376563991947924 145 | 1152,tensorflow,34.17130621333334,0.7050820407134677 146 | 1152,onnxruntime,15.90146292,3.121942005048774 147 | 1184,onnx2code-gemm-naive,348.46309920333334,4.062900327154884 148 | 1184,onnx2code-loop-tiling,38.731178416666665,1.2522460082867726 149 | 1184,onnx2code-libxsmm,84.33077342666667,3.665204177579058 150 | 1184,tensorflow,37.23460804,1.1237919123701319 151 | 1184,onnxruntime,17.54617040666667,3.289721269978182 152 | 1216,onnx2code-gemm-naive,397.16544792,14.727324074322256 153 | 1216,onnx2code-loop-tiling,43.77018736,0.9206742296714242 154 | 1216,onnx2code-libxsmm,107.27316728666668,2.795479668821692 155 | 1216,tensorflow,40.17801400333334,0.6536839254514244 156 | 1216,onnxruntime,19.293283923333334,3.392666877767465 157 | 1248,onnx2code-gemm-naive,410.9592286366667,6.704456245496791 158 | 1248,onnx2code-loop-tiling,45.56106598333333,0.9108245501411985 159 | 1248,onnx2code-libxsmm,108.07985238333332,2.3859204441571227 160 | 1248,tensorflow,42.84209676333333,0.6350184200923578 161 | 1248,onnxruntime,19.562509316666663,4.389890480237362 162 | -------------------------------------------------------------------------------- /evaluation/results_gemm/6th.csv: -------------------------------------------------------------------------------- 1 | MNK,runtime,time_mean,time_std 2 | 256,onnx2code-gemm-naive,2.93099801,0.21223372618581754 3 | 256,onnx2code-loop-tiling,0.6115743366666666,0.06599749912481524 4 | 256,onnx2code-libxsmm,0.6862000033333334,0.09256254610444041 5 | 256,tensorflow,0.83879467,0.2111202157956325 6 | 256,onnxruntime,0.17502199999999998,0.042453192844198025 7 | 288,onnx2code-gemm-naive,4.013051343333333,0.1922353570042171 8 | 288,onnx2code-loop-tiling,0.8613543366666666,0.08961090262274632 9 | 288,onnx2code-libxsmm,0.9549310066666667,0.115530932781052 10 | 288,tensorflow,0.9110693400000001,0.15713004746802695 11 | 288,onnxruntime,0.242907,0.06730584212632165 12 | 320,onnx2code-gemm-naive,5.73331939,0.27483905993038527 13 | 320,onnx2code-loop-tiling,1.1365396766666667,0.13217515031366067 14 | 320,onnx2code-libxsmm,1.2718516766666665,0.13979089496648958 15 | 320,tensorflow,1.1791876766666667,0.20658304444656664 16 | 320,onnxruntime,0.3266730033333333,0.05591295871724779 17 | 352,onnx2code-gemm-naive,7.659213810000001,0.31697362694935244 18 | 352,onnx2code-loop-tiling,1.4359520333333333,0.13807107627317253 19 | 352,onnx2code-libxsmm,1.7002643733333334,0.19519460629189345 20 | 352,tensorflow,1.4036383666666665,0.27107338265993375 21 | 352,onnxruntime,0.4308296766666666,0.09773607186690145 22 | 384,onnx2code-gemm-naive,11.107058206666666,1.4121811533334494 23 | 384,onnx2code-loop-tiling,1.8372090099999998,0.20677005859850672 24 | 384,onnx2code-libxsmm,2.311948676666667,0.25727827390834773 25 | 384,tensorflow,1.6829836733333334,0.2696682622089221 26 | 384,onnxruntime,0.538148,0.11062774047528344 27 | 416,onnx2code-gemm-naive,14.297038139999998,2.729676676434938 28 | 416,onnx2code-loop-tiling,2.4032359733333326,0.43364645287365833 29 | 416,onnx2code-libxsmm,2.9514771200000003,0.3241932617549172 30 | 416,tensorflow,2.00626849,0.322824547564065 31 | 416,onnxruntime,0.6981953366666667,0.14003255632503223 32 | 448,onnx2code-gemm-naive,17.374831833333335,0.6144123549041031 33 | 448,onnx2code-loop-tiling,2.872743363333333,0.2822886150100956 34 | 448,onnx2code-libxsmm,3.7628733766666667,0.41442373074013844 35 | 448,tensorflow,2.3588540266666667,0.3072846262026125 36 | 448,onnxruntime,0.8693563433333332,0.1435663590984048 37 | 480,onnx2code-gemm-naive,21.50225923,0.7819840128890342 38 | 480,onnx2code-loop-tiling,3.50825207,0.31527052039920433 39 | 480,onnx2code-libxsmm,4.428268086666667,0.42702809932715097 40 | 480,tensorflow,2.62927905,0.3265735363913425 41 | 480,onnxruntime,1.0414200200000001,0.19804321140402564 42 | 512,onnx2code-gemm-naive,34.906725013333336,2.4372404714513394 43 | 512,onnx2code-loop-tiling,4.392616076666666,0.34323584249067596 44 | 512,onnx2code-libxsmm,6.58534778,0.5006035942198693 45 | 512,tensorflow,3.917376396666667,0.6595506638626123 46 | 512,onnxruntime,1.2402080100000001,0.16426971820915917 47 | 544,onnx2code-gemm-naive,30.517709726666666,1.4655694722498689 48 | 544,onnx2code-loop-tiling,5.456461906666667,0.47872229454143655 49 | 544,onnx2code-libxsmm,6.595671356666667,0.6647671986123534 50 | 544,tensorflow,3.88902766,0.46820230109173716 51 | 544,onnxruntime,1.482888666666667,0.17757750778619336 52 | 576,onnx2code-gemm-naive,36.852594706666665,2.328737123134206 53 | 576,onnx2code-loop-tiling,6.3206076200000005,0.4519496564714359 54 | 576,onnx2code-libxsmm,7.905415273333334,0.6909014338298326 55 | 576,tensorflow,4.415751966666667,0.5194307622421865 56 | 576,onnxruntime,1.73132532,0.1933088669553924 57 | 608,onnx2code-gemm-naive,42.66949377666666,1.2014143562159671 58 | 608,onnx2code-loop-tiling,7.304635963333335,0.7251682138322634 59 | 608,onnx2code-libxsmm,9.214234646666666,0.778034726474529 60 | 608,tensorflow,5.144455656666667,0.5631055999529266 61 | 608,onnxruntime,2.047746663333333,0.1852386781655944 62 | 640,onnx2code-gemm-naive,55.89334453333334,2.273516001627766 63 | 640,onnx2code-loop-tiling,8.369001973333333,0.5048504896278362 64 | 640,onnx2code-libxsmm,11.598928710000001,0.9318555946571332 65 | 640,tensorflow,6.01065204,0.7255272663531527 66 | 640,onnxruntime,2.3855636833333334,0.2919221096476188 67 | 672,onnx2code-gemm-naive,58.554538380000004,2.8564249775566233 68 | 672,onnx2code-loop-tiling,9.613490160000001,0.6311975762601341 69 | 672,onnx2code-libxsmm,13.277355443333333,1.5214488973740832 70 | 672,tensorflow,6.44044367,0.6637398145560762 71 | 672,onnxruntime,2.745889,0.3140661292026463 72 | 704,onnx2code-gemm-naive,67.34064747000001,2.9076649778401698 73 | 704,onnx2code-loop-tiling,11.017596743333334,0.6963415985621837 74 | 704,onnx2code-libxsmm,14.665297196666664,1.2571258474802771 75 | 704,tensorflow,7.532525746666667,0.8175024675900715 76 | 704,onnxruntime,3.1209063366666667,0.30571192055678276 77 | 736,onnx2code-gemm-naive,77.81776146333333,3.994680527831363 78 | 736,onnx2code-loop-tiling,12.602153343333333,0.7199006589244326 79 | 736,onnx2code-libxsmm,17.260685663333334,1.3552461221698675 80 | 736,tensorflow,8.18517155,0.8955714304885909 81 | 736,onnxruntime,3.4846302700000003,0.29393547805562775 82 | 768,onnx2code-gemm-naive,114.22778595999999,8.203709334359631 83 | 768,onnx2code-loop-tiling,14.643412720000002,0.770094425828583 84 | 768,onnx2code-libxsmm,21.88221372,2.3626316578265025 85 | 768,tensorflow,9.402519983333335,1.0571307250323347 86 | 768,onnxruntime,3.9847046133333333,0.35216399742038346 87 | 800,onnx2code-gemm-naive,102.52447922666666,4.841998191757737 88 | 800,onnx2code-loop-tiling,16.721700436666666,0.7321350723210706 89 | 800,onnx2code-libxsmm,22.56168299333333,1.5385892624864095 90 | 800,tensorflow,10.610878073333334,1.2925829745715158 91 | 800,onnxruntime,4.526122559999999,0.394388227185878 92 | 832,onnx2code-gemm-naive,118.56218002333335,3.939953692180838 93 | 832,onnx2code-loop-tiling,18.922631803333335,1.4425692216179269 94 | 832,onnx2code-libxsmm,25.1037305,3.1527200230313914 95 | 832,tensorflow,11.518043606666668,1.147024838134664 96 | 832,onnxruntime,5.080414686666666,0.47586088191467807 97 | 864,onnx2code-gemm-naive,130.09689659666668,5.189265473084857 98 | 864,onnx2code-loop-tiling,20.914084600000002,0.9556391532755657 99 | 864,onnx2code-libxsmm,29.258843553333328,1.9307758241843858 100 | 864,tensorflow,13.143729653333333,2.0231548745389447 101 | 864,onnxruntime,5.760477336666666,1.477134492618634 102 | 896,onnx2code-gemm-naive,168.89975199666668,5.312927351664043 103 | 896,onnx2code-loop-tiling,23.185644933333332,1.0045551482742974 104 | 896,onnx2code-libxsmm,34.172522423333334,1.9982843256822413 105 | 896,tensorflow,14.184290020000002,1.8862201389964413 106 | 896,onnxruntime,6.194402336666667,0.5051867525439248 107 | 928,onnx2code-gemm-naive,165.13455939666667,3.6329771619842033 108 | 928,onnx2code-loop-tiling,25.830488803333335,1.6850606511951327 109 | 928,onnx2code-libxsmm,37.66786822666667,3.800566116722251 110 | 928,tensorflow,15.979504666666665,1.5224369501662662 111 | 928,onnxruntime,6.822730663333334,0.8137015051617086 112 | 960,onnx2code-gemm-naive,183.91971043000004,3.7863765477388824 113 | 960,onnx2code-loop-tiling,27.901448323333334,1.140211974885468 114 | 960,onnx2code-libxsmm,40.32194981999999,4.608950302465135 115 | 960,tensorflow,17.55200400333333,3.1222178969760126 116 | 960,onnxruntime,7.582337316666665,1.3798306560698488 117 | 992,onnx2code-gemm-naive,202.18115277666666,4.9461951688977575 118 | 992,onnx2code-loop-tiling,30.831881576666667,1.2410191207267294 119 | 992,onnx2code-libxsmm,46.472281063333334,3.5206640553037913 120 | 992,tensorflow,19.609973956666668,1.9594430432812078 121 | 992,onnxruntime,8.354810469999999,1.224331820426381 122 | 1024,onnx2code-gemm-naive,311.7934103333333,6.877924793102408 123 | 1024,onnx2code-loop-tiling,34.00422953666667,1.65053201542282 124 | 1024,onnx2code-libxsmm,56.044168510000006,4.313170511153236 125 | 1024,tensorflow,21.53622228,3.0112313016520313 126 | 1024,onnxruntime,9.185121386666667,1.5104097708390252 127 | 1056,onnx2code-gemm-naive,254.14210583000002,6.641041915904256 128 | 1056,onnx2code-loop-tiling,38.359650099999996,1.305478721310599 129 | 1056,onnx2code-libxsmm,59.21067334000001,4.997676490874754 130 | 1056,tensorflow,22.382056223333333,2.0500099528156066 131 | 1056,onnxruntime,9.777039993333332,0.7107724979235902 132 | 1088,onnx2code-gemm-naive,292.2926318666667,9.702008574620628 133 | 1088,onnx2code-loop-tiling,42.027352369999996,1.749770538149051 134 | 1088,onnx2code-libxsmm,67.41834954333333,4.8664865627514065 135 | 1088,tensorflow,26.594176163333334,2.0905433452247073 136 | 1088,onnxruntime,10.497412073333331,0.8809854735765934 137 | 1120,onnx2code-gemm-naive,338.5233844933333,10.164223333108087 138 | 1120,onnx2code-loop-tiling,45.16896703,2.3841331282466762 139 | 1120,onnx2code-libxsmm,76.79927917333333,5.7278331098052515 140 | 1120,tensorflow,28.83353773333333,2.563366119458253 141 | 1120,onnxruntime,11.659171516666666,1.8744377435576556 142 | 1152,onnx2code-gemm-naive,403.66976146333326,9.52804313250809 143 | 1152,onnx2code-loop-tiling,49.305075323333334,3.73440690467644 144 | 1152,onnx2code-libxsmm,86.43004374333333,6.511510521143248 145 | 1152,tensorflow,30.32093784666667,2.2242678975026324 146 | 1152,onnxruntime,12.255513253333332,0.9498209524135391 147 | 1184,onnx2code-gemm-naive,416.8149896066667,16.73399445325064 148 | 1184,onnx2code-loop-tiling,53.818254503333335,4.412354834290176 149 | 1184,onnx2code-libxsmm,95.98643558,6.660626441719355 150 | 1184,tensorflow,32.997734083333334,3.954762346023894 151 | 1184,onnxruntime,13.61072438,0.9931182019533201 152 | 1216,onnx2code-gemm-naive,449.9982161666666,11.71757127478388 153 | 1216,onnx2code-loop-tiling,57.79504892,3.78510541217508 154 | 1216,onnx2code-libxsmm,113.02410599,6.211236515543298 155 | 1216,tensorflow,33.57050304333334,3.4069599306920897 156 | 1216,onnxruntime,14.462370676666664,1.1351487701683944 157 | 1248,onnx2code-gemm-naive,533.9181180966667,19.01682602485039 158 | 1248,onnx2code-loop-tiling,61.468259516666656,2.3679392876943126 159 | 1248,onnx2code-libxsmm,128.34663120999997,6.3470097444951215 160 | 1248,tensorflow,36.713297913333335,2.8748869668129955 161 | 1248,onnxruntime,15.35723634,1.0219832136925788 162 | -------------------------------------------------------------------------------- /evaluation/results_gemm.csv: -------------------------------------------------------------------------------- 1 | MNK,runtime,time_mean,time_std 2 | 256,onnx2code-gemm-naive,3.3703890066666666,0.26234957107102974 3 | 256,onnx2code-loop-tiling,0.4769373333333333,0.11200040330086118 4 | 256,onnx2code-libxsmm,0.7685216666666665,0.1347120594597562 5 | 256,tensorflow,0.8927413366666667,0.21565150082934734 6 | 256,onnxruntime,0.19556233333333334,0.10832180150469352 7 | 288,onnx2code-gemm-naive,4.213650009999999,0.2412096531602261 8 | 288,onnx2code-loop-tiling,0.699732,0.1757824672030746 9 | 288,onnx2code-libxsmm,1.0847346666666666,0.1939209450563697 10 | 288,tensorflow,0.9772826666666666,0.22258221978905282 11 | 288,onnxruntime,0.26156667,0.13187345940122966 12 | 320,onnx2code-gemm-naive,6.044232376666667,0.3223155900942348 13 | 320,onnx2code-loop-tiling,0.9018526733333332,0.21311764860484914 14 | 320,onnx2code-libxsmm,1.38776568,0.2599831027845546 15 | 320,tensorflow,1.24392801,0.27125448357114745 16 | 320,onnxruntime,0.40802100333333335,0.2441359848681263 17 | 352,onnx2code-gemm-naive,7.893988396666667,0.367508493608097 18 | 352,onnx2code-loop-tiling,1.3560120133333333,0.7348494896233467 19 | 352,onnx2code-libxsmm,1.79440235,0.2837481352934221 20 | 352,tensorflow,1.49494068,0.3057390412554759 21 | 352,onnxruntime,0.46045267000000006,0.20686798562821274 22 | 384,onnx2code-gemm-naive,11.524608453333334,0.5201314323649308 23 | 384,onnx2code-loop-tiling,1.4596343366666666,0.2841688073669065 24 | 384,onnx2code-libxsmm,2.4336043433333336,0.32883703427758676 25 | 384,tensorflow,1.72980867,0.33966093636865935 26 | 384,onnxruntime,0.5861753366666667,0.1912006903240414 27 | 416,onnx2code-gemm-naive,14.006902730000002,0.48876344668009947 28 | 416,onnx2code-loop-tiling,1.830579023333333,0.33124209183315273 29 | 416,onnx2code-libxsmm,3.1111713733333333,0.37648462074904726 30 | 416,tensorflow,2.12371503,0.3163907697054321 31 | 416,onnxruntime,0.7455333433333334,0.25015353024405923 32 | 448,onnx2code-gemm-naive,18.138151503333333,0.9224213063871142 33 | 448,onnx2code-loop-tiling,2.343757016666667,0.44236630097742496 34 | 448,onnx2code-libxsmm,3.98062436,0.4661748076261562 35 | 448,tensorflow,2.5604050199999997,0.3854315060294798 36 | 448,onnxruntime,0.93208034,0.26121660448528994 37 | 480,onnx2code-gemm-naive,22.125118099999998,2.159088927144462 38 | 480,onnx2code-loop-tiling,2.8662290200000005,0.5107667355572988 39 | 480,onnx2code-libxsmm,4.9646453699999995,0.5687715558677898 40 | 480,tensorflow,3.0218440199999996,0.4649805323483191 41 | 480,onnxruntime,1.1394096733333334,0.2579315916047637 42 | 512,onnx2code-gemm-naive,35.79122658,1.4332680004341185 43 | 512,onnx2code-loop-tiling,3.3537350266666666,0.5237410889073206 44 | 512,onnx2code-libxsmm,6.718171383333333,0.6435436307087524 45 | 512,tensorflow,4.010103033333333,0.3728290496767013 46 | 512,onnxruntime,1.3507263299999999,0.29469939469919926 47 | 544,onnx2code-gemm-naive,31.681916779999998,1.4659728759093686 48 | 544,onnx2code-loop-tiling,4.23727131,0.6526400838989082 49 | 544,onnx2code-libxsmm,6.9747556433333315,0.6103129442386007 50 | 544,tensorflow,4.051602676666667,0.4291065866271986 51 | 544,onnxruntime,1.6376596700000001,0.35580045226994084 52 | 576,onnx2code-gemm-naive,37.99052364,1.0874292602662623 53 | 576,onnx2code-loop-tiling,5.001330343333333,0.7604748176479122 54 | 576,onnx2code-libxsmm,8.230971016666667,0.7198893260045716 55 | 576,tensorflow,4.502336343333333,0.586047683102526 56 | 576,onnxruntime,1.94110367,0.3945149318786313 57 | 608,onnx2code-gemm-naive,44.638641256666666,1.215741187119763 58 | 608,onnx2code-loop-tiling,5.787086216666666,0.8303883343886781 59 | 608,onnx2code-libxsmm,9.801832463333334,0.7914461282502653 60 | 608,tensorflow,5.3438368899999995,0.6125494605928336 61 | 608,onnxruntime,2.1587026366666664,0.3883709388781272 62 | 640,onnx2code-gemm-naive,58.05911371,2.694715017241604 63 | 640,onnx2code-loop-tiling,6.634438333333333,0.8345216391226107 64 | 640,onnx2code-libxsmm,12.054989756666666,1.5028348949102772 65 | 640,tensorflow,5.99051355,0.7330592824266312 66 | 640,onnxruntime,2.5221542833333332,0.4573304520249769 67 | 672,onnx2code-gemm-naive,60.40745225999999,1.4315425880311647 68 | 672,onnx2code-loop-tiling,7.762591996666667,0.9044419656465619 69 | 672,onnx2code-libxsmm,13.663411669999999,0.9982044028997674 70 | 672,tensorflow,6.9099103333333325,0.8983987900295478 71 | 672,onnxruntime,2.9145486666666667,0.3943143716607629 72 | 704,onnx2code-gemm-naive,71.40485638000001,2.114370430434871 73 | 704,onnx2code-loop-tiling,8.846936676666667,1.0275345510924836 74 | 704,onnx2code-libxsmm,15.719211346666665,1.1871685584464966 75 | 704,tensorflow,7.95713067,0.7963579281085575 76 | 704,onnxruntime,3.3527723366666664,0.4527388447792796 77 | 736,onnx2code-gemm-naive,81.41985087,2.243184882505461 78 | 736,onnx2code-loop-tiling,10.761460323333335,1.799618396172303 79 | 736,onnx2code-libxsmm,18.336470986666665,1.1059623489511625 80 | 736,tensorflow,8.837726993333332,1.3609673276859735 81 | 736,onnxruntime,3.77466933,0.4777523148543477 82 | 768,onnx2code-gemm-naive,116.2303691,3.6316318818898234 83 | 768,onnx2code-loop-tiling,11.265402053333334,1.2191776120246236 84 | 768,onnx2code-libxsmm,22.767142543333332,2.118667472431719 85 | 768,tensorflow,10.311439233333333,1.0654921718879553 86 | 768,onnxruntime,4.192945856666667,0.4530428803391309 87 | 800,onnx2code-gemm-naive,108.29446495,2.314271972973089 88 | 800,onnx2code-loop-tiling,13.067479950000001,1.2708659041292807 89 | 800,onnx2code-libxsmm,23.835922146666665,1.3408881566847868 90 | 800,tensorflow,11.30956228,1.0025326876216665 91 | 800,onnxruntime,4.919521963333334,1.3622212610655369 92 | 832,onnx2code-gemm-naive,124.03364023333333,3.910260580697564 93 | 832,onnx2code-loop-tiling,14.569716833333334,1.2364054854243702 94 | 832,onnx2code-libxsmm,26.099881983333333,1.4702241860694976 95 | 832,tensorflow,12.40567943,1.0726871141114627 96 | 832,onnxruntime,5.281970426666667,0.5238231226909117 97 | 864,onnx2code-gemm-naive,138.47043131333334,4.4882056117959355 98 | 864,onnx2code-loop-tiling,16.571924443333334,1.3590672205781729 99 | 864,onnx2code-libxsmm,31.187495433333332,2.3485493769686947 100 | 864,tensorflow,13.782888929999999,1.1931003490375507 101 | 864,onnxruntime,5.9023876333333325,0.5151141892303416 102 | 896,onnx2code-gemm-naive,177.52883427333333,3.7420385927551454 103 | 896,onnx2code-loop-tiling,18.290402886666666,1.4539367671765442 104 | 896,onnx2code-libxsmm,36.61725613,2.8889872890932375 105 | 896,tensorflow,15.695179583333335,1.7116501377966362 106 | 896,onnxruntime,6.5521933,0.6331403943611638 107 | 928,onnx2code-gemm-naive,176.81010817,4.868150399428317 108 | 928,onnx2code-loop-tiling,20.290687633333334,1.536577118268534 109 | 928,onnx2code-libxsmm,39.80128498,2.0321174807328157 110 | 928,tensorflow,16.660500926666664,1.2334493721870932 111 | 928,onnxruntime,7.147731570000001,0.5679753831886013 112 | 960,onnx2code-gemm-naive,206.60193386333333,6.203880944471459 113 | 960,onnx2code-loop-tiling,22.565377373333334,2.748720403237248 114 | 960,onnx2code-libxsmm,44.72559841333334,2.207069747547238 115 | 960,tensorflow,18.77272903,1.831075804896297 116 | 960,onnxruntime,7.780628350000001,0.6423223296151973 117 | 992,onnx2code-gemm-naive,224.6577771866667,5.341322166873297 118 | 992,onnx2code-loop-tiling,24.939347386666668,1.644355930024445 119 | 992,onnx2code-libxsmm,47.02947108666667,2.8481977420198152 120 | 992,tensorflow,20.788774706666665,1.7194106633212887 121 | 992,onnxruntime,8.619736683333334,0.7481155531326805 122 | 1024,onnx2code-gemm-naive,333.87419574,5.810992600601528 123 | 1024,onnx2code-loop-tiling,27.680156300000004,1.742366038172522 124 | 1024,onnx2code-libxsmm,64.00978282,4.674831253523885 125 | 1024,tensorflow,22.40708031,3.0494422123010727 126 | 1024,onnxruntime,10.346731420000001,1.7225040124799624 127 | 1056,onnx2code-gemm-naive,301.18202313666666,12.973498944847293 128 | 1056,onnx2code-loop-tiling,31.540917860000004,2.5664617374331913 129 | 1056,onnx2code-libxsmm,69.36479905,6.49757857067899 130 | 1056,tensorflow,23.45093885333333,2.0352275444335666 131 | 1056,onnxruntime,10.495233546666666,2.052902058341192 132 | 1088,onnx2code-gemm-naive,330.22071242000004,12.747447046267114 133 | 1088,onnx2code-loop-tiling,33.63197182333333,3.225165893332042 134 | 1088,onnx2code-libxsmm,76.57936520666668,6.61680822684353 135 | 1088,tensorflow,26.416019476666666,2.408813440294934 136 | 1088,onnxruntime,11.387930626666666,1.1355637328804493 137 | 1120,onnx2code-gemm-naive,371.04348910333334,14.914355425716268 138 | 1120,onnx2code-loop-tiling,36.63864172666666,3.522334801326663 139 | 1120,onnx2code-libxsmm,85.83883283666667,4.976313906974456 140 | 1120,tensorflow,28.497101723333333,2.975764558625859 141 | 1120,onnxruntime,12.064457016666667,0.9149140815299451 142 | 1152,onnx2code-gemm-naive,428.82500173333335,11.09360301945586 143 | 1152,onnx2code-loop-tiling,38.82765699666666,1.980450791346449 144 | 1152,onnx2code-libxsmm,106.76464112,4.759159336346207 145 | 1152,tensorflow,30.691326269999998,3.6969038110381796 146 | 1152,onnxruntime,13.085166249999999,0.9673633238490873 147 | 1184,onnx2code-gemm-naive,453.3688591733334,12.117019376194405 148 | 1184,onnx2code-loop-tiling,50.478685796666674,10.577480851929527 149 | 1184,onnx2code-libxsmm,133.68964269333335,29.239221574794968 150 | 1184,tensorflow,34.335070269999996,2.725595283464426 151 | 1184,onnxruntime,14.219474640000001,1.209578083563947 152 | 1216,onnx2code-gemm-naive,564.1206916233333,61.78638406885801 153 | 1216,onnx2code-loop-tiling,47.49215579333333,4.236138557224631 154 | 1216,onnx2code-libxsmm,157.28695468,39.71233912090336 155 | 1216,tensorflow,40.785947220000004,6.183618098802194 156 | 1216,onnxruntime,18.38131386,4.049771021552129 157 | 1248,onnx2code-gemm-naive,696.7071773133334,114.50236946627746 158 | 1248,onnx2code-loop-tiling,50.30315509666666,4.852534986694046 159 | 1248,onnx2code-libxsmm,152.39833104666667,25.41443230058589 160 | 1248,tensorflow,39.194375599999994,2.5711971444149073 161 | 1248,onnxruntime,16.34784598,1.2673714435523076 162 | 1280,onnx2code-gemm-naive,709.38608777,94.8956988090892 163 | 1280,onnx2code-loop-tiling,54.26350280999999,4.968193434869949 164 | 1280,onnx2code-libxsmm,190.09006949333335,32.978411602973054 165 | 1280,tensorflow,45.37788102,6.081976934479801 166 | 1280,onnxruntime,18.46193592666667,3.544112383502158 167 | 1312,onnx2code-gemm-naive,866.76174079,102.58889044736809 168 | 1312,onnx2code-loop-tiling,61.21315318333334,5.923573162301649 169 | 1312,onnx2code-libxsmm,213.33526460000002,26.399091248320676 170 | 1312,tensorflow,47.08061651,5.576627014049706 171 | 1312,onnxruntime,21.336600666666666,4.77412035508444 172 | -------------------------------------------------------------------------------- /preliminar/build_dataset.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "82cc7d46", 6 | "metadata": {}, 7 | "source": [ 8 | "Clone `onnx/models` and then pull all ONNX models to some folder (~50GB at the time of writing)\n", 9 | "\n", 10 | "`git clone https://github.com/onnx/models.git`\n", 11 | "\n", 12 | "`git lfs pull --include=\"*.onnx\" --exclude=\"\"`\n" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 1, 18 | "id": "7b7587b1", 19 | "metadata": {}, 20 | "outputs": [], 21 | "source": [ 22 | "import os\n", 23 | "from tqdm import tqdm\n", 24 | "import pandas as pd\n", 25 | "import numpy as np\n", 26 | "from pathlib import Path\n", 27 | "import matplotlib\n", 28 | "import onnx\n", 29 | "import matplotlib.pyplot as plt" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 2, 35 | "id": "18e30425", 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "PATH = Path(\"C:/ONNX/models\")" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 3, 45 | "id": "8c2f56a8", 46 | "metadata": { 47 | "scrolled": true 48 | }, 49 | "outputs": [], 50 | "source": [ 51 | "models = []\n", 52 | "\n", 53 | "for area in [\"text\", \"vision\"]:\n", 54 | " for problem in os.scandir(PATH / area):\n", 55 | " for model in os.scandir(PATH / area / problem):\n", 56 | " for root, _, files in os.walk(PATH / area / problem / model):\n", 57 | " for file in files:\n", 58 | " if file.endswith('.onnx'):\n", 59 | " file_path = os.path.join(root, file)\n", 60 | " models.append({\n", 61 | " \"area\": area,\n", 62 | " \"problem\": problem.name,\n", 63 | " \"model\": model.name,\n", 64 | " \"version\": file,\n", 65 | " \"size\": os.path.getsize(file_path),\n", 66 | " \"path\": file_path\n", 67 | " })\n", 68 | " \n", 69 | "models = pd.DataFrame(models)" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 4, 75 | "id": "b3ccff1c", 76 | "metadata": {}, 77 | "outputs": [ 78 | { 79 | "name": "stderr", 80 | "output_type": "stream", 81 | "text": [ 82 | "168it [00:37, 4.44it/s]\n" 83 | ] 84 | } 85 | ], 86 | "source": [ 87 | "model_ops = []\n", 88 | "\n", 89 | "for index, row in tqdm(models.iterrows()):\n", 90 | " model = onnx.load(row[\"path\"])\n", 91 | " ops_present = set()\n", 92 | "\n", 93 | " for node in model.graph.node:\n", 94 | " ops_present.add(node.op_type)\n", 95 | " \n", 96 | " model_ops.append(ops_present)\n", 97 | "\n", 98 | "models[\"ops\"] = model_ops" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": 5, 104 | "id": "fc97c123", 105 | "metadata": {}, 106 | "outputs": [ 107 | { 108 | "data": { 109 | "text/html": [ 110 | "
\n", 111 | "\n", 124 | "\n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | "
areaproblemmodelversionsizepathops
0textmachine_comprehensionbert-squadbertsquad-10.onnx435852734C:\\ONNX\\models\\text\\machine_comprehension\\bert...{ConstantOfShape, Tanh, Sub, Softmax, Mul, Cas...
1textmachine_comprehensionbert-squadbertsquad-12-int8.onnx124565601C:\\ONNX\\models\\text\\machine_comprehension\\bert...{ConstantOfShape, Tanh, Sub, Softmax, Mul, Cas...
2textmachine_comprehensionbert-squadbertsquad-12.onnx435852736C:\\ONNX\\models\\text\\machine_comprehension\\bert...{ConstantOfShape, Tanh, Sub, Softmax, Mul, Cas...
3textmachine_comprehensionbert-squadbertsquad-8.onnx435882893C:\\ONNX\\models\\text\\machine_comprehension\\bert...{Tanh, Sub, Softmax, Tile, Mul, Cast, Gather, ...
4textmachine_comprehensionbidirectional_attention_flowbidaf-9.onnx43522228C:\\ONNX\\models\\text\\machine_comprehension\\bidi...{ConstantOfShape, Relu, CategoryMapper, Sub, S...
........................
163visionstyle_transferfast_neural_stylerain-princess-8.onnx6726529C:\\ONNX\\models\\vision\\style_transfer\\fast_neur...{Add, Relu, Upsample, Conv, Pad, InstanceNorma...
164visionstyle_transferfast_neural_stylerain-princess-9.onnx6728029C:\\ONNX\\models\\vision\\style_transfer\\fast_neur...{Shape, Add, Cast, Div, Relu, Constant, Gather...
165visionstyle_transferfast_neural_styleudnie-8.onnx6726529C:\\ONNX\\models\\vision\\style_transfer\\fast_neur...{Add, Relu, Upsample, Conv, Pad, InstanceNorma...
166visionstyle_transferfast_neural_styleudnie-9.onnx6728029C:\\ONNX\\models\\vision\\style_transfer\\fast_neur...{Shape, Add, Cast, Div, Relu, Constant, Gather...
167visionsuper_resolutionsub_pixel_cnn_2016super-resolution-10.onnx240078C:\\ONNX\\models\\vision\\super_resolution\\sub_pix...{Relu, Constant, Conv, Reshape, Transpose}
\n", 250 | "

168 rows × 7 columns

\n", 251 | "
" 252 | ], 253 | "text/plain": [ 254 | " area problem model \\\n", 255 | "0 text machine_comprehension bert-squad \n", 256 | "1 text machine_comprehension bert-squad \n", 257 | "2 text machine_comprehension bert-squad \n", 258 | "3 text machine_comprehension bert-squad \n", 259 | "4 text machine_comprehension bidirectional_attention_flow \n", 260 | ".. ... ... ... \n", 261 | "163 vision style_transfer fast_neural_style \n", 262 | "164 vision style_transfer fast_neural_style \n", 263 | "165 vision style_transfer fast_neural_style \n", 264 | "166 vision style_transfer fast_neural_style \n", 265 | "167 vision super_resolution sub_pixel_cnn_2016 \n", 266 | "\n", 267 | " version size \\\n", 268 | "0 bertsquad-10.onnx 435852734 \n", 269 | "1 bertsquad-12-int8.onnx 124565601 \n", 270 | "2 bertsquad-12.onnx 435852736 \n", 271 | "3 bertsquad-8.onnx 435882893 \n", 272 | "4 bidaf-9.onnx 43522228 \n", 273 | ".. ... ... \n", 274 | "163 rain-princess-8.onnx 6726529 \n", 275 | "164 rain-princess-9.onnx 6728029 \n", 276 | "165 udnie-8.onnx 6726529 \n", 277 | "166 udnie-9.onnx 6728029 \n", 278 | "167 super-resolution-10.onnx 240078 \n", 279 | "\n", 280 | " path \\\n", 281 | "0 C:\\ONNX\\models\\text\\machine_comprehension\\bert... \n", 282 | "1 C:\\ONNX\\models\\text\\machine_comprehension\\bert... \n", 283 | "2 C:\\ONNX\\models\\text\\machine_comprehension\\bert... \n", 284 | "3 C:\\ONNX\\models\\text\\machine_comprehension\\bert... \n", 285 | "4 C:\\ONNX\\models\\text\\machine_comprehension\\bidi... \n", 286 | ".. ... \n", 287 | "163 C:\\ONNX\\models\\vision\\style_transfer\\fast_neur... \n", 288 | "164 C:\\ONNX\\models\\vision\\style_transfer\\fast_neur... \n", 289 | "165 C:\\ONNX\\models\\vision\\style_transfer\\fast_neur... \n", 290 | "166 C:\\ONNX\\models\\vision\\style_transfer\\fast_neur... \n", 291 | "167 C:\\ONNX\\models\\vision\\super_resolution\\sub_pix... \n", 292 | "\n", 293 | " ops \n", 294 | "0 {ConstantOfShape, Tanh, Sub, Softmax, Mul, Cas... \n", 295 | "1 {ConstantOfShape, Tanh, Sub, Softmax, Mul, Cas... \n", 296 | "2 {ConstantOfShape, Tanh, Sub, Softmax, Mul, Cas... \n", 297 | "3 {Tanh, Sub, Softmax, Tile, Mul, Cast, Gather, ... \n", 298 | "4 {ConstantOfShape, Relu, CategoryMapper, Sub, S... \n", 299 | ".. ... \n", 300 | "163 {Add, Relu, Upsample, Conv, Pad, InstanceNorma... \n", 301 | "164 {Shape, Add, Cast, Div, Relu, Constant, Gather... \n", 302 | "165 {Add, Relu, Upsample, Conv, Pad, InstanceNorma... \n", 303 | "166 {Shape, Add, Cast, Div, Relu, Constant, Gather... \n", 304 | "167 {Relu, Constant, Conv, Reshape, Transpose} \n", 305 | "\n", 306 | "[168 rows x 7 columns]" 307 | ] 308 | }, 309 | "execution_count": 5, 310 | "metadata": {}, 311 | "output_type": "execute_result" 312 | } 313 | ], 314 | "source": [ 315 | "models.sort_values(\"size\", ascending=False)\n", 316 | "models" 317 | ] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "execution_count": 6, 322 | "id": "a3a6c4ef", 323 | "metadata": {}, 324 | "outputs": [], 325 | "source": [ 326 | "models.to_pickle(\"models-df.pkl\")" 327 | ] 328 | } 329 | ], 330 | "metadata": { 331 | "kernelspec": { 332 | "display_name": "onnx2code-ufxMYK0j", 333 | "language": "python", 334 | "name": "python3" 335 | }, 336 | "language_info": { 337 | "codemirror_mode": { 338 | "name": "ipython", 339 | "version": 3 340 | }, 341 | "file_extension": ".py", 342 | "mimetype": "text/x-python", 343 | "name": "python", 344 | "nbconvert_exporter": "python", 345 | "pygments_lexer": "ipython3", 346 | "version": "3.10.6 (main, Nov 14 2022, 16:10:14) [GCC 11.3.0]" 347 | }, 348 | "vscode": { 349 | "interpreter": { 350 | "hash": "c1aa1ee17a9068633b0ad7418d6283b8ec82042b46ede1c27bec8ef59eb01779" 351 | } 352 | } 353 | }, 354 | "nbformat": 4, 355 | "nbformat_minor": 5 356 | } 357 | -------------------------------------------------------------------------------- /onnx2code/generator.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import warnings 4 | from collections import defaultdict 5 | from itertools import chain 6 | from pathlib import Path 7 | from textwrap import dedent, indent 8 | 9 | import numpy as np 10 | import onnx 11 | import onnxsim.onnx_simplifier as onnx_simplifier 12 | 13 | from .memory import TensorUsageRecord, find_best_layout 14 | from .ops.operation import OpCall, Operation, OpImpl 15 | from .result import ModelResult 16 | from .tensor import TensorData, parse_tensors 17 | from .util import get_fixed_input_shapes 18 | 19 | REGISTER_ORDER = ["rdi", "rsi", "rdx", "rcx", "r8", "r9"] 20 | INFERENCE_SIGNATURE = "void __attribute__ ((noinline)) inference(const float* weights, const float* inputs, float* outputs)" 21 | 22 | 23 | class Generator: 24 | """ 25 | Code generator 26 | 27 | Proto ref: https://github.com/onnx/onnx/blob/main/docs/IR.md 28 | """ 29 | 30 | def __init__(self, _model_proto: onnx.ModelProto, variations: list[str] = []): 31 | try: 32 | model_proto, check = onnx_simplifier.simplify( 33 | model=_model_proto, 34 | overwrite_input_shapes=get_fixed_input_shapes(_model_proto), 35 | ) 36 | assert check, "ONNX model could not be simplified" 37 | except Exception as e: 38 | model_proto = _model_proto 39 | warnings.warn("Model could not be simplified, using as is (" + str(e) + ")") 40 | 41 | # save model for later inspection 42 | if os.getenv("ONNX2CODE_DEBUG", "0") == "1": 43 | tmp = Path(__file__).parent.parent / "tmp" 44 | tmp.mkdir(exist_ok=True) 45 | onnx.save_model(model_proto, (tmp / "model.onnx").__str__()) 46 | 47 | self.model_proto = model_proto 48 | self.tensors = {tensor.name: tensor for tensor in parse_tensors(model_proto)} 49 | self.variations = variations + ["c", "asm"] 50 | 51 | self.impls: dict[OpImpl, OpCall] = {} 52 | self.calls: list[OpCall] = [] 53 | 54 | def weld_tensors(self, name_from: str, name_to: str) -> None: 55 | """ 56 | Weld tensors together 57 | This means they should point to the same variable in runtime 58 | 59 | :param name_from: Name of the origin tensor 60 | :param name_to: Name of the destination tensor 61 | :raises KeyError: If the tensor names are not found 62 | """ 63 | self.tensors[name_to].variable = self.tensors[name_from].variable 64 | 65 | if self.tensors[name_to].tag != "output": 66 | self.tensors[name_to].tag = "welded" 67 | 68 | def generate(self) -> ModelResult: 69 | """ 70 | Generate C and ASM code to run the model 71 | """ 72 | for node in self.model_proto.graph.node: 73 | if node.op_type in [ 74 | # Reshape/Squeeze/Unsqueeze operator ⚠️ SPECIAL CASE ⚠️ 75 | # 76 | # https://github.com/onnx/onnx/blob/main/docs/Operators.md#reshape 77 | # https://github.com/onnx/onnx/blob/main/docs/Operators.md#squeeze 78 | # https://github.com/onnx/onnx/blob/main/docs/Operators.md#unsqueeze 79 | "Reshape", 80 | "Squeeze", 81 | "Unsqueeze", 82 | # have no effect during inference 83 | "Dropout", 84 | "BatchNormalization", # are we sure about this one? 85 | # other kind of reshape 86 | "Flatten", 87 | ]: 88 | # Since it just reshapes the tensor, we don't need to do anything in runtime 89 | # But we must must be weld the input and output tensors (variables/data) 90 | self.weld_tensors(node.input[0], node.output[0]) 91 | 92 | continue 93 | 94 | variants = Operation.get(node.op_type, self.variations) 95 | 96 | impl: (OpImpl | None) = None 97 | call: (OpCall | None) = None 98 | ex: (Exception | None) = None 99 | 100 | # we try all the variants we have available, in the order specified 101 | # if one throws NotImplemented, we try the next one 102 | for var in variants: 103 | try: 104 | op = var( 105 | node, 106 | [self.tensors[name] for name in node.input], 107 | [self.tensors[name] for name in node.output], 108 | ) 109 | impl = op.impl() 110 | call = op.call() 111 | break 112 | except NotImplementedError as _ex: 113 | # keep first 114 | if ex is None: 115 | ex = _ex 116 | 117 | if impl is None or call is None: 118 | assert ex is not None 119 | raise ex 120 | 121 | if call is not None and impl is not None: 122 | if impl in self.impls: 123 | new_name = call.fn_name() 124 | prev_name = self.impls[impl].fn_name() 125 | assert ( 126 | new_name == prev_name 127 | ), "function name should coincide if the implementation is the same" 128 | 129 | self.impls[impl] = call 130 | self.calls.append(call) 131 | 132 | self._compute_memory_layout() 133 | 134 | inputs = [tensor for tensor in self.tensors.values() if tensor.tag == "input"] 135 | outputs = [tensor for tensor in self.tensors.values() if tensor.tag == "output"] 136 | 137 | return ModelResult( 138 | input_shapes={tensor.name: tensor.shape for tensor in inputs}, 139 | output_shapes={tensor.name: tensor.shape for tensor in outputs}, 140 | source_c=self._gen_c_source(), 141 | source_h=f"extern {INFERENCE_SIGNATURE};", 142 | source_asm=self._gen_asm_source(), 143 | weights=self._gen_weights(), 144 | ) 145 | 146 | def _compute_memory_layout(self) -> None: 147 | """ 148 | Finds a good memory layout for intermediate tensors 149 | """ 150 | MAX = 999999999 151 | MIN = -1 152 | 153 | inter_tensors: dict[str, TensorUsageRecord] = {} 154 | 155 | # add all intermediate tensors 156 | for t in self.tensors.values(): 157 | if t.tag == "intermediate": 158 | inter_tensors[t.variable] = TensorUsageRecord(MAX, MIN, t.size) 159 | 160 | # build usage records knowing the order of calls and data dependencies 161 | for index, call in enumerate(self.calls): 162 | # for inputs, make sure we reserve the tensor up to index 163 | for tensor in call.inputs: 164 | if tensor.tag == "intermediate": 165 | rec = inter_tensors[tensor.variable] 166 | rec.last_op = max(rec.last_op, index) 167 | 168 | # for outputs, make sure we reserve the tensor from at least index 169 | for tensor in call.outputs: 170 | if tensor.tag == "intermediate": 171 | rec = inter_tensors[tensor.variable] 172 | rec.first_op = min(rec.first_op, index) 173 | 174 | # tensors that connect with the output don't have last_op set 175 | # set to first_op + 1 176 | for var, rec in inter_tensors.items(): 177 | assert rec.first_op != MAX, "tensor is never used" 178 | 179 | if rec.last_op == -1: 180 | rec.last_op = rec.first_op + 1 181 | 182 | self.inter_size, offsets = find_best_layout(list(inter_tensors.values())) 183 | self.inter_offsets = {} 184 | 185 | # map tensor names to variables 186 | for var, offset in zip(inter_tensors.keys(), offsets): 187 | self.inter_offsets[var] = offset 188 | 189 | def _gen_weights(self) -> TensorData: 190 | return np.concatenate( 191 | [ 192 | tensor.data.reshape(-1) 193 | for tensor in self.tensors.values() 194 | if tensor.tag == "weight" 195 | and tensor.data is not None 196 | and tensor.data.dtype == np.float32 197 | ] 198 | # concatenate needs at least one array 199 | + [np.array([], dtype=np.float32)], 200 | ) 201 | 202 | def _gen_c_source(self) -> str: 203 | source = "\n".join( 204 | [ 205 | "#include ", 206 | "#include ", 207 | "#include ", 208 | "#include ", 209 | "#define min(a,b) ((a)<(b)?(a):(b))", 210 | "#define max(a,b) ((a)>(b)?(a):(b))", 211 | "float im2col[50000000]; // TODO: do this correctly...", 212 | "", 213 | ] 214 | ) 215 | 216 | # asm auxiliary function declarations 217 | 218 | source += "// Auxiliary functions (ASM):\n\n" 219 | 220 | asm_aux_declarations = [ 221 | f"{asm_aux_function.signature};" 222 | for impl in self.impls.keys() 223 | for asm_aux_function in impl.asm_aux_functions 224 | ] 225 | 226 | source += 'extern "C" {\n' + "\n\n".join(asm_aux_declarations) + "\n}\n\n" 227 | 228 | # loading external files 229 | source += "// External files:\n\n" 230 | 231 | efp = [path for impl in self.impls.keys() for path in impl.external_paths] 232 | external_file_paths = sorted(set(efp), key=efp.index) 233 | 234 | for path in external_file_paths: 235 | source += f"// {path}\n\n" 236 | with open(path, "r") as f: 237 | source += f.read() + "\n" 238 | 239 | source += "\n" * 2 240 | 241 | # c++ auxiliary functions 242 | 243 | source += "// Auxiliary functions (C++):\n\n" 244 | 245 | cpp_aux_functions = list( 246 | dict.fromkeys( 247 | chain.from_iterable( 248 | impl.cpp_aux_functions for impl in self.impls.keys() 249 | ) 250 | ) 251 | ) 252 | 253 | source += "\n".join(cpp_aux_functions) + "\n" * 2 254 | 255 | # define ASM functions in C 256 | 257 | source += "// ASM functions:\n\n" 258 | 259 | for impl, call in self.impls.items(): 260 | if impl.lang == "asm": 261 | source += f"extern {call.signature()};" 262 | 263 | source += "\n" * 2 264 | 265 | # implementations 266 | 267 | source += "// Implementations:\n\n" 268 | 269 | for impl, call in self.impls.items(): 270 | if impl.lang == "c": 271 | source += call.signature() + " {\n" 272 | source += indent(impl.full_source().strip(), prefix=" " * 4) 273 | source += "\n}\n" 274 | 275 | # define intermediate tensor 276 | # it is a shared buffer 277 | source += "\n" * 2 278 | source += f"float intermediates[{self.inter_size}];" 279 | source += "\n" * 2 280 | 281 | inference_source = "" 282 | io_offsets: defaultdict[str, int] = defaultdict(int) 283 | # build tensor variables 284 | for tensor in self.tensors.values(): 285 | if tensor.tag != "welded": 286 | if ( 287 | tensor.tag == "weight" 288 | and tensor.data is not None 289 | and tensor.data.dtype != np.float32 290 | ): 291 | # weight with no data or invalid, skip 292 | continue 293 | 294 | if tensor.tag == "intermediate": 295 | # IF an intermediate tensor is welded with the output 296 | # we want to preserve the output tensor instead of the intermediate one 297 | # so we skip the definition of the intermediate in favor of the output 298 | skip = False 299 | for other in self.tensors.values(): 300 | if other.tag == "output" and other.variable == tensor.variable: 301 | # already defined as output 302 | skip = True 303 | break 304 | if skip: 305 | continue 306 | 307 | if tensor.tag == "intermediate": 308 | offset = self.inter_offsets[tensor.variable] 309 | assert offset is not None, "invliad offset" 310 | else: # input, output or weight 311 | offset = io_offsets[tensor.tag] 312 | io_offsets[tensor.tag] += tensor.size 313 | 314 | decl = "const " if tensor.tag in ["input", "weight"] else "" 315 | decl += f"float* {tensor.variable} = " 316 | decl += f"{tensor.tag}s + {offset};" 317 | 318 | else: 319 | # welded 320 | continue 321 | 322 | decl = f"\n{decl : <34} // ({tensor.shape_str()}) {tensor.name}" 323 | inference_source += decl 324 | 325 | # make op calls 326 | inference_source += "\n" 327 | for call in self.calls: 328 | inference_source += f"\n{call.invocation()};" 329 | 330 | source += INFERENCE_SIGNATURE + " {" 331 | source += indent(inference_source, prefix=" " * 4) 332 | source += "\n}" 333 | 334 | return source 335 | 336 | def _gen_asm_source(self) -> str: 337 | source = "" 338 | 339 | # asm auxiliary functions 340 | 341 | for impl in self.impls.keys(): 342 | for asm_aux_function in impl.asm_aux_functions: 343 | # extract name from signature 344 | regex = re.compile(r"(\w+)\s*\(") 345 | match = regex.search(asm_aux_function.signature) 346 | assert match is not None, "invalid signature" 347 | name = match.group(1) 348 | 349 | function_source = indent( 350 | dedent(asm_aux_function.source), prefix=" " * 4 351 | ) 352 | 353 | source += f"global {name}\n{name}:{function_source}\n\n" 354 | 355 | for impl, call in self.impls.items(): 356 | if impl.lang == "asm": 357 | comments = [call.signature()] + [ 358 | f"{p}: {REGISTER_ORDER[i]}" 359 | for i, p in enumerate( 360 | call.input_names[: len(call.inputs)] 361 | + call.output_names[: len(call.outputs)] 362 | ) 363 | ] 364 | source += "\n\n".join( 365 | [ 366 | *[f";; {c}" for c in comments], 367 | f"global {call.fn_name()}", 368 | f"{call.fn_name()}:", 369 | indent(impl.full_source(), prefix=" " * 4), 370 | ] 371 | ) 372 | 373 | return source.strip() + "\n" 374 | --------------------------------------------------------------------------------