├── .clang-format ├── .github └── workflows │ ├── black-ruff.yml │ ├── check-release.yml │ ├── check-urls.yml │ ├── clang.yml │ ├── cmakelint.yml │ ├── documentation.yml │ ├── mypy.yml │ ├── rstcheck.yml │ ├── wheels-linux.yml │ ├── wheels-mac.yml │ └── wheels-windows.yml ├── .gitignore ├── CHANGELOGS.rst ├── LICENSE.txt ├── MANIFEST.in ├── README.rst ├── _cmake ├── CMakeLists.txt ├── clang_format.sh ├── constants.cmake ├── externals │ ├── CPM.cmake │ ├── FindCudaExtension.cmake │ ├── FindCython.cmake │ ├── FindLocalEigen.cmake │ ├── FindLocalMatX.cmake │ ├── FindLocalPyBind11.cmake │ ├── FindMyPython.cmake │ └── FindOrt.cmake ├── finalize.cmake ├── intrin.sh ├── load_externals.cmake ├── targets │ ├── _validation.cmake │ ├── _validation_cuda_example_py.cmake │ ├── _validation_cuda_monitor.cmake │ ├── c_op_conv_.cmake │ ├── c_op_svm_py_.cmake │ ├── c_op_tfidf_vectorizer_py_.cmake │ ├── c_op_tree_ensemble_py_.cmake │ ├── common.cmake │ ├── common_kernels.cmake │ ├── fp8_cy.cmake │ ├── ortinf.cmake │ ├── ortops_optim_cpu.cmake │ ├── ortops_optim_cuda.cmake │ ├── ortops_tutorial_cpu.cmake │ └── ortops_tutorial_cuda.cmake └── test_constants.h.in ├── _doc ├── _static │ ├── logo.png │ ├── profile.png │ ├── vector_sum6.png │ └── vector_sum6_results.png ├── api │ ├── check.rst │ ├── ext_test_case.rst │ ├── helper.rst │ ├── index.rst │ ├── memory_peak.rst │ ├── ortcy.rst │ ├── ortops.rst │ ├── ortops_optim_cpu.rst │ ├── ortops_optim_cuda.rst │ ├── ortops_tutorial_cpu.rst │ ├── ortops_tutorial_cuda.rst │ ├── plotting.rst │ ├── reference.rst │ ├── tools.rst │ ├── tools_einsum.rst │ ├── tools_graph.rst │ ├── tools_graph_transformer.rst │ ├── tools_inline.rst │ ├── tools_io.rst │ ├── tools_nodes.rst │ ├── tools_other.rst │ ├── tools_stats.rst │ ├── validation.rst │ ├── validation_cpu.rst │ ├── validation_cuda.rst │ ├── validation_sparse.rst │ └── validation_trees.rst ├── benchmarks.rst ├── command_lines.rst ├── conf.py ├── examples │ ├── README.txt │ ├── plot_bench_cpu.py │ ├── plot_bench_cypy_ort.py │ ├── plot_bench_gemm_f8.py │ ├── plot_bench_gemm_ort.py │ ├── plot_bench_sparse_access.py │ ├── plot_op_conv_denorm.py │ ├── plot_op_conv_py_vs_c.py │ ├── plot_op_einsum.py │ ├── plot_op_gemm2_cuda.py │ ├── plot_op_mul_cuda.py │ ├── plot_op_scatternd_cuda.py │ ├── plot_op_scatternd_mask_cuda.py │ ├── plot_op_tfidfvectorizer_sparse.py │ ├── plot_op_transpose_2d_cast_cuda.py │ ├── plot_op_tree_ensemble_implementations.py │ ├── plot_op_tree_ensemble_optim.py │ ├── plot_op_tree_ensemble_sparse.py │ └── plot_profile_gemm_ort.py ├── index.rst ├── license.rst ├── tech │ ├── 2023-09-05-glibc.rst │ ├── gemm.rst │ ├── index.rst │ ├── install_cuda_wsl.rst │ └── usefulcmd.rst └── tutorial │ ├── build.rst │ ├── build_cuda.rst │ ├── build_cython.rst │ ├── build_ortext.rst │ ├── build_pybind11.rst │ ├── custom_ops.rst │ ├── cython_binding.rst │ ├── external_data.rst │ ├── images │ └── plot_optim_tree_ensemble.png │ ├── index.rst │ ├── many_tools.rst │ ├── old_version.rst │ ├── onnx_manipulations.rst │ ├── ops.rst │ ├── ort_debug.rst │ ├── parallelization.rst │ ├── profiling.rst │ ├── quantize.rst │ ├── readings.rst │ ├── reference_evaluator.rst │ ├── statistics.rst │ └── trees.rst ├── _unittests ├── onnx_extended_test_common.h ├── ut_helper │ └── test_make_helper.py ├── ut_ortcy │ ├── data │ │ └── add.onnx │ ├── test_inference.cpp │ └── test_ortcy.py ├── ut_ortops │ ├── data │ │ ├── plot_op_tree_ensemble_implementations_custom.onnx │ │ └── plot_op_tree_ensemble_implementations_sparse.onnx │ ├── test_inference_tree.cpp │ ├── test_optim_cuda.py │ ├── test_optim_py.py │ ├── test_optim_sparse.py │ ├── test_optim_svm.py │ ├── test_optim_tfidf_vectorizer.py │ ├── test_optim_tfidf_vectorizer_sparse.py │ ├── test_optim_tree_ensemble.py │ ├── test_optim_tree_ensemble_sparse.py │ ├── test_optim_tree_ensemble_sparse_xgboost.py │ ├── test_tutorial_cpu.py │ ├── test_tutorial_cpu_tree.py │ ├── test_tutorial_gemm_cpu.py │ └── test_tutorial_gemm_cuda.py ├── ut_plotting │ └── test_plotting_benchmark.py ├── ut_reference │ ├── test_backend_c_reference_evaluator.py │ ├── test_c_op_conv.cpp │ ├── test_c_reference_evaluator.py │ ├── test_c_reference_evaluator_save.py │ ├── test_c_svm.py │ ├── test_c_tfidf_vectorizer.py │ ├── test_c_tree_ensemble.py │ └── test_sparse_tensor.py ├── ut_tools │ ├── bench │ │ ├── model.onnx │ │ └── test_data_set_0 │ │ │ ├── input_0.pb │ │ │ ├── input_1.pb │ │ │ └── output_0.pb │ ├── bench_rf │ │ ├── model.onnx │ │ └── test_data_set_0 │ │ │ ├── input_0.pb │ │ │ └── output_0.pb │ ├── data │ │ └── debug_4700-CPUep.onnx │ ├── test_einsum.py │ ├── test_einsum_benchmark.py │ ├── test_einsum_blas_lapack.py │ ├── test_einsum_bug.py │ ├── test_einsum_einsum.py │ ├── test_einsum_generic_dot.py │ ├── test_einsum_ml.py │ ├── test_einsum_onnx_micro_runtime.py │ ├── test_js_profile.py │ ├── test_onnx_inline.py │ ├── test_onnx_tools.py │ ├── test_onnx_tools_graph.py │ ├── test_onnx_tools_quantize_fp8.py │ ├── test_optim_onnx_unused.py │ ├── test_ort_debug.py │ ├── test_run_onnx.py │ ├── test_simple.py │ └── test_stats_nodes.py ├── ut_validation │ ├── test_bench_tree.py │ ├── test_cpu_fpemu.cpp │ ├── test_cpu_fpemu.py │ ├── test_cuda_fpemu.py │ ├── test_cuda_gemm.py │ ├── test_cuda_monitor.py │ ├── test_fp8.py │ ├── test_hash.py │ ├── test_sparse_struct.py │ └── test_speed_metrics.py └── ut_xrun_doc │ ├── test_args.py │ ├── test_command_lines1.py │ ├── test_command_lines2.py │ ├── test_documentation_examples.py │ ├── test_memory_peak.py │ └── test_version.py ├── azure-pipelines.yml ├── clean_build.sh ├── clean_onnx.sh ├── onnx_extended ├── __init__.py ├── __main__.py ├── _command_lines.py ├── _command_lines_parser.py ├── _common.py ├── args.py ├── cpp │ ├── __init__.py │ ├── c_op_allocation.cpp │ ├── c_op_common_parameters.cpp │ ├── cpu │ │ └── __init__.py │ ├── cuda │ │ └── __init__.py │ ├── include │ │ ├── __init__.py │ │ ├── common │ │ │ ├── __init__.py │ │ │ ├── c_op_allocation.h │ │ │ ├── c_op_common_parallel.hpp │ │ │ ├── c_op_common_parameters.h │ │ │ ├── c_op_helpers.h │ │ │ ├── c_op_math.h │ │ │ ├── c_op_status.h │ │ │ ├── common_kernels.h │ │ │ ├── simple_span.h │ │ │ └── sparse_tensor.h │ │ ├── cpu │ │ │ ├── __init__.py │ │ │ ├── c_op_conv.h │ │ │ ├── c_op_conv_common.h │ │ │ ├── c_op_svm_common_.hpp │ │ │ ├── c_op_tfidf_vectorizer_.hpp │ │ │ ├── c_op_tree_ensemble_common_.hpp │ │ │ ├── c_op_tree_ensemble_common_agg_.hpp │ │ │ ├── c_op_tree_ensemble_common_classifier_.hpp │ │ │ └── cast_fp8.h │ │ ├── cuda │ │ │ ├── __init__.py │ │ │ └── common_kernels_cuda.h │ │ ├── onnx_extended_helpers.h │ │ ├── ortapi_c_api_header.h │ │ └── ortapi_version.h │ └── onnx_extended_helpers.cpp ├── ext_test_case.py ├── helper │ ├── __init__.py │ ├── make_dynamic_quantize_linear.py │ └── make_reshape_transpose.py ├── memory_peak.py ├── ortcy │ ├── __init__.py │ └── wrap │ │ ├── __init__.py │ │ ├── ortapi.cpp │ │ ├── ortapi.h │ │ ├── ortapi_inline.h │ │ └── ortinf.pyx ├── ortops │ ├── __init__.py │ ├── optim │ │ ├── __init__.py │ │ ├── cpu │ │ │ ├── __init__.py │ │ │ ├── ort_optim_cpu_lib.cc │ │ │ ├── ort_optim_cpu_lib.h │ │ │ ├── ort_sparse.h │ │ │ ├── ort_sparse.hpp │ │ │ ├── ort_svm.h │ │ │ ├── ort_svm.hpp │ │ │ ├── ort_tfidf_vectorizer.h │ │ │ ├── ort_tfidf_vectorizer.hpp │ │ │ ├── ort_tree_ensemble.h │ │ │ └── ort_tree_ensemble.hpp │ │ ├── cuda │ │ │ ├── __init__.py │ │ │ ├── add_or_mul_shared_input.cu │ │ │ ├── add_or_mul_shared_input.h │ │ │ ├── addaddaddmulmulmul.cu │ │ │ ├── addaddaddmulmulmul.h │ │ │ ├── addaddmulmul.cu │ │ │ ├── addaddmulmul.h │ │ │ ├── addmul.cu │ │ │ ├── addmul.h │ │ │ ├── mul_mul_sigmoid.cu │ │ │ ├── mul_mul_sigmoid.h │ │ │ ├── mul_sigmoid.cu │ │ │ ├── mul_sigmoid.h │ │ │ ├── negxplus1.cu │ │ │ ├── negxplus1.h │ │ │ ├── ort_optim_cuda_lib.cc │ │ │ ├── ort_optim_cuda_lib.h │ │ │ ├── replace_zero.cu │ │ │ ├── replace_zero.h │ │ │ ├── rotary.cu │ │ │ ├── rotary.h │ │ │ ├── scatter_nd_of_shape.cu │ │ │ ├── scatter_nd_of_shape.h │ │ │ ├── scatter_nd_of_shape_common.h │ │ │ ├── scatter_nd_of_shape_masked.cu │ │ │ ├── scatter_nd_of_shape_masked.h │ │ │ ├── submul.cu │ │ │ ├── submul.h │ │ │ ├── transpose_cast_2d.cu │ │ │ ├── transpose_cast_2d.h │ │ │ ├── tri_matrix.cu │ │ │ └── tri_matrix.h │ │ └── optimize.py │ └── tutorial │ │ ├── __init__.py │ │ ├── cpu │ │ ├── __init__.py │ │ ├── custom_gemm.cc │ │ ├── custom_gemm.h │ │ ├── custom_tree_assembly.cc │ │ ├── custom_tree_assembly.h │ │ ├── dynamic_quantize_linear.cc │ │ ├── dynamic_quantize_linear.h │ │ ├── my_kernel.cc │ │ ├── my_kernel.h │ │ ├── my_kernel_attr.cc │ │ ├── my_kernel_attr.h │ │ ├── ort_tutorial_cpu_lib.cc │ │ └── ort_tutorial_cpu_lib.h │ │ └── cuda │ │ ├── __init__.py │ │ ├── custom_gemm.cu │ │ ├── custom_gemm.h │ │ ├── matx_matmul.cu │ │ ├── matx_matmul.h │ │ ├── ort_tutorial_cuda_lib.cc │ │ └── ort_tutorial_cuda_lib.h ├── plotting │ ├── __init__.py │ ├── benchmark.py │ └── data.py ├── reference │ ├── __init__.py │ ├── c_custom_ops │ │ ├── __init__.py │ │ └── custom_op_tree_ensemble_regressor.py │ ├── c_ops │ │ ├── __init__.py │ │ ├── _op_classifier_common.py │ │ ├── c_op_conv.py │ │ ├── c_op_svm_classifier.py │ │ ├── c_op_svm_regressor.py │ │ ├── c_op_tfidf_vectorizer.py │ │ ├── c_op_tree_ensemble_classifier.py │ │ ├── c_op_tree_ensemble_regressor.py │ │ └── cpu │ │ │ ├── __init__.py │ │ │ ├── c_op_conv_.cpp │ │ │ ├── c_op_conv_pybind11.h │ │ │ ├── c_op_svm_py_.cpp │ │ │ ├── c_op_tfidf_vectorizer_py_.cpp │ │ │ ├── c_op_tree_ensemble_py_.cpp │ │ │ ├── c_op_tree_ensemble_py_.hpp │ │ │ └── c_op_tree_ensemble_py_classifier_.hpp │ ├── c_reference_backend.py │ ├── c_reference_evaluator.py │ └── other_ops │ │ ├── __init__.py │ │ ├── op_scatternd_of_shape.py │ │ └── op_tokenizer.py ├── tools │ ├── __init__.py │ ├── einsum │ │ ├── __init__.py │ │ ├── blas_lapack.py │ │ ├── einsum_bench.py │ │ ├── einsum_config.py │ │ ├── einsum_fct.py │ │ ├── einsum_impl.py │ │ ├── einsum_impl_classes.py │ │ ├── einsum_impl_ext.py │ │ └── einsum_ml.py │ ├── graph │ │ ├── __init__.py │ │ ├── errors.py │ │ ├── onnx_custom_ops.py │ │ ├── onnx_graph_struct.py │ │ └── onnx_graph_transformer.py │ ├── js_profile.py │ ├── onnx_inline.py │ ├── onnx_io.py │ ├── onnx_nodes.py │ ├── ort_debug.py │ ├── run_onnx.py │ ├── run_onnx_main.py │ └── stats_nodes.py └── validation │ ├── __init__.py │ ├── _tree_d14_f100.py │ ├── bench_trees.py │ ├── cpu │ ├── __init__.py │ ├── _validation.cpp │ ├── cpu_fpemu.hpp │ ├── murmur_hash3.cpp │ ├── murmur_hash3.h │ ├── speed_metrics.cpp │ ├── speed_metrics.h │ ├── vector_sparse.cpp │ └── vector_sparse.h │ ├── cuda │ ├── __init__.py │ ├── cuda_example_py.cpp │ ├── cuda_fpemu.cu │ ├── cuda_fpemu.cuh │ ├── cuda_gemm.cu │ ├── cuda_gemm.cuh │ ├── cuda_monitor.cpp │ ├── cuda_nvtx.cuh │ ├── cuda_tensor.cu │ ├── cuda_tensor.cuh │ └── cuda_utils.h │ └── cython │ ├── __init__.py │ └── fp8.pyx ├── pyproject.toml ├── requirements-dev.txt ├── requirements.txt ├── setup.cfg └── setup.py /.clang-format: -------------------------------------------------------------------------------- 1 | ColumnLimit: 96 2 | -------------------------------------------------------------------------------- /.github/workflows/black-ruff.yml: -------------------------------------------------------------------------------- 1 | name: Black + Ruff Format Checker 2 | on: [push, pull_request] 3 | jobs: 4 | black-format-check: 5 | runs-on: ubuntu-latest 6 | steps: 7 | - uses: actions/checkout@v2 8 | - uses: psf/black@stable 9 | with: 10 | options: "--diff --check" 11 | src: "." 12 | ruff-format-check: 13 | runs-on: ubuntu-latest 14 | steps: 15 | - uses: actions/checkout@v3 16 | - uses: chartboost/ruff-action@v1 17 | -------------------------------------------------------------------------------- /.github/workflows/check-urls.yml: -------------------------------------------------------------------------------- 1 | name: Check URLs 2 | 3 | on: 4 | pull_request: 5 | branches: [main] 6 | schedule: 7 | # ┌───────────── minute (0 - 59) 8 | # │ ┌───────────── hour (0 - 23) 9 | # │ │ ┌───────────── day of the month (1 - 31) 10 | # │ │ │ ┌───────────── month (1 - 12 or JAN-DEC) 11 | # │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT) 12 | # │ │ │ │ │ 13 | # │ │ │ │ │ 14 | # │ │ │ │ │ 15 | # * * * * * 16 | - cron: '30 1 * * 0' 17 | 18 | jobs: 19 | check-urls: 20 | runs-on: ubuntu-latest 21 | 22 | steps: 23 | - uses: actions/checkout@v3 24 | 25 | - name: urls-checker-code 26 | uses: urlstechie/urlchecker-action@master 27 | with: 28 | subfolder: onnx_extended 29 | file_types: .md,.py,.rst,.ipynb 30 | print_all: false 31 | timeout: 2 32 | retry_count# : 2 33 | exclude_urls: https://github.com/microsoft/onnxruntime/blob/ 34 | exclude_patterns: https://github.com/microsoft/onnxruntime/blob/ 35 | # force_pass : true 36 | 37 | - name: urls-checker-docs 38 | uses: urlstechie/urlchecker-action@master 39 | with: 40 | subfolder: _doc 41 | file_types: .md,.py,.rst,.ipynb 42 | print_all: false 43 | timeout: 2 44 | retry_count# : 2 45 | exclude_urls: 64,14: https://github.com/Kitware/CMake/releases/download/v${cmake_version}/cmake-$,https://developer.download.nvidia.com/compute/cuda/$ 46 | exclude_patterns: https://www.data.gouv.fr/fr/datasets/r/e3d83ab3-dc52-4c99-abaf-8a38050cc68c,https://dev.azure.com/ 47 | # force_pass : true 48 | -------------------------------------------------------------------------------- /.github/workflows/clang.yml: -------------------------------------------------------------------------------- 1 | name: Clang Format Checker 2 | on: [push] 3 | jobs: 4 | clang-format-checking: 5 | runs-on: ubuntu-latest 6 | steps: 7 | - uses: actions/checkout@v2 8 | - uses: RafikFarhad/clang-format-github-action@v3 9 | with: 10 | sources: "src/**/*.h,src/**/*.c,test/**/*.c" 11 | -------------------------------------------------------------------------------- /.github/workflows/cmakelint.yml: -------------------------------------------------------------------------------- 1 | name: Cmake Format Checker 2 | 3 | on: [push] 4 | 5 | jobs: 6 | build: 7 | runs-on: ubuntu-latest 8 | 9 | steps: 10 | - name: Checkout repository 11 | uses: actions/checkout@v2 12 | 13 | - name: Format CMake files 14 | id: cmake-format 15 | uses: PuneetMatharu/cmake-format-lint-action@v1.0.0 16 | with: 17 | args: --check 18 | 19 | - name: Commit changes 20 | uses: stefanzweifel/git-auto-commit-action@v4 21 | with: 22 | commit_user_name: cmake-format-bot 23 | commit_message: 'Automated commit of cmake-format changes.' 24 | -------------------------------------------------------------------------------- /.github/workflows/mypy.yml: -------------------------------------------------------------------------------- 1 | name: Type annotation with mypy 2 | on: [push, pull_request] 3 | jobs: 4 | mypy: 5 | runs-on: ubuntu-latest 6 | steps: 7 | - uses: actions/checkout@v3 8 | - uses: actions/setup-python@v4 9 | with: 10 | python-version: '3.11' 11 | - name: Install mypy 12 | run: pip install mypy 13 | - name: Run mypy 14 | run: mypy 15 | -------------------------------------------------------------------------------- /.github/workflows/rstcheck.yml: -------------------------------------------------------------------------------- 1 | name: RST Check 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | build_wheels: 7 | name: rstcheck ${{ matrix.os }} 8 | runs-on: ${{ matrix.os }} 9 | strategy: 10 | matrix: 11 | os: [ubuntu-latest] 12 | 13 | steps: 14 | - uses: actions/checkout@v3 15 | 16 | # Used to host cibuildwheel 17 | - uses: actions/setup-python@v4 18 | with: 19 | python-version: '3.11' 20 | 21 | - name: Install requirements 22 | run: python -m pip install -r requirements.txt 23 | 24 | - name: Install rstcheck 25 | run: python -m pip install sphinx tomli rstcheck[toml,sphinx] 26 | 27 | - name: rstcheck 28 | run: rstcheck -r _doc onnx_extended 29 | -------------------------------------------------------------------------------- /.github/workflows/wheels-linux.yml: -------------------------------------------------------------------------------- 1 | name: Build Wheel Linux 2 | 3 | on: 4 | push: 5 | # branches: 6 | # - main 7 | # - 'releases/**' 8 | pull_request: 9 | # types: 10 | # - closed 11 | # branches: 12 | # - main 13 | #on: 14 | # push: 15 | # branches: 16 | # - main 17 | # - 'releases/**' 18 | 19 | jobs: 20 | build_wheels: 21 | name: Build wheels on ${{ matrix.os }} 22 | runs-on: ${{ matrix.os }} 23 | strategy: 24 | matrix: 25 | os: [ubuntu-latest] 26 | 27 | steps: 28 | - uses: actions/checkout@v4 29 | 30 | # Used to host cibuildwheel 31 | - uses: actions/setup-python@v4 32 | with: 33 | python-version: '3.11' 34 | 35 | - name: Install cibuildwheel 36 | run: python -m pip install cibuildwheel 37 | 38 | - name: python version 39 | run: python -V 40 | 41 | - name: Build wheels 42 | run: python -m cibuildwheel --output-dir wheelhouse 43 | 44 | - uses: actions/upload-artifact@v4 45 | with: 46 | path: ./wheelhouse/*.whl 47 | -------------------------------------------------------------------------------- /.github/workflows/wheels-mac.yml: -------------------------------------------------------------------------------- 1 | name: Build Wheel MacOS 2 | 3 | on: 4 | push: 5 | # branches: 6 | # - main 7 | # - 'releases/**' 8 | pull_request: 9 | # types: 10 | # - closed 11 | # branches: 12 | # - main 13 | 14 | jobs: 15 | build_wheels: 16 | name: Build wheels on ${{ matrix.os }} 17 | runs-on: ${{ matrix.os }} 18 | strategy: 19 | matrix: 20 | os: [macOS-latest] 21 | 22 | steps: 23 | - uses: actions/checkout@v4 24 | 25 | # Used to host cibuildwheel 26 | - uses: actions/setup-python@v4 27 | with: 28 | python-version: '3.11' 29 | 30 | - name: Install cibuildwheel 31 | run: python -m pip install cibuildwheel 32 | 33 | - name: python version 34 | run: python -V 35 | 36 | - name: Build wheels 37 | run: python -m cibuildwheel --output-dir wheelhouse 38 | continue-on-error: true 39 | 40 | - uses: actions/upload-artifact@v4 41 | with: 42 | path: ./wheelhouse/*.whl 43 | -------------------------------------------------------------------------------- /.github/workflows/wheels-windows.yml: -------------------------------------------------------------------------------- 1 | name: Build Wheel Windows 2 | 3 | on: 4 | push: 5 | # branches: 6 | # - main 7 | # - 'releases/**' 8 | pull_request: 9 | # types: 10 | # - closed 11 | # branches: 12 | # - main 13 | 14 | jobs: 15 | build_wheels: 16 | name: Build wheels on ${{ matrix.os }} 17 | runs-on: ${{ matrix.os }} 18 | strategy: 19 | matrix: 20 | os: [windows-latest] 21 | 22 | steps: 23 | - uses: actions/checkout@v4 24 | 25 | # Used to host cibuildwheel 26 | - uses: actions/setup-python@v4 27 | with: 28 | python-version: '3.11' 29 | 30 | - name: Install cibuildwheel 31 | run: python -m pip install cibuildwheel 32 | 33 | - name: python version 34 | run: python -V 35 | 36 | - name: Build wheels 37 | run: python -m cibuildwheel 38 | 39 | - uses: actions/upload-artifact@v4 40 | with: 41 | path: ./wheelhouse/*.whl 42 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.pyd 3 | *.dylib 4 | *.so 5 | *.so.* 6 | *.dll 7 | *.vcxproj* 8 | *.tcl 9 | *.sln 10 | *.cmake 11 | *.whl 12 | *.def 13 | *.ll 14 | *.pdb 15 | *.s 16 | /*.png 17 | /*.onnx 18 | .build_path.txt 19 | .hypothesis/* 20 | coverage.html/* 21 | _cache/* 22 | _deps/* 23 | .vs/* 24 | *.dir/* 25 | Release/* 26 | Testing/* 27 | plot_*.csv 28 | plot_*.xlsx 29 | *.data 30 | test_ort_version* 31 | x64/* 32 | CMakeFiles/* 33 | dist/* 34 | build/* 35 | .eggs/* 36 | *egg-info/* 37 | .coverage 38 | CMakeCache.txt 39 | onnxruntime_*.json 40 | _doc/LICENSE.rst 41 | _doc/LICENSE.txt 42 | _doc/CHANGELOGS.rst 43 | _doc/examples/_cache/* 44 | _doc/sg_execution_times.rst 45 | _doc/auto_examples/* 46 | _doc/examples/*.xlsx 47 | _doc/examples/plot*.csv 48 | _doc/examples/plot*.onnx 49 | _doc/examples/plot_*.png 50 | _doc/examples/plot_*.csv 51 | _doc/examples/plot_*.onnx 52 | _doc/examples/plot_*.xlsx 53 | _doc/_static/require.js 54 | _doc/_static/viz.js 55 | _unittests/ut__main/*.png 56 | _unittests/test_constants.h 57 | onnx_extended/_config.py 58 | onnx_extended/validation/cython/*.c 59 | onnx_extended/validation/cython/*.cpp 60 | onnx_extended/validation/cython/vector_function_cy.c* 61 | onnx_extended/ortcy/wrap/ortinf.c* 62 | onnx_extended/ortcy/wrap/*.lib 63 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2023-2024, Xavier Dupré 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include onnx_extended *.c *.cpp *.h *.pyx *.pxd *.pxi *.py 2 | recursive-include _cmake *.cmake *.in *.txt *.sh *.in 3 | include pyproject.toml 4 | include MANIFEST.in 5 | include setup.cfg 6 | prune _doc 7 | prune _unittests 8 | exclude *.yml 9 | exclude *.git* 10 | # cython files to exclude 11 | exclude onnx_extended/ortcy/ortinf.cpp 12 | exclude onnx_extended/validation/cython/fp8.cpp 13 | -------------------------------------------------------------------------------- /_cmake/clang_format.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | clear 3 | echo "--ruff--" 4 | ruff check . 5 | echo "--cython-lint--" 6 | cython-lint . 7 | echo "--clang-format--" 8 | find onnx_extended -type f \( -name "*.h" -o -name "*.hpp" -o -name "*.cuh" -o -name "*.cpp" -o -name "*.cc" -o -name "*.cu" \) | while read f; do 9 | echo "clang-format -i $f"; 10 | clang-format -i $f; 11 | done 12 | echo "--cmake-lint--" 13 | find _cmake -type f \( -name "*.cmake" -o -name "*.txt" \) | while read f; do 14 | echo "cmake-lint $f --line-width=88 --disabled-codes C0103 C0113"; 15 | cmake-lint $f --line-width=88 --disabled-codes C0103 C0113; 16 | done 17 | -------------------------------------------------------------------------------- /_cmake/externals/FindLocalEigen.cmake: -------------------------------------------------------------------------------- 1 | # 2 | # initialization 3 | # 4 | # function eigen_add_dependency 5 | # output variables LOCAL_EIGEN_FOUND, LOCAL_EIGEN_TARGET 6 | 7 | if(NOT LOCAL_EIGEN_VERSION) 8 | set(LOCAL_EIGEN_VERSION "3.4.0") 9 | endif() 10 | string(SUBSTRING "${LOCAL_EIGEN_VERSION}" 0 3 SHORT_EIGEN_VERSION) 11 | set(LOCAL_EIGEN_ROOT https://gitlab.com/libeigen/eigen/-/archive/) 12 | set(LOCAL_EIGEN_NAME "eigen-${LOCAL_EIGEN_VERSION}.zip") 13 | set(LOCAL_EIGEN_URL "${LOCAL_EIGEN_ROOT}${LOCAL_EIGEN_VERSION}/${LOCAL_EIGEN_NAME}") 14 | set(LOCAL_EIGEN_DEST "${CMAKE_CURRENT_BINARY_DIR}/eigen-download/${LOCAL_EIGEN_NAME}") 15 | set(LOCAL_EIGEN_DEST_DIR "${CMAKE_CURRENT_BINARY_DIR}/eigen-bin/") 16 | 17 | FetchContent_Declare(eigen URL ${LOCAL_EIGEN_URL}) 18 | 19 | # This instruction add all the available targets in eigen 20 | # including unit tests. 21 | # FetchContent_makeAvailable(eigen) 22 | 23 | FetchContent_Populate(eigen) 24 | 25 | list(APPEND CMAKE_MODULE_PATH "${eigen_SOURCE_DIR}/cmake") 26 | # find_package(Eigen3) 27 | 28 | set(LOCAL_EIGEN_SOURCE "${eigen_SOURCE_DIR}") 29 | 30 | # find_package(Eigen3 ${SHORT_EIGEN_VERSION} REQUIRED NO_MODULE) 31 | set(LOCAL_EIGEN_TARGET Eigen3::Eigen) 32 | set(LOCAL_EIGEN_VERSION ${Eigen3_VERSION}) 33 | set(EIGEN_INCLUDE_DIRS "${eigen_SOURCE_DIR}") 34 | 35 | # 36 | # !eigen_add_dependency: add a dependency to eigen. 37 | # 38 | # 39 | # \arg:name target name 40 | # 41 | function(eigen_add_dependency name) 42 | target_include_directories(${name} PRIVATE ${EIGEN_INCLUDE_DIRS}) 43 | endfunction() 44 | 45 | include(FindPackageHandleStandardArgs) 46 | find_package_handle_standard_args( 47 | LocalEigen 48 | VERSION_VAR LOCAL_EIGEN_VERSION 49 | REQUIRED_VARS LOCAL_EIGEN_TARGET LOCAL_EIGEN_URL LOCAL_EIGEN_SOURCE 50 | EIGEN_INCLUDE_DIRS) 51 | -------------------------------------------------------------------------------- /_cmake/externals/FindLocalMatX.cmake: -------------------------------------------------------------------------------- 1 | # 2 | # initialization 3 | # 4 | # defines matx matx_SOURCE_DIR matx_BINARY_DIR 5 | 6 | # 7 | # matx 8 | # 9 | 10 | set(matx_TAG "v0.8.0") 11 | 12 | include(FetchContent) 13 | FetchContent_Declare( 14 | matx 15 | GIT_REPOSITORY https://github.com/NVIDIA/matx 16 | GIT_TAG ${matx_TAG}) 17 | 18 | FetchContent_MakeAvailable(matx) 19 | FetchContent_GetProperties(matx) 20 | 21 | set(matx_VERSION ${matx_TAG}) 22 | set(MATX_INCLUDE_DIR "${matx_SOURCE_DIR}/include") 23 | message(STATUS "matx_BINARY_DIR=${matx_BINARY_DIR}") 24 | message(STATUS "matx_SOURCE_DIR=${matx_SOURCE_DIR}") 25 | message(STATUS "MATX_INCLUDE_DIR=${MATX_INCLUDE_DIR}") 26 | message(STATUS "matx_VERSION=${matx_VERSION}") 27 | 28 | include(FindPackageHandleStandardArgs) 29 | find_package_handle_standard_args( 30 | LocalMatX 31 | VERSION_VAR matx_VERSION 32 | REQUIRED_VARS matx_SOURCE_DIR matx_BINARY_DIR) 33 | -------------------------------------------------------------------------------- /_cmake/finalize.cmake: -------------------------------------------------------------------------------- 1 | 2 | if(CUDA_AVAILABLE) 3 | set(config_content_cuda 4 | "HAS_CUDA = 1\nCUDA_VERSION = '${CUDA_VERSION}'" 5 | "\nCUDA_VERSION_INT = ${CUDA_VERSION_INT}") 6 | else() 7 | set(config_content_cuda "HAS_CUDA = 0") 8 | endif() 9 | 10 | set(config_content_comma 11 | "${config_content_cuda}" 12 | "\nORT_VERSION = '${ORT_VERSION}'" 13 | "\nORT_VERSION_INT = ${ORT_VERSION_INT}" 14 | "\nCXX_FLAGS = '${CMAKE_CXX_FLAGS}'" 15 | "\nCMAKE_CXX_STANDARD_REQUIRED = '${CMAKE_CXX_STANDARD_REQUIRED}'" 16 | "\nCMAKE_CXX_EXTENSIONS = '${CMAKE_CXX_EXTENSIONS}'" 17 | "\nCMAKE_CXX_STANDARD = ${CMAKE_CXX_STANDARD}\n") 18 | 19 | string(REPLACE ";" "" config_content "${config_content_comma}") 20 | -------------------------------------------------------------------------------- /_cmake/intrin.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | get_instruction () 4 | { 5 | [ -z "$1" ] && exit 6 | func_name="$1 " 7 | 8 | header_file=`grep --include=\*intrin.h -Rl "$func_name" /usr/lib/gcc | head -n1` 9 | [ -z "$header_file" ] && exit 10 | >&2 echo "find in: $header_file" 11 | 12 | target_directive=`grep "#pragma GCC target(\|$func_name" $header_file | grep -B 1 "$func_name" | head -n1` 13 | echo $target_directive | grep -o '"[^,]*[,"]' | sed 's/"//g' | sed 's/,//g' 14 | } 15 | 16 | instruction=`get_instruction $1` 17 | if [ -z "$instruction" ]; then 18 | echo "Error: function not found: $1" 19 | else 20 | echo "add this option to gcc: -m$instruction" 21 | fi 22 | -------------------------------------------------------------------------------- /_cmake/targets/_validation.cmake: -------------------------------------------------------------------------------- 1 | # 2 | # module: onnx_extended.validation.cpu._validation 3 | # 4 | message(STATUS "+ PYBIND11 onnx_extended.validation.cpu._validation") 5 | 6 | add_library(lib_validation_cpp STATIC 7 | ../onnx_extended/validation/cpu/murmur_hash3.cpp 8 | ../onnx_extended/validation/cpu/speed_metrics.cpp) 9 | target_compile_definitions(lib_validation_cpp PRIVATE PYTHON_MANYLINUX=${PYTHON_MANYLINUX}) 10 | target_include_directories(lib_validation_cpp PRIVATE "${ROOT_INCLUDE_PATH}") 11 | set_property(TARGET lib_validation_cpp PROPERTY POSITION_INDEPENDENT_CODE ON) 12 | 13 | local_pybind11_add_module( 14 | _validation OpenMP::OpenMP_CXX 15 | ../onnx_extended/validation/cpu/_validation.cpp 16 | ../onnx_extended/validation/cpu/vector_sparse.cpp) 17 | message(STATUS " LINK _validation <- lib_validation_cpp") 18 | target_include_directories(_validation PRIVATE "${ROOT_INCLUDE_PATH}") 19 | target_link_libraries(_validation PRIVATE lib_validation_cpp common) 20 | 21 | add_executable( 22 | test_validation_cpp 23 | ../_unittests/ut_validation/test_cpu_fpemu.cpp) 24 | target_compile_definitions(test_validation_cpp PRIVATE PYTHON_MANYLINUX=${PYTHON_MANYLINUX}) 25 | target_include_directories( 26 | test_validation_cpp 27 | PRIVATE 28 | "${ROOT_PROJECT_PATH}" 29 | "${ROOT_INCLUDE_PATH}" 30 | "${ROOT_UNITTEST_PATH}") 31 | message(STATUS " LINK test_validation_cpp <- lib_validation_cpp") 32 | target_link_libraries( 33 | test_validation_cpp 34 | PRIVATE 35 | lib_validation_cpp 36 | common) 37 | add_test(NAME test_validation_cpp COMMAND test_validation_cpp) 38 | -------------------------------------------------------------------------------- /_cmake/targets/_validation_cuda_example_py.cmake: -------------------------------------------------------------------------------- 1 | # 2 | # module: onnx_extended.validation.cuda.cuda_example_py 3 | # 4 | if(CUDA_AVAILABLE) 5 | 6 | message(STATUS "+ PYBIND11 CUDA onnx_extended.validation.cuda.cuda_example_py") 7 | 8 | cuda_pybind11_add_module( 9 | cuda_example_py 10 | ../onnx_extended/validation/cuda/cuda_example_py.cpp 11 | ../onnx_extended/validation/cuda/cuda_fpemu.cu 12 | ../onnx_extended/validation/cuda/cuda_tensor.cu 13 | ../onnx_extended/validation/cuda/cuda_gemm.cu) 14 | 15 | target_include_directories(cuda_example_py PRIVATE ${ROOT_INCLUDE_PATH}) 16 | target_link_libraries(cuda_example_py PRIVATE common) 17 | 18 | endif() 19 | -------------------------------------------------------------------------------- /_cmake/targets/_validation_cuda_monitor.cmake: -------------------------------------------------------------------------------- 1 | # 2 | # module: onnx_extended.validation.cuda.cuda_monitor 3 | # 4 | if(CUDA_AVAILABLE) 5 | 6 | message(STATUS "+ PYBIND11 CUDA onnx_extended.validation.cuda.cuda_monitor") 7 | 8 | cuda_pybind11_add_module( 9 | cuda_monitor 10 | ../onnx_extended/validation/cuda/cuda_monitor.cpp) 11 | 12 | target_include_directories(cuda_monitor PRIVATE ${ROOT_INCLUDE_PATH}) 13 | target_link_libraries(cuda_monitor PRIVATE common CUDA::nvml) 14 | 15 | endif() 16 | -------------------------------------------------------------------------------- /_cmake/targets/c_op_conv_.cmake: -------------------------------------------------------------------------------- 1 | # 2 | # module: onnx_extended.reference.c_ops.cpu.c_op_conv_ 3 | # 4 | message(STATUS "+ PYBIND11 onnx_extended.reference.c_ops.cpu.c_op_conv_") 5 | 6 | local_pybind11_add_module( 7 | c_op_conv_ OpenMP::OpenMP_CXX 8 | ../onnx_extended/reference/c_ops/cpu/c_op_conv_.cpp) 9 | eigen_add_dependency(c_op_conv_) 10 | 11 | target_link_libraries(c_op_conv_ PRIVATE common_kernels common) 12 | target_include_directories(c_op_conv_ PRIVATE ${ROOT_INCLUDE_PATH}) 13 | 14 | add_executable(test_c_op_conv_cpp ../_unittests/ut_reference/test_c_op_conv.cpp) 15 | target_compile_definitions(test_c_op_conv_cpp PRIVATE PYTHON_MANYLINUX=${PYTHON_MANYLINUX}) 16 | target_link_libraries(test_c_op_conv_cpp PRIVATE common_kernels common) 17 | target_include_directories( 18 | test_c_op_conv_cpp 19 | PRIVATE 20 | ${ROOT_INCLUDE_PATH} 21 | ${ROOT_UNITTEST_PATH}) 22 | 23 | eigen_add_dependency(test_c_op_conv_cpp) 24 | 25 | add_test(NAME test_c_op_conv_cpp COMMAND test_c_op_conv_cpp) 26 | -------------------------------------------------------------------------------- /_cmake/targets/c_op_svm_py_.cmake: -------------------------------------------------------------------------------- 1 | # 2 | # module: onnx_extended.reference.c_ops.cpu.c_op_svm_py_ 3 | # 4 | message(STATUS "+ PYBIND11 onnx_extended.reference.c_ops.cpu.c_op_svm_py_") 5 | 6 | local_pybind11_add_module( 7 | c_op_svm_py_ OpenMP::OpenMP_CXX 8 | ../onnx_extended/reference/c_ops/cpu/c_op_svm_py_.cpp) 9 | 10 | target_link_libraries(c_op_svm_py_ PRIVATE common_kernels common) 11 | 12 | target_include_directories(c_op_svm_py_ PRIVATE ${ROOT_INCLUDE_PATH}) 13 | -------------------------------------------------------------------------------- /_cmake/targets/c_op_tfidf_vectorizer_py_.cmake: -------------------------------------------------------------------------------- 1 | # 2 | # module: onnx_extended.reference.c_ops.cpu.c_op_tree_ensemble_py_ 3 | # 4 | message(STATUS "+ PYBIND11 onnx_extended.reference.c_ops.cpu.c_op_tfidf_vectorizer_py_") 5 | 6 | local_pybind11_add_module( 7 | c_op_tfidf_vectorizer_py_ OpenMP::OpenMP_CXX 8 | ../onnx_extended/reference/c_ops/cpu/c_op_tfidf_vectorizer_py_.cpp) 9 | 10 | target_link_libraries(c_op_tfidf_vectorizer_py_ PRIVATE common_kernels common) 11 | 12 | target_include_directories(c_op_tfidf_vectorizer_py_ PRIVATE ${ROOT_INCLUDE_PATH}) 13 | 14 | -------------------------------------------------------------------------------- /_cmake/targets/c_op_tree_ensemble_py_.cmake: -------------------------------------------------------------------------------- 1 | # 2 | # module: onnx_extended.reference.c_ops.cpu.c_op_tree_ensemble_py_ 3 | # 4 | message(STATUS "+ PYBIND11 onnx_extended.reference.c_ops.cpu.c_op_tree_ensemble_py_") 5 | 6 | local_pybind11_add_module( 7 | c_op_tree_ensemble_py_ OpenMP::OpenMP_CXX 8 | ../onnx_extended/reference/c_ops/cpu/c_op_tree_ensemble_py_.cpp) 9 | 10 | target_link_libraries(c_op_tree_ensemble_py_ PRIVATE common_kernels common) 11 | 12 | target_include_directories(c_op_tree_ensemble_py_ PRIVATE ${ROOT_INCLUDE_PATH}) 13 | 14 | -------------------------------------------------------------------------------- /_cmake/targets/common.cmake: -------------------------------------------------------------------------------- 1 | # 2 | # module: common C++ libraries 3 | # 4 | message(STATUS "+ KERNEL onnx_extended.common") 5 | add_library(common STATIC ../onnx_extended/cpp/onnx_extended_helpers.cpp) 6 | target_compile_definitions(common PRIVATE PYTHON_MANYLINUX=${PYTHON_MANYLINUX}) 7 | target_include_directories(common PRIVATE "${ROOT_INCLUDE_PATH}") 8 | -------------------------------------------------------------------------------- /_cmake/targets/common_kernels.cmake: -------------------------------------------------------------------------------- 1 | # 2 | # module: common C++ libraries 3 | # 4 | message(STATUS "+ KERNEL onnx_extended.common_kernels") 5 | add_library( 6 | common_kernels 7 | STATIC 8 | ../onnx_extended/cpp/c_op_allocation.cpp 9 | ../onnx_extended/cpp/c_op_common_parameters.cpp) 10 | target_compile_definitions(common_kernels PRIVATE PYTHON_MANYLINUX=${PYTHON_MANYLINUX}) 11 | target_include_directories(common_kernels PRIVATE "${ROOT_INCLUDE_PATH}") 12 | -------------------------------------------------------------------------------- /_cmake/targets/fp8_cy.cmake: -------------------------------------------------------------------------------- 1 | # 2 | # module: onnx_extended.validation.cython.fp8 3 | # 4 | message(STATUS "+ CYTHON onnx_extended.validation.cython.fp8") 5 | 6 | cython_add_module( 7 | fp8 8 | ../onnx_extended/validation/cython/fp8.pyx 9 | OpenMP::OpenMP_CXX) 10 | 11 | target_include_directories(fp8 PRIVATE ${ROOT_INCLUDE_PATH}) 12 | -------------------------------------------------------------------------------- /_cmake/targets/ortinf.cmake: -------------------------------------------------------------------------------- 1 | # 2 | # module: onnx_extended.ortcy.wrap.ortapi 3 | # 4 | message(STATUS "+ CYTHON onnx_extended.ortcy.wrap.ortapi") 5 | 6 | add_library(lib_ortapi STATIC ../onnx_extended/ortcy/wrap/ortapi.cpp) 7 | target_compile_definitions(lib_ortapi PRIVATE PYTHON_MANYLINUX=${PYTHON_MANYLINUX}) 8 | target_include_directories( 9 | lib_ortapi PUBLIC 10 | ${ONNXRUNTIME_INCLUDE_DIR} 11 | ${ROOT_INCLUDE_PATH}) 12 | target_link_libraries(lib_ortapi PRIVATE common) 13 | 14 | set(ORTAPI_INCLUDE_DIR "${ROOT_PROJECT_PATH}/onnx_extended/ortcy/wrap") 15 | 16 | cython_add_module( 17 | ortinf 18 | ../onnx_extended/ortcy/wrap/ortinf.pyx 19 | OpenMP::OpenMP_CXX) 20 | 21 | message(STATUS " LINK ortinf <- lib_ortapi onnxruntime ${ORTAPI_INCLUDE_DIR}") 22 | 23 | ort_add_dependency( 24 | ortinf 25 | onnx_extended/ortcy/wrap) 26 | 27 | # If ONNXRUNTIME_LIB_DIR is used, then it seems a local installation does 28 | # does not the binaries anymore if they are removed. 29 | target_link_directories(ortinf PRIVATE ${ORTAPI_INCLUDE_DIR}) 30 | 31 | target_link_libraries( 32 | ortinf 33 | PRIVATE 34 | lib_ortapi 35 | onnxruntime 36 | common_kernels) 37 | target_include_directories(ortinf PRIVATE ${ROOT_INCLUDE_PATH}) 38 | 39 | add_executable(test_ortcy_inference_cpp ../_unittests/ut_ortcy/test_inference.cpp) 40 | target_compile_definitions(test_ortcy_inference_cpp PRIVATE PYTHON_MANYLINUX=${PYTHON_MANYLINUX}) 41 | target_include_directories( 42 | test_ortcy_inference_cpp 43 | PRIVATE 44 | ${ROOT_UNITTEST_PATH} 45 | ${ROOT_PROJECT_PATH} 46 | ${ROOT_INCLUDE_PATH} 47 | ${ORT_DIR}/include) 48 | message(STATUS " LINK test_ortcy_inference_cpp <- lib_ortapi onnxruntime") 49 | target_link_directories(test_ortcy_inference_cpp PRIVATE ${ONNXRUNTIME_LIB_DIR}) 50 | target_link_libraries( 51 | test_ortcy_inference_cpp 52 | PRIVATE 53 | lib_ortapi 54 | onnxruntime 55 | common_kernels) 56 | ort_add_dependency(test_ortcy_inference_cpp "") 57 | add_test(NAME test_ortcy_inference_cpp COMMAND test_ortcy_inference_cpp) 58 | -------------------------------------------------------------------------------- /_cmake/targets/ortops_optim_cpu.cmake: -------------------------------------------------------------------------------- 1 | # 2 | # module: onnx_extended.reference.c_ops.cpu.c_op_conv_ 3 | # 4 | message(STATUS "+ KERNEL onnx_extended.ortops.optim.cpu") 5 | 6 | ort_add_custom_op( 7 | ortops_optim_cpu 8 | "CPU" 9 | onnx_extended/ortops/optim/cpu 10 | ../onnx_extended/ortops/optim/cpu/ort_optim_cpu_lib.cc) 11 | 12 | target_include_directories(ortops_optim_cpu PRIVATE ${ROOT_INCLUDE_PATH}) 13 | 14 | target_include_directories( 15 | ortops_optim_cpu 16 | PRIVATE 17 | "${ROOT_INCLUDE_PATH}" 18 | "${ORTAPI_INCLUDE_DIR}" 19 | "${ORTOPS_INCLUDE_DIR}") 20 | 21 | target_link_libraries( 22 | ortops_optim_cpu 23 | PRIVATE 24 | OpenMP::OpenMP_CXX 25 | common_kernels 26 | common) 27 | 28 | add_executable(test_optops_inference_cpp ../_unittests/ut_ortops/test_inference_tree.cpp) 29 | target_compile_definitions( 30 | test_optops_inference_cpp 31 | PRIVATE 32 | PYTHON_MANYLINUX=${PYTHON_MANYLINUX} 33 | TESTED_CUSTOM_OPS_DLL="$") 34 | target_include_directories( 35 | test_optops_inference_cpp 36 | PRIVATE 37 | ${ROOT_UNITTEST_PATH} 38 | ${ROOT_PROJECT_PATH} 39 | ${ROOT_INCLUDE_PATH} 40 | ${ORT_DIR}/include) 41 | message(STATUS " LINK test_optops_inference_cpp <- lib_ortapi onnxruntime") 42 | target_link_directories(test_optops_inference_cpp PRIVATE ${ONNXRUNTIME_LIB_DIR}) 43 | target_link_libraries( 44 | test_optops_inference_cpp 45 | PRIVATE 46 | lib_ortapi 47 | onnxruntime 48 | common_kernels) 49 | ort_add_dependency(test_optops_inference_cpp "") 50 | add_test(NAME test_optops_inference_cpp COMMAND test_optops_inference_cpp) 51 | -------------------------------------------------------------------------------- /_cmake/targets/ortops_optim_cuda.cmake: -------------------------------------------------------------------------------- 1 | # 2 | # module: onnx_extended.ortops.optim.cuda 3 | # 4 | 5 | if(CUDA_AVAILABLE) 6 | 7 | message(STATUS "+ KERNEL onnx_extended.ortops.optim.cuda") 8 | 9 | ort_add_custom_op( 10 | ortops_optim_cuda 11 | CUDA 12 | onnx_extended/ortops/optim/cuda 13 | ../onnx_extended/cpp/onnx_extended_helpers.cpp 14 | ../onnx_extended/ortops/optim/cuda/addaddmulmul.cu 15 | ../onnx_extended/ortops/optim/cuda/addaddaddmulmulmul.cu 16 | ../onnx_extended/ortops/optim/cuda/addmul.cu 17 | ../onnx_extended/ortops/optim/cuda/add_or_mul_shared_input.cu 18 | ../onnx_extended/ortops/optim/cuda/mul_sigmoid.cu 19 | ../onnx_extended/ortops/optim/cuda/mul_mul_sigmoid.cu 20 | ../onnx_extended/ortops/optim/cuda/negxplus1.cu 21 | ../onnx_extended/ortops/optim/cuda/replace_zero.cu 22 | ../onnx_extended/ortops/optim/cuda/rotary.cu 23 | ../onnx_extended/ortops/optim/cuda/scatter_nd_of_shape.cu 24 | ../onnx_extended/ortops/optim/cuda/scatter_nd_of_shape_masked.cu 25 | ../onnx_extended/ortops/optim/cuda/submul.cu 26 | ../onnx_extended/ortops/optim/cuda/transpose_cast_2d.cu 27 | ../onnx_extended/ortops/optim/cuda/tri_matrix.cu 28 | ../onnx_extended/ortops/optim/cuda/ort_optim_cuda_lib.cc) 29 | 30 | # needed to include onnx_extended_helpers.h 31 | target_include_directories( 32 | ortops_optim_cuda 33 | PRIVATE 34 | "${ROOT_INCLUDE_PATH}" 35 | "${ORTAPI_INCLUDE_DIR}" 36 | "${ORTOPS_INCLUDE_DIR}") 37 | 38 | endif() 39 | -------------------------------------------------------------------------------- /_cmake/targets/ortops_tutorial_cpu.cmake: -------------------------------------------------------------------------------- 1 | # 2 | # module: onnx_extended.reference.c_ops.cpu.c_op_conv_ 3 | # 4 | message(STATUS "+ KERNEL onnx_extended.ortops.tutorial.cpu") 5 | 6 | ort_add_custom_op( 7 | ortops_tutorial_cpu 8 | "CPU" 9 | onnx_extended/ortops/tutorial/cpu 10 | ../onnx_extended/ortops/tutorial/cpu/custom_gemm.cc 11 | ../onnx_extended/ortops/tutorial/cpu/custom_tree_assembly.cc 12 | ../onnx_extended/ortops/tutorial/cpu/dynamic_quantize_linear.cc 13 | ../onnx_extended/ortops/tutorial/cpu/my_kernel.cc 14 | ../onnx_extended/ortops/tutorial/cpu/my_kernel_attr.cc 15 | ../onnx_extended/ortops/tutorial/cpu/ort_tutorial_cpu_lib.cc) 16 | 17 | # needed to include onnx_extended_helpers.h 18 | target_include_directories( 19 | ortops_tutorial_cpu 20 | PRIVATE 21 | "${ROOT_INCLUDE_PATH}" 22 | "${ORTAPI_INCLUDE_DIR}" 23 | "${ORTOPS_INCLUDE_DIR}") 24 | 25 | eigen_add_dependency(ortops_tutorial_cpu) 26 | 27 | target_link_libraries( 28 | ortops_tutorial_cpu 29 | PRIVATE 30 | OpenMP::OpenMP_CXX 31 | common_kernels 32 | common) 33 | -------------------------------------------------------------------------------- /_cmake/targets/ortops_tutorial_cuda.cmake: -------------------------------------------------------------------------------- 1 | # 2 | # custom ops: onnx_extended.ortops.tutorial.cuda 3 | # 4 | 5 | if(CUDA_AVAILABLE) 6 | 7 | message(STATUS "+ KERNEL onnx_extended.ortops.tutorial.cuda") 8 | 9 | ort_add_custom_op( 10 | ortops_tutorial_cuda 11 | CUDA 12 | onnx_extended/ortops/tutorial/cuda 13 | ../onnx_extended/cpp/onnx_extended_helpers.cpp 14 | ../onnx_extended/ortops/tutorial/cuda/custom_gemm.cu 15 | ../onnx_extended/ortops/tutorial/cuda/matx_matmul.cu 16 | ../onnx_extended/ortops/tutorial/cuda/ort_tutorial_cuda_lib.cc) 17 | 18 | # needed to include onnx_extended_helpers.h 19 | target_include_directories( 20 | ortops_tutorial_cuda 21 | PRIVATE 22 | "${ROOT_INCLUDE_PATH}" 23 | "${ORTAPI_INCLUDE_DIR}" 24 | "${ORTOPS_INCLUDE_DIR}" 25 | "${matx_INCLUDE_DIR}") 26 | 27 | target_link_libraries(ortops_tutorial_cuda PRIVATE matx::matx) 28 | 29 | endif() 30 | -------------------------------------------------------------------------------- /_cmake/test_constants.h.in: -------------------------------------------------------------------------------- 1 | #define TEST_FOLDER "${TEST_FOLDER}" 2 | -------------------------------------------------------------------------------- /_doc/_static/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdpython/onnx-extended/eebca3c82b615f841608f74131900738b7f7f845/_doc/_static/logo.png -------------------------------------------------------------------------------- /_doc/_static/profile.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdpython/onnx-extended/eebca3c82b615f841608f74131900738b7f7f845/_doc/_static/profile.png -------------------------------------------------------------------------------- /_doc/_static/vector_sum6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdpython/onnx-extended/eebca3c82b615f841608f74131900738b7f7f845/_doc/_static/vector_sum6.png -------------------------------------------------------------------------------- /_doc/_static/vector_sum6_results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdpython/onnx-extended/eebca3c82b615f841608f74131900738b7f7f845/_doc/_static/vector_sum6_results.png -------------------------------------------------------------------------------- /_doc/api/check.rst: -------------------------------------------------------------------------------- 1 | 2 | ========================= 3 | onnx_extended.__init__.py 4 | ========================= 5 | 6 | check_installation 7 | ================== 8 | 9 | .. autofunction:: onnx_extended.check_installation 10 | 11 | compiled_with_cuda 12 | ================== 13 | 14 | .. autofunction:: onnx_extended.compiled_with_cuda 15 | 16 | cuda_version 17 | ============ 18 | 19 | .. autofunction:: onnx_extended.cuda_version 20 | 21 | cuda_version_int 22 | ================ 23 | 24 | .. autofunction:: onnx_extended.cuda_version_int 25 | 26 | get_cxx_flags 27 | ============= 28 | 29 | .. autofunction:: onnx_extended.get_cxx_flags 30 | 31 | get_stdcpp 32 | ========== 33 | 34 | .. autofunction:: onnx_extended.get_stdcpp 35 | 36 | has_cuda 37 | ======== 38 | 39 | .. autofunction:: onnx_extended.has_cuda 40 | -------------------------------------------------------------------------------- /_doc/api/ext_test_case.rst: -------------------------------------------------------------------------------- 1 | 2 | =========================== 3 | onnx_extended.ext_test_case 4 | =========================== 5 | 6 | Various helpers to help develop the package. 7 | 8 | ExtTestCase 9 | =========== 10 | 11 | .. autoclass:: onnx_extended.ext_test_case.ExtTestCase 12 | :members: 13 | 14 | ignore_warnings 15 | =============== 16 | 17 | .. autofunction:: onnx_extended.ext_test_case.ignore_warnings 18 | 19 | measure_time 20 | ============ 21 | 22 | .. autofunction:: onnx_extended.ext_test_case.measure_time 23 | -------------------------------------------------------------------------------- /_doc/api/helper.rst: -------------------------------------------------------------------------------- 1 | 2 | ==================== 3 | onnx_extended.helper 4 | ==================== 5 | 6 | make_dynamic_quantize_linear_function_proto 7 | =========================================== 8 | 9 | .. autofunction:: onnx_extended.helper.make_dynamic_quantize_linear_function_proto 10 | 11 | make_simple_dynamic_quantize_linear_function_proto 12 | ================================================== 13 | 14 | .. autofunction:: onnx_extended.helper.make_simple_dynamic_quantize_linear_function_proto 15 | -------------------------------------------------------------------------------- /_doc/api/index.rst: -------------------------------------------------------------------------------- 1 | 2 | === 3 | API 4 | === 5 | 6 | .. toctree:: 7 | :maxdepth: 1 8 | 9 | check 10 | ext_test_case 11 | memory_peak 12 | helper 13 | ortcy 14 | ortops 15 | plotting 16 | reference 17 | validation 18 | tools 19 | -------------------------------------------------------------------------------- /_doc/api/memory_peak.rst: -------------------------------------------------------------------------------- 1 | 2 | ========================= 3 | onnx_extended.memory_peak 4 | ========================= 5 | 6 | get_memory_rss 7 | ============== 8 | 9 | .. autofunction:: onnx_extended.memory_peak.get_memory_rss 10 | 11 | MemorySpy 12 | ========= 13 | 14 | .. autoclass:: onnx_extended.memory_peak.MemorySpy 15 | :members: 16 | 17 | start_spying_on 18 | =============== 19 | 20 | .. autofunction:: onnx_extended.memory_peak.start_spying_on 21 | -------------------------------------------------------------------------------- /_doc/api/ortcy.rst: -------------------------------------------------------------------------------- 1 | 2 | =================== 3 | onnx_extended.ortcy 4 | =================== 5 | 6 | It supports any onnxruntime C API greater than version: 7 | 8 | .. runpython:: 9 | :showcode: 10 | 11 | from onnx_extended.ortcy.wrap.ortinf import get_ort_c_api_supported_version 12 | 13 | print(get_ort_c_api_supported_version()) 14 | 15 | get_ort_c_api_supported_version 16 | +++++++++++++++++++++++++++++++ 17 | 18 | .. autofunction:: onnx_extended.ortcy.wrap.ortinf.get_ort_c_api_supported_version 19 | 20 | ort_get_available_providers 21 | =========================== 22 | 23 | .. autofunction:: onnx_extended.ortcy.wrap.ortinf.ort_get_available_providers 24 | 25 | OrtSession 26 | ========== 27 | 28 | .. autoclass:: onnx_extended.ortcy.wrap.ortinf.OrtSession 29 | :members: 30 | -------------------------------------------------------------------------------- /_doc/api/ortops.rst: -------------------------------------------------------------------------------- 1 | 2 | ==================== 3 | onnx_extended.ortops 4 | ==================== 5 | 6 | It supports any onnxruntime C API greater than version: 7 | 8 | .. runpython:: 9 | :showcode: 10 | 11 | from onnx_extended.ortcy.wrap.ortinf import get_ort_c_api_supported_version 12 | 13 | print(get_ort_c_api_supported_version()) 14 | 15 | .. toctree:: 16 | :maxdepth: 2 17 | 18 | ortops_tutorial_cpu 19 | ortops_tutorial_cuda 20 | ortops_optim_cpu 21 | ortops_optim_cuda 22 | -------------------------------------------------------------------------------- /_doc/api/ortops_optim_cpu.rst: -------------------------------------------------------------------------------- 1 | 2 | ============================== 3 | onnx_extended.ortops.optim.cpu 4 | ============================== 5 | 6 | change_onnx_operator_domain 7 | =========================== 8 | 9 | .. autofunction:: onnx_extended.ortops.optim.optimize.change_onnx_operator_domain 10 | 11 | get_ort_ext_libs 12 | ================ 13 | 14 | .. autofunction:: onnx_extended.ortops.optim.cpu.get_ort_ext_libs 15 | 16 | **List of implemented kernels** 17 | 18 | .. runpython:: 19 | :showcode: 20 | :rst: 21 | 22 | from onnx_extended.ortops.optim.cpu import documentation 23 | print("\n".join(documentation())) 24 | 25 | optimize_model 26 | ============== 27 | 28 | .. autofunction:: onnx_extended.ortops.optim.optimize.optimize_model 29 | -------------------------------------------------------------------------------- /_doc/api/ortops_optim_cuda.rst: -------------------------------------------------------------------------------- 1 | 2 | =============================== 3 | onnx_extended.ortops.optim.cuda 4 | =============================== 5 | 6 | get_ort_ext_libs 7 | ================ 8 | 9 | .. autofunction:: onnx_extended.ortops.optim.cuda.get_ort_ext_libs 10 | 11 | **List of implemented kernels** 12 | 13 | .. runpython:: 14 | :showcode: 15 | :rst: 16 | 17 | from onnx_extended.ortops.optim.cuda import documentation 18 | print("\n".join(documentation())) 19 | -------------------------------------------------------------------------------- /_doc/api/ortops_tutorial_cpu.rst: -------------------------------------------------------------------------------- 1 | 2 | ================================= 3 | onnx_extended.ortops.tutorial.cpu 4 | ================================= 5 | 6 | get_ort_ext_libs 7 | ================ 8 | 9 | .. autofunction:: onnx_extended.ortops.tutorial.cpu.get_ort_ext_libs 10 | 11 | **List of implemented kernels** 12 | 13 | .. runpython:: 14 | :showcode: 15 | :rst: 16 | 17 | from onnx_extended.ortops.tutorial.cpu import documentation 18 | print("\n".join(documentation())) 19 | -------------------------------------------------------------------------------- /_doc/api/ortops_tutorial_cuda.rst: -------------------------------------------------------------------------------- 1 | 2 | ================================== 3 | onnx_extended.ortops.tutorial.cuda 4 | ================================== 5 | 6 | get_ort_ext_libs 7 | ================ 8 | 9 | .. autofunction:: onnx_extended.ortops.tutorial.cuda.get_ort_ext_libs 10 | 11 | **List of implemented kernels** 12 | 13 | .. runpython:: 14 | :showcode: 15 | :rst: 16 | 17 | from onnx_extended.ortops.tutorial.cuda import documentation 18 | print("\n".join(documentation())) 19 | -------------------------------------------------------------------------------- /_doc/api/plotting.rst: -------------------------------------------------------------------------------- 1 | 2 | ====================== 3 | onnx_extended.plotting 4 | ====================== 5 | 6 | onnx_extended.plotting.benchmark.hhistograms 7 | ============================================ 8 | 9 | .. autofunction:: onnx_extended.plotting.benchmark.hhistograms 10 | 11 | onnx_extended.plotting.benchmark.vhistograms 12 | ============================================ 13 | 14 | .. autofunction:: onnx_extended.plotting.benchmark.vhistograms 15 | -------------------------------------------------------------------------------- /_doc/api/reference.rst: -------------------------------------------------------------------------------- 1 | 2 | ======================= 3 | onnx_extended.reference 4 | ======================= 5 | 6 | CReferenceEvaluator 7 | =================== 8 | 9 | .. autoclass:: onnx_extended.reference.CReferenceEvaluator 10 | :members: input_names, output_names, opsets, run 11 | 12 | Backend 13 | ======= 14 | 15 | .. autofunction:: onnx_extended.reference.c_reference_backend.create_reference_backend 16 | 17 | .. autoclass:: onnx_extended.reference.c_reference_backend.CReferenceEvaluatorBackend 18 | :members: 19 | 20 | .. autoclass:: onnx_extended.reference.c_reference_backend.CReferenceEvaluatorBackendRep 21 | :members: 22 | 23 | .. autoclass:: onnx_extended.reference.c_reference_backend.Runner 24 | :members: 25 | 26 | Tools 27 | ===== 28 | 29 | .. autofunction:: onnx_extended.reference.from_array_extended 30 | 31 | .. autofunction:: onnx_extended.reference.to_array_extended 32 | 33 | Operators 34 | ========= 35 | 36 | ai.onnx 37 | +++++++ 38 | 39 | .. autoclass:: onnx_extended.reference.c_ops.c_op_conv.Conv 40 | 41 | ai.onnx.ml 42 | ++++++++++ 43 | 44 | .. autoclass:: onnx_extended.reference.c_ops.c_op_svm_classifier.SVMClassifier 45 | 46 | .. autoclass:: onnx_extended.reference.c_ops.c_op_svm_regressor.SVMRegressor 47 | 48 | .. autoclass:: onnx_extended.reference.c_ops.c_op_tfidf_vectorizer.TfIdfVectorizer 49 | 50 | .. autoclass:: onnx_extended.reference.c_ops.c_op_tree_ensemble_classifier.TreeEnsembleClassifier_1 51 | 52 | .. autoclass:: onnx_extended.reference.c_ops.c_op_tree_ensemble_classifier.TreeEnsembleClassifier_3 53 | 54 | .. autoclass:: onnx_extended.reference.c_ops.c_op_tree_ensemble_regressor.TreeEnsembleRegressor_1 55 | 56 | .. autoclass:: onnx_extended.reference.c_ops.c_op_tree_ensemble_regressor.TreeEnsembleRegressor_3 57 | -------------------------------------------------------------------------------- /_doc/api/tools.rst: -------------------------------------------------------------------------------- 1 | 2 | ===== 3 | tools 4 | ===== 5 | 6 | .. toctree:: 7 | :maxdepth: 2 8 | 9 | tools_io 10 | tools_einsum 11 | tools_graph 12 | tools_graph_transformer 13 | tools_inline 14 | tools_nodes 15 | tools_stats 16 | tools_other 17 | -------------------------------------------------------------------------------- /_doc/api/tools_einsum.rst: -------------------------------------------------------------------------------- 1 | ========================== 2 | onnx_extended.tools.einsum 3 | ========================== 4 | 5 | Decomposition of Einsum into simple operations. 6 | 7 | analyse_einsum_equation 8 | ======================= 9 | 10 | .. autofunction:: onnx_extended.tools.einsum.einsum_impl.analyse_einsum_equation 11 | 12 | apply_einsum_sequence 13 | ===================== 14 | 15 | .. autofunction:: onnx_extended.tools.einsum.einsum_impl.apply_einsum_sequence 16 | 17 | CachedEinsum 18 | ============ 19 | 20 | .. autoclass:: onnx_extended.tools.einsum.einsum_fct.CachedEinsum 21 | :members: 22 | 23 | compute_transposition_features 24 | ============================== 25 | 26 | .. autofunction:: onnx_extended.tools.einsum.einsum_ml.compute_transposition_features 27 | 28 | decompose_einsum_equation 29 | ========================= 30 | 31 | .. autofunction:: onnx_extended.tools.einsum.einsum_impl.decompose_einsum_equation 32 | 33 | einsum 34 | ====== 35 | 36 | .. autofunction:: onnx_extended.tools.einsum.einsum_fct.einsum 37 | 38 | einsum_benchmark 39 | ================ 40 | 41 | .. autofunction:: onnx_extended.tools.einsum.einsum_bench.einsum_benchmark 42 | 43 | numpy_extended_dot 44 | ================== 45 | 46 | .. autofunction:: onnx_extended.tools.einsum.einsum_impl_ext.numpy_extended_dot 47 | 48 | numpy_extended_dot_matrix 49 | ========================= 50 | 51 | .. autofunction:: onnx_extended.tools.einsum.einsum_impl_ext.numpy_extended_dot_matrix 52 | 53 | numpy_extended_dot_python 54 | ========================= 55 | 56 | .. autofunction:: onnx_extended.tools.einsum.einsum_impl_ext.numpy_extended_dot_python 57 | 58 | EinsumSubOp 59 | =========== 60 | 61 | .. autoclass:: onnx_extended.tools.einsum.einsum_impl_classes.EinsumSubOp 62 | :members: 63 | 64 | GraphEinsumSubOp 65 | ================ 66 | 67 | .. autoclass:: onnx_extended.tools.einsum.einsum_impl_classes.GraphEinsumSubOp 68 | :members: 69 | 70 | OnnxMicroRuntime 71 | ================ 72 | 73 | .. autoclass:: onnx_extended.tools.einsum.einsum_fct.OnnxMicroRuntime 74 | :members: 75 | 76 | optimize_decompose_einsum_equation 77 | ================================== 78 | 79 | .. autofunction:: onnx_extended.tools.einsum.einsum_fct.optimize_decompose_einsum_equation 80 | 81 | predict_transposition_cost 82 | ========================== 83 | 84 | .. autofunction:: onnx_extended.tools.einsum.einsum_ml.predict_transposition_cost 85 | 86 | -------------------------------------------------------------------------------- /_doc/api/tools_graph.rst: -------------------------------------------------------------------------------- 1 | 2 | ========================= 3 | onnx_extended.tools.graph 4 | ========================= 5 | 6 | NodeKind 7 | ======== 8 | 9 | .. autoclass:: onnx_extended.tools.graph.onnx_graph_struct.NodeKind 10 | 11 | Node 12 | ==== 13 | 14 | .. autoclass:: onnx_extended.tools.graph.onnx_graph_struct.Node 15 | :members: 16 | 17 | NodeWithSubGraph 18 | ================ 19 | 20 | .. autoclass:: onnx_extended.tools.graph.onnx_graph_struct.NodeWithSubGraph 21 | :members: 22 | 23 | NodeSet 24 | ======= 25 | 26 | .. autoclass:: onnx_extended.tools.graph.onnx_graph_struct.NodeSet 27 | :members: 28 | 29 | Graph 30 | ===== 31 | 32 | .. autoclass:: onnx_extended.tools.graph.Graph 33 | :members: 34 | -------------------------------------------------------------------------------- /_doc/api/tools_graph_transformer.rst: -------------------------------------------------------------------------------- 1 | ================================================ 2 | onnx_extended.tools.graph.onnx_graph_transformer 3 | ================================================ 4 | 5 | cast_constant 6 | ============= 7 | 8 | .. autofunction:: onnx_extended.tools.graph.cast_constant 9 | 10 | QuantizeOptions 11 | =============== 12 | 13 | .. autoclass:: onnx_extended.tools.graph.QuantizeOptions 14 | :members: 15 | 16 | quantize_float8 17 | =============== 18 | 19 | .. autofunction:: onnx_extended.tools.graph.quantize_float8 20 | 21 | TransformResults 22 | ================ 23 | 24 | .. autoclass:: onnx_extended.tools.graph.onnx_graph_transformer.TransformResults 25 | :members: 26 | 27 | QuantizationError 28 | ================= 29 | 30 | .. autoclass:: onnx_extended.tools.graph.QuantizationError 31 | -------------------------------------------------------------------------------- /_doc/api/tools_inline.rst: -------------------------------------------------------------------------------- 1 | 2 | =============================== 3 | onnx_extended.tools.onnx_inline 4 | =============================== 5 | 6 | onnx_inline_function 7 | ==================== 8 | 9 | .. autofunction:: onnx_extended.tools.onnx_inline.onnx_inline_function 10 | -------------------------------------------------------------------------------- /_doc/api/tools_io.rst: -------------------------------------------------------------------------------- 1 | 2 | =========================== 3 | onnx_extended.tools.onnx_io 4 | =========================== 5 | 6 | enumerate_model_tensors 7 | ======================= 8 | 9 | .. autofunction:: onnx_extended.tools.enumerate_model_tensors 10 | 11 | load_external 12 | ============= 13 | 14 | .. autofunction:: onnx_extended.tools.load_external 15 | 16 | load_model 17 | ========== 18 | 19 | .. autofunction:: onnx_extended.tools.load_model 20 | 21 | onnx2string 22 | =========== 23 | 24 | .. autofunction:: onnx_extended.tools.onnx_io.onnx2string 25 | 26 | save_model 27 | ========== 28 | 29 | .. autofunction:: onnx_extended.tools.save_model 30 | 31 | string2onnx 32 | =========== 33 | 34 | .. autofunction:: onnx_extended.tools.onnx_io.string2onnx 35 | 36 | -------------------------------------------------------------------------------- /_doc/api/tools_nodes.rst: -------------------------------------------------------------------------------- 1 | 2 | ============================== 3 | onnx_extended.tools.onnx_nodes 4 | ============================== 5 | 6 | convert_onnx_model 7 | ================== 8 | 9 | .. autofunction:: onnx_extended.tools.onnx_nodes.convert_onnx_model 10 | 11 | enumerate_onnx_node_types 12 | ========================= 13 | 14 | .. autofunction:: onnx_extended.tools.onnx_nodes.enumerate_onnx_node_types 15 | 16 | get_hidden_inputs 17 | ================= 18 | 19 | .. autofunction:: onnx_extended.tools.onnx_nodes.get_hidden_inputs 20 | 21 | multiply_tree 22 | ============= 23 | 24 | .. autofunction:: onnx_extended.tools.onnx_nodes.multiply_tree 25 | 26 | onnx_merge_models 27 | ================= 28 | 29 | .. autofunction:: onnx_extended.tools.onnx_nodes.onnx_merge_models 30 | 31 | onnx_remove_node_unused 32 | ======================= 33 | 34 | .. autofunction:: onnx_extended.tools.onnx_nodes.onnx_remove_node_unused 35 | 36 | select_model_inputs_outputs 37 | =========================== 38 | 39 | .. autofunction:: onnx_extended.tools.onnx_nodes.select_model_inputs_outputs 40 | -------------------------------------------------------------------------------- /_doc/api/tools_other.rst: -------------------------------------------------------------------------------- 1 | 2 | =================== 3 | onnx_extended.tools 4 | =================== 5 | 6 | onnx_extended.tools.ort_debug 7 | ============================= 8 | 9 | enumerate_ort_run 10 | +++++++++++++++++ 11 | 12 | .. autofunction:: onnx_extended.tools.ort_debug.enumerate_ort_run 13 | 14 | onnx_extended.tools.js_profile 15 | ============================== 16 | 17 | js_profile_to_dataframe 18 | +++++++++++++++++++++++ 19 | 20 | .. autofunction:: onnx_extended.tools.js_profile.js_profile_to_dataframe 21 | 22 | plot_ort_profile 23 | ++++++++++++++++ 24 | 25 | .. autofunction:: onnx_extended.tools.js_profile.plot_ort_profile 26 | 27 | plot_ort_profile_timeline 28 | +++++++++++++++++++++++++ 29 | 30 | .. autofunction:: onnx_extended.tools.js_profile.plot_ort_profile_timeline 31 | 32 | onnx_extended.tools.run_onnx 33 | ============================ 34 | 35 | save_for_benchmark_or_test 36 | ++++++++++++++++++++++++++ 37 | 38 | .. autofunction:: onnx_extended.tools.run_onnx.save_for_benchmark_or_test 39 | 40 | bench_virtual 41 | +++++++++++++ 42 | 43 | .. autofunction:: onnx_extended.tools.run_onnx.bench_virtual 44 | 45 | TestRun 46 | +++++++ 47 | 48 | .. autoclass:: onnx_extended.tools.run_onnx.TestRun 49 | :members: 50 | -------------------------------------------------------------------------------- /_doc/api/tools_stats.rst: -------------------------------------------------------------------------------- 1 | 2 | =============================== 3 | onnx_extended.tools.stats_nodes 4 | =============================== 5 | 6 | enumerate_nodes 7 | =============== 8 | 9 | .. autofunction:: onnx_extended.tools.stats_nodes.enumerate_nodes 10 | 11 | enumerate_stats_nodes 12 | ===================== 13 | 14 | .. autofunction:: onnx_extended.tools.stats_nodes.enumerate_stats_nodes 15 | 16 | HistStatistics 17 | ============== 18 | 19 | .. autoclass:: onnx_extended.tools.stats_nodes.HistStatistics 20 | :members: 21 | 22 | HistTreeStatistics 23 | ================== 24 | 25 | .. autoclass:: onnx_extended.tools.stats_nodes.HistTreeStatistics 26 | :members: 27 | 28 | NodeStatistics 29 | ============== 30 | 31 | .. autoclass:: onnx_extended.tools.stats_nodes.NodeStatistics 32 | :members: 33 | 34 | stats_tree_ensemble 35 | =================== 36 | 37 | .. autofunction:: onnx_extended.tools.stats_nodes.stats_tree_ensemble 38 | 39 | TreeStatistics 40 | ============== 41 | 42 | .. autoclass:: onnx_extended.tools.stats_nodes.TreeStatistics 43 | :members: 44 | -------------------------------------------------------------------------------- /_doc/api/validation.rst: -------------------------------------------------------------------------------- 1 | 2 | ========== 3 | validation 4 | ========== 5 | 6 | .. toctree:: 7 | :maxdepth: 2 8 | 9 | validation_cpu 10 | validation_cuda 11 | validation_sparse 12 | validation_trees 13 | -------------------------------------------------------------------------------- /_doc/api/validation_cpu.rst: -------------------------------------------------------------------------------- 1 | 2 | ============== 3 | validation.cpu 4 | ============== 5 | 6 | 7 | C API 8 | ===== 9 | 10 | _validation 11 | +++++++++++ 12 | 13 | .. autoclass:: onnx_extended.validation.cpu._validation.ElementTime 14 | 15 | .. autofunction:: onnx_extended.validation.cpu._validation.benchmark_cache 16 | 17 | .. autofunction:: onnx_extended.validation.cpu._validation.benchmark_cache_tree 18 | 19 | .. autofunction:: onnx_extended.validation.cpu._validation.double2float_rn 20 | 21 | .. autofunction:: onnx_extended.validation.cpu._validation.murmurhash3_bytes_s32 22 | 23 | .. autofunction:: onnx_extended.validation.cpu._validation.float2half_rn 24 | 25 | .. autofunction:: onnx_extended.validation.cpu._validation.half2float 26 | 27 | .. autofunction:: onnx_extended.validation.cpu._validation.has_sse3 28 | -------------------------------------------------------------------------------- /_doc/api/validation_cuda.rst: -------------------------------------------------------------------------------- 1 | 2 | =============== 3 | validation.cuda 4 | =============== 5 | 6 | C API 7 | ===== 8 | 9 | cuda_example_py 10 | +++++++++++++++ 11 | 12 | .. runpython:: 13 | :rst: 14 | 15 | from onnx_extended import has_cuda 16 | 17 | if not has_cuda(): 18 | print( 19 | "The documentation was not compiled with CUDA enabled " 20 | "and cannot expose the CUDA functions." 21 | ) 22 | 23 | names = [ 24 | "cuda_device_count", 25 | "cuda_device_memory", 26 | "cuda_devices_memory", 27 | "cuda_version", 28 | "gemm_benchmark_test", 29 | "FpemuMode", 30 | "fpemu_cuda_forward", 31 | ] 32 | names.sort() 33 | classes = {"FpemuMode"} 34 | noindex = {"gemm_benchmark_test"} 35 | 36 | prefix = "onnx_extended.validation.cuda.cuda_example_py." 37 | if has_cuda(): 38 | fct_template = f".. autofunction:: {prefix}%s" 39 | fct_template_no = f".. autofunction:: {prefix}%s\n :noindex:" 40 | cls_template = f".. autoclass:: {prefix}%s\n :members:" 41 | else: 42 | fct_template = f"Unable to document function `{prefix}%s`" 43 | fct_template_no = fct_template 44 | cls_template = f"Unable to document class `{prefix}%s`" 45 | 46 | for name in names: 47 | tpl = cls_template if name in classes else ( 48 | fct_template_no if name in noindex else fct_template 49 | ) 50 | print(tpl % name) 51 | print() 52 | 53 | cuda_monitor 54 | ++++++++++++ 55 | 56 | .. runpython:: 57 | :rst: 58 | 59 | from onnx_extended import has_cuda 60 | 61 | if not has_cuda(): 62 | print( 63 | "The documentation was not compiled with CUDA enabled " 64 | "and cannot expose the CUDA functions." 65 | ) 66 | 67 | names = [ 68 | "cuda_version", 69 | "nvml_device_get_count", 70 | "nvml_device_get_memory_info", 71 | "nvml_init", 72 | "nvml_shutdown", 73 | ] 74 | names.sort() 75 | noindex = {} 76 | classes = {} 77 | 78 | prefix = "onnx_extended.validation.cuda.cuda_monitor." 79 | if has_cuda(): 80 | fct_template = f".. autofunction:: {prefix}%s" 81 | fct_template_no = f".. autofunction:: {prefix}%s\n :noindex:" 82 | cls_template = f".. autoclass:: {prefix}%s\n :members:" 83 | else: 84 | fct_template = f"Unable to document function `{prefix}%s`" 85 | fct_template_no = fct_template 86 | cls_template = f"Unable to document class `{prefix}%s`" 87 | 88 | for name in names: 89 | tpl = cls_template if name in classes else ( 90 | fct_template_no if name in noindex else fct_template 91 | ) 92 | print(tpl % name) 93 | print() 94 | -------------------------------------------------------------------------------- /_doc/api/validation_sparse.rst: -------------------------------------------------------------------------------- 1 | 2 | ====================== 3 | validation.bench_trees 4 | ====================== 5 | 6 | Design 7 | ====== 8 | 9 | The sparse format defined here is structure storing indices and values (float) 10 | in a single float array. The beginning of the structures 11 | stores the shape (1D to 5D), the element type and the number of stored 12 | elements. The two following functions are used to convert 13 | from dense from/to sparse. 14 | 15 | Functions 16 | ========= 17 | 18 | onnx_extended.validation.cpu._validation.dense_to_sparse_struct 19 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 20 | 21 | .. autofunction:: onnx_extended.validation.cpu._validation.dense_to_sparse_struct 22 | 23 | onnx_extended.validation.cpu._validation.evaluate_sparse 24 | ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 25 | 26 | .. autofunction:: onnx_extended.validation.cpu._validation.evaluate_sparse 27 | 28 | onnx_extended.validation.cpu._validation.sparse_struct_indices_values 29 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 30 | 31 | .. autofunction:: onnx_extended.validation.cpu._validation.sparse_struct_indices_values 32 | 33 | onnx_extended.validation.cpu._validation.sparse_struct_to_dense 34 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 35 | 36 | .. autofunction:: onnx_extended.validation.cpu._validation.sparse_struct_to_dense 37 | 38 | onnx_extended.validation.cpu._validation.sparse_struct_to_csr 39 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 40 | 41 | .. autofunction:: onnx_extended.validation.cpu._validation.sparse_struct_to_csr 42 | 43 | onnx_extended.validation.cpu._validation.sparse_struct_to_maps 44 | ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 45 | 46 | .. autofunction:: onnx_extended.validation.cpu._validation.sparse_struct_to_maps 47 | -------------------------------------------------------------------------------- /_doc/api/validation_trees.rst: -------------------------------------------------------------------------------- 1 | 2 | ====================== 3 | validation.bench_trees 4 | ====================== 5 | 6 | onnx_extended.validation.bench_trees.bench_trees 7 | ================================================ 8 | 9 | .. autofunction:: onnx_extended.validation.bench_trees.bench_trees 10 | 11 | onnx_extended.validation.bench_trees.create_decision_tree 12 | ========================================================= 13 | 14 | .. autofunction:: onnx_extended.validation.bench_trees.create_decision_tree 15 | 16 | onnx_extended.validation.bench_trees.create_engine 17 | ================================================== 18 | 19 | .. autofunction:: onnx_extended.validation.bench_trees.create_engine 20 | 21 | onnx_extended.validation.bench_trees.Engine 22 | =========================================== 23 | 24 | .. autoclass:: onnx_extended.validation.bench_trees.Engine 25 | :members: 26 | 27 | onnx_extended.validation.bench_trees.EngineCython 28 | ================================================= 29 | 30 | .. autoclass:: onnx_extended.validation.bench_trees.EngineCython 31 | :members: 32 | -------------------------------------------------------------------------------- /_doc/examples/README.txt: -------------------------------------------------------------------------------- 1 | .. _l-example-gallery: 2 | 3 | Examples Gallery 4 | ================ 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /_doc/license.rst: -------------------------------------------------------------------------------- 1 | LICENSE 2 | ======= 3 | 4 | .. literalinclude:: LICENSE.txt 5 | :language: none 6 | -------------------------------------------------------------------------------- /_doc/tech/2023-09-05-glibc.rst: -------------------------------------------------------------------------------- 1 | 2023-09-05 - version GLIBCXX_3.4.30 not found 2 | ============================================= 3 | 4 | Some weird issue occured when importing :epkg:`onnxruntime` after importing :epkg:`pandas`. 5 | 6 | :: 7 | 8 | Python 3.11.4 (main, Jul 5 2023, 13:45:01) [GCC 11.2.0] on linux 9 | Type "help", "copyright", "credits" or "license" for more information. 10 | >>> import pandas 11 | >>> import onnxruntime 12 | Traceback (most recent call last): 13 | File "", line 1, in 14 | File ".../github/onnxruntime/build/linux_cuda/Release/onnxruntime/__init__.py", line 56, in 15 | raise import_capi_exception 16 | File ".../github/onnxruntime/build/linux_cuda/Release/onnxruntime/__init__.py", line 23, in 17 | from onnxruntime.capi._pybind_state import ExecutionMode # noqa: F401 18 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 19 | File ".../github/onnxruntime/build/linux_cuda/Release/onnxruntime/capi/_pybind_state.py", line 32, in 20 | from .onnxruntime_pybind11_state import * # noqa 21 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 22 | ImportError: .../miniconda3/lib/python3.11/site-packages/numexpr/../../../libstdc++.so.6: version `GLIBCXX_3.4.30' not found (required by onnxruntime/build/linux_cuda/Release/onnxruntime/capi/onnxruntime_pybind11_state.so) 23 | >>> 24 | 25 | But the reverse works: 26 | 27 | :: 28 | 29 | import onnxruntime 30 | import pandas 31 | -------------------------------------------------------------------------------- /_doc/tech/gemm.rst: -------------------------------------------------------------------------------- 1 | Gemm and storage order 2 | ====================== 3 | 4 | `Gemm `_ 5 | means general matrix multiplication. It is a common routine in linear algebra. 6 | 7 | 8 | .. math:: 9 | 10 | Gemm(A, B, C, tA, tB, \alpha, \beta) = \alpha A^{tA} B^{tB} + \beta C 11 | 12 | Where :math:`A^{tA}` means *A* is *tA* if 0 and :math:`A'` if *tA* is 1. 13 | The coefficients of a matrix are stored in memory in a one dimension 14 | array *T*: :math:`A(i,j) = T[i * C + j]` where *C* is the number of columns 15 | of matrix A. In that case, the storage is said as *row major*. In case 16 | :math:`A(i,j) = T[j * R + i]` where *R* is the number of rows, 17 | the storage is *column major*. 18 | 19 | We define a matrix *A* with :math:`(I, J, M, R)`, it has *I* 20 | rows, *J* columns, the memory buffer is *M* and the matrix order 21 | *R*. In that case, we can express the transpose of this matrix by: 22 | If :math:`A=(I,J,M,R)`, then :math:`A' = (J,I,M,C)`. 23 | 24 | Let's use that notation for :math:`A=(I,J,M_A,R)`, :math:`B=(J,K,M_B,R)` 25 | and :math:`C=(I,K,M_C,R)`. We note :math:`D = A^{tA} B^{tB} = (I, K, M_D, R)`. 26 | 27 | .. math:: 28 | 29 | \begin{array}{rcl} 30 | \alpha A^{tA} B^{tB} + \beta C &=& \alpha (I,J,M_A,R)^{tA} (J,K,M_B,R)^{tB} + \beta (I,K,M_C,R) \\ 31 | &=& \left( \alpha (I,J,M_A,R)^{tA} (J,K,M_B,R)^{tB} + \beta (I,K,M_C,R) \right)'' \\ 32 | &=& \left( \alpha (J,K,M_B,R)^{1-tB} (I,J,M_A,R)^{1-tA} + \beta (I,K,M_C,R)' \right)' \\ 33 | &=& \left( \alpha (K,J,M_B,C)^{tB} (J,I,M_A,C)^{tA} + \beta (K,I,M_C,C) \right)' (*)\\ 34 | &=& \left( (K,I,M_D,C) + \beta (K,I,M_C,C) \right)' \\ 35 | &=& (I,K,M_D,R) + \beta (I,K,M_C,R) 36 | \end{array} 37 | 38 | This trick can be used to run the computation of matrices using 39 | a column major algorithm instead of a row major algorithm 40 | by using line `(*)` as a replacement. 41 | 42 | .. math:: 43 | 44 | \begin{array}{rcl} 45 | &&\alpha (I,J,M_A,R)^{tA} (J,K,M_B,R)^{tB} + \beta (I,K,M_C,R) \\ 46 | &=& \left( \alpha (K,J,M_B,C)^{tB} (J,I,M_A,C)^{tA} + \beta (K,I,M_C,C) \right)'\\ 47 | &=& \alpha (J,I,M_A,C)^{1-tA}(K,J,M_B,C)^{1-tB} + \beta (K,I,M_C,C)' 48 | \end{array} 49 | -------------------------------------------------------------------------------- /_doc/tech/index.rst: -------------------------------------------------------------------------------- 1 | Technical Details 2 | ================= 3 | 4 | .. toctree:: 5 | :maxdepth: 1 6 | :caption: Maths 7 | 8 | install_cuda_wsl 9 | usefulcmd 10 | gemm 11 | 12 | .. toctree:: 13 | :maxdepth: 1 14 | :caption: Issues 15 | 16 | 2023-09-05-glibc 17 | -------------------------------------------------------------------------------- /_doc/tech/usefulcmd.rst: -------------------------------------------------------------------------------- 1 | Useful commands on Linux 2 | ======================== 3 | 4 | Git 5 | +++ 6 | 7 | * clone: `git clone ` 8 | * create a new branch: `git checkout -b ` 9 | * add a remote repository: `git remote add ` 10 | * merge modification: `git pull ` 11 | * add modified files: `git add ` 12 | * commit added files: `git commit -m "commit message"` 13 | * push modifications to the remote repository: `git push` 14 | * remove all current modifications: `git reset --hard` 15 | * show modified filed: `git status` 16 | 17 | Retrieve information about the CPU 18 | ++++++++++++++++++++++++++++++++++ 19 | 20 | :: 21 | 22 | cat /proc/cpuinfo 23 | lscpu 24 | 25 | Retrieve information about the GPU 26 | ++++++++++++++++++++++++++++++++++ 27 | 28 | :: 29 | 30 | nvidia-smi 31 | 32 | Dependencies of a shared library 33 | ++++++++++++++++++++++++++++++++ 34 | 35 | :: 36 | 37 | ldd 38 | -------------------------------------------------------------------------------- /_doc/tutorial/build.rst: -------------------------------------------------------------------------------- 1 | 2 | Build from source 3 | ================= 4 | 5 | The packages relies on :epkg:`cmake` to build the C++ extensions. 6 | whether it wrapped with :epkg:`pybind11` or :epkg:`cython`. 7 | Both options are available and can be linked with :epkg:`openmp`, 8 | :epkg:`eigen`, :epkg:`onnxruntime`, :epkg:`CUDA`. 9 | *cmake* is called from `setup.py 10 | `_ 11 | with two instructions: 12 | 13 | * ``python setup.py build_ext --inplace``, the legacy way 14 | * ``pip install -e .``, the new way 15 | 16 | By default, *cmake* builds with CUDA if it is available. It can be disabled: 17 | 18 | * ``python setup.py build_ext -v --inplace --with-cuda=0``, the legacy way 19 | * ``pip install -e . -v --config-settings="--with-cuda=0"``, the new way (not fully working yet) 20 | * ``pip install -e . -v --global-option "--with-cuda=0"``, the deprecated way 21 | * ``USE_CUDA=0 pip install -e . -v``, the run around way 22 | 23 | In case there are multiple versions of CUDA installed, option `cuda-version` 24 | can be specified: 25 | 26 | :: 27 | 28 | python setup.py build_ext --inplace --cuda-version=12.6 29 | 30 | The development versions of :epkg:`onnxruntime` can be used if it was already build 31 | ``--ort-version=``. Example: 32 | 33 | :: 34 | 35 | python setup.py build_ext --inplace --cuda-version=12.6 --ort-version=/home/github/onnxruntime/build/linux_cuda/Release 36 | 37 | ``--cuda-link=SHARED`` helps reducing the binary size. 38 | 39 | .. toctree:: 40 | :maxdepth: 1 41 | 42 | build_cython 43 | build_pybind11 44 | build_cuda 45 | build_ortext 46 | readings 47 | -------------------------------------------------------------------------------- /_doc/tutorial/build_cuda.rst: -------------------------------------------------------------------------------- 1 | Build with CUDA 2 | =============== 3 | 4 | The build may include pybind11 extension building with CUDA. 5 | The setup is more complex as CUDA is not always available. 6 | The profiler may be enabled as well. 7 | 8 | cmake 9 | +++++ 10 | 11 | The first step is to load the extension `FindCudaExtension.cmake 12 | `_ 13 | with `find_package(CudaExtension)`. This file exposes function 14 | `cuda_pybind11_add_module(name pybindfile)` called for 15 | every extension to build and used as follows: 16 | 17 | :: 18 | 19 | if(CUDA_AVAILABLE) 20 | 21 | cuda_pybind11_add_module( 22 | cuda_example_py # name 23 | ../onnx_extended/validation/cuda/cuda_example_py.cpp # pybind11 file 24 | ../onnx_extended/validation/cuda/cuda_example.cu # CUDA code 25 | ../onnx_extended/validation/cuda/cuda_example_reduce.cu) # CUDA code 26 | 27 | endif() 28 | 29 | The function accepts many source files whether they have extension c, cpp, cc, cu. 30 | Other link dependencies can be added as well 31 | by adding an instructions like `target_link_libraries(name PRIVATE lib_name)`. 32 | These project define constant `CUDA_VERSION`. For example, version 11.8 becomes 33 | `11080`. 34 | 35 | setup.py 36 | ++++++++ 37 | 38 | `setup.py `_ 39 | defines a custom command to call cmake. Another line must be added 40 | to register the extension in the setup. 41 | 42 | :: 43 | 44 | if platform.system() == "Windows": 45 | ext = "pyd" 46 | elif platform.system() == "Darwin" 47 | ext = "dylib" 48 | else: 49 | ext = "so" 50 | 51 | if find_cuda(): 52 | 53 | setup( 54 | ... 55 | ext_modules = [ 56 | ... 57 | CMakeExtension( 58 | "onnx_extended.validation.cuda.cuda_example_py", 59 | f"onnx_extended/validation/cuda/cuda_example_py.{ext}", 60 | ), 61 | ] 62 | ) 63 | 64 | Function `find_cuda()` executes :epkg:`nvidia-smi` to check 65 | the installation of CUDA. 66 | 67 | Possible errors 68 | +++++++++++++++ 69 | 70 | CMAKE_CUDA_COMPILER_VERSION=11.5.119 < 12.1, nvcc is not setup properly 71 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 72 | 73 | On Linux, the following error may happen: 74 | 75 | :: 76 | 77 | CMake Error at externals/FindCudaExtension.cmake:60 (message): 78 | CMAKE_CUDA_COMPILER_VERSION=11.5.119 < 12.1, nvcc is not setup properly. 79 | Try 'whereis nvcc' and chack the version. 80 | Call Stack (most recent call first): 81 | load_externals.cmake:9 (find_package) 82 | CMakeLists.txt:19 (include) 83 | 84 | It can be fixed by adding `--cuda-nvcc=`. An example: 85 | `--cuda-nvcc=/usr/local/cuda-12.1/bin/nvcc`. -------------------------------------------------------------------------------- /_doc/tutorial/build_cython.rst: -------------------------------------------------------------------------------- 1 | Build with cython 2 | ================= 3 | 4 | Any :epkg:`cython` extension is built by cmake. 5 | It first calls cython to convert a pyx file into a C++ file 6 | before it is compiled and linked. Using cmake + cython 7 | instead of cython only make it easier to link with static 8 | libraries and write unit tests in C++. 9 | 10 | cmake 11 | +++++ 12 | 13 | The first step is to load the extension `FindCython.cmake 14 | `_ 15 | with `find_package(Cython REQUIRED)`. This file exposes function 16 | `cython_add_module(name pyx_file omp_lib)` called for 17 | every extension to build and used as follows: 18 | 19 | :: 20 | 21 | cython_add_module( 22 | vector_function_cy # name 23 | ../onnx_extended/validation/cython/vector_function_cy.pyx # pyx_file 24 | OpenMP::OpenMP_CXX # link with this target 25 | ../onnx_extended/validation/cpu/vector_function.cpp) # sources files 26 | 27 | The function accepts many source files. Other link dependencies can be added as well 28 | by adding an instructions like `target_link_libraries(name PRIVATE lib_name)`. 29 | This function *cythonize* the *pyx_file* into a cpp file before building 30 | the dynamic library. 31 | 32 | setup.py 33 | ++++++++ 34 | 35 | `setup.py `_ 36 | defines a custom command to call cmake. Another line must be added 37 | to register the extension in the setup. 38 | 39 | :: 40 | 41 | if platform.system() == "Windows": 42 | ext = "pyd" 43 | elif platform.system() == "Darwin" 44 | ext = "dylib" 45 | else: 46 | ext = "so" 47 | 48 | setup( 49 | ... 50 | ext_modules = [ 51 | ... 52 | CMakeExtension( 53 | "onnx_extended.validation.cython.vector_function_cy", 54 | f"onnx_extended/validation/cython/vector_function_cy.{ext}", 55 | ), 56 | ] 57 | ) 58 | -------------------------------------------------------------------------------- /_doc/tutorial/build_ortext.rst: -------------------------------------------------------------------------------- 1 | Build with onnxruntime 2 | ====================== 3 | 4 | This package includes a wrapper for :epkg:`onnxruntime` based on 5 | :epkg:`cython`. The standard one relies on :epkg:`pybind11`. 6 | For that purpose, it includes the onnxruntime binaries released 7 | on github (see :epkg:`onnxruntime releases`). 8 | 9 | build onnxruntime 10 | +++++++++++++++++ 11 | 12 | :: 13 | 14 | clear&&CUDA_VERSION=12.6 CUDACXX=/usr/local/cuda-12.6/bin/nvcc python ./tools/ci_build/build.py \ 15 | --config Release --build_wheel --build_dir ./build/linux_cuda \ 16 | --build_shared_lib --use_cuda --cuda_home /usr/local/cuda-12.6/ \ 17 | --cudnn_home /usr/local/cuda-12.6/ --cuda_version=12.6 --enable_training --enable_training_ops \ 18 | --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=61" \ 19 | --parallel --skip_tests 20 | 21 | clear&&CUDA_VERSION=12.1 CUDACXX=/usr/local/cuda-12.1/bin/nvcc python ./tools/ci_build/build.py \ 22 | --config Release --build_wheel --build_dir ./build/linux_cuda \ 23 | --build_shared_lib --use_cuda --cuda_home /usr/local/cuda-12.1/ \ 24 | --cudnn_home /usr/local/cuda-12.1/ --cuda_version=12.1 --enable_training --enable_training_ops \ 25 | --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=70;72" \ 26 | --parallel --skip_tests 27 | 28 | cmake 29 | +++++ 30 | 31 | The first step is to load the extension `FindOrt.cmake 32 | `_ 33 | with `find_package(Ort REQUIRED)`. This file exposes two functions. 34 | The first one `ort_add_dependency(name folder_copy)` copies the binaries 35 | into folder *folder_copy* and links target *name* with onnxruntime. 36 | 37 | The second function `ort_add_custom_op(name folder "CPU")` creates a library with 38 | several custom kernels for onnxruntime and links it with onnxruntime. 39 | *name* is the project name, *folder* its location. 40 | 41 | :: 42 | 43 | ort_add_custom_op( 44 | ortops_tutorial_cpu # name 45 | "CPU" 46 | ../onnx_extended/ortops/tutorial/cpu # folder 47 | ../onnx_extended/ortops/tutorial/cpu/my_kernel.cc # source file 48 | ../onnx_extended/ortops/tutorial/cpu/my_kernel_attr.cc # source file 49 | ../onnx_extended/ortops/tutorial/cpu/ort_tutorial_cpu_lib.cc) # source file 50 | 51 | Every new kernel can be added by adding new source file. A line must be added 52 | in file `ort_tutorial_cpu_lib.cc` to register the kernel. That file also defines 53 | the domain the kernel belongs to. 54 | These project define constant `ORT_VERSION`. For example, version 1.15 becomes 55 | `1150`. 56 | -------------------------------------------------------------------------------- /_doc/tutorial/build_pybind11.rst: -------------------------------------------------------------------------------- 1 | Build with pybind11 2 | =================== 3 | 4 | Any :epkg:`pybind11` extension is built by cmake. 5 | Using cmake + pybind11 instead of pybind11 6 | only make it easier to link with static 7 | libraries and write unit tests in C++. 8 | 9 | cmake 10 | +++++ 11 | 12 | The first step is to load the extension `FindLocalPyBind11 13 | `_ 14 | with ``find_package(LocalPyBind11 REQUIRED)``. 15 | This extension fetches the content of pybind11 and builds it with 16 | `FetchContent_Populate(pybind11)`. The version is registered there. 17 | It must be done once. 18 | It defines a function `local_pybind11_add_module(name omp_lib)` called for 19 | every extension to build and used as follows: 20 | 21 | :: 22 | 23 | local_pybind11_add_module( 24 | _validation # name 25 | OpenMP::OpenMP_CXX # link with this library 26 | ../onnx_extended/validation/cpu/_validation.cpp # source file 27 | ../onnx_extended/validation/cpu/vector_sum.cpp) # source file 28 | 29 | Additional libraries can be added with `target_link_libraries(name PRIVATE lib_name)`. 30 | 31 | setup.py 32 | ++++++++ 33 | 34 | `setup.py `_ 35 | defines a custom command to call cmake. Another line must be added 36 | to register the extension in the setup. 37 | 38 | :: 39 | 40 | if platform.system() == "Windows": 41 | ext = "pyd" 42 | elif platform.system() == "Darwin" 43 | ext = "dylib" 44 | else: 45 | ext = "so" 46 | 47 | setup( 48 | ... 49 | ext_modules = [ 50 | ... 51 | CMakeExtension( 52 | "onnx_extended.validation.cpu._validation", 53 | f"onnx_extended/validation/cpu/_validation.{ext}", 54 | ), 55 | ] 56 | ) 57 | -------------------------------------------------------------------------------- /_doc/tutorial/custom_ops.rst: -------------------------------------------------------------------------------- 1 | Custom Kernels for onnxruntime 2 | ============================== 3 | 4 | :epkg:`onnxruntime` implements a C API which allows the user 5 | to add custom implementation for any new operator. 6 | This mechanism is described on onnxruntime documentation 7 | `Custom operators `_. 8 | This packages implements a couple of custom operators for CPU and 9 | GPU (NVIDIA). The first steps is to register an assembly to let 10 | onnxruntime use them. 11 | 12 | .. code-block:: python 13 | 14 | from onnxruntime import InferenceSession, SessionOptions 15 | from onnx_extended.ortops.optim.cpu import get_ort_ext_libs 16 | 17 | opts = SessionOptions() 18 | opts.register_custom_ops_library(get_ort_ext_libs()[0]) 19 | 20 | sess = InferenceSession( 21 | "", opts, providers=[..., "CPUExecutionProvider"] 22 | ) 23 | 24 | It supports any onnxruntime C API greater than version: 25 | 26 | .. runpython:: 27 | :showcode: 28 | 29 | from onnx_extended.ortcy.wrap.ortinf import get_ort_c_api_supported_version 30 | 31 | print(get_ort_c_api_supported_version()) 32 | 33 | Next section introduces the list of operators and assemblies this package 34 | implements. 35 | 36 | onnx_extended.ortops.tutorial.cpu 37 | +++++++++++++++++++++++++++++++++ 38 | 39 | .. runpython:: 40 | :showcode: 41 | 42 | from onnx_extended.ortops.tutorial.cpu import get_ort_ext_libs 43 | 44 | print(get_ort_ext_libs()) 45 | 46 | .. runpython:: 47 | :rst: 48 | 49 | from onnx_extended.ortops.tutorial.cpu import documentation 50 | 51 | print("\n".join(documentation())) 52 | 53 | onnx_extended.ortops.tutorial.cuda 54 | ++++++++++++++++++++++++++++++++++ 55 | 56 | .. runpython:: 57 | :showcode: 58 | 59 | from onnx_extended.ortops.tutorial.cuda import get_ort_ext_libs 60 | 61 | try: 62 | print(get_ort_ext_libs()) 63 | except AssertionError as e: 64 | print(f"CUDA is not enabled: {e}") 65 | 66 | .. runpython:: 67 | :rst: 68 | 69 | from onnx_extended.ortops.tutorial.cuda import documentation 70 | 71 | print("\n".join(documentation())) 72 | 73 | onnx_extended.ortops.optim.cpu 74 | ++++++++++++++++++++++++++++++ 75 | 76 | .. runpython:: 77 | :showcode: 78 | 79 | from onnx_extended.ortops.optim.cpu import get_ort_ext_libs 80 | 81 | print(get_ort_ext_libs()) 82 | 83 | .. runpython:: 84 | :rst: 85 | 86 | from onnx_extended.ortops.optim.cpu import documentation 87 | 88 | print("\n".join(documentation())) 89 | -------------------------------------------------------------------------------- /_doc/tutorial/cython_binding.rst: -------------------------------------------------------------------------------- 1 | Cython Binding of onnxruntime 2 | ============================= 3 | 4 | :epkg:`onnxruntime` implements a python API based on :epkg:`pybind11`. 5 | This API is custom and does not leverage the C API. 6 | This package implements class 7 | :class:`OrtSession `. 8 | The bindings is based on :epkg:`cython` which faster. 9 | The difference is significant when onnxruntime deals with small tensors. 10 | 11 | .. runpython:: 12 | :showcode: 13 | 14 | import numpy 15 | from onnx import TensorProto 16 | from onnx.helper import ( 17 | make_model, 18 | make_node, 19 | make_graph, 20 | make_tensor_value_info, 21 | make_opsetid, 22 | ) 23 | from onnx_extended.ortcy.wrap.ortinf import OrtSession 24 | 25 | X = make_tensor_value_info("X", TensorProto.FLOAT, [None, None]) 26 | Y = make_tensor_value_info("Y", TensorProto.FLOAT, [None, None]) 27 | Z = make_tensor_value_info("Z", TensorProto.FLOAT, [None, None]) 28 | node = make_node("Add", ["X", "Y"], ["Z"]) 29 | graph = make_graph([node], "add", [X, Y], [Z]) 30 | onnx_model = make_model( 31 | graph, opset_imports=[make_opsetid("", 18)], ir_version=8 32 | ) 33 | 34 | with open("model.onnx", "wb") as f: 35 | f.write(onnx_model.SerializeToString()) 36 | 37 | session = OrtSession("model.onnx") 38 | x = numpy.random.randn(2, 3).astype(numpy.float32) 39 | y = numpy.random.randn(2, 3).astype(numpy.float32) 40 | got =session.run([x, y]) 41 | 42 | print(got) 43 | 44 | 45 | The signature is different compare to onnxruntime 46 | ``session.run(None, {"X": x, "Y": y})`` to increase performance. 47 | This binding supports custom operators as well. 48 | A benchmark :ref:`l-cython-pybind11-ort-bindings` compares 49 | :epkg:`onnxruntime` to this new binding. 50 | -------------------------------------------------------------------------------- /_doc/tutorial/images/plot_optim_tree_ensemble.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdpython/onnx-extended/eebca3c82b615f841608f74131900738b7f7f845/_doc/tutorial/images/plot_optim_tree_ensemble.png -------------------------------------------------------------------------------- /_doc/tutorial/index.rst: -------------------------------------------------------------------------------- 1 | 2 | Tutorial 3 | ======== 4 | 5 | This package is mostly used to validate custom implementations 6 | of a specific onnx operator or **kernel**. 7 | The same code is used to either implement a custom kernel for the 8 | reference implementation from :epkg:`onnx` package or a custom kernel 9 | for :epkg:`onnxruntime`. The last section 10 | describe how to build the package and to add a new implementation 11 | depending the technology it relies on (CPU, openmp, CUDA, eigen, ...). 12 | The last section is a sorted index of the examples. 13 | 14 | .. toctree:: 15 | :maxdepth: 1 16 | :caption: Kernels 17 | 18 | reference_evaluator 19 | cython_binding 20 | custom_ops 21 | ops 22 | many_tools 23 | build 24 | 25 | .. toctree:: 26 | :maxdepth: 1 27 | :caption: Deprecated 28 | 29 | parallelization 30 | -------------------------------------------------------------------------------- /_doc/tutorial/many_tools.rst: -------------------------------------------------------------------------------- 1 | 2 | ======================================= 3 | Many Tools to help investigating issues 4 | ======================================= 5 | 6 | Developpers write many lines of code, many are part of a package, 7 | many are used to investigate what the first line produces. 8 | This section gathers some tools occasionally needed 9 | to write converters in :epkg:`sklearn-onnx`, to implement 10 | kernels in :epkg:`onnxruntime`, to add new operators in :epkg:`onnx`. 11 | The first series is used to play with :epkg:`onnx` files. 12 | A couple of the helpers described below are available 13 | through command lines. 14 | 15 | .. toctree:: 16 | :maxdepth: 1 17 | :caption: onnx 18 | 19 | external_data 20 | onnx_manipulations 21 | quantize 22 | statistics 23 | 24 | The second series is used to investigate C++ implementations 25 | in :epkg:`onnxruntime`. 26 | 27 | .. toctree:: 28 | :maxdepth: 1 29 | :caption: onnxruntime 30 | 31 | profiling 32 | ort_debug 33 | old_version 34 | trees 35 | -------------------------------------------------------------------------------- /_doc/tutorial/onnx_manipulations.rst: -------------------------------------------------------------------------------- 1 | 2 | Onnx Manipulations 3 | ================== 4 | 5 | Extract a subgraph 6 | ++++++++++++++++++ 7 | 8 | Both functions below are usually to extract a small piece of an existing 9 | model to create unit tests. 10 | 11 | Function :func:`onnx_remove_node_unused 12 | ` 13 | removes every node whose outputs are not used. 14 | 15 | Function :func:`select_model_inputs_outputs 16 | ` 17 | creates an onnx graph taking any intermediate results as new inputs 18 | or new outputs. 19 | 20 | Analyze 21 | +++++++ 22 | 23 | Loops or tests are based on onnx `GraphProto`. These 24 | subgraphs takes inputs but can also use any intermediated 25 | results computed so far. These results are part of the local 26 | context but they are not explicit mentioned and that sometimes 27 | makes it difficult to understand what subgraph is doing or needs. 28 | Function :func:`get_hidden_inputs 29 | ` 30 | retrieves that information. 31 | 32 | Function :func:`enumerate_onnx_node_types 33 | ` 34 | quickly gives the list of operators a model uses. 35 | -------------------------------------------------------------------------------- /_doc/tutorial/ops.rst: -------------------------------------------------------------------------------- 1 | =============================== 2 | Focus on operators optimization 3 | =============================== 4 | 5 | .. toctree:: 6 | :maxdepth: 1 7 | :caption: Conv 8 | 9 | ../auto_examples/plot_op_conv_py_vs_c 10 | ../auto_examples/plot_op_conv_denorm 11 | 12 | .. toctree:: 13 | :maxdepth: 1 14 | :caption: Gemm 15 | 16 | ../auto_examples/plot_bench_gemm_f8 17 | ../auto_examples/plot_bench_gemm_ort 18 | ../auto_examples/plot_profile_gemm_ort 19 | 20 | .. toctree:: 21 | :maxdepth: 1 22 | :caption: Einsum 23 | 24 | ../auto_examples/plot_op_einsum 25 | 26 | .. toctree:: 27 | :maxdepth: 1 28 | :caption: Mul 29 | 30 | ../auto_examples/plot_op_mul_cuda 31 | 32 | .. toctree:: 33 | :maxdepth: 1 34 | :caption: TreeEnsemble 35 | 36 | ../auto_examples/plot_op_tree_ensemble_optim 37 | ../auto_examples/plot_op_tree_ensemble_sparse 38 | -------------------------------------------------------------------------------- /_doc/tutorial/ort_debug.rst: -------------------------------------------------------------------------------- 1 | 2 | Debug Intermediate Results 3 | ========================== 4 | 5 | The reference evaluation (:class:`onnx_extended.reference.CReferenceEvaluator`) 6 | can return all intermediate results. :epkg:`onnxruntime` does not 7 | unless the onnx model is split to extract the intermediate results. 8 | Function :func:`enumerate_ort_run ` 9 | creates many models, inputs are always the same, new outputs are intermediate 10 | results of an original model. 11 | 12 | .. runpython:: 13 | :showcode: 14 | 15 | import logging 16 | import numpy as np 17 | from onnx import TensorProto 18 | from onnx.helper import ( 19 | make_model, 20 | make_node, 21 | make_graph, 22 | make_tensor_value_info, 23 | make_opsetid, 24 | ) 25 | from onnx.checker import check_model 26 | from onnx_extended.tools.ort_debug import enumerate_ort_run 27 | 28 | logging.getLogger("onnx-extended").setLevel(logging.ERROR) 29 | 30 | def get_model(): 31 | X = make_tensor_value_info("X", TensorProto.FLOAT, [None, None]) 32 | Y = make_tensor_value_info("Y", TensorProto.FLOAT, [None, None]) 33 | Z = make_tensor_value_info("Z", TensorProto.INT64, [None, None]) 34 | graph = make_graph( 35 | [ 36 | make_node("Add", ["X", "Y"], ["z1"]), 37 | make_node("Mul", ["X", "z1"], ["z2"]), 38 | make_node("Cast", ["z2"], ["Z"], to=TensorProto.INT64), 39 | ], 40 | "add", 41 | [X, Y], 42 | [Z], 43 | ) 44 | onnx_model = make_model( 45 | graph, opset_imports=[make_opsetid("", 18)], ir_version=8 46 | ) 47 | check_model(onnx_model) 48 | return onnx_model 49 | 50 | model = get_model() 51 | feeds = { 52 | "X": np.arange(4).reshape((2, 2)).astype(np.float32), 53 | "Y": np.arange(4).reshape((2, 2)).astype(np.float32), 54 | } 55 | 56 | for names, outs, node in enumerate_ort_run(model, feeds, verbose=2): 57 | print(f"NODE: {node.op_type}") 58 | for n, o in zip(names, outs): 59 | print(f" {n}:{o.dtype}:{o.shape}") 60 | -------------------------------------------------------------------------------- /_doc/tutorial/parallelization.rst: -------------------------------------------------------------------------------- 1 | 2 | Experiments about parallelization 3 | ================================= 4 | 5 | .. toctree:: 6 | :maxdepth: 1 7 | 8 | ../auto_examples/plot_bench_cpu 9 | -------------------------------------------------------------------------------- /_doc/tutorial/quantize.rst: -------------------------------------------------------------------------------- 1 | 2 | Quantization 3 | ============ 4 | 5 | *to be completed* 6 | -------------------------------------------------------------------------------- /_doc/tutorial/readings.rst: -------------------------------------------------------------------------------- 1 | ======== 2 | Readings 3 | ======== 4 | 5 | Some articles, papers, document helpful. 6 | 7 | Build 8 | ===== 9 | 10 | * `Compiler Options Hardening Guide for C and C++ 11 | `_ 12 | * `Build a custom ONNX Runtime package `_ 13 | 14 | Custom Operators with onnxruntime 15 | ================================= 16 | 17 | * `Custom operators `_ 18 | * `custom_op_lib.cc 19 | `_ 20 | -------------------------------------------------------------------------------- /_doc/tutorial/reference_evaluator.rst: -------------------------------------------------------------------------------- 1 | 2 | CReferenceEvaluator 3 | =================== 4 | 5 | Class :class:`CReferenceEvaluator ` 6 | extends :class:`onnx.reference.ReferenceEvaluator` with custom operators implemented 7 | in C++ in order to speed up the evaluation of this python runtime. 8 | This class inherits from :class:`onnx.reference.ReferenceEvaluator` to automatically 9 | add the C++ implementation of this operators. 10 | It rewrites the following kernels and can be used as follows. 11 | 12 | .. runpython:: 13 | :showcode: 14 | 15 | import numpy as np 16 | from onnx import TensorProto 17 | from onnx.helper import ( 18 | make_graph, 19 | make_model, 20 | make_node, 21 | make_opsetid, 22 | make_tensor_value_info, 23 | ) 24 | from onnx.reference import ReferenceEvaluator 25 | from onnxruntime import InferenceSession 26 | from onnx_extended.ext_test_case import measure_time 27 | from onnx_extended.reference import CReferenceEvaluator 28 | 29 | X = make_tensor_value_info("X", TensorProto.FLOAT, [None, None, None, None]) 30 | Y = make_tensor_value_info("Y", TensorProto.FLOAT, [None, None, None, None]) 31 | B = make_tensor_value_info("B", TensorProto.FLOAT, [None, None, None, None]) 32 | W = make_tensor_value_info("W", TensorProto.FLOAT, [None, None, None, None]) 33 | node = make_node( 34 | "Conv", 35 | ["X", "W", "B"], 36 | ["Y"], 37 | pads=[1, 1, 1, 1], 38 | dilations=[1, 1], 39 | strides=[2, 2], 40 | ) 41 | graph = make_graph([node], "g", [X, W, B], [Y]) 42 | onnx_model = make_model(graph, opset_imports=[make_opsetid("", 16)]) 43 | 44 | sH, sW = 64, 64 45 | X = np.arange(sW * sH).reshape((1, 1, sH, sW)).astype(np.float32) 46 | W = np.ones((1, 1, 3, 3), dtype=np.float32) 47 | B = np.array([[[[0]]]], dtype=np.float32) 48 | 49 | sess1 = ReferenceEvaluator(onnx_model) 50 | sess2 = CReferenceEvaluator(onnx_model) # 10 to 100 times faster 51 | 52 | expected = sess1.run(None, {"X": X, "W": W, "B": B})[0] 53 | got = sess2.run(None, {"X": X, "W": W, "B": B})[0] 54 | diff = np.abs(expected - got).max() 55 | print(f"difference: {diff}") 56 | 57 | It rewrites the following examples. 58 | 59 | .. runpython:: 60 | :showcode: 61 | 62 | import pprint 63 | from onnx_extended.reference import CReferenceEvaluator 64 | 65 | pprint.pprint( 66 | [cl.__name__ for cl in CReferenceEvaluator.default_ops()] 67 | ) 68 | -------------------------------------------------------------------------------- /_doc/tutorial/statistics.rst: -------------------------------------------------------------------------------- 1 | 2 | Statistics 3 | ========== 4 | 5 | To be completed. 6 | -------------------------------------------------------------------------------- /_unittests/ut_ortcy/data/add.onnx: -------------------------------------------------------------------------------- 1 | :H 2 |  3 | X 4 | YZ"AddaddZ 5 | X 6 | 7 |  8 | 9 | Z 10 | Y 11 | 12 |  13 | 14 | b 15 | Z 16 | 17 |  18 | 19 | B 20 |  -------------------------------------------------------------------------------- /_unittests/ut_ortcy/test_inference.cpp: -------------------------------------------------------------------------------- 1 | #include "onnx_extended_helpers.h" 2 | #include "onnx_extended_test_common.h" 3 | // #include "onnx_extended/ortcy/wrap/ortapi.h" 4 | #include "onnxruntime_cxx_api.h" 5 | #if __cplusplus >= 201703L 6 | #include 7 | #endif 8 | #ifdef _WIN32 9 | #include 10 | #include 11 | #endif 12 | 13 | void testAssertTrue() { ASSERT_THROW(true); } 14 | 15 | void test_inference() { 16 | const OrtApi *api = OrtGetApiBase()->GetApi(ORT_API_VERSION); 17 | ASSERT_THROW(api != nullptr); 18 | Ort::Env env; 19 | auto ort_env = &env; // std::make_unique(ORT_LOGGING_LEVEL_WARNING, "Default"); 20 | Ort::SessionOptions session_options; 21 | session_options.SetIntraOpNumThreads(1); 22 | session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_EXTENDED); 23 | session_options.SetLogSeverityLevel(0); 24 | 25 | // requires C++ 17 26 | std_string_type model = get_data_path("ut_ortcy/data/add.onnx"); 27 | 28 | Ort::Session session(*ort_env, model.c_str(), session_options); 29 | 30 | const char *input_names[] = {"X", "Y"}; 31 | const char *output_names[] = {"Z"}; 32 | 33 | float vector_1_value[] = {0.f, 1.f, 2.f, 3.f, 4.f, 5.f}; 34 | int64_t vector_1_dim[] = {6, 1}; 35 | 36 | float vector_2_value[] = {0.f, 1.f, 2.f, 3.f, 4.f, 50.f}; 37 | int64_t vector_2_dim[] = {6, 1}; 38 | 39 | auto memory_info = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault); 40 | 41 | Ort::Value input_tensors[] = { 42 | Ort::Value::CreateTensor(memory_info, vector_1_value, 6, vector_1_dim, 2), 43 | Ort::Value::CreateTensor(memory_info, vector_2_value, 6, vector_2_dim, 2)}; 44 | 45 | Ort::RunOptions run_options; 46 | auto output_tensors = 47 | session.Run(run_options, input_names, input_tensors, 2, output_names, 1); 48 | const auto &vector_filterred = output_tensors.at(0); 49 | auto type_shape_info = vector_filterred.GetTensorTypeAndShapeInfo(); 50 | const float *floats_output = static_cast(vector_filterred.GetTensorRawData()); 51 | ASSERT_EQUAL(floats_output[0], 0); 52 | ASSERT_EQUAL(floats_output[1], 2); 53 | ASSERT_EQUAL(floats_output[2], 4); 54 | ASSERT_EQUAL(floats_output[3], 6); 55 | ASSERT_EQUAL(floats_output[4], 8); 56 | ASSERT_EQUAL(floats_output[5], 55); 57 | } 58 | 59 | int main(int, char **) { 60 | testAssertTrue(); 61 | test_inference(); 62 | } 63 | -------------------------------------------------------------------------------- /_unittests/ut_ortops/data/plot_op_tree_ensemble_implementations_custom.onnx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdpython/onnx-extended/eebca3c82b615f841608f74131900738b7f7f845/_unittests/ut_ortops/data/plot_op_tree_ensemble_implementations_custom.onnx -------------------------------------------------------------------------------- /_unittests/ut_ortops/data/plot_op_tree_ensemble_implementations_sparse.onnx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdpython/onnx-extended/eebca3c82b615f841608f74131900738b7f7f845/_unittests/ut_ortops/data/plot_op_tree_ensemble_implementations_sparse.onnx -------------------------------------------------------------------------------- /_unittests/ut_ortops/test_inference_tree.cpp: -------------------------------------------------------------------------------- 1 | #include "onnx_extended_helpers.h" 2 | #include "onnx_extended_test_common.h" 3 | // #include "onnx_extended/ortcy/wrap/ortapi.h" 4 | #include "onnxruntime_cxx_api.h" 5 | 6 | void test_inference_tree_ensemble() { 7 | #if !defined(_WIN32) && (ORT_API_VERSION >= 17) 8 | const OrtApi *api = OrtGetApiBase()->GetApi(ORT_API_VERSION); 9 | ASSERT_THROW(api != nullptr); 10 | Ort::Env env; 11 | auto ort_env = &env; 12 | Ort::SessionOptions session_options; 13 | session_options.RegisterCustomOpsLibrary(to_std_string_path(TESTED_CUSTOM_OPS_DLL).c_str()); 14 | 15 | // requires C++ 17 16 | std_string_type model = 17 | get_data_path("ut_ortops/data/plot_op_tree_ensemble_implementations_custom.onnx"); 18 | 19 | Ort::Session session(*ort_env, model.c_str(), session_options); 20 | // It needs to revisited. 21 | return; 22 | 23 | const char *input_names[] = {"X"}; 24 | const char *output_names[] = {"variable"}; 25 | 26 | int64_t vector_1_dim[] = {100, 500}; 27 | std::vector vector_1_value(vector_1_dim[0] * vector_1_dim[1]); 28 | for (size_t i = 0; i < vector_1_value.size(); ++i) { 29 | vector_1_value[i] = 1.0f / static_cast(i + 1); 30 | } 31 | 32 | auto memory_info = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault); 33 | 34 | Ort::Value input_tensors[] = {Ort::Value::CreateTensor( 35 | memory_info, vector_1_value.data(), vector_1_value.size(), vector_1_dim, 2)}; 36 | 37 | const char *env_p = std::getenv("LONG"); 38 | bool long_test = env_p != nullptr && env_p[0] == '1'; 39 | 40 | Ort::RunOptions run_options; 41 | for (int i = 0; i < (long_test ? 100000 : 1); ++i) { 42 | if (i > 0 && i % 10000 == 0) 43 | printf("i=%d\n", i); 44 | auto out = session.Run(run_options, input_names, input_tensors, 1, output_names, 1); 45 | ASSERT_EQUAL(out.size(), 1); 46 | } 47 | auto output_tensors = 48 | session.Run(run_options, input_names, input_tensors, 1, output_names, 1); 49 | const auto &vector_filterred = output_tensors.at(0); 50 | auto type_shape_info = vector_filterred.GetTensorTypeAndShapeInfo(); 51 | ASSERT_EQUAL(type_shape_info.GetDimensionsCount(), 2); 52 | const float *floats_output = static_cast(vector_filterred.GetTensorRawData()); 53 | // ASSERT_EQUAL(floats_output[0], 0); 54 | ASSERT_NOTEQUAL(floats_output, nullptr); 55 | #endif 56 | } 57 | 58 | int main(int, char **) { test_inference_tree_ensemble(); } 59 | -------------------------------------------------------------------------------- /_unittests/ut_plotting/test_plotting_benchmark.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import pandas 3 | from onnx_extended.ext_test_case import ExtTestCase 4 | from onnx_extended.plotting.data import hhistograms_data, vhistograms_data 5 | from onnx_extended.plotting.benchmark import hhistograms, vhistograms 6 | 7 | 8 | class TestCReferenceEvaluator(ExtTestCase): 9 | def test_plotting_hhistograms(self): 10 | import matplotlib.pyplot as plt 11 | 12 | plt.clf() 13 | df = pandas.DataFrame(hhistograms_data()) 14 | ax = hhistograms(df, keys=("input", "name")) 15 | self.assertNotEmpty(ax) 16 | 17 | def test_plotting_hhistograms2(self): 18 | import matplotlib.pyplot as plt 19 | 20 | plt.clf() 21 | df = pandas.DataFrame(hhistograms_data()) 22 | df = df[df.input == "dense"] 23 | df = df.drop("input", axis=1) 24 | ax = hhistograms(df, keys="name") 25 | self.assertNotEmpty(ax) 26 | 27 | def test_plotting_vhistograms(self): 28 | import matplotlib.pyplot as plt 29 | 30 | plt.clf() 31 | df = pandas.DataFrame(vhistograms_data()) 32 | ax = vhistograms(df) 33 | self.assertNotEmpty(ax) 34 | 35 | 36 | if __name__ == "__main__": 37 | unittest.main(verbosity=2) 38 | -------------------------------------------------------------------------------- /_unittests/ut_reference/test_c_op_conv.cpp: -------------------------------------------------------------------------------- 1 | #include "onnx_extended_test_common.h" 2 | #include "cpu/c_op_conv_common.h" 3 | #include "cpu/c_op_conv.h" 4 | 5 | using namespace onnx_c_ops; 6 | 7 | void testAssertTrue() { 8 | ASSERT_THROW(true); 9 | } 10 | 11 | void test_gemm() { 12 | float pa[4] = { 1, 2, 3, 4 }; 13 | float pb[4] = { 10, 20, 30, 40 }; 14 | float pc[4] = { -0.1, -0.2, -0.3, -0.4 }; 15 | float expected[4] = { 69.9, 99.8, 149.7, 219.6 }; 16 | gemm(false, false, 2, 2, 2, 1.0f, pa, pb, 1.0f, pc); 17 | ASSERT_EQUAL_VECTOR(4, expected, pc); 18 | 19 | float pc2[4] = { -0.1, -0.2, -0.3, -0.4 }; 20 | float expected2[4] = { 70.0, 100.0, 150.0, 220.0 }; 21 | gemm(false, false, 2, 2, 2, 1.0f, pa, pb, 0.0f, pc2); 22 | ASSERT_EQUAL_VECTOR(4, expected2, pc2); 23 | 24 | float pc3[4] = { -0.1, -0.2, -0.3, -0.4 }; 25 | float expected3[4] = { 139.9, 199.8, 299.7, 439.6 }; 26 | gemm(false, false, 2, 2, 2, 2.0f, pa, pb, 1.0f, pc3); 27 | ASSERT_EQUAL_VECTOR(4, expected3, pc3); 28 | 29 | float paA[4] = { 1, 2, 3, 4 }; 30 | float pbA[4] = { 1, 0, 0, 1 }; 31 | float pcA[4] = { 0, 0, 0, 0 }; 32 | float expectedA[4] = { 1, 3, 2, 4 }; 33 | gemm(true, false, 2, 2, 2, 1.0f, paA, pbA, 1.0f, pcA); 34 | ASSERT_EQUAL_VECTOR(4, expectedA, pcA); 35 | 36 | float paB[4] = { 1, 0, 0, 1 }; 37 | float pbB[4] = { 1, 2, 3, 4 }; 38 | float pcB[4] = { 0, 0, 0, 0 }; 39 | float expectedB[4] = { 1, 2, 3, 4 }; 40 | gemm(true, false, 2, 2, 2, 1.0f, paB, pbB, 1.0f, pcB); 41 | ASSERT_EQUAL_VECTOR(4, expectedB, pcB); 42 | 43 | float paC[4] = { 1, 1, 0, 1 }; 44 | float pbC[4] = { 1, 1, 0, 0 }; 45 | float pcC[4] = { 0, 0, 0, 0 }; 46 | float expectedC[4] = { 10, 10, 10, 10 }; 47 | gemm(true, false, 2, 2, 2, 10.0f, paC, pbC, 1.0f, pcC); 48 | ASSERT_EQUAL_VECTOR(4, expectedC, pcC); 49 | 50 | float pc6[4] = { -0.1, -0.2, -0.3, -0.4 }; 51 | float expected6[4] = { 69.9, 149.8, 99.7, 219.6 }; 52 | gemm(true, true, 2, 2, 2, 1.0f, pa, pb, 1.0f, pc6); 53 | ASSERT_EQUAL_VECTOR(4, expected6, pc6); 54 | 55 | float pc4[4] = { -0.1, -0.2, -0.3, -0.4 }; 56 | float expected4[4] = { 99.9, 139.8, 139.7, 199.6 }; 57 | gemm(true, false, 2, 2, 2, 1.0f, pa, pb, 1.0f, pc4); 58 | ASSERT_ALMOST_VECTOR(4, expected4, pc4, 1e-5f); 59 | 60 | float pc5[4] = { -0.1, -0.2, -0.3, -0.4 }; 61 | float expected5[4] = { 49.9, 109.8, 109.7, 249.6 }; 62 | gemm(false, true, 2, 2, 2, 1.0f, pa, pb, 1.0f, pc5); 63 | ASSERT_EQUAL_VECTOR(4, expected5, pc5); 64 | } 65 | 66 | int main(int, char**) { 67 | testAssertTrue(); 68 | test_gemm(); 69 | } 70 | -------------------------------------------------------------------------------- /_unittests/ut_tools/bench/model.onnx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdpython/onnx-extended/eebca3c82b615f841608f74131900738b7f7f845/_unittests/ut_tools/bench/model.onnx -------------------------------------------------------------------------------- /_unittests/ut_tools/bench/test_data_set_0/input_0.pb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdpython/onnx-extended/eebca3c82b615f841608f74131900738b7f7f845/_unittests/ut_tools/bench/test_data_set_0/input_0.pb -------------------------------------------------------------------------------- /_unittests/ut_tools/bench/test_data_set_0/input_1.pb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdpython/onnx-extended/eebca3c82b615f841608f74131900738b7f7f845/_unittests/ut_tools/bench/test_data_set_0/input_1.pb -------------------------------------------------------------------------------- /_unittests/ut_tools/bench/test_data_set_0/output_0.pb: -------------------------------------------------------------------------------- 1 | BZJ  -------------------------------------------------------------------------------- /_unittests/ut_tools/bench_rf/model.onnx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdpython/onnx-extended/eebca3c82b615f841608f74131900738b7f7f845/_unittests/ut_tools/bench_rf/model.onnx -------------------------------------------------------------------------------- /_unittests/ut_tools/bench_rf/test_data_set_0/input_0.pb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdpython/onnx-extended/eebca3c82b615f841608f74131900738b7f7f845/_unittests/ut_tools/bench_rf/test_data_set_0/input_0.pb -------------------------------------------------------------------------------- /_unittests/ut_tools/bench_rf/test_data_set_0/output_0.pb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdpython/onnx-extended/eebca3c82b615f841608f74131900738b7f7f845/_unittests/ut_tools/bench_rf/test_data_set_0/output_0.pb -------------------------------------------------------------------------------- /_unittests/ut_tools/data/debug_4700-CPUep.onnx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdpython/onnx-extended/eebca3c82b615f841608f74131900738b7f7f845/_unittests/ut_tools/data/debug_4700-CPUep.onnx -------------------------------------------------------------------------------- /_unittests/ut_tools/test_einsum_benchmark.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from onnx_extended.ext_test_case import ExtTestCase 3 | from onnx_extended.tools.einsum.einsum_bench import einsum_benchmark 4 | 5 | 6 | class TestEinsumBenchmark(ExtTestCase): 7 | def test_benchmark1(self): 8 | for rt in ["numpy", "python", "onnxruntime"]: 9 | with self.subTest(rt=rt): 10 | res = list(einsum_benchmark(shape=5, runtime=rt)) 11 | self.assertEqual(len(res), 2) 12 | 13 | def test_benchmark_exc(self): 14 | self.assertRaise( 15 | lambda: list(einsum_benchmark(shape=5, runtime="UNK")), ValueError 16 | ) 17 | self.assertRaise( 18 | lambda: list(einsum_benchmark(shape=5, equation="abc,cd->abD", perm=True)), 19 | AssertionError, 20 | ) 21 | 22 | def test_benchmark2(self): 23 | for rt in ["numpy", "python", "onnxruntime"]: 24 | with self.subTest(rt=rt): 25 | res = list(einsum_benchmark(shape=[5, 6], runtime=rt)) 26 | self.assertEqual(len(res), 4) 27 | 28 | def test_benchmark1_shape(self): 29 | for rt in ["numpy", "python", "onnxruntime"]: 30 | with self.subTest(rt=rt): 31 | res = list(einsum_benchmark(shape=[(5, 5, 5), (5, 5)], runtime=rt)) 32 | self.assertEqual(len(res), 2) 33 | 34 | def test_benchmarkn(self): 35 | for rt in ["numpy"]: 36 | with self.subTest(rt=rt): 37 | res = list(einsum_benchmark(shape=5, perm=True, runtime=rt)) 38 | self.assertEqual(len(res), 48) 39 | 40 | 41 | if __name__ == "__main__": 42 | import logging 43 | 44 | logging.getLogger("skl2onnx").setLevel(logging.ERROR) 45 | logging.getLogger("onnx-extended").setLevel(logging.ERROR) 46 | unittest.main(verbosity=2) 47 | -------------------------------------------------------------------------------- /_unittests/ut_tools/test_einsum_bug.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import numpy 3 | from onnx_extended.ext_test_case import ExtTestCase 4 | from onnx_extended.tools.einsum import ( 5 | decompose_einsum_equation, 6 | optimize_decompose_einsum_equation, 7 | ) 8 | from onnx_extended.reference import CReferenceEvaluator 9 | 10 | 11 | class TestEinsumBug(ExtTestCase): 12 | def test_abbba(self): 13 | res = decompose_einsum_equation("ab,b->ba", strategy="numpy", clean=True) 14 | self.assertNotEmpty(res) 15 | 16 | def test__pprint_forward(self): 17 | res = decompose_einsum_equation("ab,b->ba", strategy="numpy", clean=True) 18 | pf = res._pprint_forward() 19 | spl = pf.split("<- id") 20 | self.assertEqual(len(spl), 4) 21 | 22 | def common_test_equation(self, equation, dim1, dim2): 23 | seq = decompose_einsum_equation(equation, clean=True, strategy="numpy") 24 | onx = seq.to_onnx("Y", "X1", "X2") 25 | sequ = equation.replace(",", "_").replace("->", "__") 26 | with open(f"temp_{sequ}_A.onnx", "wb") as f: 27 | f.write(onx.SerializeToString()) 28 | a = numpy.random.rand(*list((2,) * dim1)) 29 | b = numpy.random.rand(*list((2,) * dim2)) 30 | oinf = CReferenceEvaluator(onx, verbose=0) 31 | got = oinf.run(None, {"X1": a, "X2": b}) 32 | expected = numpy.einsum(equation, a, b) 33 | self.assertEqualArray(expected, got[0], atol=1e-15) 34 | 35 | res = optimize_decompose_einsum_equation( 36 | equation, 37 | numpy.float64, 38 | optimize=True, 39 | runtime="python", 40 | cache=False, 41 | opset=15, 42 | decompose=True, 43 | strategy="ml", 44 | verbose=None, 45 | ) 46 | new_eq = res.equation_ 47 | new_onx = res.onnx_ 48 | sequ = new_eq.replace(",", "_").replace("->", "__") 49 | with open(f"temp_{sequ}_B.onnx", "wb") as f: 50 | f.write(new_onx.SerializeToString()) 51 | oinf = CReferenceEvaluator(new_onx) 52 | got = oinf.run(None, {"X0": a, "X1": b}) 53 | self.assertEqualArray(expected, got[0], atol=1e-15) 54 | 55 | def test_decompose_einsum_abc_cde_abde(self): 56 | self.common_test_equation("abc,cde->abde", 3, 3) 57 | 58 | def test_decompose_einsum_abcd_cde_abe(self): 59 | self.common_test_equation("abcd,cde->abe", 4, 3) 60 | 61 | 62 | if __name__ == "__main__": 63 | import logging 64 | 65 | logging.getLogger("skl2onnx").setLevel(logging.ERROR) 66 | logging.getLogger("onnx-extended").setLevel(logging.ERROR) 67 | unittest.main(verbosity=2) 68 | -------------------------------------------------------------------------------- /_unittests/ut_tools/test_einsum_ml.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from itertools import permutations 3 | from onnx_extended.ext_test_case import ExtTestCase 4 | from onnx_extended.tools.einsum.einsum_ml import ( 5 | predict_transposition_cost, 6 | compute_transposition_features, 7 | _edit_distance, 8 | ) 9 | 10 | 11 | class TestEinsumMl(ExtTestCase): 12 | def test_features(self): 13 | res = compute_transposition_features((3, 5, 7), (0, 1, 2)) 14 | self.assertIsInstance(res, dict) 15 | self.assertEqual(res["edit"], 0) 16 | self.assertEqual(res["rot"], -1) 17 | res = compute_transposition_features((3, 5, 7), (2, 1, 0)) 18 | self.assertEqual(res["edit"], 2) 19 | self.assertEqual(res["rot"], 0) 20 | self.assertEqual(res["rev"], 1) 21 | 22 | def test_cost(self): 23 | res = predict_transposition_cost((3, 5, 7), (0, 1, 2)) 24 | self.assertIsInstance(res, float) 25 | self.assertGreaterEqual(res, 0) 26 | for shape in [(3, 5, 7), (30, 50, 70)]: 27 | for perm in permutations([0, 1, 2]): 28 | p = tuple(perm) 29 | cost = predict_transposition_cost(shape, p) 30 | if p[-1] == 2: 31 | self.assertEqual(cost, 0) 32 | 33 | def test_edit_distance(self): 34 | r = _edit_distance("", "a") 35 | self.assertEqual(r, 1) 36 | r = _edit_distance("a", "") 37 | self.assertEqual(r, 1) 38 | r = _edit_distance("a", "ab") 39 | self.assertEqual(r, 1) 40 | 41 | 42 | if __name__ == "__main__": 43 | unittest.main() 44 | -------------------------------------------------------------------------------- /_unittests/ut_tools/test_ort_debug.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import numpy as np 3 | from contextlib import redirect_stdout 4 | from io import StringIO 5 | from onnx import NodeProto, TensorProto 6 | from onnx.helper import ( 7 | make_model, 8 | make_node, 9 | make_graph, 10 | make_tensor_value_info, 11 | make_opsetid, 12 | ) 13 | from onnx.checker import check_model 14 | from onnx_extended.ext_test_case import ExtTestCase 15 | from onnx_extended.tools.ort_debug import enumerate_ort_run 16 | 17 | 18 | class TestOrtDebug(ExtTestCase): 19 | def _get_model(self): 20 | X = make_tensor_value_info("X", TensorProto.FLOAT, [None, None]) 21 | Y = make_tensor_value_info("Y", TensorProto.FLOAT, [None, None]) 22 | Z = make_tensor_value_info("Z", TensorProto.INT64, [None, None]) 23 | graph = make_graph( 24 | [ 25 | make_node("Add", ["X", "Y"], ["z1"]), 26 | make_node("Mul", ["X", "z1"], ["z2"]), 27 | make_node("Cast", ["z2"], ["Z"], to=TensorProto.INT64), 28 | ], 29 | "add", 30 | [X, Y], 31 | [Z], 32 | ) 33 | onnx_model = make_model( 34 | graph, opset_imports=[make_opsetid("", 18)], ir_version=8 35 | ) 36 | check_model(onnx_model) 37 | return onnx_model 38 | 39 | def test_enumerate_ort_run(self): 40 | model = self._get_model() 41 | feeds = { 42 | "X": np.arange(4).reshape((2, 2)).astype(np.float32), 43 | "Y": np.arange(4).reshape((2, 2)).astype(np.float32), 44 | } 45 | expected_names = [["z1"], ["z2"], ["Z"]] 46 | for i, (names, outs, node) in enumerate(enumerate_ort_run(model, feeds)): 47 | self.assertIsInstance(node, NodeProto) 48 | self.assertIsInstance(names, list) 49 | self.assertIsInstance(outs, list) 50 | self.assertEqual(len(names), len(outs)) 51 | self.assertEqual(names, expected_names[i]) 52 | 53 | st = StringIO() 54 | with redirect_stdout(st): 55 | for _ in enumerate_ort_run(model, feeds, verbose=2): 56 | pass 57 | std = st.getvalue() 58 | self.assertIn("Add(X, Y) -> z1", std) 59 | self.assertIn("+ z1: float32(2, 2)", std) 60 | self.assertIn("Cast(z2, to=7) -> Z", std) 61 | 62 | st = StringIO() 63 | with redirect_stdout(st): 64 | for _ in enumerate_ort_run(model, feeds, verbose=3): 65 | pass 66 | std = st.getvalue() 67 | self.assertIn("Add(X, Y) -> z1", std) 68 | self.assertIn("+ z1: float32(2, 2)", std) 69 | self.assertIn("[[", std) 70 | 71 | 72 | if __name__ == "__main__": 73 | unittest.main(verbosity=2) 74 | -------------------------------------------------------------------------------- /_unittests/ut_validation/test_bench_tree.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from contextlib import redirect_stdout 3 | from io import StringIO 4 | from onnx import ModelProto 5 | from onnx_extended.ext_test_case import ExtTestCase, skipif_ci_apple 6 | from onnx_extended.validation.bench_trees import create_decision_tree, bench_trees 7 | from onnx_extended.validation._tree_d14_f100 import tree_d14_f100 8 | from onnx_extended.tools.onnx_io import onnx2string 9 | 10 | 11 | class TestBenchTree(ExtTestCase): 12 | def test_create_decision_tree(self): 13 | tree = create_decision_tree(max_depth=2) 14 | code = onnx2string(tree, as_code=True) 15 | self.assertNotIn("import textwrap", code) 16 | # with open("onnx_extended/validation/_tree_d14_f100.py", "w") as f: 17 | # f.write(code) 18 | 19 | def test_tree14(self): 20 | model = tree_d14_f100() 21 | self.assertIsInstance(model, ModelProto) 22 | 23 | def test_bench_tree(self): 24 | res = bench_trees( 25 | max_depth=2, 26 | n_estimators=10, 27 | n_features=4, 28 | batch_size=100, 29 | number=10, 30 | warmup=2, 31 | verbose=0, 32 | engine_names=["onnxruntime", "CReferenceEvaluator"], 33 | ) 34 | self.assertIsInstance(res, list) 35 | self.assertEqual(len(res), 4) 36 | 37 | def test_bench_tree_verbose(self): 38 | st = StringIO() 39 | with redirect_stdout(st): 40 | res = bench_trees( 41 | max_depth=2, 42 | n_estimators=10, 43 | n_features=4, 44 | batch_size=100, 45 | number=10, 46 | warmup=2, 47 | engine_names=["CReferenceEvaluator"], 48 | verbose=2, 49 | ) 50 | text = st.getvalue() 51 | self.assertIsInstance(res, list) 52 | self.assertEqual(len(res), 2) 53 | self.assertIn("test 'CReferenceEvaluator' duration=", text) 54 | 55 | @skipif_ci_apple("crash") 56 | def test_bench_tree_all_engines(self): 57 | res = bench_trees( 58 | max_depth=2, 59 | n_estimators=10, 60 | n_features=4, 61 | batch_size=100, 62 | number=10, 63 | warmup=2, 64 | repeat=1, 65 | engine_names=["onnxruntime", "onnxruntime-customops"], # , "cython"], 66 | ) 67 | self.assertIsInstance(res, list) 68 | self.assertEqual(len(res), 2) 69 | 70 | 71 | if __name__ == "__main__": 72 | unittest.main(verbosity=2) 73 | -------------------------------------------------------------------------------- /_unittests/ut_validation/test_cpu_fpemu.cpp: -------------------------------------------------------------------------------- 1 | #include "onnx_extended/validation/cpu/cpu_fpemu.hpp" 2 | #include "onnx_extended_helpers.h" 3 | #include "onnx_extended_test_common.h" 4 | 5 | using namespace cpu_fpemu; 6 | 7 | void test_cast() { 8 | 9 | #if defined(__SSSE3__) 10 | 11 | float f = 1.f; 12 | double d = 1.f; 13 | float ff = __double2float_rn(d); 14 | ASSERT_THROW(f == ff); 15 | unsigned short u = __float2half_rn(f); 16 | float bu = __half2float(u); 17 | ASSERT_THROW(f == bu); 18 | 19 | #endif 20 | 21 | } 22 | 23 | int main(int, char**) { 24 | test_cast(); 25 | } 26 | -------------------------------------------------------------------------------- /_unittests/ut_validation/test_cpu_fpemu.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from onnx_extended.ext_test_case import ExtTestCase 3 | from onnx_extended.validation.cpu._validation import has_sse3 4 | 5 | 6 | class TestCpuFpEmu(ExtTestCase): 7 | @unittest.skipIf(not has_sse3(), "SSE3 not available") 8 | def test_cast(self): 9 | from onnx_extended.validation.cpu._validation import ( 10 | double2float_rn, 11 | float2half_rn, 12 | half2float, 13 | ) 14 | 15 | self.assertEqual(double2float_rn(1), 1) 16 | self.assertEqual(half2float(float2half_rn(1)), 1) 17 | 18 | 19 | if __name__ == "__main__": 20 | unittest.main(verbosity=2) 21 | -------------------------------------------------------------------------------- /_unittests/ut_validation/test_cuda_fpemu.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import numpy as np 3 | from onnx_extended.ext_test_case import ExtTestCase 4 | from onnx_extended import has_cuda 5 | 6 | 7 | class TestCudaFpemu(ExtTestCase): 8 | @unittest.skipIf(not has_cuda(), reason="CUDA not available") 9 | def test_fpemu_cuda_forward(self): 10 | from onnx_extended.validation.cuda.cuda_example_py import ( 11 | fpemu_cuda_forward, 12 | ) 13 | 14 | values = np.array( 15 | [-2, -1, 0, 1, 2, 3, 10, 100, 10000, 20000, 50000, 100000, -100000], 16 | dtype=np.float32, 17 | ) 18 | res = fpemu_cuda_forward(values) 19 | expected = res.copy() 20 | self.assertEqual(res.shape, values.shape) 21 | res = fpemu_cuda_forward(values) 22 | self.assertEqual(res.shape, values.shape) 23 | fpemu_cuda_forward(values, inplace=True) 24 | self.assertEqualArray(expected, values) 25 | 26 | 27 | if __name__ == "__main__": 28 | unittest.main(verbosity=2) 29 | -------------------------------------------------------------------------------- /_unittests/ut_validation/test_cuda_gemm.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from onnx_extended.ext_test_case import ExtTestCase 3 | from onnx_extended import has_cuda 4 | 5 | if has_cuda(): 6 | from onnx_extended.validation.cuda.cuda_example_py import ( 7 | gemm_benchmark_test, 8 | get_device_prop, 9 | cuda_device_count, 10 | cuda_device_memory, 11 | cuda_devices_memory, 12 | ) 13 | else: 14 | gemm_benchmark_test = None 15 | get_device_prop = None 16 | 17 | 18 | class TestCudaGemm(ExtTestCase): 19 | @unittest.skipIf(get_device_prop is None, reason="CUDA not available") 20 | def test_get_device_prop(self): 21 | r = get_device_prop() 22 | self.assertIsInstance(r, dict) 23 | self.assertEqual(len(r), 12) 24 | 25 | @unittest.skipIf(get_device_prop is None, reason="CUDA not available") 26 | def test_cuda_device_count(self): 27 | r = cuda_device_count() 28 | self.assertIsInstance(r, int) 29 | self.assertGreater(r, 0) 30 | 31 | @unittest.skipIf(get_device_prop is None, reason="CUDA not available") 32 | def test_cuda_device_memory(self): 33 | r = cuda_device_memory(0) 34 | self.assertIsInstance(r, tuple) 35 | self.assertEqual(len(r), 2) 36 | 37 | @unittest.skipIf(get_device_prop is None, reason="CUDA not available") 38 | def test_cuda_devices_memory(self): 39 | r = cuda_devices_memory() 40 | n = cuda_device_count() 41 | self.assertIsInstance(r, list) 42 | self.assertEqual(len(r), n) 43 | self.assertIsInstance(r[0], tuple) 44 | self.assertEqual(len(r[0]), 2) 45 | 46 | def gemm_test(self, test): 47 | r = gemm_benchmark_test(test) 48 | self.assertIsInstance(r, dict) 49 | self.assertEqual(len(r), 24) 50 | self.assertEqual(r["N"], 10) 51 | 52 | @unittest.skipIf(gemm_benchmark_test is None, reason="CUDA not available") 53 | def test_gemm_test_float32(self): 54 | for i in range(5): 55 | with self.subTest(test=i): 56 | self.gemm_test(i) 57 | 58 | @unittest.skipIf(gemm_benchmark_test is None, reason="CUDA not available") 59 | def test_gemm_test_float8(self): 60 | r = get_device_prop() 61 | if r["major"] < 9: 62 | return 63 | for i in range(5, 15): 64 | if i in {8, 9, 10, 12, 13}: 65 | # still invalid 66 | continue 67 | with self.subTest(test=i): 68 | self.gemm_test(i) 69 | 70 | 71 | if __name__ == "__main__": 72 | unittest.main(verbosity=2) 73 | -------------------------------------------------------------------------------- /_unittests/ut_validation/test_cuda_monitor.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from onnx_extended.ext_test_case import ExtTestCase 3 | from onnx_extended import has_cuda 4 | 5 | if has_cuda(): 6 | from onnx_extended.validation.cuda.cuda_monitor import ( 7 | nvml_device_get_count, 8 | nvml_device_get_memory_info, 9 | nvml_init, 10 | nvml_shutdown, 11 | ) 12 | else: 13 | nvml_init = None 14 | 15 | 16 | class TestCudaMonitor(ExtTestCase): 17 | @unittest.skipIf(nvml_init is None, reason="CUDA not available") 18 | def test_nvml(self): 19 | nvml_init() 20 | r = nvml_device_get_count() 21 | self.assertIsInstance(r, int) 22 | self.assertGreater(r, 0) 23 | info = nvml_device_get_memory_info() 24 | self.assertIsInstance(info, tuple) 25 | self.assertEqual(len(info), 3) 26 | self.assertTrue(info[-1] >= max(info[:-1])) 27 | nvml_shutdown() 28 | 29 | 30 | if __name__ == "__main__": 31 | unittest.main(verbosity=2) 32 | -------------------------------------------------------------------------------- /_unittests/ut_validation/test_fp8.py: -------------------------------------------------------------------------------- 1 | import struct 2 | import unittest 3 | import numpy 4 | from onnx_extended.ext_test_case import ExtTestCase 5 | 6 | try: 7 | from onnx_array_api.validation.f8 import search_float32_into_fe4m3 8 | except ImportError: 9 | # onnx-array-api is not recent enough 10 | search_float32_into_fe4m3 = None 11 | 12 | 13 | class TestFloat8(ExtTestCase): 14 | def test_cast_float32_to_e4m3fn(self): 15 | from onnx_extended.validation.cython.fp8 import ( 16 | cast_float32_to_e4m3fn, 17 | cast_e4m3fn_to_float32, 18 | ) 19 | 20 | values = numpy.array([[10, 1, 4, 5, 6, 7]], dtype=numpy.float32) 21 | f8 = cast_float32_to_e4m3fn(values) 22 | back = cast_e4m3fn_to_float32(f8) 23 | f82 = cast_float32_to_e4m3fn(back) 24 | self.assertEqualArray(f8, f82) 25 | 26 | @unittest.skipIf( 27 | search_float32_into_fe4m3 is None, reason="onnx-array-api not recent enough" 28 | ) 29 | def test_cast_float32_to_e4m3fn_more(self): 30 | from onnx_extended.validation.cython.fp8 import cast_float32_to_e4m3fn 31 | 32 | vect_search_float32_into_fe4m3 = numpy.vectorize(search_float32_into_fe4m3) 33 | 34 | values = numpy.array([[10, 1, 4, 5, 6, 7]], dtype=numpy.float32) 35 | expected = vect_search_float32_into_fe4m3(values).astype(numpy.uint8) 36 | f8 = cast_float32_to_e4m3fn(values) 37 | self.assertEqualArray(expected, f8) 38 | 39 | x = numpy.random.randn(4, 4, 4).astype(numpy.float32) 40 | expected = vect_search_float32_into_fe4m3(x).astype(numpy.uint8) 41 | f8 = cast_float32_to_e4m3fn(x) 42 | self.assertEqualArray(expected, f8) 43 | 44 | def test_inf(self): 45 | from onnx_extended.validation.cython.fp8 import cast_float32_to_e4m3fn 46 | 47 | for x, e in [(numpy.float32(numpy.inf), 126), (numpy.float32(-numpy.inf), 254)]: 48 | f8 = cast_float32_to_e4m3fn(x) 49 | self.assertEqual(e, f8) 50 | 51 | def test_nan(self): 52 | from onnx_extended.validation.cython.fp8 import cast_float32_to_e4m3fn 53 | 54 | expected = 127 55 | values = [ 56 | ( 57 | None, 58 | int.from_bytes(struct.pack(" Any: 5 | """ 6 | Picks the value value not None. 7 | """ 8 | for a in args: 9 | if a is not None: 10 | return a 11 | raise ValueError("All values are None.") 12 | -------------------------------------------------------------------------------- /onnx_extended/cpp/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdpython/onnx-extended/eebca3c82b615f841608f74131900738b7f7f845/onnx_extended/cpp/__init__.py -------------------------------------------------------------------------------- /onnx_extended/cpp/c_op_allocation.cpp: -------------------------------------------------------------------------------- 1 | #include "common/c_op_allocation.h" 2 | #include 3 | 4 | #if (!(defined(PYTHON_MANYLINUX) && PYTHON_MANYLINUX)) 5 | #include 6 | #endif 7 | 8 | namespace onnx_c_ops { 9 | 10 | #if (defined(PYTHON_MANYLINUX) && PYTHON_MANYLINUX) 11 | 12 | void *AllocatorDefaultAlloc(std::size_t size) { return malloc(size); } 13 | 14 | void AllocatorDefaultFree(void *p) { free(p); } 15 | 16 | #else 17 | 18 | void *AllocatorDefaultAlloc(std::size_t size) { 19 | const std::size_t alignment = 64; 20 | void *p; 21 | #if _MSC_VER 22 | p = _aligned_malloc(size, alignment); 23 | if (p == nullptr) 24 | #if __cplusplus >= 202002L 25 | throw std::bad_alloc(); 26 | #else 27 | abort(); 28 | #endif 29 | #elif defined(_LIBCPP_SGX_CONFIG) 30 | p = memalign(alignment, size); 31 | if (p == nullptr) 32 | #if __cplusplus >= 202002L 33 | throw std::bad_alloc(); 34 | #else 35 | abort(); 36 | #endif 37 | #else 38 | int ret = posix_memalign(&p, alignment, size); 39 | if (ret != 0) 40 | #if __cplusplus >= 202002L 41 | throw std::bad_alloc(); 42 | #else 43 | abort(); 44 | #endif 45 | #endif 46 | return p; 47 | } 48 | 49 | void AllocatorDefaultFree(void *p) { 50 | #if _MSC_VER 51 | _aligned_free(p); 52 | #else 53 | free(p); 54 | #endif 55 | } 56 | 57 | #endif 58 | 59 | } // namespace onnx_c_ops 60 | -------------------------------------------------------------------------------- /onnx_extended/cpp/cpu/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdpython/onnx-extended/eebca3c82b615f841608f74131900738b7f7f845/onnx_extended/cpp/cpu/__init__.py -------------------------------------------------------------------------------- /onnx_extended/cpp/cuda/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdpython/onnx-extended/eebca3c82b615f841608f74131900738b7f7f845/onnx_extended/cpp/cuda/__init__.py -------------------------------------------------------------------------------- /onnx_extended/cpp/include/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdpython/onnx-extended/eebca3c82b615f841608f74131900738b7f7f845/onnx_extended/cpp/include/__init__.py -------------------------------------------------------------------------------- /onnx_extended/cpp/include/common/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdpython/onnx-extended/eebca3c82b615f841608f74131900738b7f7f845/onnx_extended/cpp/include/common/__init__.py -------------------------------------------------------------------------------- /onnx_extended/cpp/include/common/c_op_allocation.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace onnx_c_ops { 6 | 7 | void *AllocatorDefaultAlloc(std::size_t size); 8 | void AllocatorDefaultFree(void *p); 9 | 10 | } // namespace onnx_c_ops 11 | -------------------------------------------------------------------------------- /onnx_extended/cpp/include/common/c_op_common_parameters.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | namespace onnx_c_ops { 8 | 9 | enum class POST_EVAL_TRANSFORM { 10 | NONE = 0, 11 | LOGISTIC = 1, 12 | SOFTMAX = 2, 13 | SOFTMAX_ZERO = 3, 14 | PROBIT = 4 15 | }; 16 | 17 | POST_EVAL_TRANSFORM to_POST_EVAL_TRANSFORM(const std::string &value); 18 | 19 | enum NODE_MODE : uint8_t { 20 | LEAF = 1, 21 | BRANCH_LEQ = 2, 22 | BRANCH_LT = 4, 23 | BRANCH_GTE = 6, 24 | BRANCH_GT = 8, 25 | BRANCH_EQ = 10, 26 | BRANCH_NEQ = 12 27 | }; 28 | 29 | NODE_MODE to_NODE_MODE(const std::string &value); 30 | 31 | const char *to_str(NODE_MODE mode); 32 | 33 | enum class AGGREGATE_FUNCTION { AVERAGE, SUM, MIN, MAX }; 34 | 35 | AGGREGATE_FUNCTION to_AGGREGATE_FUNCTION(const std::string &input); 36 | 37 | enum class SVM_TYPE { SVM_LINEAR = 1, SVM_SVC = 2 }; 38 | 39 | SVM_TYPE to_SVM_TYPE(const std::string &value); 40 | 41 | enum KERNEL { LINEAR, POLY, RBF, SIGMOID }; 42 | 43 | KERNEL to_KERNEL(const std::string &value); 44 | 45 | enum StorageOrder { 46 | UNKNOWN = 0, 47 | NHWC = 1, 48 | NCHW = 2, 49 | }; 50 | 51 | StorageOrder to_StorageOrder(const std::string &value); 52 | 53 | enum class AutoPadType { 54 | NOTSET = 0, 55 | VALID = 1, 56 | SAME_UPPER = 2, 57 | SAME_LOWER = 3, 58 | }; 59 | 60 | AutoPadType to_AutoPadType(const std::string &value); 61 | 62 | } // namespace onnx_c_ops 63 | -------------------------------------------------------------------------------- /onnx_extended/cpp/include/common/c_op_status.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | namespace onnx_c_ops { 4 | 5 | class Status { 6 | public: 7 | int code; 8 | inline Status() : code(1) {} 9 | inline Status(int code) : code(code) {} 10 | inline Status &operator=(const Status &other) { 11 | code = other.code; 12 | return *this; 13 | } 14 | inline bool IsOK() const { return code == 1; } 15 | inline int Code() const { return code; } 16 | inline bool operator==(const Status &other) const { return code == other.code; } 17 | inline bool operator!=(const Status &other) const { return !(*this == other); } 18 | inline static Status OK() { return Status(1); } 19 | }; 20 | 21 | } // namespace onnx_c_ops 22 | -------------------------------------------------------------------------------- /onnx_extended/cpp/include/common/simple_span.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace std_ { 6 | 7 | template 8 | class span { 9 | public: 10 | span(T* data, std::size_t size) : data_(data), size_(size) {} 11 | 12 | inline T* data() const { return data_; } 13 | inline std::size_t size() const { return size_; } 14 | inline T& operator[](std::size_t index) const { return data_[index]; } 15 | inline T* begin() const { return data_; } 16 | inline T* end() const { return data_ + size_; } 17 | 18 | private: 19 | T* data_; 20 | std::size_t size_; 21 | }; 22 | 23 | } // namespace std_ 24 | -------------------------------------------------------------------------------- /onnx_extended/cpp/include/cpu/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdpython/onnx-extended/eebca3c82b615f841608f74131900738b7f7f845/onnx_extended/cpp/include/cpu/__init__.py -------------------------------------------------------------------------------- /onnx_extended/cpp/include/cuda/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdpython/onnx-extended/eebca3c82b615f841608f74131900738b7f7f845/onnx_extended/cpp/include/cuda/__init__.py -------------------------------------------------------------------------------- /onnx_extended/cpp/include/ortapi_c_api_header.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #if defined(_WIN32) 6 | 7 | // ... 8 | 9 | #elif defined(__MACOSX__) || defined(__APPLE__) 10 | 11 | // .. 12 | 13 | #else 14 | 15 | #define IS_EMPTY(x) IS_EMPTY_HELPER(x) 16 | #define IS_EMPTY_HELPER(x) IS_EMPTY_CHECK(x ## 1, 1) 17 | #define IS_EMPTY_CHECK(a, b, ...) b 18 | 19 | #if IS_EMPTY(ORT_EXPORT) 20 | #undef ORT_EXPORT 21 | #define ORT_EXPORT __attribute__ ((visibility("default"))) 22 | #endif 23 | 24 | #endif 25 | -------------------------------------------------------------------------------- /onnx_extended/cpp/include/ortapi_version.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #define ORT_API_MANUAL_INIT 4 | #include "onnxruntime_c_api.h" 5 | #include 6 | #undef ORT_API_MANUAL_INIT 7 | 8 | #define ORT_API_VERSION_SUPPORTED 16 9 | -------------------------------------------------------------------------------- /onnx_extended/helper/__init__.py: -------------------------------------------------------------------------------- 1 | from .make_dynamic_quantize_linear import ( 2 | make_dynamic_quantize_linear_function_proto, 3 | make_simple_dynamic_quantize_linear_function_proto, 4 | ) 5 | from .make_reshape_transpose import ( 6 | make_matmul_reshape_transpose_function_proto, 7 | make_matmul_reshape_transpose_back_function_proto, 8 | ) 9 | -------------------------------------------------------------------------------- /onnx_extended/ortcy/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /onnx_extended/ortcy/wrap/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /onnx_extended/ortcy/wrap/ortapi_inline.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "onnx_extended_helpers.h" 4 | #include "ortapi_version.h" 5 | 6 | namespace ortapi { 7 | 8 | inline static const OrtApi *GetOrtApi() { 9 | const OrtApi *api_ = OrtGetApiBase()->GetApi(ORT_API_VERSION_SUPPORTED); 10 | return api_; 11 | } 12 | 13 | inline const char *ort_version() { return OrtGetApiBase()->GetVersionString(); } 14 | 15 | inline void _ThrowOnError_(OrtStatus *ort_status, const char *filename, int line) { 16 | if (ort_status) { 17 | std::string message(GetOrtApi()->GetErrorMessage(ort_status)); 18 | OrtErrorCode code = GetOrtApi()->GetErrorCode(ort_status); 19 | throw std::runtime_error(onnx_extended_helpers::MakeString( 20 | "error: onnxruntime(", code, "), ", message, "\n ", filename, ":", line)); 21 | } 22 | } 23 | 24 | #define ThrowOnError(ort_status) _ThrowOnError_(ort_status, __FILE__, __LINE__) 25 | 26 | } // namespace ortapi 27 | -------------------------------------------------------------------------------- /onnx_extended/ortops/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import platform 3 | from typing import Dict, List 4 | 5 | _ort_ext_libs_pathes: Dict[str, List[str]] = {} 6 | 7 | 8 | def _get_ort_ext_libs(path: str) -> List[str]: 9 | """ 10 | Returns the list of libraries implementing new simple 11 | :epkg:`onnxruntime` kernels and places in folder *path*. 12 | """ 13 | global _ort_ext_libs_pathes 14 | if path not in _ort_ext_libs_pathes: 15 | _ort_ext_libs_pathes[path] = [] 16 | if not _ort_ext_libs_pathes[path]: 17 | if platform.system() == "Windows": 18 | ext = ".dll" 19 | elif platform.system() == "Darwin": 20 | ext = ".dylib" 21 | else: 22 | ext = ".so" 23 | this = os.path.abspath(path) 24 | files = os.listdir(this) 25 | res = [] 26 | for name in files: 27 | e = os.path.splitext(name)[-1] 28 | if e == ext and "ortops" in name: 29 | res.append(os.path.join(this, name)) 30 | assert res, ( 31 | f"Unable to find any kernel library with ext={ext!r} " 32 | f"in {this!r} among {files}." 33 | ) 34 | _ort_ext_libs_pathes[path] = res 35 | return _ort_ext_libs_pathes[path] 36 | -------------------------------------------------------------------------------- /onnx_extended/ortops/optim/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdpython/onnx-extended/eebca3c82b615f841608f74131900738b7f7f845/onnx_extended/ortops/optim/__init__.py -------------------------------------------------------------------------------- /onnx_extended/ortops/optim/cpu/ort_optim_cpu_lib.cc: -------------------------------------------------------------------------------- 1 | // Source: https://github.com/microsoft/onnxruntime/tree/main/ 2 | // onnxruntime/test/testdata/custom_op_get_const_input_test_library 3 | 4 | #include 5 | #include 6 | 7 | #include "ort_optim_cpu_lib.h" 8 | #include "ort_sparse.hpp" 9 | #include "ort_svm.hpp" 10 | #include "ort_tfidf_vectorizer.hpp" 11 | #include "ort_tree_ensemble.hpp" 12 | #include "ortapi_version.h" 13 | 14 | static const char *c_OpDomain = "onnx_extended.ortops.optim.cpu"; 15 | 16 | static void AddOrtCustomOpDomainToContainer(Ort::CustomOpDomain &&domain) { 17 | static std::vector ort_custom_op_domain_container; 18 | static std::mutex ort_custom_op_domain_mutex; 19 | std::lock_guard lock(ort_custom_op_domain_mutex); 20 | ort_custom_op_domain_container.push_back(std::move(domain)); 21 | } 22 | 23 | OrtStatus *ORT_API_CALL RegisterCustomOps(OrtSessionOptions *options, 24 | const OrtApiBase *api_base) { 25 | Ort::InitApi(api_base->GetApi(ORT_API_VERSION_SUPPORTED)); 26 | Ort::UnownedSessionOptions session_options(options); 27 | 28 | // An instance remaining available until onnxruntime unload the library. 29 | static ortops::DenseToSparse c_DenseToSparse; 30 | static ortops::SparseToDense c_SparseToDense; 31 | static ortops::SVMClassifier c_SVMClassifier; 32 | static ortops::SVMRegressor c_SVMRegressor; 33 | static ortops::TreeEnsembleRegressor, float, float> 34 | c_TreeEnsembleRegressor; 35 | static ortops::TreeEnsembleClassifier, float, float> 36 | c_TreeEnsembleClassifier; 37 | static ortops::TreeEnsembleRegressor, float, float> 38 | c_TreeEnsembleRegressorSparse; 39 | static ortops::TreeEnsembleClassifier, float, float> 40 | c_TreeEnsembleClassifierSparse; 41 | static ortops::TfIdfVectorizer c_TfIdfVectorizer; 42 | 43 | try { 44 | Ort::CustomOpDomain domain{c_OpDomain}; 45 | 46 | domain.Add(&c_DenseToSparse); 47 | domain.Add(&c_SparseToDense); 48 | domain.Add(&c_SVMClassifier); 49 | domain.Add(&c_SVMRegressor); 50 | domain.Add(&c_TreeEnsembleClassifier); 51 | domain.Add(&c_TreeEnsembleClassifierSparse); 52 | domain.Add(&c_TreeEnsembleRegressor); 53 | domain.Add(&c_TreeEnsembleRegressorSparse); 54 | domain.Add(&c_TfIdfVectorizer); 55 | 56 | session_options.Add(domain); 57 | AddOrtCustomOpDomainToContainer(std::move(domain)); 58 | } catch (const std::exception &e) { 59 | Ort::Status status{e}; 60 | return status.release(); 61 | } 62 | 63 | return nullptr; 64 | } 65 | -------------------------------------------------------------------------------- /onnx_extended/ortops/optim/cpu/ort_optim_cpu_lib.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "ortapi_c_api_header.h" 4 | 5 | #ifdef __cplusplus 6 | extern "C" { 7 | #endif 8 | 9 | ORT_EXPORT OrtStatus *ORT_API_CALL RegisterCustomOps(OrtSessionOptions *options, 10 | const OrtApiBase *api_base); 11 | 12 | #ifdef __cplusplus 13 | } 14 | #endif 15 | -------------------------------------------------------------------------------- /onnx_extended/ortops/optim/cpu/ort_sparse.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "common/common_kernels.h" 4 | 5 | namespace ortops { 6 | 7 | template struct DenseToSparseKernel { 8 | DenseToSparseKernel(const OrtApi &api, const OrtKernelInfo *info); 9 | void Compute(OrtKernelContext *context); 10 | }; 11 | 12 | template 13 | struct DenseToSparse : Ort::CustomOpBase, DenseToSparseKernel> { 14 | void *CreateKernel(const OrtApi &api, const OrtKernelInfo *info) const; 15 | const char *GetName() const; 16 | const char *GetExecutionProviderType() const; 17 | std::size_t GetInputTypeCount() const; 18 | ONNXTensorElementDataType GetInputType(std::size_t index) const; 19 | std::size_t GetOutputTypeCount() const; 20 | ONNXTensorElementDataType GetOutputType(std::size_t index) const; 21 | }; 22 | 23 | template struct SparseToDenseKernel { 24 | SparseToDenseKernel(const OrtApi &api, const OrtKernelInfo *info); 25 | void Compute(OrtKernelContext *context); 26 | }; 27 | 28 | template 29 | struct SparseToDense : Ort::CustomOpBase, SparseToDenseKernel> { 30 | void *CreateKernel(const OrtApi &api, const OrtKernelInfo *info) const; 31 | const char *GetName() const; 32 | const char *GetExecutionProviderType() const; 33 | std::size_t GetInputTypeCount() const; 34 | ONNXTensorElementDataType GetInputType(std::size_t index) const; 35 | std::size_t GetOutputTypeCount() const; 36 | ONNXTensorElementDataType GetOutputType(std::size_t index) const; 37 | }; 38 | 39 | } // namespace ortops 40 | -------------------------------------------------------------------------------- /onnx_extended/ortops/optim/cpu/ort_svm.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "common/common_kernels.h" 4 | #include "cpu/c_op_svm_common_.hpp" 5 | 6 | namespace ortops { 7 | 8 | template struct SVMKernel { 9 | SVMKernel(const OrtApi &api, const OrtKernelInfo *info); 10 | void Compute(OrtKernelContext *context); 11 | 12 | // Attributes 13 | int64_t n_targets_or_classes; 14 | std::unique_ptr> svm_type; 15 | bool is_classifier; 16 | }; 17 | 18 | template struct SVMRegressor : Ort::CustomOpBase, SVMKernel> { 19 | void *CreateKernel(const OrtApi &api, const OrtKernelInfo *info) const; 20 | const char *GetName() const; 21 | const char *GetExecutionProviderType() const; 22 | std::size_t GetInputTypeCount() const; 23 | ONNXTensorElementDataType GetInputType(std::size_t index) const; 24 | std::size_t GetOutputTypeCount() const; 25 | ONNXTensorElementDataType GetOutputType(std::size_t index) const; 26 | }; 27 | 28 | template struct SVMClassifier : Ort::CustomOpBase, SVMKernel> { 29 | void *CreateKernel(const OrtApi &api, const OrtKernelInfo *info) const; 30 | const char *GetName() const; 31 | const char *GetExecutionProviderType() const; 32 | std::size_t GetInputTypeCount() const; 33 | ONNXTensorElementDataType GetInputType(std::size_t index) const; 34 | std::size_t GetOutputTypeCount() const; 35 | ONNXTensorElementDataType GetOutputType(std::size_t index) const; 36 | }; 37 | 38 | } // namespace ortops 39 | -------------------------------------------------------------------------------- /onnx_extended/ortops/optim/cpu/ort_tfidf_vectorizer.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "common/common_kernels.h" 4 | #include "cpu/c_op_tfidf_vectorizer_.hpp" 5 | // #include 6 | 7 | namespace ortops { 8 | 9 | template struct TfIdfVectorizerKernel { 10 | 11 | #if __cpluscplus >= 202002L 12 | typedef std::span span_type_tout; 13 | typedef std::span span_type_int64; 14 | #else 15 | typedef std_::span span_type_tout; 16 | typedef std_::span span_type_int64; 17 | #endif 18 | TfIdfVectorizerKernel(const OrtApi &api, const OrtKernelInfo *info); 19 | void Compute(OrtKernelContext *context); 20 | 21 | std::unique_ptr> tfidf_typed; 22 | }; 23 | 24 | template 25 | struct TfIdfVectorizer 26 | : Ort::CustomOpBase, TfIdfVectorizerKernel> { 27 | void *CreateKernel(const OrtApi &api, const OrtKernelInfo *info) const; 28 | const char *GetName() const; 29 | const char *GetExecutionProviderType() const; 30 | std::size_t GetInputTypeCount() const; 31 | ONNXTensorElementDataType GetInputType(std::size_t index) const; 32 | std::size_t GetOutputTypeCount() const; 33 | ONNXTensorElementDataType GetOutputType(std::size_t index) const; 34 | }; 35 | 36 | } // namespace ortops 37 | -------------------------------------------------------------------------------- /onnx_extended/ortops/optim/cpu/ort_tree_ensemble.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "common/common_kernels.h" 4 | #include "cpu/c_op_tree_ensemble_common_.hpp" 5 | #include "cpu/c_op_tree_ensemble_common_classifier_.hpp" 6 | 7 | namespace ortops { 8 | 9 | template struct TreeEnsembleKernel { 10 | TreeEnsembleKernel(const OrtApi &api, const OrtKernelInfo *info); 11 | void Compute(OrtKernelContext *context); 12 | 13 | // Attributes 14 | int64_t n_targets_or_classes; 15 | std::unique_ptr> 16 | reg_type_type_type; 17 | std::unique_ptr> 18 | cls_type_type_type; 19 | bool is_classifier; 20 | }; 21 | 22 | template 23 | struct TreeEnsembleRegressor 24 | : Ort::CustomOpBase, 25 | TreeEnsembleKernel> { 26 | void *CreateKernel(const OrtApi &api, const OrtKernelInfo *info) const; 27 | const char *GetName() const; 28 | const char *GetExecutionProviderType() const; 29 | std::size_t GetInputTypeCount() const; 30 | ONNXTensorElementDataType GetInputType(std::size_t index) const; 31 | std::size_t GetOutputTypeCount() const; 32 | ONNXTensorElementDataType GetOutputType(std::size_t index) const; 33 | }; 34 | 35 | template 36 | struct TreeEnsembleClassifier : Ort::CustomOpBase, 37 | TreeEnsembleKernel> { 38 | void *CreateKernel(const OrtApi &api, const OrtKernelInfo *info) const; 39 | const char *GetName() const; 40 | const char *GetExecutionProviderType() const; 41 | std::size_t GetInputTypeCount() const; 42 | ONNXTensorElementDataType GetInputType(std::size_t index) const; 43 | std::size_t GetOutputTypeCount() const; 44 | ONNXTensorElementDataType GetOutputType(std::size_t index) const; 45 | }; 46 | 47 | } // namespace ortops 48 | -------------------------------------------------------------------------------- /onnx_extended/ortops/optim/cuda/add_or_mul_shared_input.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "common/common_kernels.h" 4 | #include "cublas_v2.h" 5 | #include 6 | 7 | namespace ortops { 8 | 9 | template struct AddOrMulSharedInputKernel { 10 | AddOrMulSharedInputKernel(const OrtApi &api, const OrtKernelInfo *info); 11 | void Compute(OrtKernelContext *context); 12 | }; 13 | 14 | template 15 | struct AddOrMulSharedInputOp 16 | : Ort::CustomOpBase, AddOrMulSharedInputKernel> { 17 | typedef Ort::CustomOpBase, AddOrMulSharedInputKernel> 18 | parent_type; 19 | AddOrMulSharedInputOp() : parent_type() {} 20 | void *CreateKernel(const OrtApi &api, const OrtKernelInfo *info) const; 21 | const char *GetName() const; 22 | const char *GetExecutionProviderType() const; 23 | 24 | std::size_t GetInputTypeCount() const; 25 | ONNXTensorElementDataType GetInputType(std::size_t index) const; 26 | OrtCustomOpInputOutputCharacteristic GetInputCharacteristic(std::size_t index) const; 27 | 28 | std::size_t GetOutputTypeCount() const; 29 | ONNXTensorElementDataType GetOutputType(std::size_t index) const; 30 | OrtCustomOpInputOutputCharacteristic GetOutputCharacteristic(std::size_t index) const; 31 | }; 32 | 33 | } // namespace ortops 34 | -------------------------------------------------------------------------------- /onnx_extended/ortops/optim/cuda/addaddaddmulmulmul.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "common/common_kernels.h" 4 | #include "cublas_v2.h" 5 | #include 6 | 7 | namespace ortops { 8 | 9 | template struct AddAddAddMulMulMulKernel { 10 | AddAddAddMulMulMulKernel(const OrtApi &api, const OrtKernelInfo *info); 11 | void Compute(OrtKernelContext *context); 12 | }; 13 | 14 | template 15 | struct AddAddAddMulMulMulOp : Ort::CustomOpBase, 16 | AddAddAddMulMulMulKernel> { 17 | typedef Ort::CustomOpBase, 18 | AddAddAddMulMulMulKernel> 19 | parent_type; 20 | AddAddAddMulMulMulOp() : parent_type() {} 21 | void *CreateKernel(const OrtApi &api, const OrtKernelInfo *info) const; 22 | const char *GetName() const; 23 | const char *GetExecutionProviderType() const; 24 | 25 | std::size_t GetInputTypeCount() const; 26 | ONNXTensorElementDataType GetInputType(std::size_t index) const; 27 | OrtCustomOpInputOutputCharacteristic GetInputCharacteristic(std::size_t index) const; 28 | 29 | std::size_t GetOutputTypeCount() const; 30 | ONNXTensorElementDataType GetOutputType(std::size_t index) const; 31 | OrtCustomOpInputOutputCharacteristic GetOutputCharacteristic(std::size_t index) const; 32 | }; 33 | 34 | } // namespace ortops 35 | -------------------------------------------------------------------------------- /onnx_extended/ortops/optim/cuda/addaddmulmul.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "common/common_kernels.h" 4 | #include "cublas_v2.h" 5 | #include 6 | 7 | namespace ortops { 8 | 9 | template struct AddAddMulMulKernel { 10 | AddAddMulMulKernel(const OrtApi &api, const OrtKernelInfo *info); 11 | void Compute(OrtKernelContext *context); 12 | }; 13 | 14 | template 15 | struct AddAddMulMulOp 16 | : Ort::CustomOpBase, AddAddMulMulKernel> { 17 | typedef Ort::CustomOpBase, AddAddMulMulKernel> 18 | parent_type; 19 | AddAddMulMulOp() : parent_type() {} 20 | void *CreateKernel(const OrtApi &api, const OrtKernelInfo *info) const; 21 | const char *GetName() const; 22 | const char *GetExecutionProviderType() const; 23 | 24 | std::size_t GetInputTypeCount() const; 25 | ONNXTensorElementDataType GetInputType(std::size_t index) const; 26 | OrtCustomOpInputOutputCharacteristic GetInputCharacteristic(std::size_t index) const; 27 | 28 | std::size_t GetOutputTypeCount() const; 29 | ONNXTensorElementDataType GetOutputType(std::size_t index) const; 30 | OrtCustomOpInputOutputCharacteristic GetOutputCharacteristic(std::size_t index) const; 31 | }; 32 | 33 | } // namespace ortops 34 | -------------------------------------------------------------------------------- /onnx_extended/ortops/optim/cuda/addmul.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "common/common_kernels.h" 4 | #include "cublas_v2.h" 5 | #include 6 | 7 | namespace ortops { 8 | 9 | template struct AddMulKernel { 10 | AddMulKernel(const OrtApi &api, const OrtKernelInfo *info); 11 | void Compute(OrtKernelContext *context); 12 | private: 13 | // If true, the operator assumes there are 4 dimensions and the two middle ones are switched. 14 | bool switch_middle_axis_; 15 | }; 16 | 17 | template 18 | struct AddMulOp : Ort::CustomOpBase, AddMulKernel> { 19 | typedef Ort::CustomOpBase, AddMulKernel> parent_type; 20 | AddMulOp() : parent_type() {} 21 | void *CreateKernel(const OrtApi &api, const OrtKernelInfo *info) const; 22 | const char *GetName() const; 23 | const char *GetExecutionProviderType() const; 24 | 25 | std::size_t GetInputTypeCount() const; 26 | ONNXTensorElementDataType GetInputType(std::size_t index) const; 27 | OrtCustomOpInputOutputCharacteristic GetInputCharacteristic(std::size_t index) const; 28 | 29 | std::size_t GetOutputTypeCount() const; 30 | ONNXTensorElementDataType GetOutputType(std::size_t index) const; 31 | OrtCustomOpInputOutputCharacteristic GetOutputCharacteristic(std::size_t index) const; 32 | }; 33 | 34 | } // namespace ortops 35 | -------------------------------------------------------------------------------- /onnx_extended/ortops/optim/cuda/mul_mul_sigmoid.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "common/common_kernels.h" 4 | #include "cublas_v2.h" 5 | #include 6 | 7 | namespace ortops { 8 | 9 | template struct MulMulSigmoidKernel { 10 | MulMulSigmoidKernel(const OrtApi &api, const OrtKernelInfo *info); 11 | void Compute(OrtKernelContext *context); 12 | }; 13 | 14 | template 15 | struct MulMulSigmoidOp : Ort::CustomOpBase, MulMulSigmoidKernel> { 16 | typedef Ort::CustomOpBase, MulMulSigmoidKernel> parent_type; 17 | MulMulSigmoidOp() : parent_type() {} 18 | void *CreateKernel(const OrtApi &api, const OrtKernelInfo *info) const; 19 | const char *GetName() const; 20 | const char *GetExecutionProviderType() const; 21 | 22 | std::size_t GetInputTypeCount() const; 23 | ONNXTensorElementDataType GetInputType(std::size_t index) const; 24 | OrtCustomOpInputOutputCharacteristic GetInputCharacteristic(std::size_t index) const; 25 | 26 | std::size_t GetOutputTypeCount() const; 27 | ONNXTensorElementDataType GetOutputType(std::size_t index) const; 28 | OrtCustomOpInputOutputCharacteristic GetOutputCharacteristic(std::size_t index) const; 29 | }; 30 | 31 | } // namespace ortops 32 | -------------------------------------------------------------------------------- /onnx_extended/ortops/optim/cuda/mul_sigmoid.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "common/common_kernels.h" 4 | #include "cublas_v2.h" 5 | #include 6 | 7 | namespace ortops { 8 | 9 | template struct MulSigmoidKernel { 10 | MulSigmoidKernel(const OrtApi &api, const OrtKernelInfo *info); 11 | void Compute(OrtKernelContext *context); 12 | }; 13 | 14 | template 15 | struct MulSigmoidOp : Ort::CustomOpBase, MulSigmoidKernel> { 16 | typedef Ort::CustomOpBase, MulSigmoidKernel> parent_type; 17 | MulSigmoidOp() : parent_type() {} 18 | void *CreateKernel(const OrtApi &api, const OrtKernelInfo *info) const; 19 | const char *GetName() const; 20 | const char *GetExecutionProviderType() const; 21 | 22 | std::size_t GetInputTypeCount() const; 23 | ONNXTensorElementDataType GetInputType(std::size_t index) const; 24 | OrtCustomOpInputOutputCharacteristic GetInputCharacteristic(std::size_t index) const; 25 | 26 | std::size_t GetOutputTypeCount() const; 27 | ONNXTensorElementDataType GetOutputType(std::size_t index) const; 28 | OrtCustomOpInputOutputCharacteristic GetOutputCharacteristic(std::size_t index) const; 29 | }; 30 | 31 | } // namespace ortops 32 | -------------------------------------------------------------------------------- /onnx_extended/ortops/optim/cuda/negxplus1.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "common/common_kernels.h" 4 | #include "cublas_v2.h" 5 | #include 6 | 7 | namespace ortops { 8 | 9 | template struct NegXplus1Kernel { 10 | NegXplus1Kernel(const OrtApi &api, const OrtKernelInfo *info); 11 | void Compute(OrtKernelContext *context); 12 | }; 13 | 14 | template 15 | struct NegXplus1Op : Ort::CustomOpBase, NegXplus1Kernel> { 16 | typedef Ort::CustomOpBase, NegXplus1Kernel> parent_type; 17 | NegXplus1Op() : parent_type() {} 18 | void *CreateKernel(const OrtApi &api, const OrtKernelInfo *info) const; 19 | const char *GetName() const; 20 | const char *GetExecutionProviderType() const; 21 | 22 | std::size_t GetInputTypeCount() const; 23 | ONNXTensorElementDataType GetInputType(std::size_t index) const; 24 | OrtCustomOpInputOutputCharacteristic GetInputCharacteristic(std::size_t index) const; 25 | 26 | std::size_t GetOutputTypeCount() const; 27 | ONNXTensorElementDataType GetOutputType(std::size_t index) const; 28 | OrtCustomOpInputOutputCharacteristic GetOutputCharacteristic(std::size_t index) const; 29 | }; 30 | 31 | } // namespace ortops 32 | -------------------------------------------------------------------------------- /onnx_extended/ortops/optim/cuda/ort_optim_cuda_lib.h: -------------------------------------------------------------------------------- 1 | // Source: https://github.com/microsoft/onnxruntime/tree/main/ 2 | // onnxruntime/test/testdata/custom_op_get_const_input_test_library 3 | #pragma once 4 | 5 | #include "ortapi_c_api_header.h" 6 | 7 | #ifdef __cplusplus 8 | extern "C" { 9 | #endif 10 | 11 | ORT_EXPORT OrtStatus *ORT_API_CALL RegisterCustomOps(OrtSessionOptions *options, 12 | const OrtApiBase *api_base); 13 | 14 | #ifdef __cplusplus 15 | } 16 | #endif 17 | -------------------------------------------------------------------------------- /onnx_extended/ortops/optim/cuda/replace_zero.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "common/common_kernels.h" 4 | #include "cublas_v2.h" 5 | #include 6 | 7 | namespace ortops { 8 | 9 | template struct ReplaceZeroKernel { 10 | ReplaceZeroKernel(const OrtApi &api, const OrtKernelInfo *info); 11 | void Compute(OrtKernelContext *context); 12 | 13 | private: 14 | float by_; 15 | }; 16 | 17 | template 18 | struct ReplaceZeroOp : Ort::CustomOpBase, ReplaceZeroKernel> { 19 | typedef Ort::CustomOpBase, ReplaceZeroKernel> parent_type; 20 | ReplaceZeroOp() : parent_type() {} 21 | void *CreateKernel(const OrtApi &api, const OrtKernelInfo *info) const; 22 | const char *GetName() const; 23 | const char *GetExecutionProviderType() const; 24 | 25 | std::size_t GetInputTypeCount() const; 26 | ONNXTensorElementDataType GetInputType(std::size_t index) const; 27 | OrtCustomOpInputOutputCharacteristic GetInputCharacteristic(std::size_t index) const; 28 | 29 | std::size_t GetOutputTypeCount() const; 30 | ONNXTensorElementDataType GetOutputType(std::size_t index) const; 31 | OrtCustomOpInputOutputCharacteristic GetOutputCharacteristic(std::size_t index) const; 32 | }; 33 | 34 | } // namespace ortops 35 | -------------------------------------------------------------------------------- /onnx_extended/ortops/optim/cuda/rotary.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "common/common_kernels.h" 4 | #include "cublas_v2.h" 5 | #include 6 | 7 | namespace ortops { 8 | 9 | enum class RotarySide : int { 10 | LEFT = 1, 11 | RIGHT = 2, 12 | }; 13 | 14 | template struct RotaryKernel { 15 | RotaryKernel(const OrtApi &api, const OrtKernelInfo *info); 16 | void Compute(OrtKernelContext *context); 17 | 18 | private: 19 | RotarySide rotary_side_; 20 | }; 21 | 22 | template struct RotaryOp : Ort::CustomOpBase, RotaryKernel> { 23 | typedef Ort::CustomOpBase, RotaryKernel> parent_type; 24 | RotaryOp() : parent_type() {} 25 | void *CreateKernel(const OrtApi &api, const OrtKernelInfo *info) const; 26 | const char *GetName() const; 27 | const char *GetExecutionProviderType() const; 28 | 29 | std::size_t GetInputTypeCount() const; 30 | ONNXTensorElementDataType GetInputType(std::size_t index) const; 31 | OrtCustomOpInputOutputCharacteristic GetInputCharacteristic(std::size_t index) const; 32 | OrtMemType GetInputMemoryType(std::size_t index) const; 33 | 34 | std::size_t GetOutputTypeCount() const; 35 | ONNXTensorElementDataType GetOutputType(std::size_t index) const; 36 | OrtCustomOpInputOutputCharacteristic GetOutputCharacteristic(std::size_t index) const; 37 | }; 38 | 39 | } // namespace ortops 40 | -------------------------------------------------------------------------------- /onnx_extended/ortops/optim/cuda/scatter_nd_of_shape.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "common/common_kernels.h" 4 | #include "cublas_v2.h" 5 | #include "scatter_nd_of_shape_common.h" 6 | #include 7 | 8 | namespace ortops { 9 | 10 | template struct ScatterNDOfShapeKernel { 11 | ScatterNDOfShapeKernel(const OrtApi &api, const OrtKernelInfo *info); 12 | void Compute(OrtKernelContext *context); 13 | 14 | private: 15 | void ComputeNone(cudaStream_t &stream, const std::vector &input_shape, 16 | const std::vector &indices_shape, T *output_data, 17 | const int64_t *indices_data, const T *updates_data) const; 18 | void ComputeOptimize(cudaStream_t &stream, const std::vector &input_shape, 19 | const std::vector &indices_shape, T *output_data, 20 | const int64_t *indices_data, const T *updates_data) const; 21 | 22 | Reduction reduction_; 23 | Strategy strategy_; 24 | int maxThreadPerBlock_; 25 | }; 26 | 27 | template 28 | struct ScatterNDOfShapeOp 29 | : Ort::CustomOpBase, ScatterNDOfShapeKernel> { 30 | typedef Ort::CustomOpBase, ScatterNDOfShapeKernel> parent_type; 31 | ScatterNDOfShapeOp() : parent_type() {} 32 | void *CreateKernel(const OrtApi &api, const OrtKernelInfo *info) const; 33 | const char *GetName() const; 34 | const char *GetExecutionProviderType() const; 35 | 36 | std::size_t GetInputTypeCount() const; 37 | ONNXTensorElementDataType GetInputType(std::size_t index) const; 38 | OrtCustomOpInputOutputCharacteristic GetInputCharacteristic(std::size_t index) const; 39 | OrtMemType GetInputMemoryType(std::size_t index) const; 40 | 41 | std::size_t GetOutputTypeCount() const; 42 | ONNXTensorElementDataType GetOutputType(std::size_t index) const; 43 | OrtCustomOpInputOutputCharacteristic GetOutputCharacteristic(std::size_t index) const; 44 | }; 45 | 46 | } // namespace ortops 47 | -------------------------------------------------------------------------------- /onnx_extended/ortops/optim/cuda/scatter_nd_of_shape_common.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | namespace ortops { 4 | 5 | enum class Reduction : int { 6 | None = 0, 7 | Add = 1, 8 | Mul = 2, 9 | Min = 3, 10 | Max = 4, 11 | }; 12 | 13 | enum class Strategy : int { 14 | None = 0, 15 | Optimize = 1, 16 | }; 17 | 18 | struct Shape2 { 19 | int64_t dims[12]; 20 | }; 21 | 22 | } // namespace ortops 23 | -------------------------------------------------------------------------------- /onnx_extended/ortops/optim/cuda/scatter_nd_of_shape_masked.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "common/common_kernels.h" 4 | #include "cublas_v2.h" 5 | #include "scatter_nd_of_shape_common.h" 6 | #include 7 | 8 | namespace ortops { 9 | 10 | template struct MaskedScatterNDOfShapeKernel { 11 | MaskedScatterNDOfShapeKernel(const OrtApi &api, const OrtKernelInfo *info); 12 | void Compute(OrtKernelContext *context); 13 | 14 | private: 15 | void ComputeOptimize(cudaStream_t &stream, const std::vector &input_shape, 16 | const std::vector &indices_shape, T *output_data, 17 | const int64_t *indices_data, const T *updates_data) const; 18 | 19 | Reduction reduction_; 20 | int maxThreadPerBlock_; 21 | int64_t masked_value_; 22 | }; 23 | 24 | template 25 | struct MaskedScatterNDOfShapeOp 26 | : Ort::CustomOpBase, MaskedScatterNDOfShapeKernel> { 27 | typedef Ort::CustomOpBase, MaskedScatterNDOfShapeKernel> 28 | parent_type; 29 | MaskedScatterNDOfShapeOp() : parent_type() {} 30 | void *CreateKernel(const OrtApi &api, const OrtKernelInfo *info) const; 31 | const char *GetName() const; 32 | const char *GetExecutionProviderType() const; 33 | 34 | std::size_t GetInputTypeCount() const; 35 | ONNXTensorElementDataType GetInputType(std::size_t index) const; 36 | OrtCustomOpInputOutputCharacteristic GetInputCharacteristic(std::size_t index) const; 37 | OrtMemType GetInputMemoryType(std::size_t index) const; 38 | 39 | std::size_t GetOutputTypeCount() const; 40 | ONNXTensorElementDataType GetOutputType(std::size_t index) const; 41 | OrtCustomOpInputOutputCharacteristic GetOutputCharacteristic(std::size_t index) const; 42 | }; 43 | 44 | } // namespace ortops 45 | -------------------------------------------------------------------------------- /onnx_extended/ortops/optim/cuda/submul.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "common/common_kernels.h" 4 | #include "cublas_v2.h" 5 | #include 6 | 7 | namespace ortops { 8 | 9 | template struct SubMulKernel { 10 | SubMulKernel(const OrtApi &api, const OrtKernelInfo *info); 11 | void Compute(OrtKernelContext *context); 12 | 13 | private: 14 | bool negative_; 15 | }; 16 | 17 | template 18 | struct SubMulOp : Ort::CustomOpBase, SubMulKernel> { 19 | typedef Ort::CustomOpBase, SubMulKernel> parent_type; 20 | SubMulOp() : parent_type() {} 21 | void *CreateKernel(const OrtApi &api, const OrtKernelInfo *info) const; 22 | const char *GetName() const; 23 | const char *GetExecutionProviderType() const; 24 | 25 | std::size_t GetInputTypeCount() const; 26 | ONNXTensorElementDataType GetInputType(std::size_t index) const; 27 | OrtCustomOpInputOutputCharacteristic GetInputCharacteristic(std::size_t index) const; 28 | 29 | std::size_t GetOutputTypeCount() const; 30 | ONNXTensorElementDataType GetOutputType(std::size_t index) const; 31 | OrtCustomOpInputOutputCharacteristic GetOutputCharacteristic(std::size_t index) const; 32 | }; 33 | 34 | } // namespace ortops 35 | -------------------------------------------------------------------------------- /onnx_extended/ortops/optim/cuda/transpose_cast_2d.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "common/common_kernels.h" 4 | #include "cublas_v2.h" 5 | #include 6 | 7 | namespace ortops { 8 | 9 | struct Transpose2DCastKernel { 10 | Transpose2DCastKernel(const OrtApi &api, const OrtKernelInfo *info); 11 | void Compute(OrtKernelContext *context); 12 | }; 13 | 14 | struct Transpose2DCastOp : Ort::CustomOpBase { 15 | typedef Ort::CustomOpBase parent_type; 16 | Transpose2DCastOp(ONNXTensorElementDataType input_type, ONNXTensorElementDataType output_type) 17 | : parent_type() { 18 | input_type_ = input_type; 19 | output_type_ = output_type; 20 | } 21 | void *CreateKernel(const OrtApi &api, const OrtKernelInfo *info) const; 22 | const char *GetName() const; 23 | const char *GetExecutionProviderType() const; 24 | 25 | std::size_t GetInputTypeCount() const; 26 | ONNXTensorElementDataType GetInputType(std::size_t index) const; 27 | OrtCustomOpInputOutputCharacteristic GetInputCharacteristic(std::size_t index) const; 28 | 29 | std::size_t GetOutputTypeCount() const; 30 | ONNXTensorElementDataType GetOutputType(std::size_t index) const; 31 | OrtCustomOpInputOutputCharacteristic GetOutputCharacteristic(std::size_t index) const; 32 | 33 | private: 34 | ONNXTensorElementDataType input_type_; 35 | ONNXTensorElementDataType output_type_; 36 | }; 37 | 38 | } // namespace ortops 39 | -------------------------------------------------------------------------------- /onnx_extended/ortops/optim/cuda/tri_matrix.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "common/common_kernels.h" 4 | #include "cublas_v2.h" 5 | #include 6 | 7 | namespace ortops { 8 | 9 | template struct TriMatrixKernel { 10 | TriMatrixKernel(const OrtApi &api, const OrtKernelInfo *info); 11 | void Compute(OrtKernelContext *context); 12 | }; 13 | 14 | template 15 | struct TriMatrixOp : Ort::CustomOpBase, TriMatrixKernel> { 16 | typedef Ort::CustomOpBase, TriMatrixKernel> parent_type; 17 | TriMatrixOp() : parent_type() {} 18 | void *CreateKernel(const OrtApi &api, const OrtKernelInfo *info) const; 19 | const char *GetName() const; 20 | const char *GetExecutionProviderType() const; 21 | 22 | std::size_t GetInputTypeCount() const; 23 | ONNXTensorElementDataType GetInputType(std::size_t index) const; 24 | OrtCustomOpInputOutputCharacteristic GetInputCharacteristic(std::size_t index) const; 25 | OrtMemType GetInputMemoryType(std::size_t index) const; 26 | 27 | std::size_t GetOutputTypeCount() const; 28 | ONNXTensorElementDataType GetOutputType(std::size_t index) const; 29 | OrtCustomOpInputOutputCharacteristic GetOutputCharacteristic(std::size_t index) const; 30 | }; 31 | 32 | } // namespace ortops 33 | -------------------------------------------------------------------------------- /onnx_extended/ortops/tutorial/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /onnx_extended/ortops/tutorial/cpu/custom_tree_assembly.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "common/common_kernels.h" 4 | 5 | namespace ortops { 6 | 7 | struct CustomTreeAssemblyKernel { 8 | CustomTreeAssemblyKernel(const OrtApi &api, const OrtKernelInfo *info, bool classifier); 9 | void Compute(OrtKernelContext *context); 10 | ~CustomTreeAssemblyKernel(); 11 | 12 | bool classifier_; 13 | std::string assembly_name_; 14 | /* TreebeardSORunner */ void *assembly_runner_; 15 | }; 16 | 17 | struct CustomTreeAssemblyOp 18 | : Ort::CustomOpBase { 19 | typedef Ort::CustomOpBase parent_type; 20 | CustomTreeAssemblyOp(bool classifier) : parent_type(), classifier_(classifier) {} 21 | void *CreateKernel(const OrtApi &api, const OrtKernelInfo *info) const; 22 | const char *GetName() const; 23 | const char *GetExecutionProviderType() const; 24 | 25 | std::size_t GetInputTypeCount() const; 26 | ONNXTensorElementDataType GetInputType(std::size_t index) const; 27 | OrtCustomOpInputOutputCharacteristic GetInputCharacteristic(std::size_t index) const; 28 | 29 | std::size_t GetOutputTypeCount() const; 30 | ONNXTensorElementDataType GetOutputType(std::size_t index) const; 31 | OrtCustomOpInputOutputCharacteristic GetOutputCharacteristic(std::size_t index) const; 32 | 33 | private: 34 | bool classifier_; 35 | }; 36 | 37 | } // namespace ortops 38 | -------------------------------------------------------------------------------- /onnx_extended/ortops/tutorial/cpu/dynamic_quantize_linear.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "common/common_kernels.h" 4 | 5 | namespace ortops { 6 | 7 | struct DynamicQuantizeLinearKernel { 8 | DynamicQuantizeLinearKernel(const OrtApi &api, const OrtKernelInfo *info); 9 | void Compute(OrtKernelContext *context); 10 | 11 | private: 12 | template 13 | void ComputeInternal(int64_t n_elements, const T *input, uint8_t *output, float &scale, 14 | uint8_t &zero_point); 15 | 16 | int64_t to_; 17 | }; 18 | 19 | struct DynamicQuantizeLinearOp 20 | : Ort::CustomOpBase { 21 | typedef Ort::CustomOpBase parent_type; 22 | DynamicQuantizeLinearOp(ONNXTensorElementDataType input_type, 23 | ONNXTensorElementDataType quant_type) 24 | : parent_type(), input_type_(input_type), quant_type_(quant_type) {} 25 | 26 | void *CreateKernel(const OrtApi &api, const OrtKernelInfo *info) const noexcept; 27 | const char *GetName() const noexcept; 28 | const char *GetExecutionProviderType() const noexcept; 29 | 30 | std::size_t GetInputTypeCount() const noexcept; 31 | ONNXTensorElementDataType GetInputType(std::size_t index) const noexcept; 32 | 33 | std::size_t GetOutputTypeCount() const noexcept; 34 | ONNXTensorElementDataType GetOutputType(std::size_t index) const; 35 | 36 | private: 37 | ONNXTensorElementDataType input_type_; 38 | ONNXTensorElementDataType quant_type_; 39 | }; 40 | 41 | } // namespace ortops 42 | -------------------------------------------------------------------------------- /onnx_extended/ortops/tutorial/cpu/my_kernel.cc: -------------------------------------------------------------------------------- 1 | #include "my_kernel.h" 2 | 3 | namespace ortops { 4 | 5 | MyCustomKernel::MyCustomKernel(const OrtApi & /* api */, const OrtKernelInfo * /* info */) {} 6 | 7 | void MyCustomKernel::Compute(OrtKernelContext *context) { 8 | Ort::KernelContext ctx(context); 9 | Ort::ConstValue input_X = ctx.GetInput(0); 10 | Ort::ConstValue input_Y = ctx.GetInput(1); 11 | const float *X = input_X.GetTensorData(); 12 | const float *Y = input_Y.GetTensorData(); 13 | 14 | // Setup output, which is assumed to have the same dimensions as the inputs. 15 | std::vector dimensions = input_X.GetTensorTypeAndShapeInfo().GetShape(); 16 | 17 | Ort::UnownedValue output = ctx.GetOutput(0, dimensions); 18 | float *out = output.GetTensorMutableData(); 19 | 20 | const std::size_t size = output.GetTensorTypeAndShapeInfo().GetElementCount(); 21 | 22 | // Do computation 23 | for (std::size_t i = 0; i < size; i++) { 24 | out[i] = X[i] + Y[i]; 25 | } 26 | } 27 | 28 | void *MyCustomOp::CreateKernel(const OrtApi &api, const OrtKernelInfo *info) const { 29 | return std::make_unique(api, info).release(); 30 | } 31 | 32 | const char *MyCustomOp::GetName() const { return "MyCustomOp"; }; 33 | 34 | const char *MyCustomOp::GetExecutionProviderType() const { return "CPUExecutionProvider"; } 35 | 36 | size_t MyCustomOp::GetInputTypeCount() const { return 2; }; 37 | 38 | ONNXTensorElementDataType MyCustomOp::GetInputType(std::size_t /* index */) const { 39 | return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT; 40 | } 41 | 42 | size_t MyCustomOp::GetOutputTypeCount() const { return 1; }; 43 | 44 | ONNXTensorElementDataType MyCustomOp::GetOutputType(std::size_t /* index */) const { 45 | return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT; 46 | } 47 | 48 | } // namespace ortops 49 | -------------------------------------------------------------------------------- /onnx_extended/ortops/tutorial/cpu/my_kernel.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "common/common_kernels.h" 4 | 5 | namespace ortops { 6 | 7 | struct MyCustomKernel { 8 | MyCustomKernel(const OrtApi &api, const OrtKernelInfo *info); 9 | void Compute(OrtKernelContext *context); 10 | }; 11 | 12 | struct MyCustomOp : Ort::CustomOpBase { 13 | void *CreateKernel(const OrtApi &api, const OrtKernelInfo *info) const; 14 | const char *GetName() const; 15 | const char *GetExecutionProviderType() const; 16 | std::size_t GetInputTypeCount() const; 17 | ONNXTensorElementDataType GetInputType(std::size_t index) const; 18 | std::size_t GetOutputTypeCount() const; 19 | ONNXTensorElementDataType GetOutputType(std::size_t index) const; 20 | }; 21 | 22 | } // namespace ortops 23 | -------------------------------------------------------------------------------- /onnx_extended/ortops/tutorial/cpu/my_kernel_attr.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "common/common_kernels.h" 4 | 5 | namespace ortops { 6 | 7 | struct MyCustomKernelWithAttributes { 8 | MyCustomKernelWithAttributes(const OrtApi &api, const OrtKernelInfo *info); 9 | void Compute(OrtKernelContext *context); 10 | 11 | private: 12 | std::string att_string; 13 | float att_float; 14 | int64_t att_int64; 15 | std::vector att_tensor_double; 16 | }; 17 | 18 | struct MyCustomOpWithAttributes 19 | : Ort::CustomOpBase { 20 | void *CreateKernel(const OrtApi &api, const OrtKernelInfo *info) const; 21 | const char *GetName() const; 22 | const char *GetExecutionProviderType() const; 23 | std::size_t GetInputTypeCount() const; 24 | ONNXTensorElementDataType GetInputType(std::size_t index) const; 25 | std::size_t GetOutputTypeCount() const; 26 | ONNXTensorElementDataType GetOutputType(std::size_t index) const; 27 | }; 28 | 29 | } // namespace ortops 30 | -------------------------------------------------------------------------------- /onnx_extended/ortops/tutorial/cpu/ort_tutorial_cpu_lib.cc: -------------------------------------------------------------------------------- 1 | // Source: https://github.com/microsoft/onnxruntime/tree/main/ 2 | // onnxruntime/test/testdata/custom_op_get_const_input_test_library 3 | 4 | #include 5 | #include 6 | 7 | #include "custom_gemm.h" 8 | #include "custom_tree_assembly.h" 9 | #include "dynamic_quantize_linear.h" 10 | #include "my_kernel.h" 11 | #include "my_kernel_attr.h" 12 | #include "ort_tutorial_cpu_lib.h" 13 | #include "ortapi_version.h" 14 | 15 | static const char *c_OpDomain = "onnx_extended.ortops.tutorial.cpu"; 16 | 17 | static void AddOrtCustomOpDomainToContainer(Ort::CustomOpDomain &&domain) { 18 | static std::vector ort_custom_op_domain_container; 19 | static std::mutex ort_custom_op_domain_mutex; 20 | std::lock_guard lock(ort_custom_op_domain_mutex); 21 | ort_custom_op_domain_container.push_back(std::move(domain)); 22 | } 23 | 24 | OrtStatus *ORT_API_CALL RegisterCustomOps(OrtSessionOptions *options, 25 | const OrtApiBase *api_base) { 26 | Ort::InitApi(api_base->GetApi(ORT_API_VERSION_SUPPORTED)); 27 | Ort::UnownedSessionOptions session_options(options); 28 | 29 | // An instance remaining available until onnxruntime unload the library. 30 | static ortops::MyCustomOp c_CustomOp; 31 | static ortops::MyCustomOpWithAttributes c_CustomOpAttr; 32 | static ortops::CustomGemmOp c_CustomGemmFloat( 33 | "CustomGemmFloat", ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, 34 | ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, false); 35 | static ortops::CustomGemmOp c_CustomGemmFloat16( 36 | "CustomGemmFloat16", ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16, 37 | ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16, ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16, false); 38 | static ortops::CustomTreeAssemblyOp c_CustomTreeAssembly(false); 39 | 40 | #if ORT_API_VERSION_SUPPORTED >= 16 41 | static ortops::DynamicQuantizeLinearOp c_dql(ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, 42 | ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT8E4M3FN); 43 | 44 | static ortops::CustomGemmOp c_CustomGemmFloat8E4M3FN( 45 | "CustomGemmFloat8E4M3FN", ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT8E4M3FN, 46 | ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, false); 47 | #endif 48 | 49 | try { 50 | Ort::CustomOpDomain domain{c_OpDomain}; 51 | 52 | domain.Add(&c_CustomOp); 53 | domain.Add(&c_CustomOpAttr); 54 | domain.Add(&c_CustomGemmFloat); 55 | domain.Add(&c_CustomGemmFloat16); 56 | domain.Add(&c_CustomTreeAssembly); 57 | #if ORT_API_VERSION_SUPPORTED >= 16 58 | domain.Add(&c_dql); 59 | domain.Add(&c_CustomGemmFloat8E4M3FN); 60 | #endif 61 | 62 | session_options.Add(domain); 63 | AddOrtCustomOpDomainToContainer(std::move(domain)); 64 | } catch (const std::exception &e) { 65 | Ort::Status status{e}; 66 | return status.release(); 67 | } 68 | 69 | return nullptr; 70 | } 71 | -------------------------------------------------------------------------------- /onnx_extended/ortops/tutorial/cpu/ort_tutorial_cpu_lib.h: -------------------------------------------------------------------------------- 1 | // Source: https://github.com/microsoft/onnxruntime/tree/main/ 2 | // onnxruntime/test/testdata/custom_op_get_const_input_test_library 3 | #pragma once 4 | 5 | #include "ortapi_c_api_header.h" 6 | 7 | #ifdef __cplusplus 8 | extern "C" { 9 | #endif 10 | 11 | ORT_EXPORT OrtStatus *ORT_API_CALL RegisterCustomOps(OrtSessionOptions *options, 12 | const OrtApiBase *api_base); 13 | 14 | #ifdef __cplusplus 15 | } 16 | #endif 17 | -------------------------------------------------------------------------------- /onnx_extended/ortops/tutorial/cuda/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import textwrap 3 | from typing import List 4 | from ... import _get_ort_ext_libs 5 | 6 | 7 | def get_ort_ext_libs() -> List[str]: 8 | """ 9 | Returns the list of libraries implementing new simple 10 | :epkg:`onnxruntime` kernels implemented for the 11 | :epkg:`CUDAExecutionProvider`. 12 | """ 13 | libs = _get_ort_ext_libs(os.path.dirname(__file__)) 14 | return [lib for lib in libs if "cuda_cuda" not in lib] 15 | 16 | 17 | def documentation() -> List[str]: 18 | """ 19 | Returns a list of rst string documenting every implemented kernels 20 | in this subfolder. 21 | """ 22 | return list( 23 | map( 24 | textwrap.dedent, 25 | [ 26 | """ 27 | onnx_extended.ortops.tutorial.cuda.CustomGemm 28 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 29 | 30 | It calls CUDA library for Gemm :math:`\\alpha A B + \\beta C`. 31 | 32 | **Provider** 33 | 34 | CUDAExecutionProvider 35 | 36 | **Inputs** 37 | 38 | * A (T): tensor of type T 39 | * B (T): tensor of type T 40 | * C (T): tensor of type T 41 | * D (T): tensor of type T 42 | * E (T): tensor of type T 43 | 44 | **Outputs** 45 | 46 | * Z (T): :math:`\\alpha A B + \\beta C` 47 | 48 | **Constraints** 49 | 50 | * T: float, float16, bfloat16 51 | """ 52 | ], 53 | ) 54 | ) 55 | -------------------------------------------------------------------------------- /onnx_extended/ortops/tutorial/cuda/matx_matmul.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "common/common_kernels.h" 4 | #include "cublas_v2.h" 5 | #include 6 | 7 | namespace ortops { 8 | 9 | struct MatXMatMulKernel { 10 | MatXMatMulKernel(const OrtApi &api, const OrtKernelInfo *info); 11 | void Compute(OrtKernelContext *context); 12 | }; 13 | 14 | struct MatXMatMulOp : Ort::CustomOpBase { 15 | typedef Ort::CustomOpBase parent_type; 16 | MatXMatMulOp(const char *op_name, ONNXTensorElementDataType dtype) : parent_type() { 17 | op_name_ = op_name; 18 | dtype_ = dtype; 19 | } 20 | void *CreateKernel(const OrtApi &api, const OrtKernelInfo *info) const; 21 | const char *GetName() const; 22 | const char *GetExecutionProviderType() const; 23 | 24 | std::size_t GetInputTypeCount() const; 25 | ONNXTensorElementDataType GetInputType(std::size_t index) const; 26 | OrtCustomOpInputOutputCharacteristic GetInputCharacteristic(std::size_t index) const; 27 | 28 | std::size_t GetOutputTypeCount() const; 29 | ONNXTensorElementDataType GetOutputType(std::size_t index) const; 30 | OrtCustomOpInputOutputCharacteristic GetOutputCharacteristic(std::size_t index) const; 31 | 32 | private: 33 | const char *op_name_; 34 | ONNXTensorElementDataType dtype_; 35 | }; 36 | 37 | } // namespace ortops 38 | -------------------------------------------------------------------------------- /onnx_extended/ortops/tutorial/cuda/ort_tutorial_cuda_lib.cc: -------------------------------------------------------------------------------- 1 | // Source: https://github.com/microsoft/onnxruntime/tree/main/ 2 | // onnxruntime/test/testdata/custom_op_get_const_input_test_library 3 | 4 | #include 5 | #include 6 | 7 | #include "custom_gemm.h" 8 | #include "matx_matmul.h" 9 | #include "ort_tutorial_cuda_lib.h" 10 | #include "ortapi_version.h" 11 | 12 | static const char *c_OpDomain = "onnx_extended.ortops.tutorial.cuda"; 13 | 14 | static void AddOrtCustomOpDomainToContainer(Ort::CustomOpDomain &&domain) { 15 | static std::vector ort_custom_op_domain_container; 16 | static std::mutex ort_custom_op_domain_mutex; 17 | std::lock_guard lock(ort_custom_op_domain_mutex); 18 | ort_custom_op_domain_container.push_back(std::move(domain)); 19 | } 20 | 21 | OrtStatus *ORT_API_CALL RegisterCustomOps(OrtSessionOptions *options, 22 | const OrtApiBase *api_base) { 23 | Ort::InitApi(api_base->GetApi(ORT_API_VERSION_SUPPORTED)); 24 | Ort::UnownedSessionOptions session_options(options); 25 | 26 | // An instance remaining available until onnxruntime unload the library. 27 | static ortops::CustomGemmOp c_CustomGemmFloat( 28 | "CustomGemmFloat", ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, 29 | ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, false); 30 | static ortops::CustomGemmOp c_CustomGemmFloat16( 31 | "CustomGemmFloat16", ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16, 32 | ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16, ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16, false); 33 | 34 | static ortops::MatXMatMulOp c_MaxMatMulFloat("MaXMatMulFloat", 35 | ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT); 36 | 37 | #if ORT_VERSION >= 1160 && CUDA_VERSION >= 11080 38 | static ortops::CustomGemmOp c_CustomGemmFloat8E4M3FN( 39 | "CustomGemmFloat8E4M3FN", ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT8E4M3FN, 40 | ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, false); 41 | static ortops::CustomGemmOp c_CustomGemmFloat8E4M3FNTime( 42 | "CustomGemmFloat8E4M3FNTime", ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT8E4M3FN, 43 | ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, false); 44 | #endif 45 | 46 | try { 47 | Ort::CustomOpDomain domain{c_OpDomain}; 48 | 49 | domain.Add(&c_CustomGemmFloat); 50 | domain.Add(&c_CustomGemmFloat16); 51 | domain.Add(&c_MaxMatMulFloat); 52 | #if ORT_VERSION >= 1160 && CUDA_VERSION >= 11080 53 | domain.Add(&c_CustomGemmFloat8E4M3FN); 54 | domain.Add(&c_CustomGemmFloat8E4M3FNTime); 55 | #endif 56 | 57 | session_options.Add(domain); 58 | AddOrtCustomOpDomainToContainer(std::move(domain)); 59 | } catch (const std::exception &e) { 60 | Ort::Status status{e}; 61 | return status.release(); 62 | } 63 | 64 | return nullptr; 65 | } 66 | -------------------------------------------------------------------------------- /onnx_extended/ortops/tutorial/cuda/ort_tutorial_cuda_lib.h: -------------------------------------------------------------------------------- 1 | // Source: https://github.com/microsoft/onnxruntime/tree/main/ 2 | // onnxruntime/test/testdata/custom_op_get_const_input_test_library 3 | #pragma once 4 | 5 | #include "ortapi_c_api_header.h" 6 | 7 | #ifdef __cplusplus 8 | extern "C" { 9 | #endif 10 | 11 | ORT_EXPORT OrtStatus *ORT_API_CALL RegisterCustomOps(OrtSessionOptions *options, 12 | const OrtApiBase *api_base); 13 | 14 | #ifdef __cplusplus 15 | } 16 | #endif 17 | -------------------------------------------------------------------------------- /onnx_extended/plotting/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdpython/onnx-extended/eebca3c82b615f841608f74131900738b7f7f845/onnx_extended/plotting/__init__.py -------------------------------------------------------------------------------- /onnx_extended/reference/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | import numpy as np 3 | from onnx import SparseTensorProto, TensorProto 4 | from onnx.reference.op_run import to_array_extended as onnx_to_array_extended 5 | from .c_reference_evaluator import CReferenceEvaluator, from_array_extended 6 | 7 | 8 | def to_array_extended( 9 | tensor: Union[SparseTensorProto, TensorProto], 10 | ) -> Union[np.ndarray, "scipy.sparse.coo_matrix"]: # noqa: F821 11 | """ 12 | Overwrites function `onnx.reference.op_run.to_array_extended` 13 | to support sparse tensors. 14 | """ 15 | if isinstance(tensor, TensorProto): 16 | return onnx_to_array_extended(tensor) 17 | if isinstance(tensor, SparseTensorProto): 18 | import scipy.sparse as sp 19 | 20 | shape = tuple(d for d in tensor.dims) 21 | indices = onnx_to_array_extended(tensor.indices) 22 | values = onnx_to_array_extended(tensor.values) 23 | if len(indices.shape) == 1: 24 | t = sp.csr_matrix( 25 | (values, indices, np.array([0, len(indices)], dtype=np.int64)), 26 | shape=(1, np.prod(shape)), 27 | ) 28 | return t.reshape(shape) 29 | if len(indices.shape) == 2: 30 | t = sp.coo_matrix((values, (indices[:, 0], indices[:, 1])), shape=shape) 31 | return t 32 | raise RuntimeError(f"Unexpected indices shape: {indices.shape}.") 33 | raise TypeError(f"Unexpected type {type(tensor)}.") 34 | -------------------------------------------------------------------------------- /onnx_extended/reference/c_custom_ops/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /onnx_extended/reference/c_ops/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdpython/onnx-extended/eebca3c82b615f841608f74131900738b7f7f845/onnx_extended/reference/c_ops/__init__.py -------------------------------------------------------------------------------- /onnx_extended/reference/c_ops/_op_classifier_common.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | 3 | 4 | class _ClassifierCommon: 5 | """ 6 | Labels strings are not natively implemented in C++ runtime. 7 | The class stores the strings labels, replaces them by 8 | integer, calls the C++ codes and then replaces them by strings. 9 | """ 10 | 11 | @staticmethod 12 | def _post_process_predicted_label(label, scores, classlabels_int64s_string): 13 | """ 14 | Replaces int64 predicted labels by the corresponding 15 | strings. 16 | """ 17 | if classlabels_int64s_string is not None and len(classlabels_int64s_string) > 0: 18 | new_label = [] 19 | no_array = False 20 | for i in label: 21 | if i >= len(classlabels_int64s_string): 22 | new_label.append(None) 23 | no_array = True 24 | else: 25 | new_label.append(classlabels_int64s_string[i]) 26 | if no_array: 27 | return new_label, scores 28 | return numpy.array(new_label), scores 29 | return label, scores 30 | -------------------------------------------------------------------------------- /onnx_extended/reference/c_ops/c_op_conv.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Dict 2 | 3 | import numpy as np 4 | 5 | from onnx import NodeProto 6 | from onnx.reference.op_run import OpRun 7 | from .cpu.c_op_conv_ import ConvDouble, ConvFloat 8 | 9 | 10 | class Conv(OpRun): 11 | def __init__( 12 | self, onnx_node: NodeProto, run_params: Dict[str, Any], schema: Any = None 13 | ): 14 | OpRun.__init__(self, onnx_node, run_params, schema) 15 | self.cache_: Dict[type, Any] = {} 16 | 17 | def _run( 18 | self, 19 | X, 20 | W, 21 | B=None, 22 | auto_pad=None, 23 | dilations=None, 24 | group=None, 25 | kernel_shape=None, 26 | pads=None, 27 | strides=None, 28 | ): 29 | if X.dtype not in self.cache_: 30 | if X.dtype == np.float32: 31 | rt = ConvFloat() 32 | elif X.dtype == np.float64: 33 | rt = ConvDouble() 34 | else: 35 | raise TypeError( 36 | f"No C implementation C for operator 'Conv' and dtype={X.dtype}." 37 | ) 38 | self.cache_[X.dtype] = rt 39 | 40 | rt.init( 41 | auto_pad, 42 | np.array(dilations or [], dtype=np.int64), 43 | group, 44 | np.array(kernel_shape or [], dtype=np.int64), 45 | np.array(pads or [], dtype=np.int64), 46 | np.array(strides or [], dtype=np.int64), 47 | ) 48 | 49 | rt = self.cache_[X.dtype] 50 | 51 | assert X is not None, f"X cannot be None for operator {type(self)}." 52 | assert ( 53 | min(X.shape) != 0 54 | ), f"Unable to run operator Conv on an empty matrix. X.shape={X.shape!r}." 55 | assert ( 56 | B is None or min(B.shape) != 0 57 | ), f"Unable to run operator Conv on an empty matrix. B.shape={B.shape!r}." 58 | cv = rt.compute(X, W, B) 59 | return (cv,) 60 | -------------------------------------------------------------------------------- /onnx_extended/reference/c_ops/c_op_svm_regressor.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Dict 2 | import numpy as np 3 | from onnx import NodeProto 4 | from onnx.reference.op_run import OpRun 5 | from .cpu.c_op_svm_py_ import ( 6 | RuntimeSVMRegressorFloat, 7 | RuntimeSVMRegressorDouble, 8 | ) 9 | 10 | 11 | class SVMRegressor(OpRun): 12 | op_domain = "ai.onnx.ml" 13 | 14 | def __init__( 15 | self, onnx_node: NodeProto, run_params: Dict[str, Any], schema: Any = None 16 | ): 17 | OpRun.__init__(self, onnx_node, run_params, schema=schema) 18 | self.rt_ = None 19 | 20 | def _run( 21 | self, 22 | x, 23 | coefficients=None, 24 | kernel_params=None, 25 | kernel_type=None, 26 | n_supports=None, 27 | one_class=None, 28 | post_transform=None, 29 | rho=None, 30 | support_vectors=None, 31 | ): 32 | """ 33 | This is a C++ implementation coming from 34 | :epkg:`onnxruntime`. 35 | `svm_regressor.cc 36 | `_. 37 | See class :class:`RuntimeSVMRegressor 38 | `. 39 | """ 40 | if self.rt_ is None: 41 | if x.dtype == np.float32: 42 | self.rt_ = RuntimeSVMRegressorFloat() 43 | elif x.dtype == np.float64: 44 | self.rt_ = RuntimeSVMRegressorDouble() 45 | else: 46 | raise NotImplementedError(f"Not implemented for dtype={x.dtype}.") 47 | self.rt_.init( 48 | coefficients, 49 | kernel_params, 50 | kernel_type, 51 | n_supports, 52 | one_class, 53 | post_transform, 54 | rho, 55 | support_vectors, 56 | ) 57 | pred = self.rt_.compute(x) 58 | if pred.shape[0] != x.shape[0]: 59 | pred = pred.reshape(x.shape[0], pred.shape[0] // x.shape[0]) 60 | if len(pred.shape) == 1: 61 | pred = pred.reshape((-1, 1)) 62 | return (pred,) 63 | -------------------------------------------------------------------------------- /onnx_extended/reference/c_ops/cpu/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /onnx_extended/reference/c_ops/cpu/c_op_conv_.cpp: -------------------------------------------------------------------------------- 1 | #include "c_op_conv_pybind11.h" 2 | 3 | using namespace onnx_c_ops; 4 | 5 | PYBIND11_MODULE(c_op_conv_, m) { 6 | m.doc() = 7 | #if defined(__APPLE__) 8 | "C++ Reference Implementation for operator Conv." 9 | #else 10 | R"pbdoc(C++ Reference Implementation for operator Conv.)pbdoc" 11 | #endif 12 | ; 13 | 14 | py::class_ clf( 15 | m, "ConvFloat", 16 | R"pbdoc(Implements float runtime for operator Conv. The code is inspired from 17 | `conv.cc `_ 18 | in :epkg:`onnxruntime`. Supports float only.)pbdoc"); 19 | 20 | clf.def(py::init<>()); 21 | clf.def("init", &ConvFloat::init, "Initializes the runtime with the ONNX attributes."); 22 | clf.def("compute", &ConvFloat::compute, "Computes the output for operator Conv."); 23 | 24 | py::class_ cld( 25 | m, "ConvDouble", 26 | R"pbdoc(Implements float runtime for operator Conv. The code is inspired from 27 | `conv.cc `_ 28 | in :epkg:`onnxruntime`. Supports double only.)pbdoc"); 29 | 30 | cld.def(py::init<>()); 31 | cld.def("init", &ConvDouble::init, "Initializes the runtime with the ONNX attributes."); 32 | cld.def("compute", &ConvDouble::compute, "Computes the output for operator Conv."); 33 | } 34 | -------------------------------------------------------------------------------- /onnx_extended/reference/other_ops/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /onnx_extended/reference/other_ops/op_scatternd_of_shape.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from onnx.reference.op_run import OpRun 3 | from onnx.reference.ops.op_scatternd import _scatter_nd_impl 4 | 5 | 6 | class ScatterNDOfShape(OpRun): 7 | op_domain = "onnx_extended.ortops.optim.cuda" 8 | 9 | def _run(self, shape, indices, updates, reduction=None, strategy=None): 10 | data = np.zeros(shape, dtype=updates.dtype) 11 | y = _scatter_nd_impl(data, indices, updates, reduction=reduction) 12 | return (y,) 13 | -------------------------------------------------------------------------------- /onnx_extended/tools/__init__.py: -------------------------------------------------------------------------------- 1 | from .onnx_io import save_model, load_external, load_model 2 | from .onnx_nodes import enumerate_model_tensors 3 | -------------------------------------------------------------------------------- /onnx_extended/tools/einsum/__init__.py: -------------------------------------------------------------------------------- 1 | from .einsum_bench import einsum_benchmark 2 | from .einsum_fct import einsum, optimize_decompose_einsum_equation 3 | from .einsum_impl import decompose_einsum_equation, apply_einsum_sequence 4 | from .einsum_impl_classes import EinsumSubOp, GraphEinsumSubOp 5 | from .einsum_impl_ext import ( 6 | numpy_extended_dot, 7 | numpy_extended_dot_python, 8 | numpy_extended_dot_matrix, 9 | numpy_diagonal, 10 | ) 11 | -------------------------------------------------------------------------------- /onnx_extended/tools/einsum/einsum_config.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | import numpy 3 | import onnx 4 | 5 | DEFAULT_OPSET = min(18, onnx.defs.onnx_opset_version()) 6 | DEFAULT_IR_VERSION = 8 7 | 8 | 9 | def guess_proto_dtype(dtype: Any) -> int: 10 | """ 11 | Returns the corresponding proto type for a numpy dtype. 12 | """ 13 | if dtype == numpy.float32: 14 | return onnx.TensorProto.FLOAT 15 | if dtype == numpy.float64: 16 | return onnx.TensorProto.DOUBLE 17 | if dtype == numpy.int32: 18 | return onnx.TensorProto.INT32 19 | if dtype == numpy.int64: 20 | return onnx.TensorProto.INT64 21 | raise ValueError(f"Unexpected value for dtype {dtype!r}.") 22 | -------------------------------------------------------------------------------- /onnx_extended/tools/graph/__init__.py: -------------------------------------------------------------------------------- 1 | from .errors import QuantizationError 2 | from .onnx_graph_struct import Graph 3 | from .onnx_graph_transformer import cast_constant, quantize_float8, QuantizeOptions 4 | -------------------------------------------------------------------------------- /onnx_extended/tools/graph/errors.py: -------------------------------------------------------------------------------- 1 | class QuantizationError(RuntimeError): 2 | """ 3 | Raised when a model or a node cannot be quantized. 4 | """ 5 | -------------------------------------------------------------------------------- /onnx_extended/validation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdpython/onnx-extended/eebca3c82b615f841608f74131900738b7f7f845/onnx_extended/validation/__init__.py -------------------------------------------------------------------------------- /onnx_extended/validation/cpu/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /onnx_extended/validation/cpu/cpu_fpemu.hpp: -------------------------------------------------------------------------------- 1 | // This file includes some pieces taken from 2 | // https://github.com/IntelLabs/FP8-Emulation-Toolkit/blob/main/mpemu/pytquant/cuda/fpemu_kernels.cu 3 | // with the following license. 4 | // 5 | /*----------------------------------------------------------------------------* 6 | * Copyright (c) 2023, Intel Corporation - All rights reserved. 7 | * This file is part of FP8-Emulation-Toolkit 8 | * 9 | * SPDX-License-Identifier: BSD-3-Clause 10 | *----------------------------------------------------------------------------* 11 | * Naveen Mellempudi (Intel Corporation) 12 | *----------------------------------------------------------------------------*/ 13 | 14 | #pragma once 15 | 16 | #if defined(__SSSE3__) 17 | 18 | #include 19 | 20 | #endif 21 | 22 | namespace cpu_fpemu { 23 | 24 | #if defined(__SSSE3__) 25 | 26 | inline float __double2float_rn(double inval) { 27 | float out[4] = {0}; 28 | __m128 vout = _mm_cvtpd_ps(_mm_set1_pd(inval)); 29 | 30 | _mm_store_ps(&out[0], vout); 31 | return out[0]; 32 | } 33 | 34 | #ifdef _WIN32 35 | 36 | inline unsigned short __float2half_rn(float inval) { 37 | __m128i m = _mm_cvtps_ph(_mm_set_ss(inval), (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); 38 | return _mm_extract_epi16(m, 0); 39 | } 40 | 41 | inline float __half2float(unsigned short h_val) { 42 | __m128i m = _mm_cvtsi32_si128(h_val); 43 | return _mm_cvtss_f32(_mm_cvtph_ps(m)); 44 | } 45 | 46 | #else 47 | 48 | inline unsigned short __float2half_rn(float inval) { 49 | return _cvtss_sh(inval, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); 50 | } 51 | 52 | inline float __half2float(unsigned short h_val) { return _cvtsh_ss(h_val); } 53 | 54 | #endif 55 | 56 | #endif 57 | 58 | } // namespace cpu_fpemu 59 | -------------------------------------------------------------------------------- /onnx_extended/validation/cpu/murmur_hash3.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace validation { 6 | namespace sklearn { 7 | 8 | void MurmurHash3_x86_32(const void *key, int len, uint32_t seed, void *out); 9 | 10 | void MurmurHash3_x86_128(const void *key, int len, uint32_t seed, void *out); 11 | 12 | void MurmurHash3_x64_128(const void *key, int len, uint32_t seed, void *out); 13 | 14 | } // namespace sklearn 15 | } // namespace validation 16 | -------------------------------------------------------------------------------- /onnx_extended/validation/cpu/speed_metrics.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | namespace validation { 8 | 9 | #if defined(_WIN32) 10 | 11 | inline bool _isnan_(float x) { return _isnanf(x); } 12 | inline bool _isnan_(double x) { return _isnan(x); } 13 | 14 | #else 15 | 16 | // See 17 | // https://stackoverflow.com/questions/2249110/how-do-i-make-a-portable-isnan-isinf-function 18 | inline bool _isnan_(double x) { 19 | union { 20 | uint64_t u; 21 | double f; 22 | } ieee754; 23 | ieee754.f = x; 24 | return ((unsigned)(ieee754.u >> 32) & 0x7fffffff) + ((unsigned)ieee754.u != 0) > 0x7ff00000; 25 | } 26 | 27 | inline bool _isnan_(float x) { 28 | uint32_t *pv = reinterpret_cast(&x); 29 | uint32_t b = *pv; 30 | return (b & 0x7fc00000) == 0x7fc00000; 31 | } 32 | 33 | #endif 34 | 35 | typedef struct ElementTime { 36 | int64_t trial; 37 | int64_t row; 38 | double time; 39 | inline ElementTime() {} 40 | inline ElementTime(int64_t n, int64_t r, double t) { 41 | trial = n; 42 | row = r; 43 | time = t; 44 | } 45 | } ElementTime; 46 | 47 | double benchmark_cache(int64_t arr_size, bool verbose); 48 | std::vector benchmark_cache_tree(int64_t n_rows, int64_t n_features, 49 | int64_t n_trees, int64_t tree_size, 50 | int64_t max_depth, int64_t search_step = 64); 51 | 52 | } // namespace validation 53 | -------------------------------------------------------------------------------- /onnx_extended/validation/cpu/vector_sparse.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "common/sparse_tensor.h" 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | #include 10 | #include 11 | 12 | #define py_array_float py::array_t 13 | #define py_array_uint32 py::array_t 14 | 15 | namespace py = pybind11; 16 | 17 | namespace validation { 18 | 19 | py::tuple sparse_struct_indices_values(const py_array_float &v); 20 | 21 | py_array_float sparse_struct_to_dense(const py_array_float &v); 22 | 23 | py_array_float dense_to_sparse_struct(const py_array_float &v); 24 | 25 | py::list sparse_struct_to_maps(const py_array_float &v); 26 | 27 | py::tuple sparse_struct_to_csr(const py_array_float &v); 28 | 29 | std::vector> evaluate_sparse(const float *v, int64_t n_rows, 30 | int64_t n_cols, int random, 31 | int ntimes, int repeat, 32 | int test); 33 | 34 | } // namespace validation 35 | -------------------------------------------------------------------------------- /onnx_extended/validation/cuda/__init__.py: -------------------------------------------------------------------------------- 1 | def cuda_version() -> str: 2 | """ 3 | Returns the cuda version it was compiled with. 4 | If CUDA was not available, it retunrs `"0.0"`. 5 | """ 6 | try: 7 | from .cuda_example_py import cuda_version as cv 8 | except ImportError: 9 | # No CUDA 10 | return "0.0" 11 | v = cv() 12 | major = v // 1000 13 | minor = (v % 1000) // 10 14 | return f"{major}.{minor}" 15 | -------------------------------------------------------------------------------- /onnx_extended/validation/cuda/cuda_fpemu.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | namespace cuda_fpemu { 6 | 7 | enum FpemuMode { 8 | E4M3_RNE = 1, 9 | }; 10 | 11 | void fpemu_cuda_forward(const int size, const float *input, float *output, FpemuMode mode, 12 | bool inplace, float scale, bool block_norm, int block_size, 13 | int cuda_device); 14 | 15 | } // namespace cuda_fpemu -------------------------------------------------------------------------------- /onnx_extended/validation/cuda/cuda_gemm.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | namespace cuda_example { 7 | 8 | struct BenchmarkGemm { 9 | int64_t N; 10 | double workspace_new; 11 | double workspace_free; 12 | double stream_create; 13 | double stream_destroy; 14 | double setup; 15 | double clean; 16 | double gemm; 17 | double gemm_in; 18 | double gemm_sync; 19 | double total; 20 | BenchmarkGemm(); 21 | void zero(); 22 | void to_map(std::unordered_map &bench); 23 | }; 24 | 25 | std::unordered_map gemm_benchmark_test(int test, int N, int m, int n, 26 | int k, int lda, int ldb, int ldd); 27 | 28 | } // namespace cuda_example 29 | -------------------------------------------------------------------------------- /onnx_extended/validation/cuda/cuda_nvtx.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #if defined(ENABLE_NVTX) 4 | #include 5 | #define NVTX_SCOPE(msg) nvtx3::scoped_range r{msg}; 6 | #else 7 | #define NVTX_SCOPE(msg) 8 | #endif 9 | -------------------------------------------------------------------------------- /onnx_extended/validation/cuda/cuda_tensor.cuh: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "cuda_nvtx.cuh" 3 | #include "cuda_utils.h" 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | namespace cuda_example { 10 | 11 | typedef enum TensorDevice { CPU = 0, CUDA = 1 } TensorDevice; 12 | 13 | bool is_fp8_dtype(cudaDataType_t dtype); 14 | 15 | int32_t type_size(cudaDataType_t element_type); 16 | 17 | inline cudaDataType_t get_cuda_dtype(cudaDataType_t dtype) { return dtype; } 18 | 19 | struct TensorData { 20 | TensorDevice device; 21 | cudaDataType_t dtype; 22 | std::size_t size; 23 | void *dptr; 24 | inline TensorData() { 25 | device = TensorDevice::CPU; 26 | size = 0; 27 | dptr = nullptr; 28 | dtype = CUDA_R_32F; 29 | } 30 | void allocate(cudaDataType_t dtype, std::size_t size, TensorDevice device); 31 | void free(); 32 | void copy_from_cpu(void *ptr); 33 | }; 34 | 35 | class Tensor { 36 | public: 37 | const char *name; 38 | TensorData data; 39 | TensorData scale; 40 | TensorData amax; 41 | TensorData scale_inv; 42 | 43 | public: 44 | inline Tensor(const char *name) : data(), scale(), amax(), scale_inv() { this->name = name; } 45 | Tensor(const char *name, std::size_t size, cudaDataType_t dtype = CUDA_R_32F, 46 | TensorDevice device = TensorDevice::CUDA, 47 | TensorDevice scale_device = TensorDevice::CUDA); 48 | ~Tensor(); 49 | void rnd(); 50 | }; 51 | 52 | } // namespace cuda_example -------------------------------------------------------------------------------- /onnx_extended/validation/cuda/cuda_utils.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "onnx_extended_helpers.h" 4 | #include 5 | 6 | #define NVTE_ERROR(x) \ 7 | do { \ 8 | throw std::runtime_error(onnx_extended_helpers::MakeString( \ 9 | __FILE__, ":", __LINE__, " in function ", __func__, ": ", x)); \ 10 | } while (false) 11 | 12 | #define NVTE_CHECK(x, ...) \ 13 | do { \ 14 | if (!(x)) { \ 15 | NVTE_ERROR(std::string("Assertion failed: " #x ". ") + std::string(__VA_ARGS__)); \ 16 | } \ 17 | } while (false) 18 | 19 | #define NVTE_CHECK_CUDA(ans) \ 20 | { \ 21 | auto status = ans; \ 22 | NVTE_CHECK(status == cudaSuccess, \ 23 | "CUDA Error: " + std::string(cudaGetErrorString(status))); \ 24 | } 25 | 26 | #define NVTE_CHECK_CUBLAS(ans) \ 27 | { \ 28 | auto status = ans; \ 29 | NVTE_CHECK(status == CUBLAS_STATUS_SUCCESS, \ 30 | "CUBLAS Error: " + std::string(cublasGetStatusString(status))); \ 31 | } 32 | 33 | #define checkCudaErrors(val) _check_cuda((val), #val, __FILE__, __LINE__) 34 | 35 | template 36 | void _check_cuda(T err, const char *const func, const char *const file, const int line) { 37 | if (err != cudaSuccess) { 38 | throw std::runtime_error(onnx_extended_helpers::MakeString( 39 | "CUDA error at: ", file, ":", line, "\n", cudaGetErrorString(err), " ", func, "\n")); 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /onnx_extended/validation/cython/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /onnx_extended/validation/cython/fp8.pyx: -------------------------------------------------------------------------------- 1 | import numpy 2 | cimport numpy as cnumpy 3 | cimport cython 4 | from libcpp cimport bool 5 | from cython.cimports.libc.stdint import uint8_t, int64_t 6 | 7 | # numpy.import_array() 8 | 9 | 10 | cdef extern from "cpu/cast_fp8.h": 11 | void float_to_e4m3fn(int64_t n, const float* src, uint8_t* dst, bool saturate) nogil 12 | void e4m3fn_to_float(int64_t n, const uint8_t* src, float* dst) nogil 13 | 14 | 15 | @cython.boundscheck(False) 16 | @cython.wraparound(False) 17 | @cython.nonecheck(False) 18 | def cast_float32_to_e4m3fn(m, bool saturate = True): 19 | """ 20 | Converts an array from float to float 8 e4m3fn. 21 | 22 | :param m: any array 23 | :param saturate: saturate the conversion 24 | :return: casted array 25 | """ 26 | cdef cnumpy.ndarray cm = numpy.ascontiguousarray(m) 27 | cdef cnumpy.ndarray res = numpy.empty(m.shape, dtype=numpy.uint8) 28 | cdef int64_t n = m.size 29 | cdef const float* src = cm.data 30 | cdef uint8_t* dst = res.data 31 | with nogil: 32 | float_to_e4m3fn(n, src, dst, saturate) 33 | return res 34 | 35 | 36 | @cython.boundscheck(False) 37 | @cython.wraparound(False) 38 | @cython.nonecheck(False) 39 | def cast_e4m3fn_to_float32(m): 40 | """ 41 | Converts an array from float 8 e4m3fn to float. 42 | 43 | :param m: any array 44 | :return: casted array 45 | """ 46 | cdef cnumpy.ndarray cm = numpy.ascontiguousarray(m) 47 | cdef cnumpy.ndarray res = numpy.empty(m.shape, dtype=numpy.float32) 48 | cdef int64_t n = m.size 49 | cdef const uint8_t* src = cm.data 50 | cdef float* dst = res.data 51 | with nogil: 52 | e4m3fn_to_float(n, src, dst) 53 | return res 54 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | black 2 | clang-format 3 | cmakelang 4 | coverage 5 | cython>=3.0.10 6 | cython-lint 7 | flake8 8 | furo; sys_platform == 'linux' 9 | google-re2 10 | isort 11 | lightgbm 12 | matplotlib 13 | ml-dtypes 14 | onnx-array-api 15 | onnxmltools 16 | onnxruntime>=1.21.0 17 | openpyxl 18 | opt_einsum 19 | packaging 20 | pandas 21 | Pillow 22 | psutil 23 | pytest 24 | pytest-cov 25 | pytest-subtests 26 | rstcheck[sphinx,toml] 27 | ruff 28 | scikit-learn>=1.5 29 | skl2onnx>=1.14.1 30 | sphinx>=8; sys_platform == 'linux' 31 | sphinx-gallery; sys_platform == 'linux' 32 | sphinx-issues; sys_platform == 'linux' 33 | git+https://github.com/sdpython/sphinx-runpython.git 34 | toml; python_version < '3.11' 35 | tomli 36 | tqdm 37 | wheel 38 | xgboost 39 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy>=2.0 2 | onnx>=1.17.0 3 | scipy>=1.13.1 4 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [project] 2 | requires-python = ">=3.9" 3 | 4 | [options] 5 | packages = find: 6 | 7 | [options.packages.find] 8 | include = onnx_extended* 9 | --------------------------------------------------------------------------------