├── .clang-format
├── .github
    └── workflows
    │   ├── black-ruff.yml
    │   ├── check-release.yml
    │   ├── check-urls.yml
    │   ├── clang.yml
    │   ├── cmakelint.yml
    │   ├── documentation.yml
    │   ├── mypy.yml
    │   ├── rstcheck.yml
    │   ├── wheels-linux.yml
    │   ├── wheels-mac.yml
    │   └── wheels-windows.yml
├── .gitignore
├── CHANGELOGS.rst
├── LICENSE.txt
├── MANIFEST.in
├── README.rst
├── _cmake
    ├── CMakeLists.txt
    ├── clang_format.sh
    ├── constants.cmake
    ├── externals
    │   ├── CPM.cmake
    │   ├── FindCudaExtension.cmake
    │   ├── FindCython.cmake
    │   ├── FindLocalEigen.cmake
    │   ├── FindLocalMatX.cmake
    │   ├── FindLocalPyBind11.cmake
    │   ├── FindMyPython.cmake
    │   └── FindOrt.cmake
    ├── finalize.cmake
    ├── intrin.sh
    ├── load_externals.cmake
    ├── targets
    │   ├── _validation.cmake
    │   ├── _validation_cuda_example_py.cmake
    │   ├── _validation_cuda_monitor.cmake
    │   ├── c_op_conv_.cmake
    │   ├── c_op_svm_py_.cmake
    │   ├── c_op_tfidf_vectorizer_py_.cmake
    │   ├── c_op_tree_ensemble_py_.cmake
    │   ├── common.cmake
    │   ├── common_kernels.cmake
    │   ├── fp8_cy.cmake
    │   ├── ortinf.cmake
    │   ├── ortops_optim_cpu.cmake
    │   ├── ortops_optim_cuda.cmake
    │   ├── ortops_tutorial_cpu.cmake
    │   └── ortops_tutorial_cuda.cmake
    └── test_constants.h.in
├── _doc
    ├── _static
    │   ├── logo.png
    │   ├── profile.png
    │   ├── vector_sum6.png
    │   └── vector_sum6_results.png
    ├── api
    │   ├── check.rst
    │   ├── ext_test_case.rst
    │   ├── helper.rst
    │   ├── index.rst
    │   ├── memory_peak.rst
    │   ├── ortcy.rst
    │   ├── ortops.rst
    │   ├── ortops_optim_cpu.rst
    │   ├── ortops_optim_cuda.rst
    │   ├── ortops_tutorial_cpu.rst
    │   ├── ortops_tutorial_cuda.rst
    │   ├── plotting.rst
    │   ├── reference.rst
    │   ├── tools.rst
    │   ├── tools_einsum.rst
    │   ├── tools_graph.rst
    │   ├── tools_graph_transformer.rst
    │   ├── tools_inline.rst
    │   ├── tools_io.rst
    │   ├── tools_nodes.rst
    │   ├── tools_other.rst
    │   ├── tools_stats.rst
    │   ├── validation.rst
    │   ├── validation_cpu.rst
    │   ├── validation_cuda.rst
    │   ├── validation_sparse.rst
    │   └── validation_trees.rst
    ├── benchmarks.rst
    ├── command_lines.rst
    ├── conf.py
    ├── examples
    │   ├── README.txt
    │   ├── plot_bench_cpu.py
    │   ├── plot_bench_cypy_ort.py
    │   ├── plot_bench_gemm_f8.py
    │   ├── plot_bench_gemm_ort.py
    │   ├── plot_bench_sparse_access.py
    │   ├── plot_op_conv_denorm.py
    │   ├── plot_op_conv_py_vs_c.py
    │   ├── plot_op_einsum.py
    │   ├── plot_op_gemm2_cuda.py
    │   ├── plot_op_mul_cuda.py
    │   ├── plot_op_scatternd_cuda.py
    │   ├── plot_op_scatternd_mask_cuda.py
    │   ├── plot_op_tfidfvectorizer_sparse.py
    │   ├── plot_op_transpose_2d_cast_cuda.py
    │   ├── plot_op_tree_ensemble_implementations.py
    │   ├── plot_op_tree_ensemble_optim.py
    │   ├── plot_op_tree_ensemble_sparse.py
    │   └── plot_profile_gemm_ort.py
    ├── index.rst
    ├── license.rst
    ├── tech
    │   ├── 2023-09-05-glibc.rst
    │   ├── gemm.rst
    │   ├── index.rst
    │   ├── install_cuda_wsl.rst
    │   └── usefulcmd.rst
    └── tutorial
    │   ├── build.rst
    │   ├── build_cuda.rst
    │   ├── build_cython.rst
    │   ├── build_ortext.rst
    │   ├── build_pybind11.rst
    │   ├── custom_ops.rst
    │   ├── cython_binding.rst
    │   ├── external_data.rst
    │   ├── images
    │       └── plot_optim_tree_ensemble.png
    │   ├── index.rst
    │   ├── many_tools.rst
    │   ├── old_version.rst
    │   ├── onnx_manipulations.rst
    │   ├── ops.rst
    │   ├── ort_debug.rst
    │   ├── parallelization.rst
    │   ├── profiling.rst
    │   ├── quantize.rst
    │   ├── readings.rst
    │   ├── reference_evaluator.rst
    │   ├── statistics.rst
    │   └── trees.rst
├── _unittests
    ├── onnx_extended_test_common.h
    ├── ut_helper
    │   └── test_make_helper.py
    ├── ut_ortcy
    │   ├── data
    │   │   └── add.onnx
    │   ├── test_inference.cpp
    │   └── test_ortcy.py
    ├── ut_ortops
    │   ├── data
    │   │   ├── plot_op_tree_ensemble_implementations_custom.onnx
    │   │   └── plot_op_tree_ensemble_implementations_sparse.onnx
    │   ├── test_inference_tree.cpp
    │   ├── test_optim_cuda.py
    │   ├── test_optim_py.py
    │   ├── test_optim_sparse.py
    │   ├── test_optim_svm.py
    │   ├── test_optim_tfidf_vectorizer.py
    │   ├── test_optim_tfidf_vectorizer_sparse.py
    │   ├── test_optim_tree_ensemble.py
    │   ├── test_optim_tree_ensemble_sparse.py
    │   ├── test_optim_tree_ensemble_sparse_xgboost.py
    │   ├── test_tutorial_cpu.py
    │   ├── test_tutorial_cpu_tree.py
    │   ├── test_tutorial_gemm_cpu.py
    │   └── test_tutorial_gemm_cuda.py
    ├── ut_plotting
    │   └── test_plotting_benchmark.py
    ├── ut_reference
    │   ├── test_backend_c_reference_evaluator.py
    │   ├── test_c_op_conv.cpp
    │   ├── test_c_reference_evaluator.py
    │   ├── test_c_reference_evaluator_save.py
    │   ├── test_c_svm.py
    │   ├── test_c_tfidf_vectorizer.py
    │   ├── test_c_tree_ensemble.py
    │   └── test_sparse_tensor.py
    ├── ut_tools
    │   ├── bench
    │   │   ├── model.onnx
    │   │   └── test_data_set_0
    │   │   │   ├── input_0.pb
    │   │   │   ├── input_1.pb
    │   │   │   └── output_0.pb
    │   ├── bench_rf
    │   │   ├── model.onnx
    │   │   └── test_data_set_0
    │   │   │   ├── input_0.pb
    │   │   │   └── output_0.pb
    │   ├── data
    │   │   └── debug_4700-CPUep.onnx
    │   ├── test_einsum.py
    │   ├── test_einsum_benchmark.py
    │   ├── test_einsum_blas_lapack.py
    │   ├── test_einsum_bug.py
    │   ├── test_einsum_einsum.py
    │   ├── test_einsum_generic_dot.py
    │   ├── test_einsum_ml.py
    │   ├── test_einsum_onnx_micro_runtime.py
    │   ├── test_js_profile.py
    │   ├── test_onnx_inline.py
    │   ├── test_onnx_tools.py
    │   ├── test_onnx_tools_graph.py
    │   ├── test_onnx_tools_quantize_fp8.py
    │   ├── test_optim_onnx_unused.py
    │   ├── test_ort_debug.py
    │   ├── test_run_onnx.py
    │   ├── test_simple.py
    │   └── test_stats_nodes.py
    ├── ut_validation
    │   ├── test_bench_tree.py
    │   ├── test_cpu_fpemu.cpp
    │   ├── test_cpu_fpemu.py
    │   ├── test_cuda_fpemu.py
    │   ├── test_cuda_gemm.py
    │   ├── test_cuda_monitor.py
    │   ├── test_fp8.py
    │   ├── test_hash.py
    │   ├── test_sparse_struct.py
    │   └── test_speed_metrics.py
    └── ut_xrun_doc
    │   ├── test_args.py
    │   ├── test_command_lines1.py
    │   ├── test_command_lines2.py
    │   ├── test_documentation_examples.py
    │   ├── test_memory_peak.py
    │   └── test_version.py
├── azure-pipelines.yml
├── clean_build.sh
├── clean_onnx.sh
├── onnx_extended
    ├── __init__.py
    ├── __main__.py
    ├── _command_lines.py
    ├── _command_lines_parser.py
    ├── _common.py
    ├── args.py
    ├── cpp
    │   ├── __init__.py
    │   ├── c_op_allocation.cpp
    │   ├── c_op_common_parameters.cpp
    │   ├── cpu
    │   │   └── __init__.py
    │   ├── cuda
    │   │   └── __init__.py
    │   ├── include
    │   │   ├── __init__.py
    │   │   ├── common
    │   │   │   ├── __init__.py
    │   │   │   ├── c_op_allocation.h
    │   │   │   ├── c_op_common_parallel.hpp
    │   │   │   ├── c_op_common_parameters.h
    │   │   │   ├── c_op_helpers.h
    │   │   │   ├── c_op_math.h
    │   │   │   ├── c_op_status.h
    │   │   │   ├── common_kernels.h
    │   │   │   ├── simple_span.h
    │   │   │   └── sparse_tensor.h
    │   │   ├── cpu
    │   │   │   ├── __init__.py
    │   │   │   ├── c_op_conv.h
    │   │   │   ├── c_op_conv_common.h
    │   │   │   ├── c_op_svm_common_.hpp
    │   │   │   ├── c_op_tfidf_vectorizer_.hpp
    │   │   │   ├── c_op_tree_ensemble_common_.hpp
    │   │   │   ├── c_op_tree_ensemble_common_agg_.hpp
    │   │   │   ├── c_op_tree_ensemble_common_classifier_.hpp
    │   │   │   └── cast_fp8.h
    │   │   ├── cuda
    │   │   │   ├── __init__.py
    │   │   │   └── common_kernels_cuda.h
    │   │   ├── onnx_extended_helpers.h
    │   │   ├── ortapi_c_api_header.h
    │   │   └── ortapi_version.h
    │   └── onnx_extended_helpers.cpp
    ├── ext_test_case.py
    ├── helper
    │   ├── __init__.py
    │   ├── make_dynamic_quantize_linear.py
    │   └── make_reshape_transpose.py
    ├── memory_peak.py
    ├── ortcy
    │   ├── __init__.py
    │   └── wrap
    │   │   ├── __init__.py
    │   │   ├── ortapi.cpp
    │   │   ├── ortapi.h
    │   │   ├── ortapi_inline.h
    │   │   └── ortinf.pyx
    ├── ortops
    │   ├── __init__.py
    │   ├── optim
    │   │   ├── __init__.py
    │   │   ├── cpu
    │   │   │   ├── __init__.py
    │   │   │   ├── ort_optim_cpu_lib.cc
    │   │   │   ├── ort_optim_cpu_lib.h
    │   │   │   ├── ort_sparse.h
    │   │   │   ├── ort_sparse.hpp
    │   │   │   ├── ort_svm.h
    │   │   │   ├── ort_svm.hpp
    │   │   │   ├── ort_tfidf_vectorizer.h
    │   │   │   ├── ort_tfidf_vectorizer.hpp
    │   │   │   ├── ort_tree_ensemble.h
    │   │   │   └── ort_tree_ensemble.hpp
    │   │   ├── cuda
    │   │   │   ├── __init__.py
    │   │   │   ├── add_or_mul_shared_input.cu
    │   │   │   ├── add_or_mul_shared_input.h
    │   │   │   ├── addaddaddmulmulmul.cu
    │   │   │   ├── addaddaddmulmulmul.h
    │   │   │   ├── addaddmulmul.cu
    │   │   │   ├── addaddmulmul.h
    │   │   │   ├── addmul.cu
    │   │   │   ├── addmul.h
    │   │   │   ├── mul_mul_sigmoid.cu
    │   │   │   ├── mul_mul_sigmoid.h
    │   │   │   ├── mul_sigmoid.cu
    │   │   │   ├── mul_sigmoid.h
    │   │   │   ├── negxplus1.cu
    │   │   │   ├── negxplus1.h
    │   │   │   ├── ort_optim_cuda_lib.cc
    │   │   │   ├── ort_optim_cuda_lib.h
    │   │   │   ├── replace_zero.cu
    │   │   │   ├── replace_zero.h
    │   │   │   ├── rotary.cu
    │   │   │   ├── rotary.h
    │   │   │   ├── scatter_nd_of_shape.cu
    │   │   │   ├── scatter_nd_of_shape.h
    │   │   │   ├── scatter_nd_of_shape_common.h
    │   │   │   ├── scatter_nd_of_shape_masked.cu
    │   │   │   ├── scatter_nd_of_shape_masked.h
    │   │   │   ├── submul.cu
    │   │   │   ├── submul.h
    │   │   │   ├── transpose_cast_2d.cu
    │   │   │   ├── transpose_cast_2d.h
    │   │   │   ├── tri_matrix.cu
    │   │   │   └── tri_matrix.h
    │   │   └── optimize.py
    │   └── tutorial
    │   │   ├── __init__.py
    │   │   ├── cpu
    │   │       ├── __init__.py
    │   │       ├── custom_gemm.cc
    │   │       ├── custom_gemm.h
    │   │       ├── custom_tree_assembly.cc
    │   │       ├── custom_tree_assembly.h
    │   │       ├── dynamic_quantize_linear.cc
    │   │       ├── dynamic_quantize_linear.h
    │   │       ├── my_kernel.cc
    │   │       ├── my_kernel.h
    │   │       ├── my_kernel_attr.cc
    │   │       ├── my_kernel_attr.h
    │   │       ├── ort_tutorial_cpu_lib.cc
    │   │       └── ort_tutorial_cpu_lib.h
    │   │   └── cuda
    │   │       ├── __init__.py
    │   │       ├── custom_gemm.cu
    │   │       ├── custom_gemm.h
    │   │       ├── matx_matmul.cu
    │   │       ├── matx_matmul.h
    │   │       ├── ort_tutorial_cuda_lib.cc
    │   │       └── ort_tutorial_cuda_lib.h
    ├── plotting
    │   ├── __init__.py
    │   ├── benchmark.py
    │   └── data.py
    ├── reference
    │   ├── __init__.py
    │   ├── c_custom_ops
    │   │   ├── __init__.py
    │   │   └── custom_op_tree_ensemble_regressor.py
    │   ├── c_ops
    │   │   ├── __init__.py
    │   │   ├── _op_classifier_common.py
    │   │   ├── c_op_conv.py
    │   │   ├── c_op_svm_classifier.py
    │   │   ├── c_op_svm_regressor.py
    │   │   ├── c_op_tfidf_vectorizer.py
    │   │   ├── c_op_tree_ensemble_classifier.py
    │   │   ├── c_op_tree_ensemble_regressor.py
    │   │   └── cpu
    │   │   │   ├── __init__.py
    │   │   │   ├── c_op_conv_.cpp
    │   │   │   ├── c_op_conv_pybind11.h
    │   │   │   ├── c_op_svm_py_.cpp
    │   │   │   ├── c_op_tfidf_vectorizer_py_.cpp
    │   │   │   ├── c_op_tree_ensemble_py_.cpp
    │   │   │   ├── c_op_tree_ensemble_py_.hpp
    │   │   │   └── c_op_tree_ensemble_py_classifier_.hpp
    │   ├── c_reference_backend.py
    │   ├── c_reference_evaluator.py
    │   └── other_ops
    │   │   ├── __init__.py
    │   │   ├── op_scatternd_of_shape.py
    │   │   └── op_tokenizer.py
    ├── tools
    │   ├── __init__.py
    │   ├── einsum
    │   │   ├── __init__.py
    │   │   ├── blas_lapack.py
    │   │   ├── einsum_bench.py
    │   │   ├── einsum_config.py
    │   │   ├── einsum_fct.py
    │   │   ├── einsum_impl.py
    │   │   ├── einsum_impl_classes.py
    │   │   ├── einsum_impl_ext.py
    │   │   └── einsum_ml.py
    │   ├── graph
    │   │   ├── __init__.py
    │   │   ├── errors.py
    │   │   ├── onnx_custom_ops.py
    │   │   ├── onnx_graph_struct.py
    │   │   └── onnx_graph_transformer.py
    │   ├── js_profile.py
    │   ├── onnx_inline.py
    │   ├── onnx_io.py
    │   ├── onnx_nodes.py
    │   ├── ort_debug.py
    │   ├── run_onnx.py
    │   ├── run_onnx_main.py
    │   └── stats_nodes.py
    └── validation
    │   ├── __init__.py
    │   ├── _tree_d14_f100.py
    │   ├── bench_trees.py
    │   ├── cpu
    │       ├── __init__.py
    │       ├── _validation.cpp
    │       ├── cpu_fpemu.hpp
    │       ├── murmur_hash3.cpp
    │       ├── murmur_hash3.h
    │       ├── speed_metrics.cpp
    │       ├── speed_metrics.h
    │       ├── vector_sparse.cpp
    │       └── vector_sparse.h
    │   ├── cuda
    │       ├── __init__.py
    │       ├── cuda_example_py.cpp
    │       ├── cuda_fpemu.cu
    │       ├── cuda_fpemu.cuh
    │       ├── cuda_gemm.cu
    │       ├── cuda_gemm.cuh
    │       ├── cuda_monitor.cpp
    │       ├── cuda_nvtx.cuh
    │       ├── cuda_tensor.cu
    │       ├── cuda_tensor.cuh
    │       └── cuda_utils.h
    │   └── cython
    │       ├── __init__.py
    │       └── fp8.pyx
├── pyproject.toml
├── requirements-dev.txt
├── requirements.txt
├── setup.cfg
└── setup.py


/.clang-format:
--------------------------------------------------------------------------------
1 | ColumnLimit: 96
2 | 


--------------------------------------------------------------------------------
/.github/workflows/black-ruff.yml:
--------------------------------------------------------------------------------
 1 | name: Black + Ruff Format Checker
 2 | on: [push, pull_request]
 3 | jobs:
 4 |   black-format-check:
 5 |     runs-on: ubuntu-latest
 6 |     steps:
 7 |       - uses: actions/checkout@v2
 8 |       - uses: psf/black@stable
 9 |         with:
10 |           options: "--diff --check"
11 |           src: "."
12 |   ruff-format-check:
13 |     runs-on: ubuntu-latest
14 |     steps:
15 |       - uses: actions/checkout@v3
16 |       - uses: chartboost/ruff-action@v1
17 | 


--------------------------------------------------------------------------------
/.github/workflows/check-urls.yml:
--------------------------------------------------------------------------------
 1 | name: Check URLs
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     branches: [main]
 6 |   schedule:
 7 |     #        ┌───────────── minute (0 - 59)
 8 |     #        │  ┌───────────── hour (0 - 23)
 9 |     #        │  │ ┌───────────── day of the month (1 - 31)
10 |     #        │  │ │ ┌───────────── month (1 - 12 or JAN-DEC)
11 |     #        │  │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
12 |     #        │  │ │ │ │
13 |     #        │  │ │ │ │
14 |     #        │  │ │ │ │
15 |     #        *  * * * *
16 |     - cron: '30 1 * * 0'
17 | 
18 | jobs:
19 |   check-urls:
20 |     runs-on: ubuntu-latest
21 | 
22 |     steps:
23 |     - uses: actions/checkout@v3
24 | 
25 |     - name: urls-checker-code
26 |       uses: urlstechie/urlchecker-action@master
27 |       with:
28 |         subfolder: onnx_extended
29 |         file_types: .md,.py,.rst,.ipynb
30 |         print_all: false
31 |         timeout: 2
32 |         retry_count# : 2
33 |         exclude_urls: https://github.com/microsoft/onnxruntime/blob/
34 |         exclude_patterns: https://github.com/microsoft/onnxruntime/blob/
35 |         # force_pass : true
36 | 
37 |     - name: urls-checker-docs
38 |       uses: urlstechie/urlchecker-action@master
39 |       with:
40 |         subfolder: _doc
41 |         file_types: .md,.py,.rst,.ipynb
42 |         print_all: false
43 |         timeout: 2
44 |         retry_count# : 2
45 |         exclude_urls: 64,14: https://github.com/Kitware/CMake/releases/download/v${cmake_version}/cmake-$,https://developer.download.nvidia.com/compute/cuda/$
46 |         exclude_patterns: https://www.data.gouv.fr/fr/datasets/r/e3d83ab3-dc52-4c99-abaf-8a38050cc68c,https://dev.azure.com/
47 |         # force_pass : true
48 | 


--------------------------------------------------------------------------------
/.github/workflows/clang.yml:
--------------------------------------------------------------------------------
 1 | name: Clang Format Checker
 2 | on: [push]
 3 | jobs:
 4 |   clang-format-checking:
 5 |     runs-on: ubuntu-latest
 6 |     steps:
 7 |       - uses: actions/checkout@v2
 8 |       - uses: RafikFarhad/clang-format-github-action@v3
 9 |         with:
10 |           sources: "src/**/*.h,src/**/*.c,test/**/*.c"
11 | 


--------------------------------------------------------------------------------
/.github/workflows/cmakelint.yml:
--------------------------------------------------------------------------------
 1 | name: Cmake Format Checker
 2 | 
 3 | on: [push]
 4 | 
 5 | jobs:
 6 |   build:
 7 |     runs-on: ubuntu-latest
 8 | 
 9 |     steps:
10 |     - name: Checkout repository
11 |       uses: actions/checkout@v2
12 | 
13 |     - name: Format CMake files
14 |       id: cmake-format
15 |       uses: PuneetMatharu/cmake-format-lint-action@v1.0.0
16 |       with:
17 |         args: --check
18 | 
19 |     - name: Commit changes
20 |       uses: stefanzweifel/git-auto-commit-action@v4
21 |       with:
22 |         commit_user_name: cmake-format-bot
23 |         commit_message: 'Automated commit of cmake-format changes.'
24 | 


--------------------------------------------------------------------------------
/.github/workflows/mypy.yml:
--------------------------------------------------------------------------------
 1 | name: Type annotation with mypy
 2 | on: [push, pull_request]
 3 | jobs:
 4 |   mypy:
 5 |     runs-on: ubuntu-latest
 6 |     steps:
 7 |       - uses: actions/checkout@v3
 8 |       - uses: actions/setup-python@v4
 9 |         with:
10 |           python-version: '3.11'
11 |       - name: Install mypy
12 |         run: pip install mypy
13 |       - name: Run mypy
14 |         run: mypy
15 | 


--------------------------------------------------------------------------------
/.github/workflows/rstcheck.yml:
--------------------------------------------------------------------------------
 1 | name: RST Check
 2 | 
 3 | on: [push, pull_request]
 4 | 
 5 | jobs:
 6 |   build_wheels:
 7 |     name: rstcheck ${{ matrix.os }}
 8 |     runs-on: ${{ matrix.os }}
 9 |     strategy:
10 |       matrix:
11 |         os: [ubuntu-latest]
12 | 
13 |     steps:
14 |       - uses: actions/checkout@v3
15 | 
16 |       # Used to host cibuildwheel
17 |       - uses: actions/setup-python@v4
18 |         with:
19 |           python-version: '3.11'
20 | 
21 |       - name: Install requirements
22 |         run: python -m pip install -r requirements.txt
23 | 
24 |       - name: Install rstcheck
25 |         run: python -m pip install sphinx tomli rstcheck[toml,sphinx]
26 | 
27 |       - name: rstcheck
28 |         run: rstcheck -r _doc onnx_extended
29 | 


--------------------------------------------------------------------------------
/.github/workflows/wheels-linux.yml:
--------------------------------------------------------------------------------
 1 | name: Build Wheel Linux
 2 | 
 3 | on:
 4 |   push:
 5 | #    branches:
 6 | #      - main
 7 | #      - 'releases/**'
 8 |   pull_request:
 9 | #    types:
10 | #      - closed
11 | #    branches:
12 | #      - main
13 | #on:
14 | #  push:
15 | #    branches:
16 | #      - main
17 | #      - 'releases/**'
18 | 
19 | jobs:
20 |   build_wheels:
21 |     name: Build wheels on ${{ matrix.os }}
22 |     runs-on: ${{ matrix.os }}
23 |     strategy:
24 |       matrix:
25 |         os: [ubuntu-latest]
26 | 
27 |     steps:
28 |       - uses: actions/checkout@v4
29 | 
30 |       # Used to host cibuildwheel
31 |       - uses: actions/setup-python@v4
32 |         with:
33 |           python-version: '3.11'
34 | 
35 |       - name: Install cibuildwheel
36 |         run: python -m pip install cibuildwheel
37 | 
38 |       - name: python version
39 |         run: python -V
40 | 
41 |       - name: Build wheels
42 |         run: python -m cibuildwheel --output-dir wheelhouse
43 | 
44 |       - uses: actions/upload-artifact@v4
45 |         with:
46 |           path: ./wheelhouse/*.whl
47 | 


--------------------------------------------------------------------------------
/.github/workflows/wheels-mac.yml:
--------------------------------------------------------------------------------
 1 | name: Build Wheel MacOS
 2 | 
 3 | on:
 4 |   push:
 5 | #    branches:
 6 | #      - main
 7 | #      - 'releases/**'
 8 |   pull_request:
 9 | #    types:
10 | #      - closed
11 | #    branches:
12 | #      - main
13 | 
14 | jobs:
15 |   build_wheels:
16 |     name: Build wheels on ${{ matrix.os }}
17 |     runs-on: ${{ matrix.os }}
18 |     strategy:
19 |       matrix:
20 |         os: [macOS-latest]
21 | 
22 |     steps:
23 |       - uses: actions/checkout@v4
24 | 
25 |       # Used to host cibuildwheel
26 |       - uses: actions/setup-python@v4
27 |         with:
28 |           python-version: '3.11'
29 | 
30 |       - name: Install cibuildwheel
31 |         run: python -m pip install cibuildwheel
32 | 
33 |       - name: python version
34 |         run: python -V
35 | 
36 |       - name: Build wheels
37 |         run: python -m cibuildwheel --output-dir wheelhouse
38 |         continue-on-error: true
39 | 
40 |       - uses: actions/upload-artifact@v4
41 |         with:
42 |           path: ./wheelhouse/*.whl
43 | 


--------------------------------------------------------------------------------
/.github/workflows/wheels-windows.yml:
--------------------------------------------------------------------------------
 1 | name: Build Wheel Windows
 2 | 
 3 | on:
 4 |   push:
 5 | #    branches:
 6 | #      - main
 7 | #      - 'releases/**'
 8 |   pull_request:
 9 | #    types:
10 | #      - closed
11 | #    branches:
12 | #      - main
13 | 
14 | jobs:
15 |   build_wheels:
16 |     name: Build wheels on ${{ matrix.os }}
17 |     runs-on: ${{ matrix.os }}
18 |     strategy:
19 |       matrix:
20 |         os: [windows-latest]
21 | 
22 |     steps:
23 |       - uses: actions/checkout@v4
24 | 
25 |       # Used to host cibuildwheel
26 |       - uses: actions/setup-python@v4
27 |         with:
28 |           python-version: '3.11'
29 | 
30 |       - name: Install cibuildwheel
31 |         run: python -m pip install cibuildwheel
32 | 
33 |       - name: python version
34 |         run: python -V
35 | 
36 |       - name: Build wheels
37 |         run: python -m cibuildwheel
38 | 
39 |       - uses: actions/upload-artifact@v4
40 |         with:
41 |           path: ./wheelhouse/*.whl
42 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | *.pyd
 3 | *.dylib
 4 | *.so
 5 | *.so.*
 6 | *.dll
 7 | *.vcxproj*
 8 | *.tcl
 9 | *.sln
10 | *.cmake
11 | *.whl
12 | *.def
13 | *.ll
14 | *.pdb
15 | *.s
16 | /*.png
17 | /*.onnx
18 | .build_path.txt
19 | .hypothesis/*
20 | coverage.html/*
21 | _cache/*
22 | _deps/*
23 | .vs/*
24 | *.dir/*
25 | Release/*
26 | Testing/*
27 | plot_*.csv
28 | plot_*.xlsx
29 | *.data
30 | test_ort_version*
31 | x64/*
32 | CMakeFiles/*
33 | dist/*
34 | build/*
35 | .eggs/*
36 | *egg-info/*
37 | .coverage
38 | CMakeCache.txt
39 | onnxruntime_*.json
40 | _doc/LICENSE.rst
41 | _doc/LICENSE.txt
42 | _doc/CHANGELOGS.rst
43 | _doc/examples/_cache/*
44 | _doc/sg_execution_times.rst
45 | _doc/auto_examples/*
46 | _doc/examples/*.xlsx
47 | _doc/examples/plot*.csv
48 | _doc/examples/plot*.onnx
49 | _doc/examples/plot_*.png
50 | _doc/examples/plot_*.csv
51 | _doc/examples/plot_*.onnx
52 | _doc/examples/plot_*.xlsx
53 | _doc/_static/require.js
54 | _doc/_static/viz.js
55 | _unittests/ut__main/*.png
56 | _unittests/test_constants.h
57 | onnx_extended/_config.py
58 | onnx_extended/validation/cython/*.c
59 | onnx_extended/validation/cython/*.cpp
60 | onnx_extended/validation/cython/vector_function_cy.c*
61 | onnx_extended/ortcy/wrap/ortinf.c*
62 | onnx_extended/ortcy/wrap/*.lib
63 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | ﻿Copyright (c) 2023-2024, Xavier Dupré
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | THE SOFTWARE.


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | recursive-include onnx_extended *.c *.cpp *.h *.pyx *.pxd *.pxi *.py
 2 | recursive-include  _cmake *.cmake *.in *.txt *.sh *.in
 3 | include pyproject.toml
 4 | include MANIFEST.in
 5 | include setup.cfg
 6 | prune _doc
 7 | prune _unittests
 8 | exclude *.yml
 9 | exclude *.git*
10 | # cython files to exclude
11 | exclude onnx_extended/ortcy/ortinf.cpp
12 | exclude onnx_extended/validation/cython/fp8.cpp
13 | 


--------------------------------------------------------------------------------
/_cmake/clang_format.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | clear
 3 | echo "--ruff--"
 4 | ruff check .
 5 | echo "--cython-lint--"
 6 | cython-lint .
 7 | echo "--clang-format--"
 8 | find onnx_extended -type f \( -name "*.h" -o -name "*.hpp" -o -name "*.cuh" -o -name "*.cpp" -o -name "*.cc" -o -name "*.cu" \) | while read f; do
 9 |     echo "clang-format -i $f";
10 |     clang-format -i $f;
11 | done
12 | echo "--cmake-lint--"
13 | find _cmake -type f \( -name "*.cmake" -o -name "*.txt" \) | while read f; do
14 |     echo "cmake-lint $f --line-width=88 --disabled-codes C0103 C0113";
15 |     cmake-lint $f --line-width=88 --disabled-codes C0103 C0113;
16 | done
17 | 


--------------------------------------------------------------------------------
/_cmake/externals/FindLocalEigen.cmake:
--------------------------------------------------------------------------------
 1 | #
 2 | # initialization
 3 | #
 4 | # function eigen_add_dependency
 5 | # output variables LOCAL_EIGEN_FOUND, LOCAL_EIGEN_TARGET
 6 | 
 7 | if(NOT LOCAL_EIGEN_VERSION)
 8 |   set(LOCAL_EIGEN_VERSION "3.4.0")
 9 | endif()
10 | string(SUBSTRING "${LOCAL_EIGEN_VERSION}" 0 3 SHORT_EIGEN_VERSION)
11 | set(LOCAL_EIGEN_ROOT https://gitlab.com/libeigen/eigen/-/archive/)
12 | set(LOCAL_EIGEN_NAME "eigen-${LOCAL_EIGEN_VERSION}.zip")
13 | set(LOCAL_EIGEN_URL "${LOCAL_EIGEN_ROOT}${LOCAL_EIGEN_VERSION}/${LOCAL_EIGEN_NAME}")
14 | set(LOCAL_EIGEN_DEST "${CMAKE_CURRENT_BINARY_DIR}/eigen-download/${LOCAL_EIGEN_NAME}")
15 | set(LOCAL_EIGEN_DEST_DIR "${CMAKE_CURRENT_BINARY_DIR}/eigen-bin/")
16 | 
17 | FetchContent_Declare(eigen URL ${LOCAL_EIGEN_URL})
18 | 
19 | # This instruction add all the available targets in eigen
20 | # including unit tests.
21 | # FetchContent_makeAvailable(eigen)
22 | 
23 | FetchContent_Populate(eigen)
24 | 
25 | list(APPEND CMAKE_MODULE_PATH "${eigen_SOURCE_DIR}/cmake")
26 | # find_package(Eigen3)
27 | 
28 | set(LOCAL_EIGEN_SOURCE "${eigen_SOURCE_DIR}")
29 | 
30 | # find_package(Eigen3 ${SHORT_EIGEN_VERSION} REQUIRED NO_MODULE)
31 | set(LOCAL_EIGEN_TARGET Eigen3::Eigen)
32 | set(LOCAL_EIGEN_VERSION ${Eigen3_VERSION})
33 | set(EIGEN_INCLUDE_DIRS "${eigen_SOURCE_DIR}")
34 | 
35 | #
36 | # !eigen_add_dependency: add a dependency to eigen.
37 | #
38 | #
39 | # \arg:name target name
40 | #
41 | function(eigen_add_dependency name)
42 |   target_include_directories(${name} PRIVATE ${EIGEN_INCLUDE_DIRS})
43 | endfunction()
44 | 
45 | include(FindPackageHandleStandardArgs)
46 | find_package_handle_standard_args(
47 |   LocalEigen
48 |   VERSION_VAR LOCAL_EIGEN_VERSION
49 |   REQUIRED_VARS LOCAL_EIGEN_TARGET LOCAL_EIGEN_URL LOCAL_EIGEN_SOURCE
50 |                 EIGEN_INCLUDE_DIRS)
51 | 


--------------------------------------------------------------------------------
/_cmake/externals/FindLocalMatX.cmake:
--------------------------------------------------------------------------------
 1 | #
 2 | # initialization
 3 | #
 4 | # defines matx matx_SOURCE_DIR matx_BINARY_DIR
 5 | 
 6 | #
 7 | # matx
 8 | #
 9 | 
10 | set(matx_TAG "v0.8.0")
11 | 
12 | include(FetchContent)
13 | FetchContent_Declare(
14 |   matx
15 |   GIT_REPOSITORY https://github.com/NVIDIA/matx
16 |   GIT_TAG ${matx_TAG})
17 | 
18 | FetchContent_MakeAvailable(matx)
19 | FetchContent_GetProperties(matx)
20 | 
21 | set(matx_VERSION ${matx_TAG})
22 | set(MATX_INCLUDE_DIR "${matx_SOURCE_DIR}/include")
23 | message(STATUS "matx_BINARY_DIR=${matx_BINARY_DIR}")
24 | message(STATUS "matx_SOURCE_DIR=${matx_SOURCE_DIR}")
25 | message(STATUS "MATX_INCLUDE_DIR=${MATX_INCLUDE_DIR}")
26 | message(STATUS "matx_VERSION=${matx_VERSION}")
27 | 
28 | include(FindPackageHandleStandardArgs)
29 | find_package_handle_standard_args(
30 |   LocalMatX
31 |   VERSION_VAR matx_VERSION
32 |   REQUIRED_VARS matx_SOURCE_DIR matx_BINARY_DIR)
33 | 


--------------------------------------------------------------------------------
/_cmake/finalize.cmake:
--------------------------------------------------------------------------------
 1 | 
 2 | if(CUDA_AVAILABLE)
 3 |   set(config_content_cuda
 4 |       "HAS_CUDA = 1\nCUDA_VERSION = '${CUDA_VERSION}'"
 5 |       "\nCUDA_VERSION_INT = ${CUDA_VERSION_INT}")
 6 | else()
 7 |   set(config_content_cuda "HAS_CUDA = 0")
 8 | endif()
 9 | 
10 | set(config_content_comma
11 |     "${config_content_cuda}"
12 |     "\nORT_VERSION = '${ORT_VERSION}'"
13 |     "\nORT_VERSION_INT = ${ORT_VERSION_INT}"
14 |     "\nCXX_FLAGS = '${CMAKE_CXX_FLAGS}'"
15 |     "\nCMAKE_CXX_STANDARD_REQUIRED = '${CMAKE_CXX_STANDARD_REQUIRED}'"
16 |     "\nCMAKE_CXX_EXTENSIONS = '${CMAKE_CXX_EXTENSIONS}'"
17 |     "\nCMAKE_CXX_STANDARD = ${CMAKE_CXX_STANDARD}\n")
18 | 
19 | string(REPLACE ";" "" config_content "${config_content_comma}")
20 | 


--------------------------------------------------------------------------------
/_cmake/intrin.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | get_instruction ()
 4 | {
 5 |     [ -z "$1" ] && exit
 6 |     func_name="$1 "
 7 | 
 8 |     header_file=`grep --include=\*intrin.h -Rl "$func_name" /usr/lib/gcc | head -n1`
 9 |     [ -z "$header_file" ] && exit
10 |     >&2 echo "find in: $header_file"
11 | 
12 |     target_directive=`grep "#pragma GCC target(\|$func_name" $header_file | grep -B 1 "$func_name" | head -n1`
13 |     echo $target_directive | grep -o '"[^,]*[,"]' | sed 's/"//g' | sed 's/,//g'
14 | }
15 | 
16 | instruction=`get_instruction $1`
17 | if [ -z "$instruction" ]; then
18 |     echo "Error: function not found: $1"
19 | else
20 |     echo "add this option to gcc: -m$instruction"
21 | fi
22 | 


--------------------------------------------------------------------------------
/_cmake/targets/_validation.cmake:
--------------------------------------------------------------------------------
 1 | #
 2 | # module: onnx_extended.validation.cpu._validation
 3 | #
 4 | message(STATUS "+ PYBIND11 onnx_extended.validation.cpu._validation")
 5 | 
 6 | add_library(lib_validation_cpp STATIC
 7 |   ../onnx_extended/validation/cpu/murmur_hash3.cpp
 8 |   ../onnx_extended/validation/cpu/speed_metrics.cpp)
 9 | target_compile_definitions(lib_validation_cpp PRIVATE PYTHON_MANYLINUX=${PYTHON_MANYLINUX})
10 | target_include_directories(lib_validation_cpp PRIVATE "${ROOT_INCLUDE_PATH}")
11 | set_property(TARGET lib_validation_cpp PROPERTY POSITION_INDEPENDENT_CODE ON)
12 | 
13 | local_pybind11_add_module(
14 |   _validation OpenMP::OpenMP_CXX
15 |   ../onnx_extended/validation/cpu/_validation.cpp
16 |   ../onnx_extended/validation/cpu/vector_sparse.cpp)
17 | message(STATUS "    LINK _validation <- lib_validation_cpp")
18 | target_include_directories(_validation PRIVATE "${ROOT_INCLUDE_PATH}")
19 | target_link_libraries(_validation PRIVATE lib_validation_cpp common)
20 | 
21 | add_executable(
22 |   test_validation_cpp
23 |   ../_unittests/ut_validation/test_cpu_fpemu.cpp)
24 | target_compile_definitions(test_validation_cpp PRIVATE PYTHON_MANYLINUX=${PYTHON_MANYLINUX})
25 | target_include_directories(
26 |   test_validation_cpp
27 |   PRIVATE
28 |   "${ROOT_PROJECT_PATH}"
29 |   "${ROOT_INCLUDE_PATH}"
30 |   "${ROOT_UNITTEST_PATH}")
31 | message(STATUS "    LINK test_validation_cpp <- lib_validation_cpp")
32 | target_link_libraries(
33 |   test_validation_cpp
34 |   PRIVATE
35 |   lib_validation_cpp
36 |   common)
37 | add_test(NAME test_validation_cpp COMMAND test_validation_cpp)
38 | 


--------------------------------------------------------------------------------
/_cmake/targets/_validation_cuda_example_py.cmake:
--------------------------------------------------------------------------------
 1 | #
 2 | # module: onnx_extended.validation.cuda.cuda_example_py
 3 | #
 4 | if(CUDA_AVAILABLE)
 5 | 
 6 |   message(STATUS "+ PYBIND11 CUDA onnx_extended.validation.cuda.cuda_example_py")
 7 | 
 8 |   cuda_pybind11_add_module(
 9 |     cuda_example_py
10 |     ../onnx_extended/validation/cuda/cuda_example_py.cpp
11 |     ../onnx_extended/validation/cuda/cuda_fpemu.cu
12 |     ../onnx_extended/validation/cuda/cuda_tensor.cu
13 |     ../onnx_extended/validation/cuda/cuda_gemm.cu)
14 | 
15 |   target_include_directories(cuda_example_py PRIVATE ${ROOT_INCLUDE_PATH})
16 |   target_link_libraries(cuda_example_py PRIVATE common)
17 | 
18 | endif()
19 | 


--------------------------------------------------------------------------------
/_cmake/targets/_validation_cuda_monitor.cmake:
--------------------------------------------------------------------------------
 1 | #
 2 | # module: onnx_extended.validation.cuda.cuda_monitor
 3 | #
 4 | if(CUDA_AVAILABLE)
 5 | 
 6 |   message(STATUS "+ PYBIND11 CUDA onnx_extended.validation.cuda.cuda_monitor")
 7 | 
 8 |   cuda_pybind11_add_module(
 9 |     cuda_monitor
10 |     ../onnx_extended/validation/cuda/cuda_monitor.cpp)
11 | 
12 |   target_include_directories(cuda_monitor PRIVATE ${ROOT_INCLUDE_PATH})
13 |   target_link_libraries(cuda_monitor PRIVATE common CUDA::nvml)
14 | 
15 | endif()
16 | 


--------------------------------------------------------------------------------
/_cmake/targets/c_op_conv_.cmake:
--------------------------------------------------------------------------------
 1 | #
 2 | # module: onnx_extended.reference.c_ops.cpu.c_op_conv_
 3 | #
 4 | message(STATUS "+ PYBIND11 onnx_extended.reference.c_ops.cpu.c_op_conv_")
 5 | 
 6 | local_pybind11_add_module(
 7 |   c_op_conv_ OpenMP::OpenMP_CXX
 8 |   ../onnx_extended/reference/c_ops/cpu/c_op_conv_.cpp)
 9 | eigen_add_dependency(c_op_conv_)
10 | 
11 | target_link_libraries(c_op_conv_ PRIVATE common_kernels common)
12 | target_include_directories(c_op_conv_ PRIVATE ${ROOT_INCLUDE_PATH})
13 | 
14 | add_executable(test_c_op_conv_cpp ../_unittests/ut_reference/test_c_op_conv.cpp)
15 | target_compile_definitions(test_c_op_conv_cpp PRIVATE PYTHON_MANYLINUX=${PYTHON_MANYLINUX})
16 | target_link_libraries(test_c_op_conv_cpp PRIVATE common_kernels common)
17 | target_include_directories(
18 |   test_c_op_conv_cpp
19 |   PRIVATE
20 |   ${ROOT_INCLUDE_PATH}
21 |   ${ROOT_UNITTEST_PATH})
22 | 
23 | eigen_add_dependency(test_c_op_conv_cpp)
24 | 
25 | add_test(NAME test_c_op_conv_cpp COMMAND test_c_op_conv_cpp)
26 | 


--------------------------------------------------------------------------------
/_cmake/targets/c_op_svm_py_.cmake:
--------------------------------------------------------------------------------
 1 | #
 2 | # module: onnx_extended.reference.c_ops.cpu.c_op_svm_py_
 3 | #
 4 | message(STATUS "+ PYBIND11 onnx_extended.reference.c_ops.cpu.c_op_svm_py_")
 5 | 
 6 | local_pybind11_add_module(
 7 |   c_op_svm_py_ OpenMP::OpenMP_CXX
 8 |   ../onnx_extended/reference/c_ops/cpu/c_op_svm_py_.cpp)
 9 | 
10 | target_link_libraries(c_op_svm_py_ PRIVATE common_kernels common)
11 | 
12 | target_include_directories(c_op_svm_py_ PRIVATE ${ROOT_INCLUDE_PATH})
13 | 


--------------------------------------------------------------------------------
/_cmake/targets/c_op_tfidf_vectorizer_py_.cmake:
--------------------------------------------------------------------------------
 1 | #
 2 | # module: onnx_extended.reference.c_ops.cpu.c_op_tree_ensemble_py_
 3 | #
 4 | message(STATUS "+ PYBIND11 onnx_extended.reference.c_ops.cpu.c_op_tfidf_vectorizer_py_")
 5 | 
 6 | local_pybind11_add_module(
 7 |   c_op_tfidf_vectorizer_py_ OpenMP::OpenMP_CXX
 8 |   ../onnx_extended/reference/c_ops/cpu/c_op_tfidf_vectorizer_py_.cpp)
 9 | 
10 | target_link_libraries(c_op_tfidf_vectorizer_py_ PRIVATE common_kernels common)
11 | 
12 | target_include_directories(c_op_tfidf_vectorizer_py_ PRIVATE ${ROOT_INCLUDE_PATH})
13 | 
14 | 


--------------------------------------------------------------------------------
/_cmake/targets/c_op_tree_ensemble_py_.cmake:
--------------------------------------------------------------------------------
 1 | #
 2 | # module: onnx_extended.reference.c_ops.cpu.c_op_tree_ensemble_py_
 3 | #
 4 | message(STATUS "+ PYBIND11 onnx_extended.reference.c_ops.cpu.c_op_tree_ensemble_py_")
 5 | 
 6 | local_pybind11_add_module(
 7 |   c_op_tree_ensemble_py_ OpenMP::OpenMP_CXX
 8 |   ../onnx_extended/reference/c_ops/cpu/c_op_tree_ensemble_py_.cpp)
 9 | 
10 | target_link_libraries(c_op_tree_ensemble_py_ PRIVATE common_kernels common)
11 | 
12 | target_include_directories(c_op_tree_ensemble_py_ PRIVATE ${ROOT_INCLUDE_PATH})
13 | 
14 | 


--------------------------------------------------------------------------------
/_cmake/targets/common.cmake:
--------------------------------------------------------------------------------
1 | #
2 | # module: common C++ libraries
3 | #
4 | message(STATUS "+ KERNEL onnx_extended.common")
5 | add_library(common STATIC ../onnx_extended/cpp/onnx_extended_helpers.cpp)
6 | target_compile_definitions(common PRIVATE PYTHON_MANYLINUX=${PYTHON_MANYLINUX})
7 | target_include_directories(common PRIVATE "${ROOT_INCLUDE_PATH}")
8 | 


--------------------------------------------------------------------------------
/_cmake/targets/common_kernels.cmake:
--------------------------------------------------------------------------------
 1 | #
 2 | # module: common C++ libraries
 3 | #
 4 | message(STATUS "+ KERNEL onnx_extended.common_kernels")
 5 | add_library(
 6 |     common_kernels
 7 |     STATIC
 8 |     ../onnx_extended/cpp/c_op_allocation.cpp
 9 |     ../onnx_extended/cpp/c_op_common_parameters.cpp)
10 | target_compile_definitions(common_kernels PRIVATE PYTHON_MANYLINUX=${PYTHON_MANYLINUX})
11 | target_include_directories(common_kernels PRIVATE "${ROOT_INCLUDE_PATH}")
12 | 


--------------------------------------------------------------------------------
/_cmake/targets/fp8_cy.cmake:
--------------------------------------------------------------------------------
 1 | #
 2 | # module: onnx_extended.validation.cython.fp8
 3 | #
 4 | message(STATUS "+ CYTHON onnx_extended.validation.cython.fp8")
 5 | 
 6 | cython_add_module(
 7 |   fp8
 8 |   ../onnx_extended/validation/cython/fp8.pyx
 9 |   OpenMP::OpenMP_CXX)
10 | 
11 | target_include_directories(fp8 PRIVATE ${ROOT_INCLUDE_PATH})
12 | 


--------------------------------------------------------------------------------
/_cmake/targets/ortinf.cmake:
--------------------------------------------------------------------------------
 1 | #
 2 | # module: onnx_extended.ortcy.wrap.ortapi
 3 | #
 4 | message(STATUS "+ CYTHON onnx_extended.ortcy.wrap.ortapi")
 5 | 
 6 | add_library(lib_ortapi STATIC ../onnx_extended/ortcy/wrap/ortapi.cpp)
 7 | target_compile_definitions(lib_ortapi PRIVATE PYTHON_MANYLINUX=${PYTHON_MANYLINUX})
 8 | target_include_directories(
 9 |   lib_ortapi PUBLIC
10 |   ${ONNXRUNTIME_INCLUDE_DIR}
11 |   ${ROOT_INCLUDE_PATH})
12 | target_link_libraries(lib_ortapi PRIVATE common)
13 | 
14 | set(ORTAPI_INCLUDE_DIR "${ROOT_PROJECT_PATH}/onnx_extended/ortcy/wrap")
15 | 
16 | cython_add_module(
17 |   ortinf
18 |   ../onnx_extended/ortcy/wrap/ortinf.pyx
19 |   OpenMP::OpenMP_CXX)
20 | 
21 | message(STATUS "    LINK ortinf <- lib_ortapi onnxruntime ${ORTAPI_INCLUDE_DIR}")
22 | 
23 | ort_add_dependency(
24 |   ortinf
25 |   onnx_extended/ortcy/wrap)
26 | 
27 | # If ONNXRUNTIME_LIB_DIR is used, then it seems a local installation does
28 | # does not the binaries anymore if they are removed.
29 | target_link_directories(ortinf PRIVATE ${ORTAPI_INCLUDE_DIR})
30 | 
31 | target_link_libraries(
32 |   ortinf
33 |   PRIVATE
34 |   lib_ortapi
35 |   onnxruntime
36 |   common_kernels)
37 | target_include_directories(ortinf PRIVATE ${ROOT_INCLUDE_PATH})
38 | 
39 | add_executable(test_ortcy_inference_cpp ../_unittests/ut_ortcy/test_inference.cpp)
40 | target_compile_definitions(test_ortcy_inference_cpp PRIVATE PYTHON_MANYLINUX=${PYTHON_MANYLINUX})
41 | target_include_directories(
42 |   test_ortcy_inference_cpp
43 |   PRIVATE
44 |   ${ROOT_UNITTEST_PATH}
45 |   ${ROOT_PROJECT_PATH}
46 |   ${ROOT_INCLUDE_PATH}
47 |   ${ORT_DIR}/include)
48 | message(STATUS "    LINK test_ortcy_inference_cpp <- lib_ortapi onnxruntime")
49 | target_link_directories(test_ortcy_inference_cpp PRIVATE ${ONNXRUNTIME_LIB_DIR})
50 | target_link_libraries(
51 |   test_ortcy_inference_cpp
52 |   PRIVATE
53 |   lib_ortapi
54 |   onnxruntime
55 |   common_kernels)
56 | ort_add_dependency(test_ortcy_inference_cpp "")
57 | add_test(NAME test_ortcy_inference_cpp COMMAND test_ortcy_inference_cpp)
58 | 


--------------------------------------------------------------------------------
/_cmake/targets/ortops_optim_cpu.cmake:
--------------------------------------------------------------------------------
 1 | #
 2 | # module: onnx_extended.reference.c_ops.cpu.c_op_conv_
 3 | #
 4 | message(STATUS "+ KERNEL onnx_extended.ortops.optim.cpu")
 5 | 
 6 | ort_add_custom_op(
 7 |   ortops_optim_cpu
 8 |   "CPU"
 9 |   onnx_extended/ortops/optim/cpu
10 |   ../onnx_extended/ortops/optim/cpu/ort_optim_cpu_lib.cc)
11 | 
12 | target_include_directories(ortops_optim_cpu PRIVATE ${ROOT_INCLUDE_PATH})
13 | 
14 | target_include_directories(
15 |   ortops_optim_cpu
16 |   PRIVATE
17 |   "${ROOT_INCLUDE_PATH}"
18 |   "${ORTAPI_INCLUDE_DIR}"
19 |   "${ORTOPS_INCLUDE_DIR}")
20 | 
21 | target_link_libraries(
22 |   ortops_optim_cpu
23 |   PRIVATE
24 |   OpenMP::OpenMP_CXX
25 |   common_kernels
26 |   common)
27 | 
28 | add_executable(test_optops_inference_cpp ../_unittests/ut_ortops/test_inference_tree.cpp)
29 | target_compile_definitions(
30 |   test_optops_inference_cpp
31 |   PRIVATE
32 |   PYTHON_MANYLINUX=${PYTHON_MANYLINUX}
33 |   TESTED_CUSTOM_OPS_DLL="$<TARGET_FILE:ortops_optim_cpu>")
34 | target_include_directories(
35 |   test_optops_inference_cpp
36 |   PRIVATE
37 |   ${ROOT_UNITTEST_PATH}
38 |   ${ROOT_PROJECT_PATH}
39 |   ${ROOT_INCLUDE_PATH}
40 |   ${ORT_DIR}/include)
41 | message(STATUS "    LINK test_optops_inference_cpp <- lib_ortapi onnxruntime")
42 | target_link_directories(test_optops_inference_cpp PRIVATE ${ONNXRUNTIME_LIB_DIR})
43 | target_link_libraries(
44 |   test_optops_inference_cpp
45 |   PRIVATE
46 |   lib_ortapi
47 |   onnxruntime
48 |   common_kernels)
49 | ort_add_dependency(test_optops_inference_cpp "")
50 | add_test(NAME test_optops_inference_cpp COMMAND test_optops_inference_cpp)
51 | 


--------------------------------------------------------------------------------
/_cmake/targets/ortops_optim_cuda.cmake:
--------------------------------------------------------------------------------
 1 | #
 2 | # module: onnx_extended.ortops.optim.cuda
 3 | #
 4 | 
 5 | if(CUDA_AVAILABLE)
 6 | 
 7 |   message(STATUS "+ KERNEL onnx_extended.ortops.optim.cuda")
 8 | 
 9 |   ort_add_custom_op(
10 |     ortops_optim_cuda
11 |     CUDA
12 |     onnx_extended/ortops/optim/cuda
13 |     ../onnx_extended/cpp/onnx_extended_helpers.cpp
14 |     ../onnx_extended/ortops/optim/cuda/addaddmulmul.cu
15 |     ../onnx_extended/ortops/optim/cuda/addaddaddmulmulmul.cu
16 |     ../onnx_extended/ortops/optim/cuda/addmul.cu
17 |     ../onnx_extended/ortops/optim/cuda/add_or_mul_shared_input.cu
18 |     ../onnx_extended/ortops/optim/cuda/mul_sigmoid.cu
19 |     ../onnx_extended/ortops/optim/cuda/mul_mul_sigmoid.cu
20 |     ../onnx_extended/ortops/optim/cuda/negxplus1.cu
21 |     ../onnx_extended/ortops/optim/cuda/replace_zero.cu
22 |     ../onnx_extended/ortops/optim/cuda/rotary.cu
23 |     ../onnx_extended/ortops/optim/cuda/scatter_nd_of_shape.cu
24 |     ../onnx_extended/ortops/optim/cuda/scatter_nd_of_shape_masked.cu
25 |     ../onnx_extended/ortops/optim/cuda/submul.cu
26 |     ../onnx_extended/ortops/optim/cuda/transpose_cast_2d.cu
27 |     ../onnx_extended/ortops/optim/cuda/tri_matrix.cu
28 |     ../onnx_extended/ortops/optim/cuda/ort_optim_cuda_lib.cc)
29 | 
30 |   # needed to include onnx_extended_helpers.h
31 |   target_include_directories(
32 |     ortops_optim_cuda
33 |     PRIVATE
34 |     "${ROOT_INCLUDE_PATH}"
35 |     "${ORTAPI_INCLUDE_DIR}"
36 |     "${ORTOPS_INCLUDE_DIR}")
37 | 
38 | endif()
39 | 


--------------------------------------------------------------------------------
/_cmake/targets/ortops_tutorial_cpu.cmake:
--------------------------------------------------------------------------------
 1 | #
 2 | # module: onnx_extended.reference.c_ops.cpu.c_op_conv_
 3 | #
 4 | message(STATUS "+ KERNEL onnx_extended.ortops.tutorial.cpu")
 5 | 
 6 | ort_add_custom_op(
 7 |   ortops_tutorial_cpu
 8 |   "CPU"
 9 |   onnx_extended/ortops/tutorial/cpu
10 |   ../onnx_extended/ortops/tutorial/cpu/custom_gemm.cc
11 |   ../onnx_extended/ortops/tutorial/cpu/custom_tree_assembly.cc
12 |   ../onnx_extended/ortops/tutorial/cpu/dynamic_quantize_linear.cc
13 |   ../onnx_extended/ortops/tutorial/cpu/my_kernel.cc
14 |   ../onnx_extended/ortops/tutorial/cpu/my_kernel_attr.cc
15 |   ../onnx_extended/ortops/tutorial/cpu/ort_tutorial_cpu_lib.cc)
16 | 
17 | # needed to include onnx_extended_helpers.h
18 | target_include_directories(
19 |   ortops_tutorial_cpu
20 |   PRIVATE
21 |   "${ROOT_INCLUDE_PATH}"
22 |   "${ORTAPI_INCLUDE_DIR}"
23 |   "${ORTOPS_INCLUDE_DIR}")
24 | 
25 | eigen_add_dependency(ortops_tutorial_cpu)
26 | 
27 | target_link_libraries(
28 |   ortops_tutorial_cpu
29 |   PRIVATE
30 |   OpenMP::OpenMP_CXX
31 |   common_kernels
32 |   common)
33 | 


--------------------------------------------------------------------------------
/_cmake/targets/ortops_tutorial_cuda.cmake:
--------------------------------------------------------------------------------
 1 | #
 2 | # custom ops: onnx_extended.ortops.tutorial.cuda
 3 | #
 4 | 
 5 | if(CUDA_AVAILABLE)
 6 | 
 7 |   message(STATUS "+ KERNEL onnx_extended.ortops.tutorial.cuda")
 8 | 
 9 |   ort_add_custom_op(
10 |     ortops_tutorial_cuda
11 |     CUDA
12 |     onnx_extended/ortops/tutorial/cuda
13 |     ../onnx_extended/cpp/onnx_extended_helpers.cpp
14 |     ../onnx_extended/ortops/tutorial/cuda/custom_gemm.cu
15 |     ../onnx_extended/ortops/tutorial/cuda/matx_matmul.cu
16 |     ../onnx_extended/ortops/tutorial/cuda/ort_tutorial_cuda_lib.cc)
17 | 
18 |   # needed to include onnx_extended_helpers.h
19 |   target_include_directories(
20 |     ortops_tutorial_cuda
21 |     PRIVATE
22 |     "${ROOT_INCLUDE_PATH}"
23 |     "${ORTAPI_INCLUDE_DIR}"
24 |     "${ORTOPS_INCLUDE_DIR}"
25 |     "${matx_INCLUDE_DIR}")
26 | 
27 |   target_link_libraries(ortops_tutorial_cuda PRIVATE matx::matx)
28 | 
29 | endif()
30 | 


--------------------------------------------------------------------------------
/_cmake/test_constants.h.in:
--------------------------------------------------------------------------------
1 | #define TEST_FOLDER "${TEST_FOLDER}"
2 | 


--------------------------------------------------------------------------------
/_doc/_static/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sdpython/onnx-extended/eebca3c82b615f841608f74131900738b7f7f845/_doc/_static/logo.png


--------------------------------------------------------------------------------
/_doc/_static/profile.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sdpython/onnx-extended/eebca3c82b615f841608f74131900738b7f7f845/_doc/_static/profile.png


--------------------------------------------------------------------------------
/_doc/_static/vector_sum6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sdpython/onnx-extended/eebca3c82b615f841608f74131900738b7f7f845/_doc/_static/vector_sum6.png


--------------------------------------------------------------------------------
/_doc/_static/vector_sum6_results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sdpython/onnx-extended/eebca3c82b615f841608f74131900738b7f7f845/_doc/_static/vector_sum6_results.png


--------------------------------------------------------------------------------
/_doc/api/check.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | =========================
 3 | onnx_extended.__init__.py
 4 | =========================
 5 | 
 6 | check_installation
 7 | ==================
 8 | 
 9 | .. autofunction:: onnx_extended.check_installation
10 | 
11 | compiled_with_cuda
12 | ==================
13 | 
14 | .. autofunction:: onnx_extended.compiled_with_cuda
15 | 
16 | cuda_version
17 | ============
18 | 
19 | .. autofunction:: onnx_extended.cuda_version
20 | 
21 | cuda_version_int
22 | ================
23 | 
24 | .. autofunction:: onnx_extended.cuda_version_int
25 | 
26 | get_cxx_flags
27 | =============
28 | 
29 | .. autofunction:: onnx_extended.get_cxx_flags
30 | 
31 | get_stdcpp
32 | ==========
33 | 
34 | .. autofunction:: onnx_extended.get_stdcpp
35 | 
36 | has_cuda
37 | ========
38 | 
39 | .. autofunction:: onnx_extended.has_cuda
40 | 


--------------------------------------------------------------------------------
/_doc/api/ext_test_case.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | ===========================
 3 | onnx_extended.ext_test_case
 4 | ===========================
 5 | 
 6 | Various helpers to help develop the package.
 7 | 
 8 | ExtTestCase
 9 | ===========
10 | 
11 | .. autoclass:: onnx_extended.ext_test_case.ExtTestCase
12 |     :members:
13 | 
14 | ignore_warnings
15 | ===============
16 | 
17 | .. autofunction:: onnx_extended.ext_test_case.ignore_warnings
18 | 
19 | measure_time
20 | ============
21 | 
22 | .. autofunction:: onnx_extended.ext_test_case.measure_time
23 | 


--------------------------------------------------------------------------------
/_doc/api/helper.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | ====================
 3 | onnx_extended.helper
 4 | ====================
 5 | 
 6 | make_dynamic_quantize_linear_function_proto
 7 | ===========================================
 8 | 
 9 | .. autofunction:: onnx_extended.helper.make_dynamic_quantize_linear_function_proto
10 | 
11 | make_simple_dynamic_quantize_linear_function_proto
12 | ==================================================
13 | 
14 | .. autofunction:: onnx_extended.helper.make_simple_dynamic_quantize_linear_function_proto
15 | 


--------------------------------------------------------------------------------
/_doc/api/index.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | ===
 3 | API
 4 | ===
 5 | 
 6 | .. toctree::
 7 |     :maxdepth: 1
 8 | 
 9 |     check
10 |     ext_test_case
11 |     memory_peak
12 |     helper
13 |     ortcy
14 |     ortops
15 |     plotting
16 |     reference
17 |     validation
18 |     tools
19 | 


--------------------------------------------------------------------------------
/_doc/api/memory_peak.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | =========================
 3 | onnx_extended.memory_peak
 4 | =========================
 5 | 
 6 | get_memory_rss
 7 | ==============
 8 | 
 9 | .. autofunction:: onnx_extended.memory_peak.get_memory_rss
10 | 
11 | MemorySpy
12 | =========
13 | 
14 | .. autoclass:: onnx_extended.memory_peak.MemorySpy
15 |     :members:
16 | 
17 | start_spying_on
18 | ===============
19 | 
20 | .. autofunction:: onnx_extended.memory_peak.start_spying_on
21 | 


--------------------------------------------------------------------------------
/_doc/api/ortcy.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | ===================
 3 | onnx_extended.ortcy
 4 | ===================
 5 | 
 6 | It supports any onnxruntime C API greater than version:
 7 | 
 8 | .. runpython::
 9 |     :showcode:
10 | 
11 |     from onnx_extended.ortcy.wrap.ortinf import get_ort_c_api_supported_version
12 |     
13 |     print(get_ort_c_api_supported_version())
14 | 
15 | get_ort_c_api_supported_version
16 | +++++++++++++++++++++++++++++++
17 | 
18 | .. autofunction:: onnx_extended.ortcy.wrap.ortinf.get_ort_c_api_supported_version
19 | 
20 | ort_get_available_providers
21 | ===========================
22 | 
23 | .. autofunction:: onnx_extended.ortcy.wrap.ortinf.ort_get_available_providers
24 | 
25 | OrtSession
26 | ==========
27 | 
28 | .. autoclass:: onnx_extended.ortcy.wrap.ortinf.OrtSession
29 |     :members:
30 | 


--------------------------------------------------------------------------------
/_doc/api/ortops.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | ====================
 3 | onnx_extended.ortops
 4 | ====================
 5 | 
 6 | It supports any onnxruntime C API greater than version:
 7 | 
 8 | .. runpython::
 9 |     :showcode:
10 | 
11 |     from onnx_extended.ortcy.wrap.ortinf import get_ort_c_api_supported_version
12 |     
13 |     print(get_ort_c_api_supported_version())
14 | 
15 | .. toctree::
16 |     :maxdepth: 2
17 | 
18 |     ortops_tutorial_cpu
19 |     ortops_tutorial_cuda
20 |     ortops_optim_cpu
21 |     ortops_optim_cuda
22 | 


--------------------------------------------------------------------------------
/_doc/api/ortops_optim_cpu.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | ==============================
 3 | onnx_extended.ortops.optim.cpu
 4 | ==============================
 5 | 
 6 | change_onnx_operator_domain
 7 | ===========================
 8 | 
 9 | .. autofunction:: onnx_extended.ortops.optim.optimize.change_onnx_operator_domain
10 | 
11 | get_ort_ext_libs
12 | ================
13 | 
14 | .. autofunction:: onnx_extended.ortops.optim.cpu.get_ort_ext_libs
15 | 
16 | **List of implemented kernels**
17 | 
18 | .. runpython::
19 |     :showcode:
20 |     :rst:
21 | 
22 |     from onnx_extended.ortops.optim.cpu import documentation
23 |     print("\n".join(documentation()))
24 | 
25 | optimize_model
26 | ==============
27 | 
28 | .. autofunction:: onnx_extended.ortops.optim.optimize.optimize_model
29 | 


--------------------------------------------------------------------------------
/_doc/api/ortops_optim_cuda.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | ===============================
 3 | onnx_extended.ortops.optim.cuda
 4 | ===============================
 5 | 
 6 | get_ort_ext_libs
 7 | ================
 8 | 
 9 | .. autofunction:: onnx_extended.ortops.optim.cuda.get_ort_ext_libs
10 | 
11 | **List of implemented kernels**
12 | 
13 | .. runpython::
14 |     :showcode:
15 |     :rst:
16 | 
17 |     from onnx_extended.ortops.optim.cuda import documentation
18 |     print("\n".join(documentation()))
19 | 


--------------------------------------------------------------------------------
/_doc/api/ortops_tutorial_cpu.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | =================================
 3 | onnx_extended.ortops.tutorial.cpu
 4 | =================================
 5 | 
 6 | get_ort_ext_libs
 7 | ================
 8 | 
 9 | .. autofunction:: onnx_extended.ortops.tutorial.cpu.get_ort_ext_libs
10 | 
11 | **List of implemented kernels**
12 | 
13 | .. runpython::
14 |     :showcode:
15 |     :rst:
16 | 
17 |     from onnx_extended.ortops.tutorial.cpu import documentation
18 |     print("\n".join(documentation()))
19 | 


--------------------------------------------------------------------------------
/_doc/api/ortops_tutorial_cuda.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | ==================================
 3 | onnx_extended.ortops.tutorial.cuda
 4 | ==================================
 5 | 
 6 | get_ort_ext_libs
 7 | ================
 8 | 
 9 | .. autofunction:: onnx_extended.ortops.tutorial.cuda.get_ort_ext_libs
10 | 
11 | **List of implemented kernels**
12 | 
13 | .. runpython::
14 |     :showcode:
15 |     :rst:
16 | 
17 |     from onnx_extended.ortops.tutorial.cuda import documentation
18 |     print("\n".join(documentation()))
19 | 


--------------------------------------------------------------------------------
/_doc/api/plotting.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | ======================
 3 | onnx_extended.plotting
 4 | ======================
 5 | 
 6 | onnx_extended.plotting.benchmark.hhistograms
 7 | ============================================
 8 | 
 9 | .. autofunction:: onnx_extended.plotting.benchmark.hhistograms
10 | 
11 | onnx_extended.plotting.benchmark.vhistograms
12 | ============================================
13 | 
14 | .. autofunction:: onnx_extended.plotting.benchmark.vhistograms
15 | 


--------------------------------------------------------------------------------
/_doc/api/reference.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | =======================
 3 | onnx_extended.reference
 4 | =======================
 5 | 
 6 | CReferenceEvaluator
 7 | ===================
 8 | 
 9 | .. autoclass:: onnx_extended.reference.CReferenceEvaluator
10 |     :members: input_names, output_names, opsets, run
11 | 
12 | Backend
13 | =======
14 | 
15 | .. autofunction:: onnx_extended.reference.c_reference_backend.create_reference_backend
16 | 
17 | .. autoclass:: onnx_extended.reference.c_reference_backend.CReferenceEvaluatorBackend
18 |     :members: 
19 | 
20 | .. autoclass:: onnx_extended.reference.c_reference_backend.CReferenceEvaluatorBackendRep
21 |     :members: 
22 | 
23 | .. autoclass:: onnx_extended.reference.c_reference_backend.Runner
24 |     :members: 
25 | 
26 | Tools
27 | =====
28 | 
29 | .. autofunction:: onnx_extended.reference.from_array_extended
30 | 
31 | .. autofunction:: onnx_extended.reference.to_array_extended
32 | 
33 | Operators
34 | =========
35 | 
36 | ai.onnx
37 | +++++++
38 | 
39 | .. autoclass:: onnx_extended.reference.c_ops.c_op_conv.Conv
40 | 
41 | ai.onnx.ml
42 | ++++++++++
43 | 
44 | .. autoclass:: onnx_extended.reference.c_ops.c_op_svm_classifier.SVMClassifier
45 | 
46 | .. autoclass:: onnx_extended.reference.c_ops.c_op_svm_regressor.SVMRegressor
47 | 
48 | .. autoclass:: onnx_extended.reference.c_ops.c_op_tfidf_vectorizer.TfIdfVectorizer
49 | 
50 | .. autoclass:: onnx_extended.reference.c_ops.c_op_tree_ensemble_classifier.TreeEnsembleClassifier_1
51 | 
52 | .. autoclass:: onnx_extended.reference.c_ops.c_op_tree_ensemble_classifier.TreeEnsembleClassifier_3
53 | 
54 | .. autoclass:: onnx_extended.reference.c_ops.c_op_tree_ensemble_regressor.TreeEnsembleRegressor_1
55 | 
56 | .. autoclass:: onnx_extended.reference.c_ops.c_op_tree_ensemble_regressor.TreeEnsembleRegressor_3
57 | 


--------------------------------------------------------------------------------
/_doc/api/tools.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | =====
 3 | tools
 4 | =====
 5 | 
 6 | .. toctree::
 7 |     :maxdepth: 2
 8 | 
 9 |     tools_io
10 |     tools_einsum
11 |     tools_graph
12 |     tools_graph_transformer
13 |     tools_inline
14 |     tools_nodes
15 |     tools_stats
16 |     tools_other
17 | 


--------------------------------------------------------------------------------
/_doc/api/tools_einsum.rst:
--------------------------------------------------------------------------------
 1 | ==========================
 2 | onnx_extended.tools.einsum
 3 | ==========================
 4 | 
 5 | Decomposition of Einsum into simple operations.
 6 | 
 7 | analyse_einsum_equation
 8 | =======================
 9 | 
10 | .. autofunction:: onnx_extended.tools.einsum.einsum_impl.analyse_einsum_equation
11 | 
12 | apply_einsum_sequence
13 | =====================
14 | 
15 | .. autofunction:: onnx_extended.tools.einsum.einsum_impl.apply_einsum_sequence
16 | 
17 | CachedEinsum
18 | ============
19 | 
20 | .. autoclass:: onnx_extended.tools.einsum.einsum_fct.CachedEinsum
21 |     :members:
22 | 
23 | compute_transposition_features
24 | ==============================
25 | 
26 | .. autofunction:: onnx_extended.tools.einsum.einsum_ml.compute_transposition_features
27 | 
28 | decompose_einsum_equation
29 | =========================
30 | 
31 | .. autofunction:: onnx_extended.tools.einsum.einsum_impl.decompose_einsum_equation
32 | 
33 | einsum
34 | ======
35 | 
36 | .. autofunction:: onnx_extended.tools.einsum.einsum_fct.einsum
37 | 
38 | einsum_benchmark
39 | ================
40 | 
41 | .. autofunction:: onnx_extended.tools.einsum.einsum_bench.einsum_benchmark
42 | 
43 | numpy_extended_dot
44 | ==================
45 | 
46 | .. autofunction:: onnx_extended.tools.einsum.einsum_impl_ext.numpy_extended_dot
47 | 
48 | numpy_extended_dot_matrix
49 | =========================
50 | 
51 | .. autofunction:: onnx_extended.tools.einsum.einsum_impl_ext.numpy_extended_dot_matrix
52 | 
53 | numpy_extended_dot_python
54 | =========================
55 | 
56 | .. autofunction:: onnx_extended.tools.einsum.einsum_impl_ext.numpy_extended_dot_python
57 | 
58 | EinsumSubOp
59 | ===========
60 | 
61 | .. autoclass:: onnx_extended.tools.einsum.einsum_impl_classes.EinsumSubOp
62 |     :members:
63 | 
64 | GraphEinsumSubOp
65 | ================
66 | 
67 | .. autoclass:: onnx_extended.tools.einsum.einsum_impl_classes.GraphEinsumSubOp
68 |     :members:
69 | 
70 | OnnxMicroRuntime
71 | ================
72 | 
73 | .. autoclass:: onnx_extended.tools.einsum.einsum_fct.OnnxMicroRuntime
74 |     :members:
75 | 
76 | optimize_decompose_einsum_equation
77 | ==================================
78 | 
79 | .. autofunction:: onnx_extended.tools.einsum.einsum_fct.optimize_decompose_einsum_equation
80 | 
81 | predict_transposition_cost
82 | ==========================
83 | 
84 | .. autofunction:: onnx_extended.tools.einsum.einsum_ml.predict_transposition_cost
85 | 
86 | 


--------------------------------------------------------------------------------
/_doc/api/tools_graph.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | =========================
 3 | onnx_extended.tools.graph
 4 | =========================
 5 | 
 6 | NodeKind
 7 | ========
 8 | 
 9 | .. autoclass:: onnx_extended.tools.graph.onnx_graph_struct.NodeKind
10 | 
11 | Node
12 | ====
13 | 
14 | .. autoclass:: onnx_extended.tools.graph.onnx_graph_struct.Node
15 |     :members:
16 | 
17 | NodeWithSubGraph
18 | ================
19 | 
20 | .. autoclass:: onnx_extended.tools.graph.onnx_graph_struct.NodeWithSubGraph
21 |     :members:
22 | 
23 | NodeSet
24 | =======
25 | 
26 | .. autoclass:: onnx_extended.tools.graph.onnx_graph_struct.NodeSet
27 |     :members:
28 | 
29 | Graph
30 | =====
31 | 
32 | .. autoclass:: onnx_extended.tools.graph.Graph
33 |     :members:
34 | 


--------------------------------------------------------------------------------
/_doc/api/tools_graph_transformer.rst:
--------------------------------------------------------------------------------
 1 | ================================================
 2 | onnx_extended.tools.graph.onnx_graph_transformer
 3 | ================================================
 4 | 
 5 | cast_constant
 6 | =============
 7 | 
 8 | .. autofunction:: onnx_extended.tools.graph.cast_constant
 9 | 
10 | QuantizeOptions
11 | ===============
12 | 
13 | .. autoclass:: onnx_extended.tools.graph.QuantizeOptions
14 |     :members:
15 | 
16 | quantize_float8
17 | ===============
18 | 
19 | .. autofunction:: onnx_extended.tools.graph.quantize_float8
20 | 
21 | TransformResults
22 | ================
23 | 
24 | .. autoclass:: onnx_extended.tools.graph.onnx_graph_transformer.TransformResults
25 |     :members:
26 | 
27 | QuantizationError
28 | =================
29 | 
30 | .. autoclass:: onnx_extended.tools.graph.QuantizationError
31 | 


--------------------------------------------------------------------------------
/_doc/api/tools_inline.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | ===============================
 3 | onnx_extended.tools.onnx_inline
 4 | ===============================
 5 | 
 6 | onnx_inline_function
 7 | ====================
 8 | 
 9 | .. autofunction:: onnx_extended.tools.onnx_inline.onnx_inline_function
10 | 


--------------------------------------------------------------------------------
/_doc/api/tools_io.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | ===========================
 3 | onnx_extended.tools.onnx_io
 4 | ===========================
 5 | 
 6 | enumerate_model_tensors
 7 | =======================
 8 | 
 9 | .. autofunction:: onnx_extended.tools.enumerate_model_tensors
10 | 
11 | load_external
12 | =============
13 | 
14 | .. autofunction:: onnx_extended.tools.load_external
15 | 
16 | load_model
17 | ==========
18 | 
19 | .. autofunction:: onnx_extended.tools.load_model
20 | 
21 | onnx2string
22 | ===========
23 | 
24 | .. autofunction:: onnx_extended.tools.onnx_io.onnx2string
25 | 
26 | save_model
27 | ==========
28 | 
29 | .. autofunction:: onnx_extended.tools.save_model
30 | 
31 | string2onnx
32 | ===========
33 | 
34 | .. autofunction:: onnx_extended.tools.onnx_io.string2onnx
35 | 
36 | 


--------------------------------------------------------------------------------
/_doc/api/tools_nodes.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | ==============================
 3 | onnx_extended.tools.onnx_nodes
 4 | ==============================
 5 | 
 6 | convert_onnx_model
 7 | ==================
 8 | 
 9 | .. autofunction:: onnx_extended.tools.onnx_nodes.convert_onnx_model
10 | 
11 | enumerate_onnx_node_types
12 | =========================
13 | 
14 | .. autofunction:: onnx_extended.tools.onnx_nodes.enumerate_onnx_node_types
15 | 
16 | get_hidden_inputs
17 | =================
18 | 
19 | .. autofunction:: onnx_extended.tools.onnx_nodes.get_hidden_inputs
20 | 
21 | multiply_tree
22 | =============
23 | 
24 | .. autofunction:: onnx_extended.tools.onnx_nodes.multiply_tree
25 | 
26 | onnx_merge_models
27 | =================
28 | 
29 | .. autofunction:: onnx_extended.tools.onnx_nodes.onnx_merge_models
30 | 
31 | onnx_remove_node_unused
32 | =======================
33 | 
34 | .. autofunction:: onnx_extended.tools.onnx_nodes.onnx_remove_node_unused
35 | 
36 | select_model_inputs_outputs
37 | ===========================
38 | 
39 | .. autofunction:: onnx_extended.tools.onnx_nodes.select_model_inputs_outputs
40 | 


--------------------------------------------------------------------------------
/_doc/api/tools_other.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | ===================
 3 | onnx_extended.tools
 4 | ===================
 5 | 
 6 | onnx_extended.tools.ort_debug
 7 | =============================
 8 | 
 9 | enumerate_ort_run
10 | +++++++++++++++++
11 | 
12 | .. autofunction:: onnx_extended.tools.ort_debug.enumerate_ort_run
13 | 
14 | onnx_extended.tools.js_profile
15 | ==============================
16 | 
17 | js_profile_to_dataframe
18 | +++++++++++++++++++++++
19 | 
20 | .. autofunction:: onnx_extended.tools.js_profile.js_profile_to_dataframe
21 | 
22 | plot_ort_profile
23 | ++++++++++++++++
24 | 
25 | .. autofunction:: onnx_extended.tools.js_profile.plot_ort_profile
26 | 
27 | plot_ort_profile_timeline
28 | +++++++++++++++++++++++++
29 | 
30 | .. autofunction:: onnx_extended.tools.js_profile.plot_ort_profile_timeline
31 | 
32 | onnx_extended.tools.run_onnx
33 | ============================
34 | 
35 | save_for_benchmark_or_test
36 | ++++++++++++++++++++++++++
37 | 
38 | .. autofunction:: onnx_extended.tools.run_onnx.save_for_benchmark_or_test
39 | 
40 | bench_virtual
41 | +++++++++++++
42 | 
43 | .. autofunction:: onnx_extended.tools.run_onnx.bench_virtual
44 | 
45 | TestRun
46 | +++++++
47 | 
48 | .. autoclass:: onnx_extended.tools.run_onnx.TestRun
49 |     :members:
50 | 


--------------------------------------------------------------------------------
/_doc/api/tools_stats.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | ===============================
 3 | onnx_extended.tools.stats_nodes
 4 | ===============================
 5 | 
 6 | enumerate_nodes
 7 | ===============
 8 | 
 9 | .. autofunction:: onnx_extended.tools.stats_nodes.enumerate_nodes
10 | 
11 | enumerate_stats_nodes
12 | =====================
13 | 
14 | .. autofunction:: onnx_extended.tools.stats_nodes.enumerate_stats_nodes
15 | 
16 | HistStatistics
17 | ==============
18 | 
19 | .. autoclass:: onnx_extended.tools.stats_nodes.HistStatistics
20 |     :members:
21 | 
22 | HistTreeStatistics
23 | ==================
24 | 
25 | .. autoclass:: onnx_extended.tools.stats_nodes.HistTreeStatistics
26 |     :members:
27 | 
28 | NodeStatistics
29 | ==============
30 | 
31 | .. autoclass:: onnx_extended.tools.stats_nodes.NodeStatistics
32 |     :members:
33 | 
34 | stats_tree_ensemble
35 | ===================
36 | 
37 | .. autofunction:: onnx_extended.tools.stats_nodes.stats_tree_ensemble
38 | 
39 | TreeStatistics
40 | ==============
41 | 
42 | .. autoclass:: onnx_extended.tools.stats_nodes.TreeStatistics
43 |     :members:
44 | 


--------------------------------------------------------------------------------
/_doc/api/validation.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | ==========
 3 | validation
 4 | ==========
 5 | 
 6 | .. toctree::
 7 |     :maxdepth: 2
 8 | 
 9 |     validation_cpu
10 |     validation_cuda
11 |     validation_sparse
12 |     validation_trees
13 | 


--------------------------------------------------------------------------------
/_doc/api/validation_cpu.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | ==============
 3 | validation.cpu
 4 | ==============
 5 | 
 6 | 
 7 | C API
 8 | =====
 9 | 
10 | _validation
11 | +++++++++++
12 | 
13 | .. autoclass:: onnx_extended.validation.cpu._validation.ElementTime
14 | 
15 | .. autofunction:: onnx_extended.validation.cpu._validation.benchmark_cache
16 | 
17 | .. autofunction:: onnx_extended.validation.cpu._validation.benchmark_cache_tree
18 | 
19 | .. autofunction:: onnx_extended.validation.cpu._validation.double2float_rn
20 | 
21 | .. autofunction:: onnx_extended.validation.cpu._validation.murmurhash3_bytes_s32
22 | 
23 | .. autofunction:: onnx_extended.validation.cpu._validation.float2half_rn
24 | 
25 | .. autofunction:: onnx_extended.validation.cpu._validation.half2float
26 | 
27 | .. autofunction:: onnx_extended.validation.cpu._validation.has_sse3
28 | 


--------------------------------------------------------------------------------
/_doc/api/validation_cuda.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | ===============
 3 | validation.cuda
 4 | ===============
 5 | 
 6 | C API
 7 | =====
 8 | 
 9 | cuda_example_py
10 | +++++++++++++++
11 | 
12 | .. runpython::
13 |     :rst:
14 | 
15 |     from onnx_extended import has_cuda
16 | 
17 |     if not has_cuda():
18 |         print(
19 |             "The documentation was not compiled with CUDA enabled "
20 |             "and cannot expose the CUDA functions."
21 |         )
22 | 
23 |     names = [
24 |         "cuda_device_count",
25 |         "cuda_device_memory",
26 |         "cuda_devices_memory",
27 |         "cuda_version",
28 |         "gemm_benchmark_test",
29 |         "FpemuMode",
30 |         "fpemu_cuda_forward",
31 |     ]
32 |     names.sort()
33 |     classes = {"FpemuMode"}
34 |     noindex = {"gemm_benchmark_test"}
35 | 
36 |     prefix = "onnx_extended.validation.cuda.cuda_example_py."
37 |     if has_cuda():
38 |         fct_template = f".. autofunction:: {prefix}%s"
39 |         fct_template_no = f".. autofunction:: {prefix}%s\n    :noindex:"
40 |         cls_template = f".. autoclass:: {prefix}%s\n    :members:"
41 |     else:
42 |         fct_template = f"Unable to document function `{prefix}%s`"
43 |         fct_template_no = fct_template
44 |         cls_template = f"Unable to document class `{prefix}%s`"
45 | 
46 |     for name in names:
47 |         tpl = cls_template if name in classes else (
48 |             fct_template_no if name in noindex else fct_template
49 |         )
50 |         print(tpl % name)
51 |         print()
52 | 
53 | cuda_monitor
54 | ++++++++++++
55 | 
56 | .. runpython::
57 |     :rst:
58 | 
59 |     from onnx_extended import has_cuda
60 | 
61 |     if not has_cuda():
62 |         print(
63 |             "The documentation was not compiled with CUDA enabled "
64 |             "and cannot expose the CUDA functions."
65 |         )
66 | 
67 |     names = [
68 |         "cuda_version",
69 |         "nvml_device_get_count",
70 |         "nvml_device_get_memory_info",
71 |         "nvml_init",
72 |         "nvml_shutdown",
73 |     ]
74 |     names.sort()
75 |     noindex = {}
76 |     classes = {}
77 | 
78 |     prefix = "onnx_extended.validation.cuda.cuda_monitor."
79 |     if has_cuda():
80 |         fct_template = f".. autofunction:: {prefix}%s"
81 |         fct_template_no = f".. autofunction:: {prefix}%s\n    :noindex:"
82 |         cls_template = f".. autoclass:: {prefix}%s\n    :members:"
83 |     else:
84 |         fct_template = f"Unable to document function `{prefix}%s`"
85 |         fct_template_no = fct_template
86 |         cls_template = f"Unable to document class `{prefix}%s`"
87 | 
88 |     for name in names:
89 |         tpl = cls_template if name in classes else (
90 |             fct_template_no if name in noindex else fct_template
91 |         )
92 |         print(tpl % name)
93 |         print()
94 | 


--------------------------------------------------------------------------------
/_doc/api/validation_sparse.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | ======================
 3 | validation.bench_trees
 4 | ======================
 5 | 
 6 | Design
 7 | ======
 8 | 
 9 | The sparse format defined here is structure storing indices and values (float)
10 | in a single float array. The beginning of the structures
11 | stores the shape (1D to 5D), the element type and the number of stored
12 | elements. The two following functions are used to convert
13 | from dense from/to sparse.
14 | 
15 | Functions
16 | =========
17 | 
18 | onnx_extended.validation.cpu._validation.dense_to_sparse_struct
19 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
20 | 
21 | .. autofunction:: onnx_extended.validation.cpu._validation.dense_to_sparse_struct
22 | 
23 | onnx_extended.validation.cpu._validation.evaluate_sparse
24 | ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
25 | 
26 | .. autofunction:: onnx_extended.validation.cpu._validation.evaluate_sparse
27 | 
28 | onnx_extended.validation.cpu._validation.sparse_struct_indices_values
29 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
30 | 
31 | .. autofunction:: onnx_extended.validation.cpu._validation.sparse_struct_indices_values
32 | 
33 | onnx_extended.validation.cpu._validation.sparse_struct_to_dense
34 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
35 | 
36 | .. autofunction:: onnx_extended.validation.cpu._validation.sparse_struct_to_dense
37 | 
38 | onnx_extended.validation.cpu._validation.sparse_struct_to_csr
39 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
40 | 
41 | .. autofunction:: onnx_extended.validation.cpu._validation.sparse_struct_to_csr
42 | 
43 | onnx_extended.validation.cpu._validation.sparse_struct_to_maps
44 | ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
45 | 
46 | .. autofunction:: onnx_extended.validation.cpu._validation.sparse_struct_to_maps
47 | 


--------------------------------------------------------------------------------
/_doc/api/validation_trees.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | ======================
 3 | validation.bench_trees
 4 | ======================
 5 | 
 6 | onnx_extended.validation.bench_trees.bench_trees
 7 | ================================================
 8 | 
 9 | .. autofunction:: onnx_extended.validation.bench_trees.bench_trees
10 | 
11 | onnx_extended.validation.bench_trees.create_decision_tree
12 | =========================================================
13 | 
14 | .. autofunction:: onnx_extended.validation.bench_trees.create_decision_tree
15 | 
16 | onnx_extended.validation.bench_trees.create_engine
17 | ==================================================
18 | 
19 | .. autofunction:: onnx_extended.validation.bench_trees.create_engine
20 | 
21 | onnx_extended.validation.bench_trees.Engine
22 | ===========================================
23 | 
24 | .. autoclass:: onnx_extended.validation.bench_trees.Engine
25 |     :members:
26 | 
27 | onnx_extended.validation.bench_trees.EngineCython
28 | =================================================
29 | 
30 | .. autoclass:: onnx_extended.validation.bench_trees.EngineCython
31 |     :members:
32 | 


--------------------------------------------------------------------------------
/_doc/examples/README.txt:
--------------------------------------------------------------------------------
1 | .. _l-example-gallery:
2 | 
3 | Examples Gallery
4 | ================
5 | 
6 | 
7 | 
8 | 
9 | 


--------------------------------------------------------------------------------
/_doc/license.rst:
--------------------------------------------------------------------------------
1 | LICENSE
2 | =======
3 | 
4 | .. literalinclude:: LICENSE.txt
5 |    :language: none
6 |  


--------------------------------------------------------------------------------
/_doc/tech/2023-09-05-glibc.rst:
--------------------------------------------------------------------------------
 1 | 2023-09-05 - version GLIBCXX_3.4.30 not found
 2 | =============================================
 3 | 
 4 | Some weird issue occured when importing :epkg:`onnxruntime` after importing :epkg:`pandas`.
 5 | 
 6 | ::
 7 | 
 8 |     Python 3.11.4 (main, Jul  5 2023, 13:45:01) [GCC 11.2.0] on linux
 9 |     Type "help", "copyright", "credits" or "license" for more information.
10 |     >>> import pandas
11 |     >>> import onnxruntime
12 |     Traceback (most recent call last):
13 |     File "<stdin>", line 1, in <module>
14 |     File ".../github/onnxruntime/build/linux_cuda/Release/onnxruntime/__init__.py", line 56, in <module>
15 |         raise import_capi_exception
16 |     File ".../github/onnxruntime/build/linux_cuda/Release/onnxruntime/__init__.py", line 23, in <module>
17 |         from onnxruntime.capi._pybind_state import ExecutionMode  # noqa: F401
18 |         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
19 |     File ".../github/onnxruntime/build/linux_cuda/Release/onnxruntime/capi/_pybind_state.py", line 32, in <module>
20 |         from .onnxruntime_pybind11_state import *  # noqa
21 |         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
22 |     ImportError: .../miniconda3/lib/python3.11/site-packages/numexpr/../../../libstdc++.so.6: version `GLIBCXX_3.4.30' not found (required by onnxruntime/build/linux_cuda/Release/onnxruntime/capi/onnxruntime_pybind11_state.so)
23 |     >>> 
24 | 
25 | But the reverse works:
26 | 
27 | ::
28 | 
29 |     import onnxruntime
30 |     import pandas
31 | 


--------------------------------------------------------------------------------
/_doc/tech/gemm.rst:
--------------------------------------------------------------------------------
 1 | Gemm and storage order
 2 | ======================
 3 | 
 4 | `Gemm <https://en.wikipedia.org/wiki/Basic_Linear_Algebra_Subprograms#Level_3>`_
 5 | means general matrix multiplication. It is a common routine in linear algebra.
 6 | 
 7 | 
 8 | .. math::
 9 | 
10 |     Gemm(A, B, C, tA, tB, \alpha, \beta) = \alpha A^{tA} B^{tB} + \beta C
11 | 
12 | Where :math:`A^{tA}` means *A* is *tA* if 0 and :math:`A'` if *tA* is 1.
13 | The coefficients of a matrix are stored in memory in a one dimension
14 | array *T*: :math:`A(i,j) = T[i * C + j]` where *C* is the number of columns
15 | of matrix A. In that case, the storage is said as *row major*. In case
16 | :math:`A(i,j) = T[j * R + i]` where *R* is the number of rows,
17 | the storage is *column major*.
18 | 
19 | We define a matrix *A* with :math:`(I, J, M, R)`, it has *I*
20 | rows, *J* columns, the memory buffer is *M* and the matrix order
21 | *R*. In that case, we can express the transpose of this matrix by:
22 | If :math:`A=(I,J,M,R)`, then :math:`A' = (J,I,M,C)`.
23 | 
24 | Let's use that notation for :math:`A=(I,J,M_A,R)`, :math:`B=(J,K,M_B,R)`
25 | and :math:`C=(I,K,M_C,R)`. We note :math:`D =  A^{tA} B^{tB} = (I, K, M_D, R)`.
26 | 
27 | .. math::
28 | 
29 |     \begin{array}{rcl}
30 |     \alpha A^{tA} B^{tB} + \beta C &=& \alpha (I,J,M_A,R)^{tA} (J,K,M_B,R)^{tB} + \beta (I,K,M_C,R) \\
31 |     &=& \left( \alpha (I,J,M_A,R)^{tA} (J,K,M_B,R)^{tB} + \beta (I,K,M_C,R) \right)'' \\
32 |     &=& \left( \alpha (J,K,M_B,R)^{1-tB} (I,J,M_A,R)^{1-tA} + \beta (I,K,M_C,R)' \right)' \\
33 |     &=& \left( \alpha (K,J,M_B,C)^{tB} (J,I,M_A,C)^{tA}  + \beta (K,I,M_C,C) \right)' (*)\\
34 |     &=& \left( (K,I,M_D,C) + \beta (K,I,M_C,C) \right)' \\
35 |     &=&  (I,K,M_D,R)  + \beta (I,K,M_C,R) 
36 |     \end{array}
37 | 
38 | This trick can be used to run the computation of matrices using
39 | a column major algorithm instead of a row major algorithm
40 | by using line `(*)` as a replacement.
41 | 
42 | .. math::
43 | 
44 |     \begin{array}{rcl}
45 |     &&\alpha (I,J,M_A,R)^{tA} (J,K,M_B,R)^{tB} + \beta (I,K,M_C,R) \\
46 |     &=& \left( \alpha (K,J,M_B,C)^{tB} (J,I,M_A,C)^{tA}  + \beta (K,I,M_C,C) \right)'\\
47 |     &=& \alpha (J,I,M_A,C)^{1-tA}(K,J,M_B,C)^{1-tB}   + \beta (K,I,M_C,C)'
48 |     \end{array}
49 | 


--------------------------------------------------------------------------------
/_doc/tech/index.rst:
--------------------------------------------------------------------------------
 1 | Technical Details
 2 | =================
 3 | 
 4 | .. toctree::
 5 |     :maxdepth: 1
 6 |     :caption: Maths
 7 | 
 8 |     install_cuda_wsl
 9 |     usefulcmd
10 |     gemm
11 | 
12 | .. toctree::
13 |     :maxdepth: 1
14 |     :caption: Issues
15 | 
16 |     2023-09-05-glibc
17 | 


--------------------------------------------------------------------------------
/_doc/tech/usefulcmd.rst:
--------------------------------------------------------------------------------
 1 | Useful commands on Linux
 2 | ========================
 3 | 
 4 | Git
 5 | +++
 6 | 
 7 | * clone: `git clone <repo address.git>`
 8 | * create a new branch: `git checkout -b <new_branch>`
 9 | * add a remote repository: `git remote add <remote name> <remote address>`
10 | * merge modification: `git pull <remote name> <remote branch>`
11 | * add modified files: `git add <files or folder>`
12 | * commit added files: `git commit -m "commit message"`
13 | * push modifications to the remote repository: `git push`
14 | * remove all current modifications: `git reset --hard`
15 | * show modified filed: `git status`
16 | 
17 | Retrieve information about the CPU
18 | ++++++++++++++++++++++++++++++++++
19 | 
20 | ::
21 | 
22 |     cat /proc/cpuinfo
23 |     lscpu
24 | 
25 | Retrieve information about the GPU
26 | ++++++++++++++++++++++++++++++++++
27 | 
28 | ::
29 | 
30 |     nvidia-smi
31 | 
32 | Dependencies of a shared library
33 | ++++++++++++++++++++++++++++++++
34 | 
35 | ::
36 | 
37 |     ldd <shared_library.so>
38 | 


--------------------------------------------------------------------------------
/_doc/tutorial/build.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | Build from source
 3 | =================
 4 | 
 5 | The packages relies on :epkg:`cmake` to build the C++ extensions.
 6 | whether it wrapped with :epkg:`pybind11` or :epkg:`cython`.
 7 | Both options are available and can be linked with :epkg:`openmp`,
 8 | :epkg:`eigen`, :epkg:`onnxruntime`, :epkg:`CUDA`.
 9 | *cmake* is called from `setup.py
10 | <https://github.com/sdpython/onnx-extended/blob/main/setup.py#L198>`_
11 | with two instructions:
12 | 
13 | * ``python setup.py build_ext --inplace``, the legacy way
14 | * ``pip install -e .``, the new way
15 | 
16 | By default, *cmake* builds with CUDA if it is available. It can be disabled:
17 | 
18 | * ``python setup.py build_ext -v --inplace --with-cuda=0``, the legacy way
19 | * ``pip install -e . -v --config-settings="--with-cuda=0"``, the new way (not fully working yet)
20 | * ``pip install -e . -v --global-option "--with-cuda=0"``, the deprecated way
21 | * ``USE_CUDA=0 pip install -e . -v``, the run around way
22 | 
23 | In case there are multiple versions of CUDA installed, option `cuda-version`
24 | can be specified:
25 | 
26 | ::
27 | 
28 |     python setup.py build_ext --inplace --cuda-version=12.6
29 | 
30 | The development versions of :epkg:`onnxruntime` can be used if it was already build
31 | ``--ort-version=<version or build path>``. Example:
32 | 
33 | ::
34 | 
35 |     python setup.py build_ext --inplace --cuda-version=12.6 --ort-version=/home/github/onnxruntime/build/linux_cuda/Release
36 | 
37 | ``--cuda-link=SHARED`` helps reducing the binary size.
38 | 
39 | .. toctree::
40 |     :maxdepth: 1    
41 |     
42 |     build_cython
43 |     build_pybind11
44 |     build_cuda
45 |     build_ortext
46 |     readings
47 | 


--------------------------------------------------------------------------------
/_doc/tutorial/build_cuda.rst:
--------------------------------------------------------------------------------
 1 | Build with CUDA
 2 | ===============
 3 | 
 4 | The build may include pybind11 extension building with CUDA.
 5 | The setup is more complex as CUDA is not always available.
 6 | The profiler may be enabled as well.
 7 | 
 8 | cmake
 9 | +++++
10 | 
11 | The first step is to load the extension `FindCudaExtension.cmake
12 | <https://github.com/sdpython/onnx-extended/blob/main/_cmake/externals/FindCudaExtension.cmake>`_
13 | with `find_package(CudaExtension)`. This file exposes function
14 | `cuda_pybind11_add_module(name pybindfile)` called for
15 | every extension to build and used as follows:
16 | 
17 | ::
18 | 
19 |     if(CUDA_AVAILABLE)
20 | 
21 |         cuda_pybind11_add_module(
22 |             cuda_example_py                                             # name
23 |             ../onnx_extended/validation/cuda/cuda_example_py.cpp        # pybind11 file
24 |             ../onnx_extended/validation/cuda/cuda_example.cu            # CUDA code
25 |             ../onnx_extended/validation/cuda/cuda_example_reduce.cu)    # CUDA code
26 | 
27 |     endif()
28 | 
29 | The function accepts many source files whether they have extension c, cpp, cc, cu.
30 | Other link dependencies can be added as well
31 | by adding an instructions like `target_link_libraries(name PRIVATE lib_name)`.
32 | These project define constant `CUDA_VERSION`. For example, version 11.8 becomes
33 | `11080`.
34 | 
35 | setup.py
36 | ++++++++
37 | 
38 | `setup.py <https://github.com/sdpython/onnx-extended/blob/main/setup.py>`_
39 | defines a custom command to call cmake. Another line must be added
40 | to register the extension in the setup.
41 | 
42 | ::
43 | 
44 |     if platform.system() == "Windows":
45 |         ext = "pyd"
46 |     elif platform.system() == "Darwin"
47 |         ext = "dylib"
48 |     else:
49 |         ext = "so"
50 | 
51 |     if find_cuda():
52 | 
53 |         setup(
54 |             ...
55 |             ext_modules = [
56 |                 ...
57 |                 CMakeExtension(
58 |                     "onnx_extended.validation.cuda.cuda_example_py",
59 |                     f"onnx_extended/validation/cuda/cuda_example_py.{ext}",
60 |                 ),
61 |             ]
62 |         )
63 | 
64 | Function `find_cuda()` executes :epkg:`nvidia-smi` to check
65 | the installation of CUDA.
66 | 
67 | Possible errors
68 | +++++++++++++++
69 | 
70 | CMAKE_CUDA_COMPILER_VERSION=11.5.119 < 12.1, nvcc is not setup properly
71 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
72 | 
73 | On Linux, the following error may happen:
74 | 
75 | ::
76 | 
77 |     CMake Error at externals/FindCudaExtension.cmake:60 (message):
78 |     CMAKE_CUDA_COMPILER_VERSION=11.5.119 < 12.1, nvcc is not setup properly.
79 |     Try 'whereis nvcc' and chack the version.
80 |     Call Stack (most recent call first):
81 |     load_externals.cmake:9 (find_package)
82 |     CMakeLists.txt:19 (include)
83 | 
84 | It can be fixed by adding `--cuda-nvcc=<path ot nvcc>`. An example:
85 | `--cuda-nvcc=/usr/local/cuda-12.1/bin/nvcc`.


--------------------------------------------------------------------------------
/_doc/tutorial/build_cython.rst:
--------------------------------------------------------------------------------
 1 | Build with cython
 2 | =================
 3 | 
 4 | Any :epkg:`cython` extension is built by cmake.
 5 | It first calls cython to convert a pyx file into a C++ file
 6 | before it is compiled and linked. Using cmake + cython
 7 | instead of cython only make it easier to link with static
 8 | libraries and write unit tests in C++.
 9 | 
10 | cmake
11 | +++++
12 | 
13 | The first step is to load the extension `FindCython.cmake
14 | <https://github.com/sdpython/onnx-extended/blob/main/_cmake/externals/FindCython.cmake>`_
15 | with `find_package(Cython REQUIRED)`. This file exposes function
16 | `cython_add_module(name pyx_file omp_lib)` called for
17 | every extension to build and used as follows:
18 | 
19 | ::
20 | 
21 |     cython_add_module(
22 |         vector_function_cy                                          # name
23 |         ../onnx_extended/validation/cython/vector_function_cy.pyx   # pyx_file
24 |         OpenMP::OpenMP_CXX                                          # link with this target
25 |         ../onnx_extended/validation/cpu/vector_function.cpp)        # sources files
26 | 
27 | The function accepts many source files. Other link dependencies can be added as well
28 | by adding an instructions like `target_link_libraries(name PRIVATE lib_name)`.
29 | This function *cythonize* the *pyx_file* into a cpp file before building
30 | the dynamic library.
31 | 
32 | setup.py
33 | ++++++++
34 | 
35 | `setup.py <https://github.com/sdpython/onnx-extended/blob/main/setup.py>`_
36 | defines a custom command to call cmake. Another line must be added
37 | to register the extension in the setup.
38 | 
39 | ::
40 | 
41 |     if platform.system() == "Windows":
42 |         ext = "pyd"
43 |     elif platform.system() == "Darwin"
44 |         ext = "dylib"
45 |     else:
46 |         ext = "so"
47 | 
48 |     setup(
49 |         ...
50 |         ext_modules = [
51 |             ...
52 |             CMakeExtension(
53 |                 "onnx_extended.validation.cython.vector_function_cy",
54 |                 f"onnx_extended/validation/cython/vector_function_cy.{ext}",
55 |             ),
56 |         ]
57 |     )
58 | 


--------------------------------------------------------------------------------
/_doc/tutorial/build_ortext.rst:
--------------------------------------------------------------------------------
 1 | Build with onnxruntime
 2 | ======================
 3 | 
 4 | This package includes a wrapper for :epkg:`onnxruntime` based on
 5 | :epkg:`cython`. The standard one relies on :epkg:`pybind11`.
 6 | For that purpose, it includes the onnxruntime binaries released
 7 | on github (see :epkg:`onnxruntime releases`).
 8 | 
 9 | build onnxruntime
10 | +++++++++++++++++
11 | 
12 | ::
13 | 
14 |     clear&&CUDA_VERSION=12.6 CUDACXX=/usr/local/cuda-12.6/bin/nvcc python ./tools/ci_build/build.py \
15 |             --config Release --build_wheel --build_dir ./build/linux_cuda \
16 |             --build_shared_lib --use_cuda --cuda_home /usr/local/cuda-12.6/ \
17 |             --cudnn_home /usr/local/cuda-12.6/ --cuda_version=12.6 --enable_training --enable_training_ops \
18 |             --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=61" \
19 |             --parallel --skip_tests
20 | 
21 |     clear&&CUDA_VERSION=12.1 CUDACXX=/usr/local/cuda-12.1/bin/nvcc python ./tools/ci_build/build.py \
22 |             --config Release --build_wheel --build_dir ./build/linux_cuda \
23 |             --build_shared_lib --use_cuda --cuda_home /usr/local/cuda-12.1/ \
24 |             --cudnn_home /usr/local/cuda-12.1/ --cuda_version=12.1 --enable_training --enable_training_ops \
25 |             --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=70;72" \
26 |             --parallel --skip_tests
27 | 
28 | cmake
29 | +++++
30 | 
31 | The first step is to load the extension `FindOrt.cmake
32 | <https://github.com/sdpython/onnx-extended/blob/main/_cmake/externals/FindOrt.cmake>`_
33 | with `find_package(Ort REQUIRED)`. This file exposes two functions.
34 | The first one `ort_add_dependency(name folder_copy)` copies the binaries
35 | into folder *folder_copy* and links target *name* with onnxruntime.
36 | 
37 | The second function `ort_add_custom_op(name folder "CPU")` creates a library with 
38 | several custom kernels for onnxruntime and links it with onnxruntime.
39 | *name* is the project name, *folder* its location.
40 | 
41 | ::
42 | 
43 |     ort_add_custom_op(
44 |         ortops_tutorial_cpu                                             # name
45 |         "CPU"
46 |         ../onnx_extended/ortops/tutorial/cpu                            # folder
47 |         ../onnx_extended/ortops/tutorial/cpu/my_kernel.cc               # source file
48 |         ../onnx_extended/ortops/tutorial/cpu/my_kernel_attr.cc          # source file
49 |         ../onnx_extended/ortops/tutorial/cpu/ort_tutorial_cpu_lib.cc)   # source file
50 | 
51 | Every new kernel can be added by adding new source file. A line must be added
52 | in file `ort_tutorial_cpu_lib.cc` to register the kernel. That file also defines
53 | the domain the kernel belongs to.
54 | These project define constant `ORT_VERSION`. For example, version 1.15 becomes
55 | `1150`.
56 | 


--------------------------------------------------------------------------------
/_doc/tutorial/build_pybind11.rst:
--------------------------------------------------------------------------------
 1 | Build with pybind11
 2 | ===================
 3 | 
 4 | Any :epkg:`pybind11` extension is built by cmake.
 5 | Using cmake + pybind11 instead of pybind11
 6 | only make it easier to link with static
 7 | libraries and write unit tests in C++.
 8 | 
 9 | cmake
10 | +++++
11 | 
12 | The first step is to load the extension `FindLocalPyBind11
13 | <https://github.com/sdpython/onnx-extended/blob/main/_cmake/externals/FindLocalPyBind11.cmake>`_
14 | with ``find_package(LocalPyBind11 REQUIRED)``.
15 | This extension fetches the content of pybind11 and builds it with
16 | `FetchContent_Populate(pybind11)`. The version is registered there.
17 | It must be done once.
18 | It defines a function `local_pybind11_add_module(name omp_lib)` called for
19 | every extension to build and used as follows:
20 | 
21 | ::
22 | 
23 |     local_pybind11_add_module(
24 |     _validation                                         # name
25 |     OpenMP::OpenMP_CXX                                  # link with this library
26 |     ../onnx_extended/validation/cpu/_validation.cpp     # source file
27 |     ../onnx_extended/validation/cpu/vector_sum.cpp)     # source file
28 | 
29 | Additional libraries can be added with `target_link_libraries(name PRIVATE lib_name)`.
30 | 
31 | setup.py
32 | ++++++++
33 | 
34 | `setup.py <https://github.com/sdpython/onnx-extended/blob/main/setup.py>`_
35 | defines a custom command to call cmake. Another line must be added
36 | to register the extension in the setup.
37 | 
38 | ::
39 | 
40 |     if platform.system() == "Windows":
41 |         ext = "pyd"
42 |     elif platform.system() == "Darwin"
43 |         ext = "dylib"
44 |     else:
45 |         ext = "so"
46 | 
47 |     setup(
48 |         ...
49 |         ext_modules = [
50 |             ...
51 |             CMakeExtension(
52 |                 "onnx_extended.validation.cpu._validation",
53 |                 f"onnx_extended/validation/cpu/_validation.{ext}",
54 |             ),
55 |         ]
56 |     )
57 | 


--------------------------------------------------------------------------------
/_doc/tutorial/custom_ops.rst:
--------------------------------------------------------------------------------
 1 | Custom Kernels for onnxruntime
 2 | ==============================
 3 | 
 4 | :epkg:`onnxruntime` implements a C API which allows the user
 5 | to add custom implementation for any new operator.
 6 | This mechanism is described on onnxruntime documentation
 7 | `Custom operators <https://onnxruntime.ai/docs/reference/operators/add-custom-op.html>`_.
 8 | This packages implements a couple of custom operators for CPU and
 9 | GPU (NVIDIA). The first steps is to register an assembly to let
10 | onnxruntime use them.
11 | 
12 | .. code-block:: python
13 | 
14 |     from onnxruntime import InferenceSession, SessionOptions
15 |     from onnx_extended.ortops.optim.cpu import get_ort_ext_libs
16 | 
17 |     opts = SessionOptions()
18 |     opts.register_custom_ops_library(get_ort_ext_libs()[0])
19 | 
20 |     sess = InferenceSession(
21 |         "<model_name_or_bytes>", opts, providers=[..., "CPUExecutionProvider"]
22 |     )
23 | 
24 | It supports any onnxruntime C API greater than version:
25 | 
26 | .. runpython::
27 |     :showcode:
28 | 
29 |     from onnx_extended.ortcy.wrap.ortinf import get_ort_c_api_supported_version
30 |     
31 |     print(get_ort_c_api_supported_version())
32 | 
33 | Next section introduces the list of operators and assemblies this package
34 | implements.
35 | 
36 | onnx_extended.ortops.tutorial.cpu
37 | +++++++++++++++++++++++++++++++++
38 | 
39 | .. runpython::
40 |     :showcode:
41 | 
42 |     from onnx_extended.ortops.tutorial.cpu import get_ort_ext_libs
43 | 
44 |     print(get_ort_ext_libs())
45 | 
46 | .. runpython::
47 |     :rst:
48 | 
49 |     from onnx_extended.ortops.tutorial.cpu import documentation
50 | 
51 |     print("\n".join(documentation()))
52 | 
53 | onnx_extended.ortops.tutorial.cuda
54 | ++++++++++++++++++++++++++++++++++
55 | 
56 | .. runpython::
57 |     :showcode:
58 | 
59 |     from onnx_extended.ortops.tutorial.cuda import get_ort_ext_libs
60 | 
61 |     try:
62 |         print(get_ort_ext_libs())
63 |     except AssertionError as e:
64 |         print(f"CUDA is not enabled: {e}")
65 | 
66 | .. runpython::
67 |     :rst:
68 | 
69 |     from onnx_extended.ortops.tutorial.cuda import documentation
70 | 
71 |     print("\n".join(documentation()))
72 | 
73 | onnx_extended.ortops.optim.cpu
74 | ++++++++++++++++++++++++++++++
75 | 
76 | .. runpython::
77 |     :showcode:
78 | 
79 |     from onnx_extended.ortops.optim.cpu import get_ort_ext_libs
80 | 
81 |     print(get_ort_ext_libs())
82 | 
83 | .. runpython::
84 |     :rst:
85 | 
86 |     from onnx_extended.ortops.optim.cpu import documentation
87 | 
88 |     print("\n".join(documentation()))
89 | 


--------------------------------------------------------------------------------
/_doc/tutorial/cython_binding.rst:
--------------------------------------------------------------------------------
 1 | Cython Binding of onnxruntime
 2 | =============================
 3 | 
 4 | :epkg:`onnxruntime` implements a python API based on :epkg:`pybind11`.
 5 | This API is custom and does not leverage the C API.
 6 | This package implements class
 7 | :class:`OrtSession <onnx_extended.ortcy.wrap.ortinf.OrtSession>`.
 8 | The bindings is based on :epkg:`cython` which faster.
 9 | The difference is significant when onnxruntime deals with small tensors.
10 | 
11 | .. runpython::
12 |     :showcode:
13 | 
14 |     import numpy
15 |     from onnx import TensorProto
16 |     from onnx.helper import (
17 |         make_model,
18 |         make_node,
19 |         make_graph,
20 |         make_tensor_value_info,
21 |         make_opsetid,
22 |     )
23 |     from onnx_extended.ortcy.wrap.ortinf import OrtSession
24 | 
25 |     X = make_tensor_value_info("X", TensorProto.FLOAT, [None, None])
26 |     Y = make_tensor_value_info("Y", TensorProto.FLOAT, [None, None])
27 |     Z = make_tensor_value_info("Z", TensorProto.FLOAT, [None, None])
28 |     node = make_node("Add", ["X", "Y"], ["Z"])
29 |     graph = make_graph([node], "add", [X, Y], [Z])
30 |     onnx_model = make_model(
31 |         graph, opset_imports=[make_opsetid("", 18)], ir_version=8
32 |     )
33 | 
34 |     with open("model.onnx", "wb") as f:
35 |         f.write(onnx_model.SerializeToString())
36 | 
37 |     session = OrtSession("model.onnx")
38 |     x = numpy.random.randn(2, 3).astype(numpy.float32)
39 |     y = numpy.random.randn(2, 3).astype(numpy.float32)
40 |     got =session.run([x, y])
41 | 
42 |     print(got)
43 | 
44 | 
45 | The signature is different compare to onnxruntime
46 | ``session.run(None, {"X": x, "Y": y})`` to increase performance.
47 | This binding supports custom operators as well.
48 | A benchmark :ref:`l-cython-pybind11-ort-bindings` compares
49 | :epkg:`onnxruntime` to this new binding.
50 | 


--------------------------------------------------------------------------------
/_doc/tutorial/images/plot_optim_tree_ensemble.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sdpython/onnx-extended/eebca3c82b615f841608f74131900738b7f7f845/_doc/tutorial/images/plot_optim_tree_ensemble.png


--------------------------------------------------------------------------------
/_doc/tutorial/index.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | Tutorial
 3 | ========
 4 | 
 5 | This package is mostly used to validate custom implementations
 6 | of a specific onnx operator or **kernel**.
 7 | The same code is used to either implement a custom kernel for the
 8 | reference implementation from :epkg:`onnx` package or a custom kernel
 9 | for :epkg:`onnxruntime`. The last section
10 | describe how to build the package and to add a new implementation
11 | depending the technology it relies on (CPU, openmp, CUDA, eigen, ...).
12 | The last section is a sorted index of the examples.
13 | 
14 | .. toctree::
15 |     :maxdepth: 1
16 |     :caption: Kernels
17 | 
18 |     reference_evaluator
19 |     cython_binding
20 |     custom_ops
21 |     ops
22 |     many_tools
23 |     build
24 | 
25 | .. toctree::
26 |     :maxdepth: 1
27 |     :caption: Deprecated
28 | 
29 |     parallelization
30 | 


--------------------------------------------------------------------------------
/_doc/tutorial/many_tools.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | =======================================
 3 | Many Tools to help investigating issues
 4 | =======================================
 5 | 
 6 | Developpers write many lines of code, many are part of a package,
 7 | many are used to investigate what the first line produces.
 8 | This section gathers some tools occasionally needed 
 9 | to write converters in :epkg:`sklearn-onnx`, to implement
10 | kernels in :epkg:`onnxruntime`, to add new operators in :epkg:`onnx`.
11 | The first series is used to play with :epkg:`onnx` files.
12 | A couple of the helpers described below are available
13 | through command lines.
14 | 
15 | .. toctree::
16 |     :maxdepth: 1
17 |     :caption: onnx
18 | 
19 |     external_data
20 |     onnx_manipulations
21 |     quantize
22 |     statistics
23 | 
24 | The second series is used to investigate C++ implementations
25 | in :epkg:`onnxruntime`.
26 | 
27 | .. toctree::
28 |     :maxdepth: 1
29 |     :caption: onnxruntime
30 | 
31 |     profiling
32 |     ort_debug
33 |     old_version
34 |     trees
35 | 


--------------------------------------------------------------------------------
/_doc/tutorial/onnx_manipulations.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | Onnx Manipulations
 3 | ==================
 4 | 
 5 | Extract a subgraph
 6 | ++++++++++++++++++
 7 | 
 8 | Both functions below are usually to extract a small piece of an existing
 9 | model to create unit tests.
10 | 
11 | Function :func:`onnx_remove_node_unused
12 | <onnx_extended.tools.onnx_nodes.onnx_remove_node_unused>`
13 | removes every node whose outputs are not used.
14 | 
15 | Function :func:`select_model_inputs_outputs
16 | <onnx_extended.tools.onnx_nodes.select_model_inputs_outputs>`
17 | creates an onnx graph taking any intermediate results as new inputs
18 | or new outputs.
19 | 
20 | Analyze
21 | +++++++
22 | 
23 | Loops or tests are based on onnx `GraphProto`. These
24 | subgraphs takes inputs but can also use any intermediated
25 | results computed so far. These results are part of the local
26 | context but they are not explicit mentioned and that sometimes
27 | makes it difficult to understand what subgraph is doing or needs.
28 | Function :func:`get_hidden_inputs
29 | <onnx_extended.tools.onnx_nodes.get_hidden_inputs>`
30 | retrieves that information.
31 | 
32 | Function :func:`enumerate_onnx_node_types
33 | <onnx_extended.tools.onnx_nodes.enumerate_onnx_node_types>`
34 | quickly gives the list of operators a model uses.
35 | 


--------------------------------------------------------------------------------
/_doc/tutorial/ops.rst:
--------------------------------------------------------------------------------
 1 | ===============================
 2 | Focus on operators optimization
 3 | ===============================
 4 | 
 5 | .. toctree::
 6 |     :maxdepth: 1
 7 |     :caption: Conv
 8 | 
 9 |     ../auto_examples/plot_op_conv_py_vs_c
10 |     ../auto_examples/plot_op_conv_denorm
11 | 
12 | .. toctree::
13 |     :maxdepth: 1
14 |     :caption: Gemm
15 | 
16 |     ../auto_examples/plot_bench_gemm_f8
17 |     ../auto_examples/plot_bench_gemm_ort
18 |     ../auto_examples/plot_profile_gemm_ort
19 | 
20 | .. toctree::
21 |     :maxdepth: 1
22 |     :caption: Einsum
23 | 
24 |     ../auto_examples/plot_op_einsum
25 | 
26 | .. toctree::
27 |     :maxdepth: 1
28 |     :caption: Mul
29 | 
30 |     ../auto_examples/plot_op_mul_cuda
31 | 
32 | .. toctree::
33 |     :maxdepth: 1
34 |     :caption: TreeEnsemble
35 | 
36 |     ../auto_examples/plot_op_tree_ensemble_optim
37 |     ../auto_examples/plot_op_tree_ensemble_sparse
38 | 


--------------------------------------------------------------------------------
/_doc/tutorial/ort_debug.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | Debug Intermediate Results
 3 | ==========================
 4 | 
 5 | The reference evaluation (:class:`onnx_extended.reference.CReferenceEvaluator`)
 6 | can return all intermediate results. :epkg:`onnxruntime` does not
 7 | unless the onnx model is split to extract the intermediate results.
 8 | Function :func:`enumerate_ort_run <onnx_extended.tools.ort_debug.enumerate_ort_run>`
 9 | creates many models, inputs are always the same, new outputs are intermediate
10 | results of an original model.
11 | 
12 | .. runpython::
13 |     :showcode:
14 | 
15 |     import logging
16 |     import numpy as np
17 |     from onnx import TensorProto
18 |     from onnx.helper import (
19 |         make_model,
20 |         make_node,
21 |         make_graph,
22 |         make_tensor_value_info,
23 |         make_opsetid,
24 |     )
25 |     from onnx.checker import check_model
26 |     from onnx_extended.tools.ort_debug import enumerate_ort_run
27 | 
28 |     logging.getLogger("onnx-extended").setLevel(logging.ERROR)
29 | 
30 |     def get_model():
31 |         X = make_tensor_value_info("X", TensorProto.FLOAT, [None, None])
32 |         Y = make_tensor_value_info("Y", TensorProto.FLOAT, [None, None])
33 |         Z = make_tensor_value_info("Z", TensorProto.INT64, [None, None])
34 |         graph = make_graph(
35 |             [
36 |                 make_node("Add", ["X", "Y"], ["z1"]),
37 |                 make_node("Mul", ["X", "z1"], ["z2"]),
38 |                 make_node("Cast", ["z2"], ["Z"], to=TensorProto.INT64),
39 |             ],
40 |             "add",
41 |             [X, Y],
42 |             [Z],
43 |         )
44 |         onnx_model = make_model(
45 |             graph, opset_imports=[make_opsetid("", 18)], ir_version=8
46 |         )
47 |         check_model(onnx_model)
48 |         return onnx_model
49 | 
50 |     model = get_model()
51 |     feeds = {
52 |         "X": np.arange(4).reshape((2, 2)).astype(np.float32),
53 |         "Y": np.arange(4).reshape((2, 2)).astype(np.float32),
54 |     }
55 | 
56 |     for names, outs, node in enumerate_ort_run(model, feeds, verbose=2):
57 |         print(f"NODE: {node.op_type}")
58 |         for n, o in zip(names, outs):
59 |             print(f"   {n}:{o.dtype}:{o.shape}")
60 | 


--------------------------------------------------------------------------------
/_doc/tutorial/parallelization.rst:
--------------------------------------------------------------------------------
1 | 
2 | Experiments about parallelization
3 | =================================
4 | 
5 | .. toctree::
6 |     :maxdepth: 1
7 | 
8 |     ../auto_examples/plot_bench_cpu
9 | 


--------------------------------------------------------------------------------
/_doc/tutorial/quantize.rst:
--------------------------------------------------------------------------------
1 | 
2 | Quantization
3 | ============
4 | 
5 | *to be completed*
6 | 


--------------------------------------------------------------------------------
/_doc/tutorial/readings.rst:
--------------------------------------------------------------------------------
 1 | ========
 2 | Readings
 3 | ========
 4 | 
 5 | Some articles, papers, document helpful.
 6 | 
 7 | Build
 8 | =====
 9 | 
10 | * `Compiler Options Hardening Guide for C and C++
11 |   <https://github.com/ossf/wg-best-practices-os-developers/blob/main/docs/Compiler-Hardening-Guides/Compiler-Options-Hardening-Guide-for-C-and-C%2B%2B.md>`_
12 | * `Build a custom ONNX Runtime package <https://onnxruntime.ai/docs/build/custom.html>`_
13 | 
14 | Custom Operators with onnxruntime
15 | =================================
16 | 
17 | * `Custom operators <https://onnxruntime.ai/docs/reference/operators/add-custom-op.html>`_
18 | * `custom_op_lib.cc
19 |   <https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/test/testdata/custom_op_openvino_wrapper_library/custom_op_lib.cc>`_
20 | 


--------------------------------------------------------------------------------
/_doc/tutorial/reference_evaluator.rst:
--------------------------------------------------------------------------------
 1 | 
 2 | CReferenceEvaluator
 3 | ===================
 4 | 
 5 | Class :class:`CReferenceEvaluator <onnx_extended.reference.CReferenceEvaluator>`
 6 | extends :class:`onnx.reference.ReferenceEvaluator` with custom operators implemented
 7 | in C++ in order to speed up the evaluation of this python runtime.
 8 | This class inherits from :class:`onnx.reference.ReferenceEvaluator` to automatically
 9 | add the C++ implementation of this operators.
10 | It rewrites the following kernels and can be used as follows.
11 | 
12 | .. runpython::
13 |     :showcode:
14 | 
15 |     import numpy as np
16 |     from onnx import TensorProto
17 |     from onnx.helper import (
18 |         make_graph,
19 |         make_model,
20 |         make_node,
21 |         make_opsetid,
22 |         make_tensor_value_info,
23 |     )
24 |     from onnx.reference import ReferenceEvaluator
25 |     from onnxruntime import InferenceSession
26 |     from onnx_extended.ext_test_case import measure_time
27 |     from onnx_extended.reference import CReferenceEvaluator
28 | 
29 |     X = make_tensor_value_info("X", TensorProto.FLOAT, [None, None, None, None])
30 |     Y = make_tensor_value_info("Y", TensorProto.FLOAT, [None, None, None, None])
31 |     B = make_tensor_value_info("B", TensorProto.FLOAT, [None, None, None, None])
32 |     W = make_tensor_value_info("W", TensorProto.FLOAT, [None, None, None, None])
33 |     node = make_node(
34 |         "Conv",
35 |         ["X", "W", "B"],
36 |         ["Y"],
37 |         pads=[1, 1, 1, 1],
38 |         dilations=[1, 1],
39 |         strides=[2, 2],
40 |     )
41 |     graph = make_graph([node], "g", [X, W, B], [Y])
42 |     onnx_model = make_model(graph, opset_imports=[make_opsetid("", 16)])
43 | 
44 |     sH, sW = 64, 64
45 |     X = np.arange(sW * sH).reshape((1, 1, sH, sW)).astype(np.float32)
46 |     W = np.ones((1, 1, 3, 3), dtype=np.float32)
47 |     B = np.array([[[[0]]]], dtype=np.float32)
48 | 
49 |     sess1 = ReferenceEvaluator(onnx_model)
50 |     sess2 = CReferenceEvaluator(onnx_model)  # 10 to 100 times faster
51 | 
52 |     expected = sess1.run(None, {"X": X, "W": W, "B": B})[0]
53 |     got = sess2.run(None, {"X": X, "W": W, "B": B})[0]
54 |     diff = np.abs(expected - got).max()
55 |     print(f"difference: {diff}")
56 | 
57 | It rewrites the following examples.
58 | 
59 | .. runpython::
60 |     :showcode:
61 | 
62 |     import pprint
63 |     from onnx_extended.reference import CReferenceEvaluator
64 | 
65 |     pprint.pprint(
66 |         [cl.__name__ for cl in CReferenceEvaluator.default_ops()]
67 |     )
68 | 


--------------------------------------------------------------------------------
/_doc/tutorial/statistics.rst:
--------------------------------------------------------------------------------
1 | 
2 | Statistics
3 | ==========
4 | 
5 | To be completed.
6 | 


--------------------------------------------------------------------------------
/_unittests/ut_ortcy/data/add.onnx:
--------------------------------------------------------------------------------
 1 | :H
 2 | 
 3 | X
 4 | YZ"AddaddZ
 5 | X
 6 | 
 7 | 
 8 |  
 9 |  Z
10 | Y
11 | 
12 | 
13 |  
14 |  b
15 | Z
16 | 
17 | 
18 |  
19 |  B
20 |  


--------------------------------------------------------------------------------
/_unittests/ut_ortcy/test_inference.cpp:
--------------------------------------------------------------------------------
 1 | #include "onnx_extended_helpers.h"
 2 | #include "onnx_extended_test_common.h"
 3 | // #include "onnx_extended/ortcy/wrap/ortapi.h"
 4 | #include "onnxruntime_cxx_api.h"
 5 | #if __cplusplus >= 201703L
 6 | #include <filesystem>
 7 | #endif
 8 | #ifdef _WIN32
 9 | #include <codecvt>
10 | #include <locale>
11 | #endif
12 | 
13 | void testAssertTrue() { ASSERT_THROW(true); }
14 | 
15 | void test_inference() {
16 |   const OrtApi *api = OrtGetApiBase()->GetApi(ORT_API_VERSION);
17 |   ASSERT_THROW(api != nullptr);
18 |   Ort::Env env;
19 |   auto ort_env = &env; // std::make_unique<Ort::Env>(ORT_LOGGING_LEVEL_WARNING, "Default");
20 |   Ort::SessionOptions session_options;
21 |   session_options.SetIntraOpNumThreads(1);
22 |   session_options.SetGraphOptimizationLevel(GraphOptimizationLevel::ORT_ENABLE_EXTENDED);
23 |   session_options.SetLogSeverityLevel(0);
24 | 
25 |   // requires C++ 17
26 |   std_string_type model = get_data_path("ut_ortcy/data/add.onnx");
27 | 
28 |   Ort::Session session(*ort_env, model.c_str(), session_options);
29 | 
30 |   const char *input_names[] = {"X", "Y"};
31 |   const char *output_names[] = {"Z"};
32 | 
33 |   float vector_1_value[] = {0.f, 1.f, 2.f, 3.f, 4.f, 5.f};
34 |   int64_t vector_1_dim[] = {6, 1};
35 | 
36 |   float vector_2_value[] = {0.f, 1.f, 2.f, 3.f, 4.f, 50.f};
37 |   int64_t vector_2_dim[] = {6, 1};
38 | 
39 |   auto memory_info = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
40 | 
41 |   Ort::Value input_tensors[] = {
42 |       Ort::Value::CreateTensor<float>(memory_info, vector_1_value, 6, vector_1_dim, 2),
43 |       Ort::Value::CreateTensor<float>(memory_info, vector_2_value, 6, vector_2_dim, 2)};
44 | 
45 |   Ort::RunOptions run_options;
46 |   auto output_tensors =
47 |       session.Run(run_options, input_names, input_tensors, 2, output_names, 1);
48 |   const auto &vector_filterred = output_tensors.at(0);
49 |   auto type_shape_info = vector_filterred.GetTensorTypeAndShapeInfo();
50 |   const float *floats_output = static_cast<const float *>(vector_filterred.GetTensorRawData());
51 |   ASSERT_EQUAL(floats_output[0], 0);
52 |   ASSERT_EQUAL(floats_output[1], 2);
53 |   ASSERT_EQUAL(floats_output[2], 4);
54 |   ASSERT_EQUAL(floats_output[3], 6);
55 |   ASSERT_EQUAL(floats_output[4], 8);
56 |   ASSERT_EQUAL(floats_output[5], 55);
57 | }
58 | 
59 | int main(int, char **) {
60 |   testAssertTrue();
61 |   test_inference();
62 | }
63 | 


--------------------------------------------------------------------------------
/_unittests/ut_ortops/data/plot_op_tree_ensemble_implementations_custom.onnx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sdpython/onnx-extended/eebca3c82b615f841608f74131900738b7f7f845/_unittests/ut_ortops/data/plot_op_tree_ensemble_implementations_custom.onnx


--------------------------------------------------------------------------------
/_unittests/ut_ortops/data/plot_op_tree_ensemble_implementations_sparse.onnx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sdpython/onnx-extended/eebca3c82b615f841608f74131900738b7f7f845/_unittests/ut_ortops/data/plot_op_tree_ensemble_implementations_sparse.onnx


--------------------------------------------------------------------------------
/_unittests/ut_ortops/test_inference_tree.cpp:
--------------------------------------------------------------------------------
 1 | #include "onnx_extended_helpers.h"
 2 | #include "onnx_extended_test_common.h"
 3 | // #include "onnx_extended/ortcy/wrap/ortapi.h"
 4 | #include "onnxruntime_cxx_api.h"
 5 | 
 6 | void test_inference_tree_ensemble() {
 7 | #if !defined(_WIN32) && (ORT_API_VERSION >= 17)
 8 |   const OrtApi *api = OrtGetApiBase()->GetApi(ORT_API_VERSION);
 9 |   ASSERT_THROW(api != nullptr);
10 |   Ort::Env env;
11 |   auto ort_env = &env;
12 |   Ort::SessionOptions session_options;
13 |   session_options.RegisterCustomOpsLibrary(to_std_string_path(TESTED_CUSTOM_OPS_DLL).c_str());
14 | 
15 |   // requires C++ 17
16 |   std_string_type model =
17 |       get_data_path("ut_ortops/data/plot_op_tree_ensemble_implementations_custom.onnx");
18 | 
19 |   Ort::Session session(*ort_env, model.c_str(), session_options);
20 |   // It needs to revisited.
21 |   return;
22 | 
23 |   const char *input_names[] = {"X"};
24 |   const char *output_names[] = {"variable"};
25 | 
26 |   int64_t vector_1_dim[] = {100, 500};
27 |   std::vector<float> vector_1_value(vector_1_dim[0] * vector_1_dim[1]);
28 |   for (size_t i = 0; i < vector_1_value.size(); ++i) {
29 |     vector_1_value[i] = 1.0f / static_cast<float>(i + 1);
30 |   }
31 | 
32 |   auto memory_info = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
33 | 
34 |   Ort::Value input_tensors[] = {Ort::Value::CreateTensor<float>(
35 |       memory_info, vector_1_value.data(), vector_1_value.size(), vector_1_dim, 2)};
36 | 
37 |   const char *env_p = std::getenv("LONG");
38 |   bool long_test = env_p != nullptr && env_p[0] == '1';
39 | 
40 |   Ort::RunOptions run_options;
41 |   for (int i = 0; i < (long_test ? 100000 : 1); ++i) {
42 |     if (i > 0 && i % 10000 == 0)
43 |       printf("i=%d\n", i);
44 |     auto out = session.Run(run_options, input_names, input_tensors, 1, output_names, 1);
45 |     ASSERT_EQUAL(out.size(), 1);
46 |   }
47 |   auto output_tensors =
48 |       session.Run(run_options, input_names, input_tensors, 1, output_names, 1);
49 |   const auto &vector_filterred = output_tensors.at(0);
50 |   auto type_shape_info = vector_filterred.GetTensorTypeAndShapeInfo();
51 |   ASSERT_EQUAL(type_shape_info.GetDimensionsCount(), 2);
52 |   const float *floats_output = static_cast<const float *>(vector_filterred.GetTensorRawData());
53 |   // ASSERT_EQUAL(floats_output[0], 0);
54 |   ASSERT_NOTEQUAL(floats_output, nullptr);
55 | #endif
56 | }
57 | 
58 | int main(int, char **) { test_inference_tree_ensemble(); }
59 | 


--------------------------------------------------------------------------------
/_unittests/ut_plotting/test_plotting_benchmark.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import pandas
 3 | from onnx_extended.ext_test_case import ExtTestCase
 4 | from onnx_extended.plotting.data import hhistograms_data, vhistograms_data
 5 | from onnx_extended.plotting.benchmark import hhistograms, vhistograms
 6 | 
 7 | 
 8 | class TestCReferenceEvaluator(ExtTestCase):
 9 |     def test_plotting_hhistograms(self):
10 |         import matplotlib.pyplot as plt
11 | 
12 |         plt.clf()
13 |         df = pandas.DataFrame(hhistograms_data())
14 |         ax = hhistograms(df, keys=("input", "name"))
15 |         self.assertNotEmpty(ax)
16 | 
17 |     def test_plotting_hhistograms2(self):
18 |         import matplotlib.pyplot as plt
19 | 
20 |         plt.clf()
21 |         df = pandas.DataFrame(hhistograms_data())
22 |         df = df[df.input == "dense"]
23 |         df = df.drop("input", axis=1)
24 |         ax = hhistograms(df, keys="name")
25 |         self.assertNotEmpty(ax)
26 | 
27 |     def test_plotting_vhistograms(self):
28 |         import matplotlib.pyplot as plt
29 | 
30 |         plt.clf()
31 |         df = pandas.DataFrame(vhistograms_data())
32 |         ax = vhistograms(df)
33 |         self.assertNotEmpty(ax)
34 | 
35 | 
36 | if __name__ == "__main__":
37 |     unittest.main(verbosity=2)
38 | 


--------------------------------------------------------------------------------
/_unittests/ut_reference/test_c_op_conv.cpp:
--------------------------------------------------------------------------------
 1 | #include "onnx_extended_test_common.h"
 2 | #include "cpu/c_op_conv_common.h"
 3 | #include "cpu/c_op_conv.h"
 4 | 
 5 | using namespace onnx_c_ops;
 6 | 
 7 | void testAssertTrue() {
 8 |     ASSERT_THROW(true);
 9 | }
10 | 
11 | void test_gemm() {
12 |     float pa[4] = { 1, 2, 3, 4 };
13 |     float pb[4] = { 10, 20, 30, 40 };
14 |     float pc[4] = { -0.1, -0.2, -0.3, -0.4 };
15 |     float expected[4] = { 69.9, 99.8, 149.7, 219.6 };
16 |     gemm(false, false, 2, 2, 2, 1.0f, pa, pb, 1.0f, pc);
17 |     ASSERT_EQUAL_VECTOR(4, expected, pc);
18 | 
19 |     float pc2[4] = { -0.1, -0.2, -0.3, -0.4 };
20 |     float expected2[4] = { 70.0, 100.0, 150.0, 220.0 };
21 |     gemm(false, false, 2, 2, 2, 1.0f, pa, pb, 0.0f, pc2);
22 |     ASSERT_EQUAL_VECTOR(4, expected2, pc2);
23 | 
24 |     float pc3[4] = { -0.1, -0.2, -0.3, -0.4 };
25 |     float expected3[4] = { 139.9, 199.8, 299.7, 439.6 };
26 |     gemm(false, false, 2, 2, 2, 2.0f, pa, pb, 1.0f, pc3);
27 |     ASSERT_EQUAL_VECTOR(4, expected3, pc3);
28 | 
29 |     float paA[4] = { 1, 2, 3, 4 };
30 |     float pbA[4] = { 1, 0, 0, 1 };
31 |     float pcA[4] = { 0, 0, 0, 0 };
32 |     float expectedA[4] = { 1, 3, 2, 4 };
33 |     gemm(true, false, 2, 2, 2, 1.0f, paA, pbA, 1.0f, pcA);
34 |     ASSERT_EQUAL_VECTOR(4, expectedA, pcA);
35 | 
36 |     float paB[4] = { 1, 0, 0, 1 };
37 |     float pbB[4] = { 1, 2, 3, 4 };
38 |     float pcB[4] = { 0, 0, 0, 0 };
39 |     float expectedB[4] = { 1, 2, 3, 4 };
40 |     gemm(true, false, 2, 2, 2, 1.0f, paB, pbB, 1.0f, pcB);
41 |     ASSERT_EQUAL_VECTOR(4, expectedB, pcB);
42 | 
43 |     float paC[4] = { 1, 1, 0, 1 };
44 |     float pbC[4] = { 1, 1, 0, 0 };
45 |     float pcC[4] = { 0, 0, 0, 0 };
46 |     float expectedC[4] = { 10, 10, 10, 10 };
47 |     gemm(true, false, 2, 2, 2, 10.0f, paC, pbC, 1.0f, pcC);
48 |     ASSERT_EQUAL_VECTOR(4, expectedC, pcC);
49 | 
50 |     float pc6[4] = { -0.1, -0.2, -0.3, -0.4 };
51 |     float expected6[4] = { 69.9, 149.8, 99.7, 219.6 };
52 |     gemm(true, true, 2, 2, 2, 1.0f, pa, pb, 1.0f, pc6);
53 |     ASSERT_EQUAL_VECTOR(4, expected6, pc6);
54 | 
55 |     float pc4[4] = { -0.1, -0.2, -0.3, -0.4 };
56 |     float expected4[4] = { 99.9, 139.8, 139.7, 199.6 };
57 |     gemm(true, false, 2, 2, 2, 1.0f, pa, pb, 1.0f, pc4);
58 |     ASSERT_ALMOST_VECTOR(4, expected4, pc4, 1e-5f);
59 | 
60 |     float pc5[4] = { -0.1, -0.2, -0.3, -0.4 };
61 |     float expected5[4] = { 49.9, 109.8, 109.7, 249.6 };
62 |     gemm(false, true, 2, 2, 2, 1.0f, pa, pb, 1.0f, pc5);
63 |     ASSERT_EQUAL_VECTOR(4, expected5, pc5);
64 | }
65 | 
66 | int main(int, char**) {
67 |     testAssertTrue();
68 |     test_gemm();
69 | }
70 | 


--------------------------------------------------------------------------------
/_unittests/ut_tools/bench/model.onnx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sdpython/onnx-extended/eebca3c82b615f841608f74131900738b7f7f845/_unittests/ut_tools/bench/model.onnx


--------------------------------------------------------------------------------
/_unittests/ut_tools/bench/test_data_set_0/input_0.pb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sdpython/onnx-extended/eebca3c82b615f841608f74131900738b7f7f845/_unittests/ut_tools/bench/test_data_set_0/input_0.pb


--------------------------------------------------------------------------------
/_unittests/ut_tools/bench/test_data_set_0/input_1.pb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sdpython/onnx-extended/eebca3c82b615f841608f74131900738b7f7f845/_unittests/ut_tools/bench/test_data_set_0/input_1.pb


--------------------------------------------------------------------------------
/_unittests/ut_tools/bench/test_data_set_0/output_0.pb:
--------------------------------------------------------------------------------
1 | BZJ                              


--------------------------------------------------------------------------------
/_unittests/ut_tools/bench_rf/model.onnx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sdpython/onnx-extended/eebca3c82b615f841608f74131900738b7f7f845/_unittests/ut_tools/bench_rf/model.onnx


--------------------------------------------------------------------------------
/_unittests/ut_tools/bench_rf/test_data_set_0/input_0.pb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sdpython/onnx-extended/eebca3c82b615f841608f74131900738b7f7f845/_unittests/ut_tools/bench_rf/test_data_set_0/input_0.pb


--------------------------------------------------------------------------------
/_unittests/ut_tools/bench_rf/test_data_set_0/output_0.pb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sdpython/onnx-extended/eebca3c82b615f841608f74131900738b7f7f845/_unittests/ut_tools/bench_rf/test_data_set_0/output_0.pb


--------------------------------------------------------------------------------
/_unittests/ut_tools/data/debug_4700-CPUep.onnx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sdpython/onnx-extended/eebca3c82b615f841608f74131900738b7f7f845/_unittests/ut_tools/data/debug_4700-CPUep.onnx


--------------------------------------------------------------------------------
/_unittests/ut_tools/test_einsum_benchmark.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from onnx_extended.ext_test_case import ExtTestCase
 3 | from onnx_extended.tools.einsum.einsum_bench import einsum_benchmark
 4 | 
 5 | 
 6 | class TestEinsumBenchmark(ExtTestCase):
 7 |     def test_benchmark1(self):
 8 |         for rt in ["numpy", "python", "onnxruntime"]:
 9 |             with self.subTest(rt=rt):
10 |                 res = list(einsum_benchmark(shape=5, runtime=rt))
11 |                 self.assertEqual(len(res), 2)
12 | 
13 |     def test_benchmark_exc(self):
14 |         self.assertRaise(
15 |             lambda: list(einsum_benchmark(shape=5, runtime="UNK")), ValueError
16 |         )
17 |         self.assertRaise(
18 |             lambda: list(einsum_benchmark(shape=5, equation="abc,cd->abD", perm=True)),
19 |             AssertionError,
20 |         )
21 | 
22 |     def test_benchmark2(self):
23 |         for rt in ["numpy", "python", "onnxruntime"]:
24 |             with self.subTest(rt=rt):
25 |                 res = list(einsum_benchmark(shape=[5, 6], runtime=rt))
26 |                 self.assertEqual(len(res), 4)
27 | 
28 |     def test_benchmark1_shape(self):
29 |         for rt in ["numpy", "python", "onnxruntime"]:
30 |             with self.subTest(rt=rt):
31 |                 res = list(einsum_benchmark(shape=[(5, 5, 5), (5, 5)], runtime=rt))
32 |                 self.assertEqual(len(res), 2)
33 | 
34 |     def test_benchmarkn(self):
35 |         for rt in ["numpy"]:
36 |             with self.subTest(rt=rt):
37 |                 res = list(einsum_benchmark(shape=5, perm=True, runtime=rt))
38 |                 self.assertEqual(len(res), 48)
39 | 
40 | 
41 | if __name__ == "__main__":
42 |     import logging
43 | 
44 |     logging.getLogger("skl2onnx").setLevel(logging.ERROR)
45 |     logging.getLogger("onnx-extended").setLevel(logging.ERROR)
46 |     unittest.main(verbosity=2)
47 | 


--------------------------------------------------------------------------------
/_unittests/ut_tools/test_einsum_bug.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import numpy
 3 | from onnx_extended.ext_test_case import ExtTestCase
 4 | from onnx_extended.tools.einsum import (
 5 |     decompose_einsum_equation,
 6 |     optimize_decompose_einsum_equation,
 7 | )
 8 | from onnx_extended.reference import CReferenceEvaluator
 9 | 
10 | 
11 | class TestEinsumBug(ExtTestCase):
12 |     def test_abbba(self):
13 |         res = decompose_einsum_equation("ab,b->ba", strategy="numpy", clean=True)
14 |         self.assertNotEmpty(res)
15 | 
16 |     def test__pprint_forward(self):
17 |         res = decompose_einsum_equation("ab,b->ba", strategy="numpy", clean=True)
18 |         pf = res._pprint_forward()
19 |         spl = pf.split("<- id")
20 |         self.assertEqual(len(spl), 4)
21 | 
22 |     def common_test_equation(self, equation, dim1, dim2):
23 |         seq = decompose_einsum_equation(equation, clean=True, strategy="numpy")
24 |         onx = seq.to_onnx("Y", "X1", "X2")
25 |         sequ = equation.replace(",", "_").replace("->", "__")
26 |         with open(f"temp_{sequ}_A.onnx", "wb") as f:
27 |             f.write(onx.SerializeToString())
28 |         a = numpy.random.rand(*list((2,) * dim1))
29 |         b = numpy.random.rand(*list((2,) * dim2))
30 |         oinf = CReferenceEvaluator(onx, verbose=0)
31 |         got = oinf.run(None, {"X1": a, "X2": b})
32 |         expected = numpy.einsum(equation, a, b)
33 |         self.assertEqualArray(expected, got[0], atol=1e-15)
34 | 
35 |         res = optimize_decompose_einsum_equation(
36 |             equation,
37 |             numpy.float64,
38 |             optimize=True,
39 |             runtime="python",
40 |             cache=False,
41 |             opset=15,
42 |             decompose=True,
43 |             strategy="ml",
44 |             verbose=None,
45 |         )
46 |         new_eq = res.equation_
47 |         new_onx = res.onnx_
48 |         sequ = new_eq.replace(",", "_").replace("->", "__")
49 |         with open(f"temp_{sequ}_B.onnx", "wb") as f:
50 |             f.write(new_onx.SerializeToString())
51 |         oinf = CReferenceEvaluator(new_onx)
52 |         got = oinf.run(None, {"X0": a, "X1": b})
53 |         self.assertEqualArray(expected, got[0], atol=1e-15)
54 | 
55 |     def test_decompose_einsum_abc_cde_abde(self):
56 |         self.common_test_equation("abc,cde->abde", 3, 3)
57 | 
58 |     def test_decompose_einsum_abcd_cde_abe(self):
59 |         self.common_test_equation("abcd,cde->abe", 4, 3)
60 | 
61 | 
62 | if __name__ == "__main__":
63 |     import logging
64 | 
65 |     logging.getLogger("skl2onnx").setLevel(logging.ERROR)
66 |     logging.getLogger("onnx-extended").setLevel(logging.ERROR)
67 |     unittest.main(verbosity=2)
68 | 


--------------------------------------------------------------------------------
/_unittests/ut_tools/test_einsum_ml.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from itertools import permutations
 3 | from onnx_extended.ext_test_case import ExtTestCase
 4 | from onnx_extended.tools.einsum.einsum_ml import (
 5 |     predict_transposition_cost,
 6 |     compute_transposition_features,
 7 |     _edit_distance,
 8 | )
 9 | 
10 | 
11 | class TestEinsumMl(ExtTestCase):
12 |     def test_features(self):
13 |         res = compute_transposition_features((3, 5, 7), (0, 1, 2))
14 |         self.assertIsInstance(res, dict)
15 |         self.assertEqual(res["edit"], 0)
16 |         self.assertEqual(res["rot"], -1)
17 |         res = compute_transposition_features((3, 5, 7), (2, 1, 0))
18 |         self.assertEqual(res["edit"], 2)
19 |         self.assertEqual(res["rot"], 0)
20 |         self.assertEqual(res["rev"], 1)
21 | 
22 |     def test_cost(self):
23 |         res = predict_transposition_cost((3, 5, 7), (0, 1, 2))
24 |         self.assertIsInstance(res, float)
25 |         self.assertGreaterEqual(res, 0)
26 |         for shape in [(3, 5, 7), (30, 50, 70)]:
27 |             for perm in permutations([0, 1, 2]):
28 |                 p = tuple(perm)
29 |                 cost = predict_transposition_cost(shape, p)
30 |                 if p[-1] == 2:
31 |                     self.assertEqual(cost, 0)
32 | 
33 |     def test_edit_distance(self):
34 |         r = _edit_distance("", "a")
35 |         self.assertEqual(r, 1)
36 |         r = _edit_distance("a", "")
37 |         self.assertEqual(r, 1)
38 |         r = _edit_distance("a", "ab")
39 |         self.assertEqual(r, 1)
40 | 
41 | 
42 | if __name__ == "__main__":
43 |     unittest.main()
44 | 


--------------------------------------------------------------------------------
/_unittests/ut_tools/test_ort_debug.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import numpy as np
 3 | from contextlib import redirect_stdout
 4 | from io import StringIO
 5 | from onnx import NodeProto, TensorProto
 6 | from onnx.helper import (
 7 |     make_model,
 8 |     make_node,
 9 |     make_graph,
10 |     make_tensor_value_info,
11 |     make_opsetid,
12 | )
13 | from onnx.checker import check_model
14 | from onnx_extended.ext_test_case import ExtTestCase
15 | from onnx_extended.tools.ort_debug import enumerate_ort_run
16 | 
17 | 
18 | class TestOrtDebug(ExtTestCase):
19 |     def _get_model(self):
20 |         X = make_tensor_value_info("X", TensorProto.FLOAT, [None, None])
21 |         Y = make_tensor_value_info("Y", TensorProto.FLOAT, [None, None])
22 |         Z = make_tensor_value_info("Z", TensorProto.INT64, [None, None])
23 |         graph = make_graph(
24 |             [
25 |                 make_node("Add", ["X", "Y"], ["z1"]),
26 |                 make_node("Mul", ["X", "z1"], ["z2"]),
27 |                 make_node("Cast", ["z2"], ["Z"], to=TensorProto.INT64),
28 |             ],
29 |             "add",
30 |             [X, Y],
31 |             [Z],
32 |         )
33 |         onnx_model = make_model(
34 |             graph, opset_imports=[make_opsetid("", 18)], ir_version=8
35 |         )
36 |         check_model(onnx_model)
37 |         return onnx_model
38 | 
39 |     def test_enumerate_ort_run(self):
40 |         model = self._get_model()
41 |         feeds = {
42 |             "X": np.arange(4).reshape((2, 2)).astype(np.float32),
43 |             "Y": np.arange(4).reshape((2, 2)).astype(np.float32),
44 |         }
45 |         expected_names = [["z1"], ["z2"], ["Z"]]
46 |         for i, (names, outs, node) in enumerate(enumerate_ort_run(model, feeds)):
47 |             self.assertIsInstance(node, NodeProto)
48 |             self.assertIsInstance(names, list)
49 |             self.assertIsInstance(outs, list)
50 |             self.assertEqual(len(names), len(outs))
51 |             self.assertEqual(names, expected_names[i])
52 | 
53 |         st = StringIO()
54 |         with redirect_stdout(st):
55 |             for _ in enumerate_ort_run(model, feeds, verbose=2):
56 |                 pass
57 |         std = st.getvalue()
58 |         self.assertIn("Add(X, Y) -> z1", std)
59 |         self.assertIn("+ z1: float32(2, 2)", std)
60 |         self.assertIn("Cast(z2, to=7) -> Z", std)
61 | 
62 |         st = StringIO()
63 |         with redirect_stdout(st):
64 |             for _ in enumerate_ort_run(model, feeds, verbose=3):
65 |                 pass
66 |         std = st.getvalue()
67 |         self.assertIn("Add(X, Y) -> z1", std)
68 |         self.assertIn("+ z1: float32(2, 2)", std)
69 |         self.assertIn("[[", std)
70 | 
71 | 
72 | if __name__ == "__main__":
73 |     unittest.main(verbosity=2)
74 | 


--------------------------------------------------------------------------------
/_unittests/ut_validation/test_bench_tree.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from contextlib import redirect_stdout
 3 | from io import StringIO
 4 | from onnx import ModelProto
 5 | from onnx_extended.ext_test_case import ExtTestCase, skipif_ci_apple
 6 | from onnx_extended.validation.bench_trees import create_decision_tree, bench_trees
 7 | from onnx_extended.validation._tree_d14_f100 import tree_d14_f100
 8 | from onnx_extended.tools.onnx_io import onnx2string
 9 | 
10 | 
11 | class TestBenchTree(ExtTestCase):
12 |     def test_create_decision_tree(self):
13 |         tree = create_decision_tree(max_depth=2)
14 |         code = onnx2string(tree, as_code=True)
15 |         self.assertNotIn("import textwrap", code)
16 |         # with open("onnx_extended/validation/_tree_d14_f100.py", "w") as f:
17 |         #     f.write(code)
18 | 
19 |     def test_tree14(self):
20 |         model = tree_d14_f100()
21 |         self.assertIsInstance(model, ModelProto)
22 | 
23 |     def test_bench_tree(self):
24 |         res = bench_trees(
25 |             max_depth=2,
26 |             n_estimators=10,
27 |             n_features=4,
28 |             batch_size=100,
29 |             number=10,
30 |             warmup=2,
31 |             verbose=0,
32 |             engine_names=["onnxruntime", "CReferenceEvaluator"],
33 |         )
34 |         self.assertIsInstance(res, list)
35 |         self.assertEqual(len(res), 4)
36 | 
37 |     def test_bench_tree_verbose(self):
38 |         st = StringIO()
39 |         with redirect_stdout(st):
40 |             res = bench_trees(
41 |                 max_depth=2,
42 |                 n_estimators=10,
43 |                 n_features=4,
44 |                 batch_size=100,
45 |                 number=10,
46 |                 warmup=2,
47 |                 engine_names=["CReferenceEvaluator"],
48 |                 verbose=2,
49 |             )
50 |         text = st.getvalue()
51 |         self.assertIsInstance(res, list)
52 |         self.assertEqual(len(res), 2)
53 |         self.assertIn("test 'CReferenceEvaluator' duration=", text)
54 | 
55 |     @skipif_ci_apple("crash")
56 |     def test_bench_tree_all_engines(self):
57 |         res = bench_trees(
58 |             max_depth=2,
59 |             n_estimators=10,
60 |             n_features=4,
61 |             batch_size=100,
62 |             number=10,
63 |             warmup=2,
64 |             repeat=1,
65 |             engine_names=["onnxruntime", "onnxruntime-customops"],  # , "cython"],
66 |         )
67 |         self.assertIsInstance(res, list)
68 |         self.assertEqual(len(res), 2)
69 | 
70 | 
71 | if __name__ == "__main__":
72 |     unittest.main(verbosity=2)
73 | 


--------------------------------------------------------------------------------
/_unittests/ut_validation/test_cpu_fpemu.cpp:
--------------------------------------------------------------------------------
 1 | #include "onnx_extended/validation/cpu/cpu_fpemu.hpp"
 2 | #include "onnx_extended_helpers.h"
 3 | #include "onnx_extended_test_common.h"
 4 | 
 5 | using namespace cpu_fpemu;
 6 | 
 7 | void test_cast() {
 8 | 
 9 | #if defined(__SSSE3__)
10 | 
11 |   float f = 1.f;
12 |   double d = 1.f;
13 |   float ff = __double2float_rn(d);
14 |   ASSERT_THROW(f == ff);
15 |   unsigned short u = __float2half_rn(f);
16 |   float bu = __half2float(u);
17 |   ASSERT_THROW(f == bu);
18 | 
19 | #endif
20 | 
21 | }
22 | 
23 | int main(int, char**) {
24 |   test_cast();
25 | }
26 | 


--------------------------------------------------------------------------------
/_unittests/ut_validation/test_cpu_fpemu.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from onnx_extended.ext_test_case import ExtTestCase
 3 | from onnx_extended.validation.cpu._validation import has_sse3
 4 | 
 5 | 
 6 | class TestCpuFpEmu(ExtTestCase):
 7 |     @unittest.skipIf(not has_sse3(), "SSE3 not available")
 8 |     def test_cast(self):
 9 |         from onnx_extended.validation.cpu._validation import (
10 |             double2float_rn,
11 |             float2half_rn,
12 |             half2float,
13 |         )
14 | 
15 |         self.assertEqual(double2float_rn(1), 1)
16 |         self.assertEqual(half2float(float2half_rn(1)), 1)
17 | 
18 | 
19 | if __name__ == "__main__":
20 |     unittest.main(verbosity=2)
21 | 


--------------------------------------------------------------------------------
/_unittests/ut_validation/test_cuda_fpemu.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import numpy as np
 3 | from onnx_extended.ext_test_case import ExtTestCase
 4 | from onnx_extended import has_cuda
 5 | 
 6 | 
 7 | class TestCudaFpemu(ExtTestCase):
 8 |     @unittest.skipIf(not has_cuda(), reason="CUDA not available")
 9 |     def test_fpemu_cuda_forward(self):
10 |         from onnx_extended.validation.cuda.cuda_example_py import (
11 |             fpemu_cuda_forward,
12 |         )
13 | 
14 |         values = np.array(
15 |             [-2, -1, 0, 1, 2, 3, 10, 100, 10000, 20000, 50000, 100000, -100000],
16 |             dtype=np.float32,
17 |         )
18 |         res = fpemu_cuda_forward(values)
19 |         expected = res.copy()
20 |         self.assertEqual(res.shape, values.shape)
21 |         res = fpemu_cuda_forward(values)
22 |         self.assertEqual(res.shape, values.shape)
23 |         fpemu_cuda_forward(values, inplace=True)
24 |         self.assertEqualArray(expected, values)
25 | 
26 | 
27 | if __name__ == "__main__":
28 |     unittest.main(verbosity=2)
29 | 


--------------------------------------------------------------------------------
/_unittests/ut_validation/test_cuda_gemm.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from onnx_extended.ext_test_case import ExtTestCase
 3 | from onnx_extended import has_cuda
 4 | 
 5 | if has_cuda():
 6 |     from onnx_extended.validation.cuda.cuda_example_py import (
 7 |         gemm_benchmark_test,
 8 |         get_device_prop,
 9 |         cuda_device_count,
10 |         cuda_device_memory,
11 |         cuda_devices_memory,
12 |     )
13 | else:
14 |     gemm_benchmark_test = None
15 |     get_device_prop = None
16 | 
17 | 
18 | class TestCudaGemm(ExtTestCase):
19 |     @unittest.skipIf(get_device_prop is None, reason="CUDA not available")
20 |     def test_get_device_prop(self):
21 |         r = get_device_prop()
22 |         self.assertIsInstance(r, dict)
23 |         self.assertEqual(len(r), 12)
24 | 
25 |     @unittest.skipIf(get_device_prop is None, reason="CUDA not available")
26 |     def test_cuda_device_count(self):
27 |         r = cuda_device_count()
28 |         self.assertIsInstance(r, int)
29 |         self.assertGreater(r, 0)
30 | 
31 |     @unittest.skipIf(get_device_prop is None, reason="CUDA not available")
32 |     def test_cuda_device_memory(self):
33 |         r = cuda_device_memory(0)
34 |         self.assertIsInstance(r, tuple)
35 |         self.assertEqual(len(r), 2)
36 | 
37 |     @unittest.skipIf(get_device_prop is None, reason="CUDA not available")
38 |     def test_cuda_devices_memory(self):
39 |         r = cuda_devices_memory()
40 |         n = cuda_device_count()
41 |         self.assertIsInstance(r, list)
42 |         self.assertEqual(len(r), n)
43 |         self.assertIsInstance(r[0], tuple)
44 |         self.assertEqual(len(r[0]), 2)
45 | 
46 |     def gemm_test(self, test):
47 |         r = gemm_benchmark_test(test)
48 |         self.assertIsInstance(r, dict)
49 |         self.assertEqual(len(r), 24)
50 |         self.assertEqual(r["N"], 10)
51 | 
52 |     @unittest.skipIf(gemm_benchmark_test is None, reason="CUDA not available")
53 |     def test_gemm_test_float32(self):
54 |         for i in range(5):
55 |             with self.subTest(test=i):
56 |                 self.gemm_test(i)
57 | 
58 |     @unittest.skipIf(gemm_benchmark_test is None, reason="CUDA not available")
59 |     def test_gemm_test_float8(self):
60 |         r = get_device_prop()
61 |         if r["major"] < 9:
62 |             return
63 |         for i in range(5, 15):
64 |             if i in {8, 9, 10, 12, 13}:
65 |                 # still invalid
66 |                 continue
67 |             with self.subTest(test=i):
68 |                 self.gemm_test(i)
69 | 
70 | 
71 | if __name__ == "__main__":
72 |     unittest.main(verbosity=2)
73 | 


--------------------------------------------------------------------------------
/_unittests/ut_validation/test_cuda_monitor.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from onnx_extended.ext_test_case import ExtTestCase
 3 | from onnx_extended import has_cuda
 4 | 
 5 | if has_cuda():
 6 |     from onnx_extended.validation.cuda.cuda_monitor import (
 7 |         nvml_device_get_count,
 8 |         nvml_device_get_memory_info,
 9 |         nvml_init,
10 |         nvml_shutdown,
11 |     )
12 | else:
13 |     nvml_init = None
14 | 
15 | 
16 | class TestCudaMonitor(ExtTestCase):
17 |     @unittest.skipIf(nvml_init is None, reason="CUDA not available")
18 |     def test_nvml(self):
19 |         nvml_init()
20 |         r = nvml_device_get_count()
21 |         self.assertIsInstance(r, int)
22 |         self.assertGreater(r, 0)
23 |         info = nvml_device_get_memory_info()
24 |         self.assertIsInstance(info, tuple)
25 |         self.assertEqual(len(info), 3)
26 |         self.assertTrue(info[-1] >= max(info[:-1]))
27 |         nvml_shutdown()
28 | 
29 | 
30 | if __name__ == "__main__":
31 |     unittest.main(verbosity=2)
32 | 


--------------------------------------------------------------------------------
/_unittests/ut_validation/test_fp8.py:
--------------------------------------------------------------------------------
 1 | import struct
 2 | import unittest
 3 | import numpy
 4 | from onnx_extended.ext_test_case import ExtTestCase
 5 | 
 6 | try:
 7 |     from onnx_array_api.validation.f8 import search_float32_into_fe4m3
 8 | except ImportError:
 9 |     # onnx-array-api is not recent enough
10 |     search_float32_into_fe4m3 = None
11 | 
12 | 
13 | class TestFloat8(ExtTestCase):
14 |     def test_cast_float32_to_e4m3fn(self):
15 |         from onnx_extended.validation.cython.fp8 import (
16 |             cast_float32_to_e4m3fn,
17 |             cast_e4m3fn_to_float32,
18 |         )
19 | 
20 |         values = numpy.array([[10, 1, 4, 5, 6, 7]], dtype=numpy.float32)
21 |         f8 = cast_float32_to_e4m3fn(values)
22 |         back = cast_e4m3fn_to_float32(f8)
23 |         f82 = cast_float32_to_e4m3fn(back)
24 |         self.assertEqualArray(f8, f82)
25 | 
26 |     @unittest.skipIf(
27 |         search_float32_into_fe4m3 is None, reason="onnx-array-api not recent enough"
28 |     )
29 |     def test_cast_float32_to_e4m3fn_more(self):
30 |         from onnx_extended.validation.cython.fp8 import cast_float32_to_e4m3fn
31 | 
32 |         vect_search_float32_into_fe4m3 = numpy.vectorize(search_float32_into_fe4m3)
33 | 
34 |         values = numpy.array([[10, 1, 4, 5, 6, 7]], dtype=numpy.float32)
35 |         expected = vect_search_float32_into_fe4m3(values).astype(numpy.uint8)
36 |         f8 = cast_float32_to_e4m3fn(values)
37 |         self.assertEqualArray(expected, f8)
38 | 
39 |         x = numpy.random.randn(4, 4, 4).astype(numpy.float32)
40 |         expected = vect_search_float32_into_fe4m3(x).astype(numpy.uint8)
41 |         f8 = cast_float32_to_e4m3fn(x)
42 |         self.assertEqualArray(expected, f8)
43 | 
44 |     def test_inf(self):
45 |         from onnx_extended.validation.cython.fp8 import cast_float32_to_e4m3fn
46 | 
47 |         for x, e in [(numpy.float32(numpy.inf), 126), (numpy.float32(-numpy.inf), 254)]:
48 |             f8 = cast_float32_to_e4m3fn(x)
49 |             self.assertEqual(e, f8)
50 | 
51 |     def test_nan(self):
52 |         from onnx_extended.validation.cython.fp8 import cast_float32_to_e4m3fn
53 | 
54 |         expected = 127
55 |         values = [
56 |             (
57 |                 None,
58 |                 int.from_bytes(struct.pack("<f", numpy.float32(numpy.nan)), "little"),
59 |                 numpy.float32(numpy.nan),
60 |                 expected,
61 |             )
62 |         ]
63 |         for i in range(23):
64 |             v = 0x7F800000 | (1 << i)
65 |             f = numpy.uint32(v).view(numpy.float32)
66 |             values.append((i, v, f, expected))
67 |             values.append((i, v, -f, expected | 128))
68 | 
69 |         for i, v, x, e in values:
70 |             with self.subTest(x=x, e=e, h=hex(v), i=i):
71 |                 f8 = cast_float32_to_e4m3fn(x)
72 |                 self.assertEqual(e, f8)
73 | 
74 | 
75 | if __name__ == "__main__":
76 |     TestFloat8().test_nan()
77 |     unittest.main(verbosity=2)
78 | 


--------------------------------------------------------------------------------
/_unittests/ut_validation/test_speed_metrics.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from onnx_extended.ext_test_case import ExtTestCase, skipif_ci_apple
 3 | 
 4 | 
 5 | class TestSpeedMetrics(ExtTestCase):
 6 |     def test_benchmark_cache(self):
 7 |         from onnx_extended.validation.cpu._validation import (
 8 |             benchmark_cache,
 9 |         )
10 | 
11 |         res = benchmark_cache(1000, False)
12 |         self.assertGreater(res, 0)
13 | 
14 |     @skipif_ci_apple("unstable on Apple")
15 |     def test_benchmark_cache_tree(self):
16 |         from onnx_extended.validation.cpu._validation import (
17 |             benchmark_cache_tree,
18 |         )
19 | 
20 |         res = benchmark_cache_tree(1000)
21 |         self.assertIsInstance(res, list)
22 |         self.assertEqual(len(res), 1000)
23 |         last = res[-1]
24 |         self.assertEqual(last.trial, 0)
25 |         self.assertEqual(last.row, 999)
26 | 
27 |         res = benchmark_cache_tree(2000)
28 |         self.assertIsInstance(res, list)
29 |         self.assertEqual(len(res), 2000)
30 | 
31 | 
32 | if __name__ == "__main__":
33 |     # TestSpeedMetrics().test_benchmark_cache_tree()
34 |     unittest.main(verbosity=2)
35 | 


--------------------------------------------------------------------------------
/_unittests/ut_xrun_doc/test_args.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from onnx_extended.ext_test_case import ExtTestCase
 3 | from onnx_extended.args import get_parsed_args
 4 | 
 5 | 
 6 | class TestArgs(ExtTestCase):
 7 |     def test_get_parsed_args(self):
 8 |         parsed = get_parsed_args(
 9 |             "ut",
10 |             args=["-s", "large"],
11 |             expose="scenario",
12 |             scenarios={"large": "large model"},
13 |         )
14 |         self.assertEqual(parsed.scenario, "large")
15 | 
16 |     def test_get_parsed_args_exp(self):
17 |         parsed = get_parsed_args(
18 |             "ut", args=["-s", "large"], expose="", scenarios={"large": "large model"}
19 |         )
20 |         self.assertEqual(parsed.scenario, "large")
21 | 
22 |     def test_get_parsed_args_a(self):
23 |         parsed = get_parsed_args("ut", args=["--ppp", "5"], ppp=("j", "zoo"))
24 |         self.assertEqual(parsed.ppp, "5")
25 | 
26 |     def test_get_parsed_args_x(self):
27 |         parsed = get_parsed_args(
28 |             "ut", args=["-r", "5"], ppp=("j", "zoo"), expose="number,repeat"
29 |         )
30 |         self.assertEqual(parsed.ppp, "j")
31 |         self.assertEqual(parsed.repeat, 5)
32 | 
33 | 
34 | if __name__ == "__main__":
35 |     unittest.main(verbosity=2)
36 | 


--------------------------------------------------------------------------------
/_unittests/ut_xrun_doc/test_memory_peak.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import time
 3 | import unittest
 4 | import numpy as np
 5 | from onnx_extended.ext_test_case import ExtTestCase
 6 | from onnx_extended.memory_peak import get_memory_rss, start_spying_on
 7 | from onnx_extended import has_cuda
 8 | 
 9 | 
10 | class TestMemoryPeak(ExtTestCase):
11 |     def test_memory(self):
12 |         mem = get_memory_rss(os.getpid())
13 |         self.assertIsInstance(mem, int)
14 | 
15 |     @unittest.skipIf(True, reason="unit test never ends")
16 |     def test_spy(self):
17 |         p = start_spying_on()
18 |         res = []
19 |         for i in range(10):
20 |             time.sleep(0.005)
21 |             res.append(np.empty(i * 1000000))
22 |         del res
23 |         time.sleep(0.02)
24 |         pres = p.stop()
25 |         self.assertIsInstance(pres, dict)
26 |         self.assertLessEqual(pres["cpu"].end, pres["cpu"].max_peak)
27 |         self.assertLessEqual(pres["cpu"].begin, pres["cpu"].max_peak)
28 |         self.assertIsInstance(pres["cpu"].to_dict(), dict)
29 | 
30 |     @unittest.skipIf(not has_cuda(), reason="CUDA not here")
31 |     def test_spy_cuda(self):
32 |         p = start_spying_on(cuda=True)
33 |         res = []
34 |         for i in range(10):
35 |             time.sleep(0.005)
36 |             res.append(np.empty(i * 1000000))
37 |         del res
38 |         time.sleep(0.02)
39 |         pres = p.stop()
40 |         self.assertIsInstance(pres, dict)
41 |         self.assertLessEqual(pres["cpu"].end, pres["cpu"].max_peak)
42 |         self.assertLessEqual(pres["cpu"].begin, pres["cpu"].max_peak)
43 |         self.assertIn("gpus", pres)
44 |         self.assertLessEqual(pres["gpus"][0].end, pres["gpus"][0].max_peak)
45 |         self.assertLessEqual(pres["gpus"][0].begin, pres["gpus"][0].max_peak)
46 | 
47 | 
48 | if __name__ == "__main__":
49 |     unittest.main(verbosity=2)
50 | 


--------------------------------------------------------------------------------
/_unittests/ut_xrun_doc/test_version.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | from onnx_extended import __version__ as extversion, check_installation
 4 | from onnx_extended.ext_test_case import ExtTestCase
 5 | from onnx_extended._config import ORT_VERSION, ORT_VERSION_INT
 6 | from onnx_extended import ort_version, ort_version_int
 7 | 
 8 | try:
 9 |     import tomllib as toml
10 | 
11 |     fmt = "rb"
12 | except ImportError:
13 |     import toml
14 | 
15 |     fmt = "r"
16 | import unittest
17 | 
18 | 
19 | class TestVersion(ExtTestCase):
20 |     def test_version_toml(self):
21 |         this = os.path.dirname(__file__)
22 |         name = os.path.join(this, "..", "..", "pyproject.toml")
23 |         with open(name, fmt) as f:
24 |             tom = toml.load(f)
25 |         self.assertEqual("onnx-extended", tom["project"]["name"])
26 |         self.assertEqual(extversion, tom["project"]["version"])
27 | 
28 |     def test_check_installation(self):
29 |         # It seems using both ortops and ortcy and customops lead to
30 |         # munmap_chunk(): invalid pointer on Linux
31 |         check_installation(val=True, ortcy=True, ortops=sys.platform != "inux")
32 | 
33 |     def test_ort_version(self):
34 |         self.assertEqual(ort_version(), ORT_VERSION)
35 |         v = (
36 |             ORT_VERSION_INT // 1000,
37 |             (ORT_VERSION_INT % 1000) // 10,
38 |             ORT_VERSION_INT % 10,
39 |         )
40 |         self.assertEqual(ort_version_int(), v)
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     unittest.main(verbosity=2)
45 | 


--------------------------------------------------------------------------------
/clean_build.sh:
--------------------------------------------------------------------------------
1 | rm dist -rf
2 | rm build -rf
3 | rm $(find . -name "*.so")
4 | rm onnx_extended/validation/cython/fp8.cpp
5 | rm onnx_extended/ortcy/wrap/ortinf.cpp
6 | 


--------------------------------------------------------------------------------
/clean_onnx.sh:
--------------------------------------------------------------------------------
 1 | rm *.onnx
 2 | rm *.json
 3 | rm *.png
 4 | rm *.csv
 5 | rm *.nsys-rep
 6 | rm *.sqlite
 7 | rm tt_*
 8 | rm plot*
 9 | rm test* -rf
10 | rm temp* -rf
11 | rm dump* -rf
12 | rm *.sarif
13 | rm *.svg
14 | rm dump_models -rf
15 | rm neural_coder_workspace -rf
16 | rm *.data
17 | rm .build_path.txt
18 | 
19 | rm _doc/examples/plot*.onnx
20 | rm _doc/examples/plot*.txt
21 | rm _doc/examples/ort*.onnx
22 | rm _doc/examples/*.sarif
23 | rm _doc/examples/*.json
24 | rm _doc/examples/*.png
25 | rm _doc/examples/*.csv
26 | rm _doc/examples/*.xlsx
27 | rm _doc/examples/dummy*.onnx
28 | rm _doc/examples/*.opt.onnx
29 | rm _doc/examples/*.dynamo.onnx
30 | rm _doc/examples/*.script.onnx
31 | rm _doc/examples/dump_models -rf
32 | rm _doc/sg_execution*
33 | 


--------------------------------------------------------------------------------
/onnx_extended/__main__.py:
--------------------------------------------------------------------------------
1 | from ._command_lines_parser import main
2 | 
3 | if __name__ == "__main__":
4 |     main()
5 | 


--------------------------------------------------------------------------------
/onnx_extended/_common.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, List
 2 | 
 3 | 
 4 | def pick(*args: List[Any]) -> Any:
 5 |     """
 6 |     Picks the value value not None.
 7 |     """
 8 |     for a in args:
 9 |         if a is not None:
10 |             return a
11 |     raise ValueError("All values are None.")
12 | 


--------------------------------------------------------------------------------
/onnx_extended/cpp/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sdpython/onnx-extended/eebca3c82b615f841608f74131900738b7f7f845/onnx_extended/cpp/__init__.py


--------------------------------------------------------------------------------
/onnx_extended/cpp/c_op_allocation.cpp:
--------------------------------------------------------------------------------
 1 | #include "common/c_op_allocation.h"
 2 | #include <cstdlib>
 3 | 
 4 | #if (!(defined(PYTHON_MANYLINUX) && PYTHON_MANYLINUX))
 5 | #include <new>
 6 | #endif
 7 | 
 8 | namespace onnx_c_ops {
 9 | 
10 | #if (defined(PYTHON_MANYLINUX) && PYTHON_MANYLINUX)
11 | 
12 | void *AllocatorDefaultAlloc(std::size_t size) { return malloc(size); }
13 | 
14 | void AllocatorDefaultFree(void *p) { free(p); }
15 | 
16 | #else
17 | 
18 | void *AllocatorDefaultAlloc(std::size_t size) {
19 |   const std::size_t alignment = 64;
20 |   void *p;
21 | #if _MSC_VER
22 |   p = _aligned_malloc(size, alignment);
23 |   if (p == nullptr)
24 | #if __cplusplus >= 202002L
25 |     throw std::bad_alloc();
26 | #else
27 |     abort();
28 | #endif
29 | #elif defined(_LIBCPP_SGX_CONFIG)
30 |   p = memalign(alignment, size);
31 |   if (p == nullptr)
32 | #if __cplusplus >= 202002L
33 |     throw std::bad_alloc();
34 | #else
35 |     abort();
36 | #endif
37 | #else
38 |   int ret = posix_memalign(&p, alignment, size);
39 |   if (ret != 0)
40 | #if __cplusplus >= 202002L
41 |     throw std::bad_alloc();
42 | #else
43 |     abort();
44 | #endif
45 | #endif
46 |   return p;
47 | }
48 | 
49 | void AllocatorDefaultFree(void *p) {
50 | #if _MSC_VER
51 |   _aligned_free(p);
52 | #else
53 |   free(p);
54 | #endif
55 | }
56 | 
57 | #endif
58 | 
59 | } // namespace onnx_c_ops
60 | 


--------------------------------------------------------------------------------
/onnx_extended/cpp/cpu/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sdpython/onnx-extended/eebca3c82b615f841608f74131900738b7f7f845/onnx_extended/cpp/cpu/__init__.py


--------------------------------------------------------------------------------
/onnx_extended/cpp/cuda/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sdpython/onnx-extended/eebca3c82b615f841608f74131900738b7f7f845/onnx_extended/cpp/cuda/__init__.py


--------------------------------------------------------------------------------
/onnx_extended/cpp/include/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sdpython/onnx-extended/eebca3c82b615f841608f74131900738b7f7f845/onnx_extended/cpp/include/__init__.py


--------------------------------------------------------------------------------
/onnx_extended/cpp/include/common/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sdpython/onnx-extended/eebca3c82b615f841608f74131900738b7f7f845/onnx_extended/cpp/include/common/__init__.py


--------------------------------------------------------------------------------
/onnx_extended/cpp/include/common/c_op_allocation.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <cstddef>
 4 | 
 5 | namespace onnx_c_ops {
 6 | 
 7 | void *AllocatorDefaultAlloc(std::size_t size);
 8 | void AllocatorDefaultFree(void *p);
 9 | 
10 | } // namespace onnx_c_ops
11 | 


--------------------------------------------------------------------------------
/onnx_extended/cpp/include/common/c_op_common_parameters.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <stdexcept>
 4 | #include <string>
 5 | #include <vector>
 6 | 
 7 | namespace onnx_c_ops {
 8 | 
 9 | enum class POST_EVAL_TRANSFORM {
10 |   NONE = 0,
11 |   LOGISTIC = 1,
12 |   SOFTMAX = 2,
13 |   SOFTMAX_ZERO = 3,
14 |   PROBIT = 4
15 | };
16 | 
17 | POST_EVAL_TRANSFORM to_POST_EVAL_TRANSFORM(const std::string &value);
18 | 
19 | enum NODE_MODE : uint8_t {
20 |   LEAF = 1,
21 |   BRANCH_LEQ = 2,
22 |   BRANCH_LT = 4,
23 |   BRANCH_GTE = 6,
24 |   BRANCH_GT = 8,
25 |   BRANCH_EQ = 10,
26 |   BRANCH_NEQ = 12
27 | };
28 | 
29 | NODE_MODE to_NODE_MODE(const std::string &value);
30 | 
31 | const char *to_str(NODE_MODE mode);
32 | 
33 | enum class AGGREGATE_FUNCTION { AVERAGE, SUM, MIN, MAX };
34 | 
35 | AGGREGATE_FUNCTION to_AGGREGATE_FUNCTION(const std::string &input);
36 | 
37 | enum class SVM_TYPE { SVM_LINEAR = 1, SVM_SVC = 2 };
38 | 
39 | SVM_TYPE to_SVM_TYPE(const std::string &value);
40 | 
41 | enum KERNEL { LINEAR, POLY, RBF, SIGMOID };
42 | 
43 | KERNEL to_KERNEL(const std::string &value);
44 | 
45 | enum StorageOrder {
46 |   UNKNOWN = 0,
47 |   NHWC = 1,
48 |   NCHW = 2,
49 | };
50 | 
51 | StorageOrder to_StorageOrder(const std::string &value);
52 | 
53 | enum class AutoPadType {
54 |   NOTSET = 0,
55 |   VALID = 1,
56 |   SAME_UPPER = 2,
57 |   SAME_LOWER = 3,
58 | };
59 | 
60 | AutoPadType to_AutoPadType(const std::string &value);
61 | 
62 | } // namespace onnx_c_ops
63 | 


--------------------------------------------------------------------------------
/onnx_extended/cpp/include/common/c_op_status.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | namespace onnx_c_ops {
 4 | 
 5 | class Status {
 6 | public:
 7 |   int code;
 8 |   inline Status() : code(1) {}
 9 |   inline Status(int code) : code(code) {}
10 |   inline Status &operator=(const Status &other) {
11 |     code = other.code;
12 |     return *this;
13 |   }
14 |   inline bool IsOK() const { return code == 1; }
15 |   inline int Code() const { return code; }
16 |   inline bool operator==(const Status &other) const { return code == other.code; }
17 |   inline bool operator!=(const Status &other) const { return !(*this == other); }
18 |   inline static Status OK() { return Status(1); }
19 | };
20 | 
21 | } // namespace onnx_c_ops
22 | 


--------------------------------------------------------------------------------
/onnx_extended/cpp/include/common/simple_span.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <cstddef>
 4 | 
 5 | namespace std_ {
 6 | 
 7 | template <typename T>
 8 | class span {
 9 | public:
10 |     span(T* data, std::size_t size) : data_(data), size_(size) {}
11 | 
12 |     inline T* data() const { return data_; }
13 |     inline std::size_t size() const { return size_; }
14 |     inline T& operator[](std::size_t index) const { return data_[index]; }
15 |     inline T* begin() const { return data_; }
16 |     inline T* end() const { return data_ + size_; }
17 | 
18 | private:
19 |     T* data_;
20 |     std::size_t size_;
21 | };
22 | 
23 | } // namespace std_
24 | 


--------------------------------------------------------------------------------
/onnx_extended/cpp/include/cpu/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sdpython/onnx-extended/eebca3c82b615f841608f74131900738b7f7f845/onnx_extended/cpp/include/cpu/__init__.py


--------------------------------------------------------------------------------
/onnx_extended/cpp/include/cuda/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sdpython/onnx-extended/eebca3c82b615f841608f74131900738b7f7f845/onnx_extended/cpp/include/cuda/__init__.py


--------------------------------------------------------------------------------
/onnx_extended/cpp/include/ortapi_c_api_header.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <onnxruntime_c_api.h>
 4 | 
 5 | #if defined(_WIN32)
 6 | 
 7 | // ...
 8 | 
 9 | #elif defined(__MACOSX__) || defined(__APPLE__)
10 | 
11 | // ..
12 | 
13 | #else
14 | 
15 | #define IS_EMPTY(x) IS_EMPTY_HELPER(x)
16 | #define IS_EMPTY_HELPER(x) IS_EMPTY_CHECK(x ## 1, 1)
17 | #define IS_EMPTY_CHECK(a, b, ...) b
18 | 
19 | #if IS_EMPTY(ORT_EXPORT)
20 | #undef ORT_EXPORT
21 | #define ORT_EXPORT __attribute__ ((visibility("default")))
22 | #endif
23 | 
24 | #endif
25 | 


--------------------------------------------------------------------------------
/onnx_extended/cpp/include/ortapi_version.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | 
3 | #define ORT_API_MANUAL_INIT
4 | #include "onnxruntime_c_api.h"
5 | #include <onnxruntime_cxx_api.h>
6 | #undef ORT_API_MANUAL_INIT
7 | 
8 | #define ORT_API_VERSION_SUPPORTED 16
9 | 


--------------------------------------------------------------------------------
/onnx_extended/helper/__init__.py:
--------------------------------------------------------------------------------
1 | from .make_dynamic_quantize_linear import (
2 |     make_dynamic_quantize_linear_function_proto,
3 |     make_simple_dynamic_quantize_linear_function_proto,
4 | )
5 | from .make_reshape_transpose import (
6 |     make_matmul_reshape_transpose_function_proto,
7 |     make_matmul_reshape_transpose_back_function_proto,
8 | )
9 | 


--------------------------------------------------------------------------------
/onnx_extended/ortcy/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/onnx_extended/ortcy/wrap/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/onnx_extended/ortcy/wrap/ortapi_inline.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "onnx_extended_helpers.h"
 4 | #include "ortapi_version.h"
 5 | 
 6 | namespace ortapi {
 7 | 
 8 | inline static const OrtApi *GetOrtApi() {
 9 |   const OrtApi *api_ = OrtGetApiBase()->GetApi(ORT_API_VERSION_SUPPORTED);
10 |   return api_;
11 | }
12 | 
13 | inline const char *ort_version() { return OrtGetApiBase()->GetVersionString(); }
14 | 
15 | inline void _ThrowOnError_(OrtStatus *ort_status, const char *filename, int line) {
16 |   if (ort_status) {
17 |     std::string message(GetOrtApi()->GetErrorMessage(ort_status));
18 |     OrtErrorCode code = GetOrtApi()->GetErrorCode(ort_status);
19 |     throw std::runtime_error(onnx_extended_helpers::MakeString(
20 |         "error: onnxruntime(", code, "), ", message, "\n    ", filename, ":", line));
21 |   }
22 | }
23 | 
24 | #define ThrowOnError(ort_status) _ThrowOnError_(ort_status, __FILE__, __LINE__)
25 | 
26 | } // namespace ortapi
27 | 


--------------------------------------------------------------------------------
/onnx_extended/ortops/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import platform
 3 | from typing import Dict, List
 4 | 
 5 | _ort_ext_libs_pathes: Dict[str, List[str]] = {}
 6 | 
 7 | 
 8 | def _get_ort_ext_libs(path: str) -> List[str]:
 9 |     """
10 |     Returns the list of libraries implementing new simple
11 |     :epkg:`onnxruntime` kernels and places in folder *path*.
12 |     """
13 |     global _ort_ext_libs_pathes
14 |     if path not in _ort_ext_libs_pathes:
15 |         _ort_ext_libs_pathes[path] = []
16 |     if not _ort_ext_libs_pathes[path]:
17 |         if platform.system() == "Windows":
18 |             ext = ".dll"
19 |         elif platform.system() == "Darwin":
20 |             ext = ".dylib"
21 |         else:
22 |             ext = ".so"
23 |         this = os.path.abspath(path)
24 |         files = os.listdir(this)
25 |         res = []
26 |         for name in files:
27 |             e = os.path.splitext(name)[-1]
28 |             if e == ext and "ortops" in name:
29 |                 res.append(os.path.join(this, name))
30 |         assert res, (
31 |             f"Unable to find any kernel library with ext={ext!r} "
32 |             f"in {this!r} among {files}."
33 |         )
34 |         _ort_ext_libs_pathes[path] = res
35 |     return _ort_ext_libs_pathes[path]
36 | 


--------------------------------------------------------------------------------
/onnx_extended/ortops/optim/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sdpython/onnx-extended/eebca3c82b615f841608f74131900738b7f7f845/onnx_extended/ortops/optim/__init__.py


--------------------------------------------------------------------------------
/onnx_extended/ortops/optim/cpu/ort_optim_cpu_lib.cc:
--------------------------------------------------------------------------------
 1 | // Source: https://github.com/microsoft/onnxruntime/tree/main/
 2 | // onnxruntime/test/testdata/custom_op_get_const_input_test_library
 3 | 
 4 | #include <mutex>
 5 | #include <vector>
 6 | 
 7 | #include "ort_optim_cpu_lib.h"
 8 | #include "ort_sparse.hpp"
 9 | #include "ort_svm.hpp"
10 | #include "ort_tfidf_vectorizer.hpp"
11 | #include "ort_tree_ensemble.hpp"
12 | #include "ortapi_version.h"
13 | 
14 | static const char *c_OpDomain = "onnx_extended.ortops.optim.cpu";
15 | 
16 | static void AddOrtCustomOpDomainToContainer(Ort::CustomOpDomain &&domain) {
17 |   static std::vector<Ort::CustomOpDomain> ort_custom_op_domain_container;
18 |   static std::mutex ort_custom_op_domain_mutex;
19 |   std::lock_guard<std::mutex> lock(ort_custom_op_domain_mutex);
20 |   ort_custom_op_domain_container.push_back(std::move(domain));
21 | }
22 | 
23 | OrtStatus *ORT_API_CALL RegisterCustomOps(OrtSessionOptions *options,
24 |                                           const OrtApiBase *api_base) {
25 |   Ort::InitApi(api_base->GetApi(ORT_API_VERSION_SUPPORTED));
26 |   Ort::UnownedSessionOptions session_options(options);
27 | 
28 |   // An instance remaining available until onnxruntime unload the library.
29 |   static ortops::DenseToSparse<float> c_DenseToSparse;
30 |   static ortops::SparseToDense<float> c_SparseToDense;
31 |   static ortops::SVMClassifier<float> c_SVMClassifier;
32 |   static ortops::SVMRegressor<float> c_SVMRegressor;
33 |   static ortops::TreeEnsembleRegressor<onnx_c_ops::DenseFeatureAccessor<float>, float, float>
34 |       c_TreeEnsembleRegressor;
35 |   static ortops::TreeEnsembleClassifier<onnx_c_ops::DenseFeatureAccessor<float>, float, float>
36 |       c_TreeEnsembleClassifier;
37 |   static ortops::TreeEnsembleRegressor<onnx_c_ops::SparseFeatureAccessor<float>, float, float>
38 |       c_TreeEnsembleRegressorSparse;
39 |   static ortops::TreeEnsembleClassifier<onnx_c_ops::SparseFeatureAccessor<float>, float, float>
40 |       c_TreeEnsembleClassifierSparse;
41 |   static ortops::TfIdfVectorizer<int64_t, float> c_TfIdfVectorizer;
42 | 
43 |   try {
44 |     Ort::CustomOpDomain domain{c_OpDomain};
45 | 
46 |     domain.Add(&c_DenseToSparse);
47 |     domain.Add(&c_SparseToDense);
48 |     domain.Add(&c_SVMClassifier);
49 |     domain.Add(&c_SVMRegressor);
50 |     domain.Add(&c_TreeEnsembleClassifier);
51 |     domain.Add(&c_TreeEnsembleClassifierSparse);
52 |     domain.Add(&c_TreeEnsembleRegressor);
53 |     domain.Add(&c_TreeEnsembleRegressorSparse);
54 |     domain.Add(&c_TfIdfVectorizer);
55 | 
56 |     session_options.Add(domain);
57 |     AddOrtCustomOpDomainToContainer(std::move(domain));
58 |   } catch (const std::exception &e) {
59 |     Ort::Status status{e};
60 |     return status.release();
61 |   }
62 | 
63 |   return nullptr;
64 | }
65 | 


--------------------------------------------------------------------------------
/onnx_extended/ortops/optim/cpu/ort_optim_cpu_lib.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "ortapi_c_api_header.h"
 4 | 
 5 | #ifdef __cplusplus
 6 | extern "C" {
 7 | #endif
 8 | 
 9 | ORT_EXPORT OrtStatus *ORT_API_CALL RegisterCustomOps(OrtSessionOptions *options,
10 |                                                      const OrtApiBase *api_base);
11 | 
12 | #ifdef __cplusplus
13 | }
14 | #endif
15 | 


--------------------------------------------------------------------------------
/onnx_extended/ortops/optim/cpu/ort_sparse.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "common/common_kernels.h"
 4 | 
 5 | namespace ortops {
 6 | 
 7 | template <typename T> struct DenseToSparseKernel {
 8 |   DenseToSparseKernel(const OrtApi &api, const OrtKernelInfo *info);
 9 |   void Compute(OrtKernelContext *context);
10 | };
11 | 
12 | template <typename T>
13 | struct DenseToSparse : Ort::CustomOpBase<DenseToSparse<T>, DenseToSparseKernel<T>> {
14 |   void *CreateKernel(const OrtApi &api, const OrtKernelInfo *info) const;
15 |   const char *GetName() const;
16 |   const char *GetExecutionProviderType() const;
17 |   std::size_t GetInputTypeCount() const;
18 |   ONNXTensorElementDataType GetInputType(std::size_t index) const;
19 |   std::size_t GetOutputTypeCount() const;
20 |   ONNXTensorElementDataType GetOutputType(std::size_t index) const;
21 | };
22 | 
23 | template <typename T> struct SparseToDenseKernel {
24 |   SparseToDenseKernel(const OrtApi &api, const OrtKernelInfo *info);
25 |   void Compute(OrtKernelContext *context);
26 | };
27 | 
28 | template <typename T>
29 | struct SparseToDense : Ort::CustomOpBase<SparseToDense<T>, SparseToDenseKernel<T>> {
30 |   void *CreateKernel(const OrtApi &api, const OrtKernelInfo *info) const;
31 |   const char *GetName() const;
32 |   const char *GetExecutionProviderType() const;
33 |   std::size_t GetInputTypeCount() const;
34 |   ONNXTensorElementDataType GetInputType(std::size_t index) const;
35 |   std::size_t GetOutputTypeCount() const;
36 |   ONNXTensorElementDataType GetOutputType(std::size_t index) const;
37 | };
38 | 
39 | } // namespace ortops
40 | 


--------------------------------------------------------------------------------
/onnx_extended/ortops/optim/cpu/ort_svm.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "common/common_kernels.h"
 4 | #include "cpu/c_op_svm_common_.hpp"
 5 | 
 6 | namespace ortops {
 7 | 
 8 | template <typename T> struct SVMKernel {
 9 |   SVMKernel(const OrtApi &api, const OrtKernelInfo *info);
10 |   void Compute(OrtKernelContext *context);
11 | 
12 |   // Attributes
13 |   int64_t n_targets_or_classes;
14 |   std::unique_ptr<onnx_c_ops::RuntimeSVMCommon<T>> svm_type;
15 |   bool is_classifier;
16 | };
17 | 
18 | template <typename T> struct SVMRegressor : Ort::CustomOpBase<SVMRegressor<T>, SVMKernel<T>> {
19 |   void *CreateKernel(const OrtApi &api, const OrtKernelInfo *info) const;
20 |   const char *GetName() const;
21 |   const char *GetExecutionProviderType() const;
22 |   std::size_t GetInputTypeCount() const;
23 |   ONNXTensorElementDataType GetInputType(std::size_t index) const;
24 |   std::size_t GetOutputTypeCount() const;
25 |   ONNXTensorElementDataType GetOutputType(std::size_t index) const;
26 | };
27 | 
28 | template <typename T> struct SVMClassifier : Ort::CustomOpBase<SVMClassifier<T>, SVMKernel<T>> {
29 |   void *CreateKernel(const OrtApi &api, const OrtKernelInfo *info) const;
30 |   const char *GetName() const;
31 |   const char *GetExecutionProviderType() const;
32 |   std::size_t GetInputTypeCount() const;
33 |   ONNXTensorElementDataType GetInputType(std::size_t index) const;
34 |   std::size_t GetOutputTypeCount() const;
35 |   ONNXTensorElementDataType GetOutputType(std::size_t index) const;
36 | };
37 | 
38 | } // namespace ortops
39 | 


--------------------------------------------------------------------------------
/onnx_extended/ortops/optim/cpu/ort_tfidf_vectorizer.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "common/common_kernels.h"
 4 | #include "cpu/c_op_tfidf_vectorizer_.hpp"
 5 | // #include <memory>
 6 | 
 7 | namespace ortops {
 8 | 
 9 | template <typename TIN, typename TOUT> struct TfIdfVectorizerKernel {
10 | 
11 |   #if __cpluscplus >= 202002L
12 |   typedef std::span<TOUT> span_type_tout;
13 |   typedef std::span<const int64_t> span_type_int64;
14 | #else
15 |   typedef std_::span<TOUT> span_type_tout;
16 |   typedef std_::span<const int64_t> span_type_int64;
17 | #endif
18 |   TfIdfVectorizerKernel(const OrtApi &api, const OrtKernelInfo *info);
19 |   void Compute(OrtKernelContext *context);
20 | 
21 |   std::unique_ptr<onnx_c_ops::RuntimeTfIdfVectorizer<TOUT>> tfidf_typed;
22 | };
23 | 
24 | template <typename TIN, typename TOUT>
25 | struct TfIdfVectorizer
26 |     : Ort::CustomOpBase<TfIdfVectorizer<TIN, TOUT>, TfIdfVectorizerKernel<TIN, TOUT>> {
27 |   void *CreateKernel(const OrtApi &api, const OrtKernelInfo *info) const;
28 |   const char *GetName() const;
29 |   const char *GetExecutionProviderType() const;
30 |   std::size_t GetInputTypeCount() const;
31 |   ONNXTensorElementDataType GetInputType(std::size_t index) const;
32 |   std::size_t GetOutputTypeCount() const;
33 |   ONNXTensorElementDataType GetOutputType(std::size_t index) const;
34 | };
35 | 
36 | } // namespace ortops
37 | 


--------------------------------------------------------------------------------
/onnx_extended/ortops/optim/cpu/ort_tree_ensemble.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "common/common_kernels.h"
 4 | #include "cpu/c_op_tree_ensemble_common_.hpp"
 5 | #include "cpu/c_op_tree_ensemble_common_classifier_.hpp"
 6 | 
 7 | namespace ortops {
 8 | 
 9 | template <typename IFEATURETYPE, typename TTYPE, typename OTYPE> struct TreeEnsembleKernel {
10 |   TreeEnsembleKernel(const OrtApi &api, const OrtKernelInfo *info);
11 |   void Compute(OrtKernelContext *context);
12 | 
13 |   // Attributes
14 |   int64_t n_targets_or_classes;
15 |   std::unique_ptr<onnx_c_ops::TreeEnsembleCommon<IFEATURETYPE, TTYPE, OTYPE>>
16 |       reg_type_type_type;
17 |   std::unique_ptr<onnx_c_ops::TreeEnsembleCommonClassifier<IFEATURETYPE, TTYPE, OTYPE>>
18 |       cls_type_type_type;
19 |   bool is_classifier;
20 | };
21 | 
22 | template <typename IFEATURETYPE, typename TTYPE, typename OTYPE>
23 | struct TreeEnsembleRegressor
24 |     : Ort::CustomOpBase<TreeEnsembleRegressor<IFEATURETYPE, TTYPE, OTYPE>,
25 |                         TreeEnsembleKernel<IFEATURETYPE, TTYPE, OTYPE>> {
26 |   void *CreateKernel(const OrtApi &api, const OrtKernelInfo *info) const;
27 |   const char *GetName() const;
28 |   const char *GetExecutionProviderType() const;
29 |   std::size_t GetInputTypeCount() const;
30 |   ONNXTensorElementDataType GetInputType(std::size_t index) const;
31 |   std::size_t GetOutputTypeCount() const;
32 |   ONNXTensorElementDataType GetOutputType(std::size_t index) const;
33 | };
34 | 
35 | template <typename ITYPE, typename TTYPE, typename OTYPE>
36 | struct TreeEnsembleClassifier : Ort::CustomOpBase<TreeEnsembleClassifier<ITYPE, TTYPE, OTYPE>,
37 |                                                   TreeEnsembleKernel<ITYPE, TTYPE, OTYPE>> {
38 |   void *CreateKernel(const OrtApi &api, const OrtKernelInfo *info) const;
39 |   const char *GetName() const;
40 |   const char *GetExecutionProviderType() const;
41 |   std::size_t GetInputTypeCount() const;
42 |   ONNXTensorElementDataType GetInputType(std::size_t index) const;
43 |   std::size_t GetOutputTypeCount() const;
44 |   ONNXTensorElementDataType GetOutputType(std::size_t index) const;
45 | };
46 | 
47 | } // namespace ortops
48 | 


--------------------------------------------------------------------------------
/onnx_extended/ortops/optim/cuda/add_or_mul_shared_input.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "common/common_kernels.h"
 4 | #include "cublas_v2.h"
 5 | #include <cuda_runtime.h>
 6 | 
 7 | namespace ortops {
 8 | 
 9 | template <typename T, bool addition> struct AddOrMulSharedInputKernel {
10 |   AddOrMulSharedInputKernel(const OrtApi &api, const OrtKernelInfo *info);
11 |   void Compute(OrtKernelContext *context);
12 | };
13 | 
14 | template <typename T, bool addition>
15 | struct AddOrMulSharedInputOp
16 |     : Ort::CustomOpBase<AddOrMulSharedInputOp<T, addition>, AddOrMulSharedInputKernel<T, addition>> {
17 |   typedef Ort::CustomOpBase<AddOrMulSharedInputOp<T, addition>, AddOrMulSharedInputKernel<T, addition>>
18 |       parent_type;
19 |   AddOrMulSharedInputOp() : parent_type() {}
20 |   void *CreateKernel(const OrtApi &api, const OrtKernelInfo *info) const;
21 |   const char *GetName() const;
22 |   const char *GetExecutionProviderType() const;
23 | 
24 |   std::size_t GetInputTypeCount() const;
25 |   ONNXTensorElementDataType GetInputType(std::size_t index) const;
26 |   OrtCustomOpInputOutputCharacteristic GetInputCharacteristic(std::size_t index) const;
27 | 
28 |   std::size_t GetOutputTypeCount() const;
29 |   ONNXTensorElementDataType GetOutputType(std::size_t index) const;
30 |   OrtCustomOpInputOutputCharacteristic GetOutputCharacteristic(std::size_t index) const;
31 | };
32 | 
33 | } // namespace ortops
34 | 


--------------------------------------------------------------------------------
/onnx_extended/ortops/optim/cuda/addaddaddmulmulmul.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "common/common_kernels.h"
 4 | #include "cublas_v2.h"
 5 | #include <cuda_runtime.h>
 6 | 
 7 | namespace ortops {
 8 | 
 9 | template <typename T, bool addition> struct AddAddAddMulMulMulKernel {
10 |   AddAddAddMulMulMulKernel(const OrtApi &api, const OrtKernelInfo *info);
11 |   void Compute(OrtKernelContext *context);
12 | };
13 | 
14 | template <typename T, bool addition>
15 | struct AddAddAddMulMulMulOp : Ort::CustomOpBase<AddAddAddMulMulMulOp<T, addition>,
16 |                                                 AddAddAddMulMulMulKernel<T, addition>> {
17 |   typedef Ort::CustomOpBase<AddAddAddMulMulMulOp<T, addition>,
18 |                             AddAddAddMulMulMulKernel<T, addition>>
19 |       parent_type;
20 |   AddAddAddMulMulMulOp() : parent_type() {}
21 |   void *CreateKernel(const OrtApi &api, const OrtKernelInfo *info) const;
22 |   const char *GetName() const;
23 |   const char *GetExecutionProviderType() const;
24 | 
25 |   std::size_t GetInputTypeCount() const;
26 |   ONNXTensorElementDataType GetInputType(std::size_t index) const;
27 |   OrtCustomOpInputOutputCharacteristic GetInputCharacteristic(std::size_t index) const;
28 | 
29 |   std::size_t GetOutputTypeCount() const;
30 |   ONNXTensorElementDataType GetOutputType(std::size_t index) const;
31 |   OrtCustomOpInputOutputCharacteristic GetOutputCharacteristic(std::size_t index) const;
32 | };
33 | 
34 | } // namespace ortops
35 | 


--------------------------------------------------------------------------------
/onnx_extended/ortops/optim/cuda/addaddmulmul.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "common/common_kernels.h"
 4 | #include "cublas_v2.h"
 5 | #include <cuda_runtime.h>
 6 | 
 7 | namespace ortops {
 8 | 
 9 | template <typename T, bool addition> struct AddAddMulMulKernel {
10 |   AddAddMulMulKernel(const OrtApi &api, const OrtKernelInfo *info);
11 |   void Compute(OrtKernelContext *context);
12 | };
13 | 
14 | template <typename T, bool addition>
15 | struct AddAddMulMulOp
16 |     : Ort::CustomOpBase<AddAddMulMulOp<T, addition>, AddAddMulMulKernel<T, addition>> {
17 |   typedef Ort::CustomOpBase<AddAddMulMulOp<T, addition>, AddAddMulMulKernel<T, addition>>
18 |       parent_type;
19 |   AddAddMulMulOp() : parent_type() {}
20 |   void *CreateKernel(const OrtApi &api, const OrtKernelInfo *info) const;
21 |   const char *GetName() const;
22 |   const char *GetExecutionProviderType() const;
23 | 
24 |   std::size_t GetInputTypeCount() const;
25 |   ONNXTensorElementDataType GetInputType(std::size_t index) const;
26 |   OrtCustomOpInputOutputCharacteristic GetInputCharacteristic(std::size_t index) const;
27 | 
28 |   std::size_t GetOutputTypeCount() const;
29 |   ONNXTensorElementDataType GetOutputType(std::size_t index) const;
30 |   OrtCustomOpInputOutputCharacteristic GetOutputCharacteristic(std::size_t index) const;
31 | };
32 | 
33 | } // namespace ortops
34 | 


--------------------------------------------------------------------------------
/onnx_extended/ortops/optim/cuda/addmul.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "common/common_kernels.h"
 4 | #include "cublas_v2.h"
 5 | #include <cuda_runtime.h>
 6 | 
 7 | namespace ortops {
 8 | 
 9 | template <typename T, bool addition> struct AddMulKernel {
10 |   AddMulKernel(const OrtApi &api, const OrtKernelInfo *info);
11 |   void Compute(OrtKernelContext *context);
12 |   private:
13 |     // If true, the operator assumes there are 4 dimensions and the two middle ones are switched.
14 |     bool switch_middle_axis_;
15 | };
16 | 
17 | template <typename T, bool addition>
18 | struct AddMulOp : Ort::CustomOpBase<AddMulOp<T, addition>, AddMulKernel<T, addition>> {
19 |   typedef Ort::CustomOpBase<AddMulOp<T, addition>, AddMulKernel<T, addition>> parent_type;
20 |   AddMulOp() : parent_type() {}
21 |   void *CreateKernel(const OrtApi &api, const OrtKernelInfo *info) const;
22 |   const char *GetName() const;
23 |   const char *GetExecutionProviderType() const;
24 | 
25 |   std::size_t GetInputTypeCount() const;
26 |   ONNXTensorElementDataType GetInputType(std::size_t index) const;
27 |   OrtCustomOpInputOutputCharacteristic GetInputCharacteristic(std::size_t index) const;
28 | 
29 |   std::size_t GetOutputTypeCount() const;
30 |   ONNXTensorElementDataType GetOutputType(std::size_t index) const;
31 |   OrtCustomOpInputOutputCharacteristic GetOutputCharacteristic(std::size_t index) const;
32 | };
33 | 
34 | } // namespace ortops
35 | 


--------------------------------------------------------------------------------
/onnx_extended/ortops/optim/cuda/mul_mul_sigmoid.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "common/common_kernels.h"
 4 | #include "cublas_v2.h"
 5 | #include <cuda_runtime.h>
 6 | 
 7 | namespace ortops {
 8 | 
 9 | template <typename T> struct MulMulSigmoidKernel {
10 |   MulMulSigmoidKernel(const OrtApi &api, const OrtKernelInfo *info);
11 |   void Compute(OrtKernelContext *context);
12 | };
13 | 
14 | template <typename T>
15 | struct MulMulSigmoidOp : Ort::CustomOpBase<MulMulSigmoidOp<T>, MulMulSigmoidKernel<T>> {
16 |   typedef Ort::CustomOpBase<MulMulSigmoidOp<T>, MulMulSigmoidKernel<T>> parent_type;
17 |   MulMulSigmoidOp() : parent_type() {}
18 |   void *CreateKernel(const OrtApi &api, const OrtKernelInfo *info) const;
19 |   const char *GetName() const;
20 |   const char *GetExecutionProviderType() const;
21 | 
22 |   std::size_t GetInputTypeCount() const;
23 |   ONNXTensorElementDataType GetInputType(std::size_t index) const;
24 |   OrtCustomOpInputOutputCharacteristic GetInputCharacteristic(std::size_t index) const;
25 | 
26 |   std::size_t GetOutputTypeCount() const;
27 |   ONNXTensorElementDataType GetOutputType(std::size_t index) const;
28 |   OrtCustomOpInputOutputCharacteristic GetOutputCharacteristic(std::size_t index) const;
29 | };
30 | 
31 | } // namespace ortops
32 | 


--------------------------------------------------------------------------------
/onnx_extended/ortops/optim/cuda/mul_sigmoid.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "common/common_kernels.h"
 4 | #include "cublas_v2.h"
 5 | #include <cuda_runtime.h>
 6 | 
 7 | namespace ortops {
 8 | 
 9 | template <typename T> struct MulSigmoidKernel {
10 |   MulSigmoidKernel(const OrtApi &api, const OrtKernelInfo *info);
11 |   void Compute(OrtKernelContext *context);
12 | };
13 | 
14 | template <typename T>
15 | struct MulSigmoidOp : Ort::CustomOpBase<MulSigmoidOp<T>, MulSigmoidKernel<T>> {
16 |   typedef Ort::CustomOpBase<MulSigmoidOp<T>, MulSigmoidKernel<T>> parent_type;
17 |   MulSigmoidOp() : parent_type() {}
18 |   void *CreateKernel(const OrtApi &api, const OrtKernelInfo *info) const;
19 |   const char *GetName() const;
20 |   const char *GetExecutionProviderType() const;
21 | 
22 |   std::size_t GetInputTypeCount() const;
23 |   ONNXTensorElementDataType GetInputType(std::size_t index) const;
24 |   OrtCustomOpInputOutputCharacteristic GetInputCharacteristic(std::size_t index) const;
25 | 
26 |   std::size_t GetOutputTypeCount() const;
27 |   ONNXTensorElementDataType GetOutputType(std::size_t index) const;
28 |   OrtCustomOpInputOutputCharacteristic GetOutputCharacteristic(std::size_t index) const;
29 | };
30 | 
31 | } // namespace ortops
32 | 


--------------------------------------------------------------------------------
/onnx_extended/ortops/optim/cuda/negxplus1.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "common/common_kernels.h"
 4 | #include "cublas_v2.h"
 5 | #include <cuda_runtime.h>
 6 | 
 7 | namespace ortops {
 8 | 
 9 | template <typename T> struct NegXplus1Kernel {
10 |   NegXplus1Kernel(const OrtApi &api, const OrtKernelInfo *info);
11 |   void Compute(OrtKernelContext *context);
12 | };
13 | 
14 | template <typename T>
15 | struct NegXplus1Op : Ort::CustomOpBase<NegXplus1Op<T>, NegXplus1Kernel<T>> {
16 |   typedef Ort::CustomOpBase<NegXplus1Op<T>, NegXplus1Kernel<T>> parent_type;
17 |   NegXplus1Op() : parent_type() {}
18 |   void *CreateKernel(const OrtApi &api, const OrtKernelInfo *info) const;
19 |   const char *GetName() const;
20 |   const char *GetExecutionProviderType() const;
21 | 
22 |   std::size_t GetInputTypeCount() const;
23 |   ONNXTensorElementDataType GetInputType(std::size_t index) const;
24 |   OrtCustomOpInputOutputCharacteristic GetInputCharacteristic(std::size_t index) const;
25 | 
26 |   std::size_t GetOutputTypeCount() const;
27 |   ONNXTensorElementDataType GetOutputType(std::size_t index) const;
28 |   OrtCustomOpInputOutputCharacteristic GetOutputCharacteristic(std::size_t index) const;
29 | };
30 | 
31 | } // namespace ortops
32 | 


--------------------------------------------------------------------------------
/onnx_extended/ortops/optim/cuda/ort_optim_cuda_lib.h:
--------------------------------------------------------------------------------
 1 | // Source: https://github.com/microsoft/onnxruntime/tree/main/
 2 | // onnxruntime/test/testdata/custom_op_get_const_input_test_library
 3 | #pragma once
 4 | 
 5 | #include "ortapi_c_api_header.h"
 6 | 
 7 | #ifdef __cplusplus
 8 | extern "C" {
 9 | #endif
10 | 
11 | ORT_EXPORT OrtStatus *ORT_API_CALL RegisterCustomOps(OrtSessionOptions *options,
12 |                                                      const OrtApiBase *api_base);
13 | 
14 | #ifdef __cplusplus
15 | }
16 | #endif
17 | 


--------------------------------------------------------------------------------
/onnx_extended/ortops/optim/cuda/replace_zero.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "common/common_kernels.h"
 4 | #include "cublas_v2.h"
 5 | #include <cuda_runtime.h>
 6 | 
 7 | namespace ortops {
 8 | 
 9 | template <typename T> struct ReplaceZeroKernel {
10 |   ReplaceZeroKernel(const OrtApi &api, const OrtKernelInfo *info);
11 |   void Compute(OrtKernelContext *context);
12 | 
13 | private:
14 |   float by_;
15 | };
16 | 
17 | template <typename T>
18 | struct ReplaceZeroOp : Ort::CustomOpBase<ReplaceZeroOp<T>, ReplaceZeroKernel<T>> {
19 |   typedef Ort::CustomOpBase<ReplaceZeroOp<T>, ReplaceZeroKernel<T>> parent_type;
20 |   ReplaceZeroOp() : parent_type() {}
21 |   void *CreateKernel(const OrtApi &api, const OrtKernelInfo *info) const;
22 |   const char *GetName() const;
23 |   const char *GetExecutionProviderType() const;
24 | 
25 |   std::size_t GetInputTypeCount() const;
26 |   ONNXTensorElementDataType GetInputType(std::size_t index) const;
27 |   OrtCustomOpInputOutputCharacteristic GetInputCharacteristic(std::size_t index) const;
28 | 
29 |   std::size_t GetOutputTypeCount() const;
30 |   ONNXTensorElementDataType GetOutputType(std::size_t index) const;
31 |   OrtCustomOpInputOutputCharacteristic GetOutputCharacteristic(std::size_t index) const;
32 | };
33 | 
34 | } // namespace ortops
35 | 


--------------------------------------------------------------------------------
/onnx_extended/ortops/optim/cuda/rotary.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "common/common_kernels.h"
 4 | #include "cublas_v2.h"
 5 | #include <cuda_runtime.h>
 6 | 
 7 | namespace ortops {
 8 | 
 9 | enum class RotarySide : int {
10 |   LEFT = 1,
11 |   RIGHT = 2,
12 | };
13 | 
14 | template <typename T> struct RotaryKernel {
15 |   RotaryKernel(const OrtApi &api, const OrtKernelInfo *info);
16 |   void Compute(OrtKernelContext *context);
17 | 
18 | private:
19 |   RotarySide rotary_side_;
20 | };
21 | 
22 | template <typename T> struct RotaryOp : Ort::CustomOpBase<RotaryOp<T>, RotaryKernel<T>> {
23 |   typedef Ort::CustomOpBase<RotaryOp<T>, RotaryKernel<T>> parent_type;
24 |   RotaryOp() : parent_type() {}
25 |   void *CreateKernel(const OrtApi &api, const OrtKernelInfo *info) const;
26 |   const char *GetName() const;
27 |   const char *GetExecutionProviderType() const;
28 | 
29 |   std::size_t GetInputTypeCount() const;
30 |   ONNXTensorElementDataType GetInputType(std::size_t index) const;
31 |   OrtCustomOpInputOutputCharacteristic GetInputCharacteristic(std::size_t index) const;
32 |   OrtMemType GetInputMemoryType(std::size_t index) const;
33 | 
34 |   std::size_t GetOutputTypeCount() const;
35 |   ONNXTensorElementDataType GetOutputType(std::size_t index) const;
36 |   OrtCustomOpInputOutputCharacteristic GetOutputCharacteristic(std::size_t index) const;
37 | };
38 | 
39 | } // namespace ortops
40 | 


--------------------------------------------------------------------------------
/onnx_extended/ortops/optim/cuda/scatter_nd_of_shape.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "common/common_kernels.h"
 4 | #include "cublas_v2.h"
 5 | #include "scatter_nd_of_shape_common.h"
 6 | #include <cuda_runtime.h>
 7 | 
 8 | namespace ortops {
 9 | 
10 | template <typename T> struct ScatterNDOfShapeKernel {
11 |   ScatterNDOfShapeKernel(const OrtApi &api, const OrtKernelInfo *info);
12 |   void Compute(OrtKernelContext *context);
13 | 
14 | private:
15 |   void ComputeNone(cudaStream_t &stream, const std::vector<int64_t> &input_shape,
16 |                    const std::vector<int64_t> &indices_shape, T *output_data,
17 |                    const int64_t *indices_data, const T *updates_data) const;
18 |   void ComputeOptimize(cudaStream_t &stream, const std::vector<int64_t> &input_shape,
19 |                        const std::vector<int64_t> &indices_shape, T *output_data,
20 |                        const int64_t *indices_data, const T *updates_data) const;
21 | 
22 |   Reduction reduction_;
23 |   Strategy strategy_;
24 |   int maxThreadPerBlock_;
25 | };
26 | 
27 | template <typename T>
28 | struct ScatterNDOfShapeOp
29 |     : Ort::CustomOpBase<ScatterNDOfShapeOp<T>, ScatterNDOfShapeKernel<T>> {
30 |   typedef Ort::CustomOpBase<ScatterNDOfShapeOp<T>, ScatterNDOfShapeKernel<T>> parent_type;
31 |   ScatterNDOfShapeOp() : parent_type() {}
32 |   void *CreateKernel(const OrtApi &api, const OrtKernelInfo *info) const;
33 |   const char *GetName() const;
34 |   const char *GetExecutionProviderType() const;
35 | 
36 |   std::size_t GetInputTypeCount() const;
37 |   ONNXTensorElementDataType GetInputType(std::size_t index) const;
38 |   OrtCustomOpInputOutputCharacteristic GetInputCharacteristic(std::size_t index) const;
39 |   OrtMemType GetInputMemoryType(std::size_t index) const;
40 | 
41 |   std::size_t GetOutputTypeCount() const;
42 |   ONNXTensorElementDataType GetOutputType(std::size_t index) const;
43 |   OrtCustomOpInputOutputCharacteristic GetOutputCharacteristic(std::size_t index) const;
44 | };
45 | 
46 | } // namespace ortops
47 | 


--------------------------------------------------------------------------------
/onnx_extended/ortops/optim/cuda/scatter_nd_of_shape_common.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | namespace ortops {
 4 | 
 5 | enum class Reduction : int {
 6 |   None = 0,
 7 |   Add = 1,
 8 |   Mul = 2,
 9 |   Min = 3,
10 |   Max = 4,
11 | };
12 | 
13 | enum class Strategy : int {
14 |   None = 0,
15 |   Optimize = 1,
16 | };
17 | 
18 | struct Shape2 {
19 |   int64_t dims[12];
20 | };
21 | 
22 | } // namespace ortops
23 | 


--------------------------------------------------------------------------------
/onnx_extended/ortops/optim/cuda/scatter_nd_of_shape_masked.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "common/common_kernels.h"
 4 | #include "cublas_v2.h"
 5 | #include "scatter_nd_of_shape_common.h"
 6 | #include <cuda_runtime.h>
 7 | 
 8 | namespace ortops {
 9 | 
10 | template <typename T> struct MaskedScatterNDOfShapeKernel {
11 |   MaskedScatterNDOfShapeKernel(const OrtApi &api, const OrtKernelInfo *info);
12 |   void Compute(OrtKernelContext *context);
13 | 
14 | private:
15 |   void ComputeOptimize(cudaStream_t &stream, const std::vector<int64_t> &input_shape,
16 |                        const std::vector<int64_t> &indices_shape, T *output_data,
17 |                        const int64_t *indices_data, const T *updates_data) const;
18 | 
19 |   Reduction reduction_;
20 |   int maxThreadPerBlock_;
21 |   int64_t masked_value_;
22 | };
23 | 
24 | template <typename T>
25 | struct MaskedScatterNDOfShapeOp
26 |     : Ort::CustomOpBase<MaskedScatterNDOfShapeOp<T>, MaskedScatterNDOfShapeKernel<T>> {
27 |   typedef Ort::CustomOpBase<MaskedScatterNDOfShapeOp<T>, MaskedScatterNDOfShapeKernel<T>>
28 |       parent_type;
29 |   MaskedScatterNDOfShapeOp() : parent_type() {}
30 |   void *CreateKernel(const OrtApi &api, const OrtKernelInfo *info) const;
31 |   const char *GetName() const;
32 |   const char *GetExecutionProviderType() const;
33 | 
34 |   std::size_t GetInputTypeCount() const;
35 |   ONNXTensorElementDataType GetInputType(std::size_t index) const;
36 |   OrtCustomOpInputOutputCharacteristic GetInputCharacteristic(std::size_t index) const;
37 |   OrtMemType GetInputMemoryType(std::size_t index) const;
38 | 
39 |   std::size_t GetOutputTypeCount() const;
40 |   ONNXTensorElementDataType GetOutputType(std::size_t index) const;
41 |   OrtCustomOpInputOutputCharacteristic GetOutputCharacteristic(std::size_t index) const;
42 | };
43 | 
44 | } // namespace ortops
45 | 


--------------------------------------------------------------------------------
/onnx_extended/ortops/optim/cuda/submul.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "common/common_kernels.h"
 4 | #include "cublas_v2.h"
 5 | #include <cuda_runtime.h>
 6 | 
 7 | namespace ortops {
 8 | 
 9 | template <typename T, bool addition> struct SubMulKernel {
10 |   SubMulKernel(const OrtApi &api, const OrtKernelInfo *info);
11 |   void Compute(OrtKernelContext *context);
12 | 
13 | private:
14 |   bool negative_;
15 | };
16 | 
17 | template <typename T, bool addition>
18 | struct SubMulOp : Ort::CustomOpBase<SubMulOp<T, addition>, SubMulKernel<T, addition>> {
19 |   typedef Ort::CustomOpBase<SubMulOp<T, addition>, SubMulKernel<T, addition>> parent_type;
20 |   SubMulOp() : parent_type() {}
21 |   void *CreateKernel(const OrtApi &api, const OrtKernelInfo *info) const;
22 |   const char *GetName() const;
23 |   const char *GetExecutionProviderType() const;
24 | 
25 |   std::size_t GetInputTypeCount() const;
26 |   ONNXTensorElementDataType GetInputType(std::size_t index) const;
27 |   OrtCustomOpInputOutputCharacteristic GetInputCharacteristic(std::size_t index) const;
28 | 
29 |   std::size_t GetOutputTypeCount() const;
30 |   ONNXTensorElementDataType GetOutputType(std::size_t index) const;
31 |   OrtCustomOpInputOutputCharacteristic GetOutputCharacteristic(std::size_t index) const;
32 | };
33 | 
34 | } // namespace ortops
35 | 


--------------------------------------------------------------------------------
/onnx_extended/ortops/optim/cuda/transpose_cast_2d.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "common/common_kernels.h"
 4 | #include "cublas_v2.h"
 5 | #include <cuda_runtime.h>
 6 | 
 7 | namespace ortops {
 8 | 
 9 | struct Transpose2DCastKernel {
10 |   Transpose2DCastKernel(const OrtApi &api, const OrtKernelInfo *info);
11 |   void Compute(OrtKernelContext *context);
12 | };
13 | 
14 | struct Transpose2DCastOp : Ort::CustomOpBase<Transpose2DCastOp, Transpose2DCastKernel> {
15 |   typedef Ort::CustomOpBase<Transpose2DCastOp, Transpose2DCastKernel> parent_type;
16 |   Transpose2DCastOp(ONNXTensorElementDataType input_type, ONNXTensorElementDataType output_type)
17 |       : parent_type() {
18 |     input_type_ = input_type;
19 |     output_type_ = output_type;
20 |   }
21 |   void *CreateKernel(const OrtApi &api, const OrtKernelInfo *info) const;
22 |   const char *GetName() const;
23 |   const char *GetExecutionProviderType() const;
24 | 
25 |   std::size_t GetInputTypeCount() const;
26 |   ONNXTensorElementDataType GetInputType(std::size_t index) const;
27 |   OrtCustomOpInputOutputCharacteristic GetInputCharacteristic(std::size_t index) const;
28 | 
29 |   std::size_t GetOutputTypeCount() const;
30 |   ONNXTensorElementDataType GetOutputType(std::size_t index) const;
31 |   OrtCustomOpInputOutputCharacteristic GetOutputCharacteristic(std::size_t index) const;
32 | 
33 | private:
34 |   ONNXTensorElementDataType input_type_;
35 |   ONNXTensorElementDataType output_type_;
36 | };
37 | 
38 | } // namespace ortops
39 | 


--------------------------------------------------------------------------------
/onnx_extended/ortops/optim/cuda/tri_matrix.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "common/common_kernels.h"
 4 | #include "cublas_v2.h"
 5 | #include <cuda_runtime.h>
 6 | 
 7 | namespace ortops {
 8 | 
 9 | template <typename T> struct TriMatrixKernel {
10 |   TriMatrixKernel(const OrtApi &api, const OrtKernelInfo *info);
11 |   void Compute(OrtKernelContext *context);
12 | };
13 | 
14 | template <typename T>
15 | struct TriMatrixOp : Ort::CustomOpBase<TriMatrixOp<T>, TriMatrixKernel<T>> {
16 |   typedef Ort::CustomOpBase<TriMatrixOp<T>, TriMatrixKernel<T>> parent_type;
17 |   TriMatrixOp() : parent_type() {}
18 |   void *CreateKernel(const OrtApi &api, const OrtKernelInfo *info) const;
19 |   const char *GetName() const;
20 |   const char *GetExecutionProviderType() const;
21 | 
22 |   std::size_t GetInputTypeCount() const;
23 |   ONNXTensorElementDataType GetInputType(std::size_t index) const;
24 |   OrtCustomOpInputOutputCharacteristic GetInputCharacteristic(std::size_t index) const;
25 |   OrtMemType GetInputMemoryType(std::size_t index) const;
26 | 
27 |   std::size_t GetOutputTypeCount() const;
28 |   ONNXTensorElementDataType GetOutputType(std::size_t index) const;
29 |   OrtCustomOpInputOutputCharacteristic GetOutputCharacteristic(std::size_t index) const;
30 | };
31 | 
32 | } // namespace ortops
33 | 


--------------------------------------------------------------------------------
/onnx_extended/ortops/tutorial/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/onnx_extended/ortops/tutorial/cpu/custom_tree_assembly.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "common/common_kernels.h"
 4 | 
 5 | namespace ortops {
 6 | 
 7 | struct CustomTreeAssemblyKernel {
 8 |   CustomTreeAssemblyKernel(const OrtApi &api, const OrtKernelInfo *info, bool classifier);
 9 |   void Compute(OrtKernelContext *context);
10 |   ~CustomTreeAssemblyKernel();
11 | 
12 |   bool classifier_;
13 |   std::string assembly_name_;
14 |   /* TreebeardSORunner */ void *assembly_runner_;
15 | };
16 | 
17 | struct CustomTreeAssemblyOp
18 |     : Ort::CustomOpBase<CustomTreeAssemblyOp, CustomTreeAssemblyKernel> {
19 |   typedef Ort::CustomOpBase<CustomTreeAssemblyOp, CustomTreeAssemblyKernel> parent_type;
20 |   CustomTreeAssemblyOp(bool classifier) : parent_type(), classifier_(classifier) {}
21 |   void *CreateKernel(const OrtApi &api, const OrtKernelInfo *info) const;
22 |   const char *GetName() const;
23 |   const char *GetExecutionProviderType() const;
24 | 
25 |   std::size_t GetInputTypeCount() const;
26 |   ONNXTensorElementDataType GetInputType(std::size_t index) const;
27 |   OrtCustomOpInputOutputCharacteristic GetInputCharacteristic(std::size_t index) const;
28 | 
29 |   std::size_t GetOutputTypeCount() const;
30 |   ONNXTensorElementDataType GetOutputType(std::size_t index) const;
31 |   OrtCustomOpInputOutputCharacteristic GetOutputCharacteristic(std::size_t index) const;
32 | 
33 | private:
34 |   bool classifier_;
35 | };
36 | 
37 | } // namespace ortops
38 | 


--------------------------------------------------------------------------------
/onnx_extended/ortops/tutorial/cpu/dynamic_quantize_linear.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "common/common_kernels.h"
 4 | 
 5 | namespace ortops {
 6 | 
 7 | struct DynamicQuantizeLinearKernel {
 8 |   DynamicQuantizeLinearKernel(const OrtApi &api, const OrtKernelInfo *info);
 9 |   void Compute(OrtKernelContext *context);
10 | 
11 | private:
12 |   template <typename T>
13 |   void ComputeInternal(int64_t n_elements, const T *input, uint8_t *output, float &scale,
14 |                        uint8_t &zero_point);
15 | 
16 |   int64_t to_;
17 | };
18 | 
19 | struct DynamicQuantizeLinearOp
20 |     : Ort::CustomOpBase<DynamicQuantizeLinearOp, DynamicQuantizeLinearKernel> {
21 |   typedef Ort::CustomOpBase<DynamicQuantizeLinearOp, DynamicQuantizeLinearKernel> parent_type;
22 |   DynamicQuantizeLinearOp(ONNXTensorElementDataType input_type,
23 |                           ONNXTensorElementDataType quant_type)
24 |       : parent_type(), input_type_(input_type), quant_type_(quant_type) {}
25 | 
26 |   void *CreateKernel(const OrtApi &api, const OrtKernelInfo *info) const noexcept;
27 |   const char *GetName() const noexcept;
28 |   const char *GetExecutionProviderType() const noexcept;
29 | 
30 |   std::size_t GetInputTypeCount() const noexcept;
31 |   ONNXTensorElementDataType GetInputType(std::size_t index) const noexcept;
32 | 
33 |   std::size_t GetOutputTypeCount() const noexcept;
34 |   ONNXTensorElementDataType GetOutputType(std::size_t index) const;
35 | 
36 | private:
37 |   ONNXTensorElementDataType input_type_;
38 |   ONNXTensorElementDataType quant_type_;
39 | };
40 | 
41 | } // namespace ortops
42 | 


--------------------------------------------------------------------------------
/onnx_extended/ortops/tutorial/cpu/my_kernel.cc:
--------------------------------------------------------------------------------
 1 | #include "my_kernel.h"
 2 | 
 3 | namespace ortops {
 4 | 
 5 | MyCustomKernel::MyCustomKernel(const OrtApi & /* api */, const OrtKernelInfo * /* info */) {}
 6 | 
 7 | void MyCustomKernel::Compute(OrtKernelContext *context) {
 8 |   Ort::KernelContext ctx(context);
 9 |   Ort::ConstValue input_X = ctx.GetInput(0);
10 |   Ort::ConstValue input_Y = ctx.GetInput(1);
11 |   const float *X = input_X.GetTensorData<float>();
12 |   const float *Y = input_Y.GetTensorData<float>();
13 | 
14 |   // Setup output, which is assumed to have the same dimensions as the inputs.
15 |   std::vector<int64_t> dimensions = input_X.GetTensorTypeAndShapeInfo().GetShape();
16 | 
17 |   Ort::UnownedValue output = ctx.GetOutput(0, dimensions);
18 |   float *out = output.GetTensorMutableData<float>();
19 | 
20 |   const std::size_t size = output.GetTensorTypeAndShapeInfo().GetElementCount();
21 | 
22 |   // Do computation
23 |   for (std::size_t i = 0; i < size; i++) {
24 |     out[i] = X[i] + Y[i];
25 |   }
26 | }
27 | 
28 | void *MyCustomOp::CreateKernel(const OrtApi &api, const OrtKernelInfo *info) const {
29 |   return std::make_unique<MyCustomKernel>(api, info).release();
30 | }
31 | 
32 | const char *MyCustomOp::GetName() const { return "MyCustomOp"; };
33 | 
34 | const char *MyCustomOp::GetExecutionProviderType() const { return "CPUExecutionProvider"; }
35 | 
36 | size_t MyCustomOp::GetInputTypeCount() const { return 2; };
37 | 
38 | ONNXTensorElementDataType MyCustomOp::GetInputType(std::size_t /* index */) const {
39 |   return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
40 | }
41 | 
42 | size_t MyCustomOp::GetOutputTypeCount() const { return 1; };
43 | 
44 | ONNXTensorElementDataType MyCustomOp::GetOutputType(std::size_t /* index */) const {
45 |   return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
46 | }
47 | 
48 | } // namespace ortops
49 | 


--------------------------------------------------------------------------------
/onnx_extended/ortops/tutorial/cpu/my_kernel.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "common/common_kernels.h"
 4 | 
 5 | namespace ortops {
 6 | 
 7 | struct MyCustomKernel {
 8 |   MyCustomKernel(const OrtApi &api, const OrtKernelInfo *info);
 9 |   void Compute(OrtKernelContext *context);
10 | };
11 | 
12 | struct MyCustomOp : Ort::CustomOpBase<MyCustomOp, MyCustomKernel> {
13 |   void *CreateKernel(const OrtApi &api, const OrtKernelInfo *info) const;
14 |   const char *GetName() const;
15 |   const char *GetExecutionProviderType() const;
16 |   std::size_t GetInputTypeCount() const;
17 |   ONNXTensorElementDataType GetInputType(std::size_t index) const;
18 |   std::size_t GetOutputTypeCount() const;
19 |   ONNXTensorElementDataType GetOutputType(std::size_t index) const;
20 | };
21 | 
22 | } // namespace ortops
23 | 


--------------------------------------------------------------------------------
/onnx_extended/ortops/tutorial/cpu/my_kernel_attr.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "common/common_kernels.h"
 4 | 
 5 | namespace ortops {
 6 | 
 7 | struct MyCustomKernelWithAttributes {
 8 |   MyCustomKernelWithAttributes(const OrtApi &api, const OrtKernelInfo *info);
 9 |   void Compute(OrtKernelContext *context);
10 | 
11 | private:
12 |   std::string att_string;
13 |   float att_float;
14 |   int64_t att_int64;
15 |   std::vector<double> att_tensor_double;
16 | };
17 | 
18 | struct MyCustomOpWithAttributes
19 |     : Ort::CustomOpBase<MyCustomOpWithAttributes, MyCustomKernelWithAttributes> {
20 |   void *CreateKernel(const OrtApi &api, const OrtKernelInfo *info) const;
21 |   const char *GetName() const;
22 |   const char *GetExecutionProviderType() const;
23 |   std::size_t GetInputTypeCount() const;
24 |   ONNXTensorElementDataType GetInputType(std::size_t index) const;
25 |   std::size_t GetOutputTypeCount() const;
26 |   ONNXTensorElementDataType GetOutputType(std::size_t index) const;
27 | };
28 | 
29 | } // namespace ortops
30 | 


--------------------------------------------------------------------------------
/onnx_extended/ortops/tutorial/cpu/ort_tutorial_cpu_lib.cc:
--------------------------------------------------------------------------------
 1 | // Source: https://github.com/microsoft/onnxruntime/tree/main/
 2 | // onnxruntime/test/testdata/custom_op_get_const_input_test_library
 3 | 
 4 | #include <mutex>
 5 | #include <vector>
 6 | 
 7 | #include "custom_gemm.h"
 8 | #include "custom_tree_assembly.h"
 9 | #include "dynamic_quantize_linear.h"
10 | #include "my_kernel.h"
11 | #include "my_kernel_attr.h"
12 | #include "ort_tutorial_cpu_lib.h"
13 | #include "ortapi_version.h"
14 | 
15 | static const char *c_OpDomain = "onnx_extended.ortops.tutorial.cpu";
16 | 
17 | static void AddOrtCustomOpDomainToContainer(Ort::CustomOpDomain &&domain) {
18 |   static std::vector<Ort::CustomOpDomain> ort_custom_op_domain_container;
19 |   static std::mutex ort_custom_op_domain_mutex;
20 |   std::lock_guard<std::mutex> lock(ort_custom_op_domain_mutex);
21 |   ort_custom_op_domain_container.push_back(std::move(domain));
22 | }
23 | 
24 | OrtStatus *ORT_API_CALL RegisterCustomOps(OrtSessionOptions *options,
25 |                                           const OrtApiBase *api_base) {
26 |   Ort::InitApi(api_base->GetApi(ORT_API_VERSION_SUPPORTED));
27 |   Ort::UnownedSessionOptions session_options(options);
28 | 
29 |   // An instance remaining available until onnxruntime unload the library.
30 |   static ortops::MyCustomOp c_CustomOp;
31 |   static ortops::MyCustomOpWithAttributes c_CustomOpAttr;
32 |   static ortops::CustomGemmOp c_CustomGemmFloat(
33 |       "CustomGemmFloat", ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT,
34 |       ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, false);
35 |   static ortops::CustomGemmOp c_CustomGemmFloat16(
36 |       "CustomGemmFloat16", ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16,
37 |       ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16, ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16, false);
38 |   static ortops::CustomTreeAssemblyOp c_CustomTreeAssembly(false);
39 | 
40 | #if ORT_API_VERSION_SUPPORTED >= 16
41 |   static ortops::DynamicQuantizeLinearOp c_dql(ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT,
42 |                                                ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT8E4M3FN);
43 | 
44 |   static ortops::CustomGemmOp c_CustomGemmFloat8E4M3FN(
45 |       "CustomGemmFloat8E4M3FN", ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT8E4M3FN,
46 |       ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, false);
47 | #endif
48 | 
49 |   try {
50 |     Ort::CustomOpDomain domain{c_OpDomain};
51 | 
52 |     domain.Add(&c_CustomOp);
53 |     domain.Add(&c_CustomOpAttr);
54 |     domain.Add(&c_CustomGemmFloat);
55 |     domain.Add(&c_CustomGemmFloat16);
56 |     domain.Add(&c_CustomTreeAssembly);
57 | #if ORT_API_VERSION_SUPPORTED >= 16
58 |     domain.Add(&c_dql);
59 |     domain.Add(&c_CustomGemmFloat8E4M3FN);
60 | #endif
61 | 
62 |     session_options.Add(domain);
63 |     AddOrtCustomOpDomainToContainer(std::move(domain));
64 |   } catch (const std::exception &e) {
65 |     Ort::Status status{e};
66 |     return status.release();
67 |   }
68 | 
69 |   return nullptr;
70 | }
71 | 


--------------------------------------------------------------------------------
/onnx_extended/ortops/tutorial/cpu/ort_tutorial_cpu_lib.h:
--------------------------------------------------------------------------------
 1 | // Source: https://github.com/microsoft/onnxruntime/tree/main/
 2 | // onnxruntime/test/testdata/custom_op_get_const_input_test_library
 3 | #pragma once
 4 | 
 5 | #include "ortapi_c_api_header.h"
 6 | 
 7 | #ifdef __cplusplus
 8 | extern "C" {
 9 | #endif
10 | 
11 | ORT_EXPORT OrtStatus *ORT_API_CALL RegisterCustomOps(OrtSessionOptions *options,
12 |                                                      const OrtApiBase *api_base);
13 | 
14 | #ifdef __cplusplus
15 | }
16 | #endif
17 | 


--------------------------------------------------------------------------------
/onnx_extended/ortops/tutorial/cuda/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import textwrap
 3 | from typing import List
 4 | from ... import _get_ort_ext_libs
 5 | 
 6 | 
 7 | def get_ort_ext_libs() -> List[str]:
 8 |     """
 9 |     Returns the list of libraries implementing new simple
10 |     :epkg:`onnxruntime` kernels implemented for the
11 |     :epkg:`CUDAExecutionProvider`.
12 |     """
13 |     libs = _get_ort_ext_libs(os.path.dirname(__file__))
14 |     return [lib for lib in libs if "cuda_cuda" not in lib]
15 | 
16 | 
17 | def documentation() -> List[str]:
18 |     """
19 |     Returns a list of rst string documenting every implemented kernels
20 |     in this subfolder.
21 |     """
22 |     return list(
23 |         map(
24 |             textwrap.dedent,
25 |             [
26 |                 """
27 |     onnx_extended.ortops.tutorial.cuda.CustomGemm
28 |     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
29 | 
30 |     It calls CUDA library for Gemm :math:`\\alpha A B + \\beta C`.
31 | 
32 |     **Provider**
33 | 
34 |     CUDAExecutionProvider
35 | 
36 |     **Inputs**
37 | 
38 |     * A (T): tensor of type T
39 |     * B (T): tensor of type T
40 |     * C (T): tensor of type T
41 |     * D (T): tensor of type T
42 |     * E (T): tensor of type T
43 | 
44 |     **Outputs**
45 | 
46 |     * Z (T): :math:`\\alpha A B + \\beta C`
47 | 
48 |     **Constraints**
49 | 
50 |     * T: float, float16, bfloat16
51 |     """
52 |             ],
53 |         )
54 |     )
55 | 


--------------------------------------------------------------------------------
/onnx_extended/ortops/tutorial/cuda/matx_matmul.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "common/common_kernels.h"
 4 | #include "cublas_v2.h"
 5 | #include <cuda_runtime.h>
 6 | 
 7 | namespace ortops {
 8 | 
 9 | struct MatXMatMulKernel {
10 |   MatXMatMulKernel(const OrtApi &api, const OrtKernelInfo *info);
11 |   void Compute(OrtKernelContext *context);
12 | };
13 | 
14 | struct MatXMatMulOp : Ort::CustomOpBase<MatXMatMulOp, MatXMatMulKernel> {
15 |   typedef Ort::CustomOpBase<MatXMatMulOp, MatXMatMulKernel> parent_type;
16 |   MatXMatMulOp(const char *op_name, ONNXTensorElementDataType dtype) : parent_type() {
17 |     op_name_ = op_name;
18 |     dtype_ = dtype;
19 |   }
20 |   void *CreateKernel(const OrtApi &api, const OrtKernelInfo *info) const;
21 |   const char *GetName() const;
22 |   const char *GetExecutionProviderType() const;
23 | 
24 |   std::size_t GetInputTypeCount() const;
25 |   ONNXTensorElementDataType GetInputType(std::size_t index) const;
26 |   OrtCustomOpInputOutputCharacteristic GetInputCharacteristic(std::size_t index) const;
27 | 
28 |   std::size_t GetOutputTypeCount() const;
29 |   ONNXTensorElementDataType GetOutputType(std::size_t index) const;
30 |   OrtCustomOpInputOutputCharacteristic GetOutputCharacteristic(std::size_t index) const;
31 | 
32 | private:
33 |   const char *op_name_;
34 |   ONNXTensorElementDataType dtype_;
35 | };
36 | 
37 | } // namespace ortops
38 | 


--------------------------------------------------------------------------------
/onnx_extended/ortops/tutorial/cuda/ort_tutorial_cuda_lib.cc:
--------------------------------------------------------------------------------
 1 | // Source: https://github.com/microsoft/onnxruntime/tree/main/
 2 | // onnxruntime/test/testdata/custom_op_get_const_input_test_library
 3 | 
 4 | #include <mutex>
 5 | #include <vector>
 6 | 
 7 | #include "custom_gemm.h"
 8 | #include "matx_matmul.h"
 9 | #include "ort_tutorial_cuda_lib.h"
10 | #include "ortapi_version.h"
11 | 
12 | static const char *c_OpDomain = "onnx_extended.ortops.tutorial.cuda";
13 | 
14 | static void AddOrtCustomOpDomainToContainer(Ort::CustomOpDomain &&domain) {
15 |   static std::vector<Ort::CustomOpDomain> ort_custom_op_domain_container;
16 |   static std::mutex ort_custom_op_domain_mutex;
17 |   std::lock_guard<std::mutex> lock(ort_custom_op_domain_mutex);
18 |   ort_custom_op_domain_container.push_back(std::move(domain));
19 | }
20 | 
21 | OrtStatus *ORT_API_CALL RegisterCustomOps(OrtSessionOptions *options,
22 |                                           const OrtApiBase *api_base) {
23 |   Ort::InitApi(api_base->GetApi(ORT_API_VERSION_SUPPORTED));
24 |   Ort::UnownedSessionOptions session_options(options);
25 | 
26 |   // An instance remaining available until onnxruntime unload the library.
27 |   static ortops::CustomGemmOp c_CustomGemmFloat(
28 |       "CustomGemmFloat", ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT,
29 |       ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, false);
30 |   static ortops::CustomGemmOp c_CustomGemmFloat16(
31 |       "CustomGemmFloat16", ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16,
32 |       ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16, ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16, false);
33 | 
34 |   static ortops::MatXMatMulOp c_MaxMatMulFloat("MaXMatMulFloat",
35 |                                                ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT);
36 | 
37 | #if ORT_VERSION >= 1160 && CUDA_VERSION >= 11080
38 |   static ortops::CustomGemmOp c_CustomGemmFloat8E4M3FN(
39 |       "CustomGemmFloat8E4M3FN", ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT8E4M3FN,
40 |       ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, false);
41 |   static ortops::CustomGemmOp c_CustomGemmFloat8E4M3FNTime(
42 |       "CustomGemmFloat8E4M3FNTime", ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT8E4M3FN,
43 |       ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, false);
44 | #endif
45 | 
46 |   try {
47 |     Ort::CustomOpDomain domain{c_OpDomain};
48 | 
49 |     domain.Add(&c_CustomGemmFloat);
50 |     domain.Add(&c_CustomGemmFloat16);
51 |     domain.Add(&c_MaxMatMulFloat);
52 | #if ORT_VERSION >= 1160 && CUDA_VERSION >= 11080
53 |     domain.Add(&c_CustomGemmFloat8E4M3FN);
54 |     domain.Add(&c_CustomGemmFloat8E4M3FNTime);
55 | #endif
56 | 
57 |     session_options.Add(domain);
58 |     AddOrtCustomOpDomainToContainer(std::move(domain));
59 |   } catch (const std::exception &e) {
60 |     Ort::Status status{e};
61 |     return status.release();
62 |   }
63 | 
64 |   return nullptr;
65 | }
66 | 


--------------------------------------------------------------------------------
/onnx_extended/ortops/tutorial/cuda/ort_tutorial_cuda_lib.h:
--------------------------------------------------------------------------------
 1 | // Source: https://github.com/microsoft/onnxruntime/tree/main/
 2 | // onnxruntime/test/testdata/custom_op_get_const_input_test_library
 3 | #pragma once
 4 | 
 5 | #include "ortapi_c_api_header.h"
 6 | 
 7 | #ifdef __cplusplus
 8 | extern "C" {
 9 | #endif
10 | 
11 | ORT_EXPORT OrtStatus *ORT_API_CALL RegisterCustomOps(OrtSessionOptions *options,
12 |                                                      const OrtApiBase *api_base);
13 | 
14 | #ifdef __cplusplus
15 | }
16 | #endif
17 | 


--------------------------------------------------------------------------------
/onnx_extended/plotting/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sdpython/onnx-extended/eebca3c82b615f841608f74131900738b7f7f845/onnx_extended/plotting/__init__.py


--------------------------------------------------------------------------------
/onnx_extended/reference/__init__.py:
--------------------------------------------------------------------------------
 1 | from typing import Union
 2 | import numpy as np
 3 | from onnx import SparseTensorProto, TensorProto
 4 | from onnx.reference.op_run import to_array_extended as onnx_to_array_extended
 5 | from .c_reference_evaluator import CReferenceEvaluator, from_array_extended
 6 | 
 7 | 
 8 | def to_array_extended(
 9 |     tensor: Union[SparseTensorProto, TensorProto],
10 | ) -> Union[np.ndarray, "scipy.sparse.coo_matrix"]:  # noqa: F821
11 |     """
12 |     Overwrites function `onnx.reference.op_run.to_array_extended`
13 |     to support sparse tensors.
14 |     """
15 |     if isinstance(tensor, TensorProto):
16 |         return onnx_to_array_extended(tensor)
17 |     if isinstance(tensor, SparseTensorProto):
18 |         import scipy.sparse as sp
19 | 
20 |         shape = tuple(d for d in tensor.dims)
21 |         indices = onnx_to_array_extended(tensor.indices)
22 |         values = onnx_to_array_extended(tensor.values)
23 |         if len(indices.shape) == 1:
24 |             t = sp.csr_matrix(
25 |                 (values, indices, np.array([0, len(indices)], dtype=np.int64)),
26 |                 shape=(1, np.prod(shape)),
27 |             )
28 |             return t.reshape(shape)
29 |         if len(indices.shape) == 2:
30 |             t = sp.coo_matrix((values, (indices[:, 0], indices[:, 1])), shape=shape)
31 |             return t
32 |         raise RuntimeError(f"Unexpected indices shape: {indices.shape}.")
33 |     raise TypeError(f"Unexpected type {type(tensor)}.")
34 | 


--------------------------------------------------------------------------------
/onnx_extended/reference/c_custom_ops/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/onnx_extended/reference/c_ops/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sdpython/onnx-extended/eebca3c82b615f841608f74131900738b7f7f845/onnx_extended/reference/c_ops/__init__.py


--------------------------------------------------------------------------------
/onnx_extended/reference/c_ops/_op_classifier_common.py:
--------------------------------------------------------------------------------
 1 | import numpy
 2 | 
 3 | 
 4 | class _ClassifierCommon:
 5 |     """
 6 |     Labels strings are not natively implemented in C++ runtime.
 7 |     The class stores the strings labels, replaces them by
 8 |     integer, calls the C++ codes and then replaces them by strings.
 9 |     """
10 | 
11 |     @staticmethod
12 |     def _post_process_predicted_label(label, scores, classlabels_int64s_string):
13 |         """
14 |         Replaces int64 predicted labels by the corresponding
15 |         strings.
16 |         """
17 |         if classlabels_int64s_string is not None and len(classlabels_int64s_string) > 0:
18 |             new_label = []
19 |             no_array = False
20 |             for i in label:
21 |                 if i >= len(classlabels_int64s_string):
22 |                     new_label.append(None)
23 |                     no_array = True
24 |                 else:
25 |                     new_label.append(classlabels_int64s_string[i])
26 |             if no_array:
27 |                 return new_label, scores
28 |             return numpy.array(new_label), scores
29 |         return label, scores
30 | 


--------------------------------------------------------------------------------
/onnx_extended/reference/c_ops/c_op_conv.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Dict
 2 | 
 3 | import numpy as np
 4 | 
 5 | from onnx import NodeProto
 6 | from onnx.reference.op_run import OpRun
 7 | from .cpu.c_op_conv_ import ConvDouble, ConvFloat
 8 | 
 9 | 
10 | class Conv(OpRun):
11 |     def __init__(
12 |         self, onnx_node: NodeProto, run_params: Dict[str, Any], schema: Any = None
13 |     ):
14 |         OpRun.__init__(self, onnx_node, run_params, schema)
15 |         self.cache_: Dict[type, Any] = {}
16 | 
17 |     def _run(
18 |         self,
19 |         X,
20 |         W,
21 |         B=None,
22 |         auto_pad=None,
23 |         dilations=None,
24 |         group=None,
25 |         kernel_shape=None,
26 |         pads=None,
27 |         strides=None,
28 |     ):
29 |         if X.dtype not in self.cache_:
30 |             if X.dtype == np.float32:
31 |                 rt = ConvFloat()
32 |             elif X.dtype == np.float64:
33 |                 rt = ConvDouble()
34 |             else:
35 |                 raise TypeError(
36 |                     f"No C implementation C for operator 'Conv' and dtype={X.dtype}."
37 |                 )
38 |             self.cache_[X.dtype] = rt
39 | 
40 |             rt.init(
41 |                 auto_pad,
42 |                 np.array(dilations or [], dtype=np.int64),
43 |                 group,
44 |                 np.array(kernel_shape or [], dtype=np.int64),
45 |                 np.array(pads or [], dtype=np.int64),
46 |                 np.array(strides or [], dtype=np.int64),
47 |             )
48 | 
49 |         rt = self.cache_[X.dtype]
50 | 
51 |         assert X is not None, f"X cannot be None for operator {type(self)}."
52 |         assert (
53 |             min(X.shape) != 0
54 |         ), f"Unable to run operator Conv on an empty matrix. X.shape={X.shape!r}."
55 |         assert (
56 |             B is None or min(B.shape) != 0
57 |         ), f"Unable to run operator Conv on an empty matrix. B.shape={B.shape!r}."
58 |         cv = rt.compute(X, W, B)
59 |         return (cv,)
60 | 


--------------------------------------------------------------------------------
/onnx_extended/reference/c_ops/c_op_svm_regressor.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Dict
 2 | import numpy as np
 3 | from onnx import NodeProto
 4 | from onnx.reference.op_run import OpRun
 5 | from .cpu.c_op_svm_py_ import (
 6 |     RuntimeSVMRegressorFloat,
 7 |     RuntimeSVMRegressorDouble,
 8 | )
 9 | 
10 | 
11 | class SVMRegressor(OpRun):
12 |     op_domain = "ai.onnx.ml"
13 | 
14 |     def __init__(
15 |         self, onnx_node: NodeProto, run_params: Dict[str, Any], schema: Any = None
16 |     ):
17 |         OpRun.__init__(self, onnx_node, run_params, schema=schema)
18 |         self.rt_ = None
19 | 
20 |     def _run(
21 |         self,
22 |         x,
23 |         coefficients=None,
24 |         kernel_params=None,
25 |         kernel_type=None,
26 |         n_supports=None,
27 |         one_class=None,
28 |         post_transform=None,
29 |         rho=None,
30 |         support_vectors=None,
31 |     ):
32 |         """
33 |         This is a C++ implementation coming from
34 |         :epkg:`onnxruntime`.
35 |         `svm_regressor.cc
36 |         <https://github.com/microsoft/onnxruntime/blob/master/onnxruntime/core/providers/cpu/ml/svm_regressor.cc>`_.
37 |         See class :class:`RuntimeSVMRegressor
38 |         <mlprodict.onnxrt.ops_cpu.op_svm_regressor_.RuntimeSVMRegressor>`.
39 |         """
40 |         if self.rt_ is None:
41 |             if x.dtype == np.float32:
42 |                 self.rt_ = RuntimeSVMRegressorFloat()
43 |             elif x.dtype == np.float64:
44 |                 self.rt_ = RuntimeSVMRegressorDouble()
45 |             else:
46 |                 raise NotImplementedError(f"Not implemented for dtype={x.dtype}.")
47 |             self.rt_.init(
48 |                 coefficients,
49 |                 kernel_params,
50 |                 kernel_type,
51 |                 n_supports,
52 |                 one_class,
53 |                 post_transform,
54 |                 rho,
55 |                 support_vectors,
56 |             )
57 |         pred = self.rt_.compute(x)
58 |         if pred.shape[0] != x.shape[0]:
59 |             pred = pred.reshape(x.shape[0], pred.shape[0] // x.shape[0])
60 |         if len(pred.shape) == 1:
61 |             pred = pred.reshape((-1, 1))
62 |         return (pred,)
63 | 


--------------------------------------------------------------------------------
/onnx_extended/reference/c_ops/cpu/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/onnx_extended/reference/c_ops/cpu/c_op_conv_.cpp:
--------------------------------------------------------------------------------
 1 | #include "c_op_conv_pybind11.h"
 2 | 
 3 | using namespace onnx_c_ops;
 4 | 
 5 | PYBIND11_MODULE(c_op_conv_, m) {
 6 |   m.doc() =
 7 | #if defined(__APPLE__)
 8 |       "C++ Reference Implementation for operator Conv."
 9 | #else
10 |       R"pbdoc(C++ Reference Implementation for operator Conv.)pbdoc"
11 | #endif
12 |       ;
13 | 
14 |   py::class_<ConvFloat> clf(
15 |       m, "ConvFloat",
16 |       R"pbdoc(Implements float runtime for operator Conv. The code is inspired from
17 | `conv.cc <https://github.com/microsoft/onnxruntime/blob/master/onnxruntime/core/providers/cpu/nn/conv.cc>`_
18 | in :epkg:`onnxruntime`. Supports float only.)pbdoc");
19 | 
20 |   clf.def(py::init<>());
21 |   clf.def("init", &ConvFloat::init, "Initializes the runtime with the ONNX attributes.");
22 |   clf.def("compute", &ConvFloat::compute, "Computes the output for operator Conv.");
23 | 
24 |   py::class_<ConvDouble> cld(
25 |       m, "ConvDouble",
26 |       R"pbdoc(Implements float runtime for operator Conv. The code is inspired from
27 | `conv.cc <https://github.com/microsoft/onnxruntime/blob/master/onnxruntime/core/providers/cpu/nn/conv.cc>`_
28 | in :epkg:`onnxruntime`. Supports double only.)pbdoc");
29 | 
30 |   cld.def(py::init<>());
31 |   cld.def("init", &ConvDouble::init, "Initializes the runtime with the ONNX attributes.");
32 |   cld.def("compute", &ConvDouble::compute, "Computes the output for operator Conv.");
33 | }
34 | 


--------------------------------------------------------------------------------
/onnx_extended/reference/other_ops/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/onnx_extended/reference/other_ops/op_scatternd_of_shape.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from onnx.reference.op_run import OpRun
 3 | from onnx.reference.ops.op_scatternd import _scatter_nd_impl
 4 | 
 5 | 
 6 | class ScatterNDOfShape(OpRun):
 7 |     op_domain = "onnx_extended.ortops.optim.cuda"
 8 | 
 9 |     def _run(self, shape, indices, updates, reduction=None, strategy=None):
10 |         data = np.zeros(shape, dtype=updates.dtype)
11 |         y = _scatter_nd_impl(data, indices, updates, reduction=reduction)
12 |         return (y,)
13 | 


--------------------------------------------------------------------------------
/onnx_extended/tools/__init__.py:
--------------------------------------------------------------------------------
1 | from .onnx_io import save_model, load_external, load_model
2 | from .onnx_nodes import enumerate_model_tensors
3 | 


--------------------------------------------------------------------------------
/onnx_extended/tools/einsum/__init__.py:
--------------------------------------------------------------------------------
 1 | from .einsum_bench import einsum_benchmark
 2 | from .einsum_fct import einsum, optimize_decompose_einsum_equation
 3 | from .einsum_impl import decompose_einsum_equation, apply_einsum_sequence
 4 | from .einsum_impl_classes import EinsumSubOp, GraphEinsumSubOp
 5 | from .einsum_impl_ext import (
 6 |     numpy_extended_dot,
 7 |     numpy_extended_dot_python,
 8 |     numpy_extended_dot_matrix,
 9 |     numpy_diagonal,
10 | )
11 | 


--------------------------------------------------------------------------------
/onnx_extended/tools/einsum/einsum_config.py:
--------------------------------------------------------------------------------
 1 | from typing import Any
 2 | import numpy
 3 | import onnx
 4 | 
 5 | DEFAULT_OPSET = min(18, onnx.defs.onnx_opset_version())
 6 | DEFAULT_IR_VERSION = 8
 7 | 
 8 | 
 9 | def guess_proto_dtype(dtype: Any) -> int:
10 |     """
11 |     Returns the corresponding proto type for a numpy dtype.
12 |     """
13 |     if dtype == numpy.float32:
14 |         return onnx.TensorProto.FLOAT
15 |     if dtype == numpy.float64:
16 |         return onnx.TensorProto.DOUBLE
17 |     if dtype == numpy.int32:
18 |         return onnx.TensorProto.INT32
19 |     if dtype == numpy.int64:
20 |         return onnx.TensorProto.INT64
21 |     raise ValueError(f"Unexpected value for dtype {dtype!r}.")
22 | 


--------------------------------------------------------------------------------
/onnx_extended/tools/graph/__init__.py:
--------------------------------------------------------------------------------
1 | from .errors import QuantizationError
2 | from .onnx_graph_struct import Graph
3 | from .onnx_graph_transformer import cast_constant, quantize_float8, QuantizeOptions
4 | 


--------------------------------------------------------------------------------
/onnx_extended/tools/graph/errors.py:
--------------------------------------------------------------------------------
1 | class QuantizationError(RuntimeError):
2 |     """
3 |     Raised when a model or a node cannot be quantized.
4 |     """
5 | 


--------------------------------------------------------------------------------
/onnx_extended/validation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sdpython/onnx-extended/eebca3c82b615f841608f74131900738b7f7f845/onnx_extended/validation/__init__.py


--------------------------------------------------------------------------------
/onnx_extended/validation/cpu/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/onnx_extended/validation/cpu/cpu_fpemu.hpp:
--------------------------------------------------------------------------------
 1 | // This file includes some pieces taken from
 2 | // https://github.com/IntelLabs/FP8-Emulation-Toolkit/blob/main/mpemu/pytquant/cuda/fpemu_kernels.cu
 3 | // with the following license.
 4 | //
 5 | /*----------------------------------------------------------------------------*
 6 |  * Copyright (c) 2023, Intel Corporation - All rights reserved.
 7 |  * This file is part of FP8-Emulation-Toolkit
 8 |  *
 9 |  * SPDX-License-Identifier: BSD-3-Clause
10 |  *----------------------------------------------------------------------------*
11 |  * Naveen Mellempudi (Intel Corporation)
12 |  *----------------------------------------------------------------------------*/
13 | 
14 | #pragma once
15 | 
16 | #if defined(__SSSE3__)
17 | 
18 | #include <immintrin.h>
19 | 
20 | #endif
21 | 
22 | namespace cpu_fpemu {
23 | 
24 | #if defined(__SSSE3__)
25 | 
26 | inline float __double2float_rn(double inval) {
27 |   float out[4] = {0};
28 |   __m128 vout = _mm_cvtpd_ps(_mm_set1_pd(inval));
29 | 
30 |   _mm_store_ps(&out[0], vout);
31 |   return out[0];
32 | }
33 | 
34 | #ifdef _WIN32
35 | 
36 | inline unsigned short __float2half_rn(float inval) {
37 |   __m128i m = _mm_cvtps_ph(_mm_set_ss(inval), (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
38 |   return _mm_extract_epi16(m, 0);
39 | }
40 | 
41 | inline float __half2float(unsigned short h_val) {
42 |   __m128i m = _mm_cvtsi32_si128(h_val);
43 |   return _mm_cvtss_f32(_mm_cvtph_ps(m));
44 | }
45 | 
46 | #else
47 | 
48 | inline unsigned short __float2half_rn(float inval) {
49 |   return _cvtss_sh(inval, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
50 | }
51 | 
52 | inline float __half2float(unsigned short h_val) { return _cvtsh_ss(h_val); }
53 | 
54 | #endif
55 | 
56 | #endif
57 | 
58 | } // namespace cpu_fpemu
59 | 


--------------------------------------------------------------------------------
/onnx_extended/validation/cpu/murmur_hash3.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <stdint.h>
 4 | 
 5 | namespace validation {
 6 | namespace sklearn {
 7 | 
 8 | void MurmurHash3_x86_32(const void *key, int len, uint32_t seed, void *out);
 9 | 
10 | void MurmurHash3_x86_128(const void *key, int len, uint32_t seed, void *out);
11 | 
12 | void MurmurHash3_x64_128(const void *key, int len, uint32_t seed, void *out);
13 | 
14 | } // namespace sklearn
15 | } // namespace validation
16 | 


--------------------------------------------------------------------------------
/onnx_extended/validation/cpu/speed_metrics.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <cstddef>
 4 | #include <stdint.h>
 5 | #include <vector>
 6 | 
 7 | namespace validation {
 8 | 
 9 | #if defined(_WIN32)
10 | 
11 | inline bool _isnan_(float x) { return _isnanf(x); }
12 | inline bool _isnan_(double x) { return _isnan(x); }
13 | 
14 | #else
15 | 
16 | // See
17 | // https://stackoverflow.com/questions/2249110/how-do-i-make-a-portable-isnan-isinf-function
18 | inline bool _isnan_(double x) {
19 |   union {
20 |     uint64_t u;
21 |     double f;
22 |   } ieee754;
23 |   ieee754.f = x;
24 |   return ((unsigned)(ieee754.u >> 32) & 0x7fffffff) + ((unsigned)ieee754.u != 0) > 0x7ff00000;
25 | }
26 | 
27 | inline bool _isnan_(float x) {
28 |   uint32_t *pv = reinterpret_cast<uint32_t *>(&x);
29 |   uint32_t b = *pv;
30 |   return (b & 0x7fc00000) == 0x7fc00000;
31 | }
32 | 
33 | #endif
34 | 
35 | typedef struct ElementTime {
36 |   int64_t trial;
37 |   int64_t row;
38 |   double time;
39 |   inline ElementTime() {}
40 |   inline ElementTime(int64_t n, int64_t r, double t) {
41 |     trial = n;
42 |     row = r;
43 |     time = t;
44 |   }
45 | } ElementTime;
46 | 
47 | double benchmark_cache(int64_t arr_size, bool verbose);
48 | std::vector<ElementTime> benchmark_cache_tree(int64_t n_rows, int64_t n_features,
49 |                                               int64_t n_trees, int64_t tree_size,
50 |                                               int64_t max_depth, int64_t search_step = 64);
51 | 
52 | } // namespace validation
53 | 


--------------------------------------------------------------------------------
/onnx_extended/validation/cpu/vector_sparse.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "common/sparse_tensor.h"
 4 | 
 5 | #include <cstddef>
 6 | #include <stdint.h>
 7 | #include <vector>
 8 | 
 9 | #include <pybind11/numpy.h>
10 | #include <pybind11/pybind11.h>
11 | 
12 | #define py_array_float py::array_t<float, py::array::c_style | py::array::forcecast>
13 | #define py_array_uint32 py::array_t<uint32_t, py::array::c_style | py::array::forcecast>
14 | 
15 | namespace py = pybind11;
16 | 
17 | namespace validation {
18 | 
19 | py::tuple sparse_struct_indices_values(const py_array_float &v);
20 | 
21 | py_array_float sparse_struct_to_dense(const py_array_float &v);
22 | 
23 | py_array_float dense_to_sparse_struct(const py_array_float &v);
24 | 
25 | py::list sparse_struct_to_maps(const py_array_float &v);
26 | 
27 | py::tuple sparse_struct_to_csr(const py_array_float &v);
28 | 
29 | std::vector<std::tuple<double, double, double>> evaluate_sparse(const float *v, int64_t n_rows,
30 |                                                                 int64_t n_cols, int random,
31 |                                                                 int ntimes, int repeat,
32 |                                                                 int test);
33 | 
34 | } // namespace validation
35 | 


--------------------------------------------------------------------------------
/onnx_extended/validation/cuda/__init__.py:
--------------------------------------------------------------------------------
 1 | def cuda_version() -> str:
 2 |     """
 3 |     Returns the cuda version it was compiled with.
 4 |     If CUDA was not available, it retunrs `"0.0"`.
 5 |     """
 6 |     try:
 7 |         from .cuda_example_py import cuda_version as cv
 8 |     except ImportError:
 9 |         # No CUDA
10 |         return "0.0"
11 |     v = cv()
12 |     major = v // 1000
13 |     minor = (v % 1000) // 10
14 |     return f"{major}.{minor}"
15 | 


--------------------------------------------------------------------------------
/onnx_extended/validation/cuda/cuda_fpemu.cuh:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <stdint.h>
 4 | 
 5 | namespace cuda_fpemu {
 6 | 
 7 | enum FpemuMode {
 8 |   E4M3_RNE = 1,
 9 | };
10 | 
11 | void fpemu_cuda_forward(const int size, const float *input, float *output, FpemuMode mode,
12 |                         bool inplace, float scale, bool block_norm, int block_size,
13 |                         int cuda_device);
14 | 
15 | } // namespace cuda_fpemu


--------------------------------------------------------------------------------
/onnx_extended/validation/cuda/cuda_gemm.cuh:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <string>
 4 | #include <unordered_map>
 5 | 
 6 | namespace cuda_example {
 7 | 
 8 | struct BenchmarkGemm {
 9 |   int64_t N;
10 |   double workspace_new;
11 |   double workspace_free;
12 |   double stream_create;
13 |   double stream_destroy;
14 |   double setup;
15 |   double clean;
16 |   double gemm;
17 |   double gemm_in;
18 |   double gemm_sync;
19 |   double total;
20 |   BenchmarkGemm();
21 |   void zero();
22 |   void to_map(std::unordered_map<std::string, double> &bench);
23 | };
24 | 
25 | std::unordered_map<std::string, double> gemm_benchmark_test(int test, int N, int m, int n,
26 |                                                             int k, int lda, int ldb, int ldd);
27 | 
28 | } // namespace cuda_example
29 | 


--------------------------------------------------------------------------------
/onnx_extended/validation/cuda/cuda_nvtx.cuh:
--------------------------------------------------------------------------------
1 | #pragma once
2 | 
3 | #if defined(ENABLE_NVTX)
4 | #include <nvtx3/nvtx3.hpp>
5 | #define NVTX_SCOPE(msg) nvtx3::scoped_range r{msg};
6 | #else
7 | #define NVTX_SCOPE(msg)
8 | #endif
9 | 


--------------------------------------------------------------------------------
/onnx_extended/validation/cuda/cuda_tensor.cuh:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | #include "cuda_nvtx.cuh"
 3 | #include "cuda_utils.h"
 4 | #include <cublasLt.h>
 5 | #include <cublas_v2.h>
 6 | #include <cuda_runtime.h>
 7 | #include <string>
 8 | 
 9 | namespace cuda_example {
10 | 
11 | typedef enum TensorDevice { CPU = 0, CUDA = 1 } TensorDevice;
12 | 
13 | bool is_fp8_dtype(cudaDataType_t dtype);
14 | 
15 | int32_t type_size(cudaDataType_t element_type);
16 | 
17 | inline cudaDataType_t get_cuda_dtype(cudaDataType_t dtype) { return dtype; }
18 | 
19 | struct TensorData {
20 |   TensorDevice device;
21 |   cudaDataType_t dtype;
22 |   std::size_t size;
23 |   void *dptr;
24 |   inline TensorData() {
25 |     device = TensorDevice::CPU;
26 |     size = 0;
27 |     dptr = nullptr;
28 |     dtype = CUDA_R_32F;
29 |   }
30 |   void allocate(cudaDataType_t dtype, std::size_t size, TensorDevice device);
31 |   void free();
32 |   void copy_from_cpu(void *ptr);
33 | };
34 | 
35 | class Tensor {
36 | public:
37 |   const char *name;
38 |   TensorData data;
39 |   TensorData scale;
40 |   TensorData amax;
41 |   TensorData scale_inv;
42 | 
43 | public:
44 |   inline Tensor(const char *name) : data(), scale(), amax(), scale_inv() { this->name = name; }
45 |   Tensor(const char *name, std::size_t size, cudaDataType_t dtype = CUDA_R_32F,
46 |          TensorDevice device = TensorDevice::CUDA,
47 |          TensorDevice scale_device = TensorDevice::CUDA);
48 |   ~Tensor();
49 |   void rnd();
50 | };
51 | 
52 | } // namespace cuda_example


--------------------------------------------------------------------------------
/onnx_extended/validation/cuda/cuda_utils.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include "onnx_extended_helpers.h"
 4 | #include <stdexcept>
 5 | 
 6 | #define NVTE_ERROR(x)                                                                          \
 7 |   do {                                                                                         \
 8 |     throw std::runtime_error(onnx_extended_helpers::MakeString(                                \
 9 |         __FILE__, ":", __LINE__, " in function ", __func__, ": ", x));                         \
10 |   } while (false)
11 | 
12 | #define NVTE_CHECK(x, ...)                                                                     \
13 |   do {                                                                                         \
14 |     if (!(x)) {                                                                                \
15 |       NVTE_ERROR(std::string("Assertion failed: " #x ". ") + std::string(__VA_ARGS__));        \
16 |     }                                                                                          \
17 |   } while (false)
18 | 
19 | #define NVTE_CHECK_CUDA(ans)                                                                   \
20 |   {                                                                                            \
21 |     auto status = ans;                                                                         \
22 |     NVTE_CHECK(status == cudaSuccess,                                                          \
23 |                "CUDA Error: " + std::string(cudaGetErrorString(status)));                      \
24 |   }
25 | 
26 | #define NVTE_CHECK_CUBLAS(ans)                                                                 \
27 |   {                                                                                            \
28 |     auto status = ans;                                                                         \
29 |     NVTE_CHECK(status == CUBLAS_STATUS_SUCCESS,                                                \
30 |                "CUBLAS Error: " + std::string(cublasGetStatusString(status)));                 \
31 |   }
32 | 
33 | #define checkCudaErrors(val) _check_cuda((val), #val, __FILE__, __LINE__)
34 | 
35 | template <typename T>
36 | void _check_cuda(T err, const char *const func, const char *const file, const int line) {
37 |   if (err != cudaSuccess) {
38 |     throw std::runtime_error(onnx_extended_helpers::MakeString(
39 |         "CUDA error at: ", file, ":", line, "\n", cudaGetErrorString(err), " ", func, "\n"));
40 |   }
41 | }
42 | 


--------------------------------------------------------------------------------
/onnx_extended/validation/cython/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/onnx_extended/validation/cython/fp8.pyx:
--------------------------------------------------------------------------------
 1 | import numpy
 2 | cimport numpy as cnumpy
 3 | cimport cython
 4 | from libcpp cimport bool
 5 | from cython.cimports.libc.stdint import uint8_t, int64_t
 6 | 
 7 | # numpy.import_array()
 8 | 
 9 | 
10 | cdef extern from "cpu/cast_fp8.h":
11 |     void float_to_e4m3fn(int64_t n, const float* src, uint8_t* dst, bool saturate) nogil
12 |     void e4m3fn_to_float(int64_t n, const uint8_t* src, float* dst) nogil
13 | 
14 | 
15 | @cython.boundscheck(False)
16 | @cython.wraparound(False)
17 | @cython.nonecheck(False)
18 | def cast_float32_to_e4m3fn(m, bool saturate = True):
19 |     """
20 |     Converts an array from float to float 8 e4m3fn.
21 | 
22 |     :param m: any array
23 |     :param saturate: saturate the conversion
24 |     :return: casted array
25 |     """
26 |     cdef cnumpy.ndarray cm = numpy.ascontiguousarray(m)
27 |     cdef cnumpy.ndarray res = numpy.empty(m.shape, dtype=numpy.uint8)
28 |     cdef int64_t n = m.size
29 |     cdef const float* src = <float*> cm.data
30 |     cdef uint8_t* dst = <uint8_t*> res.data
31 |     with nogil:
32 |         float_to_e4m3fn(n, src, dst, saturate)
33 |     return res
34 | 
35 | 
36 | @cython.boundscheck(False)
37 | @cython.wraparound(False)
38 | @cython.nonecheck(False)
39 | def cast_e4m3fn_to_float32(m):
40 |     """
41 |     Converts an array from float 8 e4m3fn to float.
42 | 
43 |     :param m: any array
44 |     :return: casted array
45 |     """
46 |     cdef cnumpy.ndarray cm = numpy.ascontiguousarray(m)
47 |     cdef cnumpy.ndarray res = numpy.empty(m.shape, dtype=numpy.float32)
48 |     cdef int64_t n = m.size
49 |     cdef const uint8_t* src = <uint8_t*> cm.data
50 |     cdef float* dst = <float*> res.data
51 |     with nogil:
52 |         e4m3fn_to_float(n, src, dst)
53 |     return res
54 | 


--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
 1 | black
 2 | clang-format
 3 | cmakelang
 4 | coverage
 5 | cython>=3.0.10
 6 | cython-lint
 7 | flake8
 8 | furo; sys_platform == 'linux'
 9 | google-re2
10 | isort
11 | lightgbm
12 | matplotlib
13 | ml-dtypes
14 | onnx-array-api
15 | onnxmltools
16 | onnxruntime>=1.21.0
17 | openpyxl
18 | opt_einsum
19 | packaging
20 | pandas
21 | Pillow
22 | psutil
23 | pytest
24 | pytest-cov
25 | pytest-subtests
26 | rstcheck[sphinx,toml]
27 | ruff
28 | scikit-learn>=1.5
29 | skl2onnx>=1.14.1
30 | sphinx>=8; sys_platform == 'linux'
31 | sphinx-gallery; sys_platform == 'linux'
32 | sphinx-issues; sys_platform == 'linux'
33 | git+https://github.com/sdpython/sphinx-runpython.git
34 | toml; python_version < '3.11'
35 | tomli
36 | tqdm
37 | wheel
38 | xgboost
39 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy>=2.0
2 | onnx>=1.17.0
3 | scipy>=1.13.1
4 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [project]
2 | requires-python = ">=3.9"
3 | 
4 | [options]
5 | packages = find:
6 | 
7 | [options.packages.find]
8 | include = onnx_extended*
9 | 


--------------------------------------------------------------------------------