The response has been limited to 50k tokens of the smallest files in the repo. You can remove this limitation by removing the max tokens filter.
├── .clang-format
├── .clang-ignorelist
├── .clang-tidy
├── .gitattributes
├── .github
    ├── CODEOWNERS
    ├── ISSUE_TEMPLATE
    │   ├── bug_report.md
    │   ├── documentation.md
    │   ├── feature_request.md
    │   └── question.md
    ├── automation
    │   ├── aarch64
    │   │   ├── build.sh
    │   │   ├── build_acl.sh
    │   │   ├── ci.json
    │   │   ├── common.sh
    │   │   ├── skipped-tests.sh
    │   │   └── test.sh
    │   ├── clang-format.sh
    │   ├── commit-msg-check.py
    │   ├── performance
    │   │   ├── bench_nightly_performance.sh
    │   │   ├── bench_pr_performance.sh
    │   │   ├── benchdnn_comparison.py
    │   │   └── inputs
    │   │   │   ├── conv
    │   │   │   ├── conv_nightly
    │   │   │   ├── eltwise
    │   │   │   ├── eltwise_nightly
    │   │   │   ├── matmul
    │   │   │   ├── matmul_nightly
    │   │   │   ├── reorder
    │   │   │   └── reorder_nightly
    │   └── x64
    │   │   └── build_linters.sh
    ├── azure
    │   ├── build.bat
    │   ├── build.sh
    │   ├── ci-x64.yml
    │   ├── env
    │   │   └── clang.sh
    │   ├── test.bat
    │   └── test.sh
    ├── codeql-config.yml
    ├── dependabot.yml
    ├── labels.yml
    ├── pull_request_template.md
    └── workflows
    │   ├── aarch64-acl.yml
    │   ├── ci-aarch64.yml
    │   ├── clang-tidy.yml
    │   ├── codeql.yml
    │   ├── labeler.yml
    │   ├── nightly-aarch64.yml
    │   ├── openssf-scorecard.yml
    │   ├── performance-aarch64.yml
    │   └── pr-linter.yml
├── .gitignore
├── CITATION.cff
├── CMakeLists.txt
├── CODE_OF_CONDUCT.md
├── CODING_STANDARDS.md
├── CONTRIBUTING.md
├── LICENSE
├── MAINTAINERS.md
├── README.binary.in
├── README.md
├── SECURITY.md
├── THIRD-PARTY-PROGRAMS
├── cmake
    ├── ACL.cmake
    ├── Doxygen.cmake
    ├── Doxyrest.cmake
    ├── FindACL.cmake
    ├── FindBLAS.cmake
    ├── FindDoxyrest.cmake
    ├── FindHIP.cmake
    ├── FindMIOpen.cmake
    ├── FindOpenCL.cmake
    ├── FindPI_CUDA.cmake
    ├── FindSphinx.cmake
    ├── FindcuBLAS.cmake
    ├── FindcuDNN.cmake
    ├── FindcublasLt.cmake
    ├── FindrocBLAS.cmake
    ├── OpenCL.cmake
    ├── OpenMP.cmake
    ├── SDL.cmake
    ├── SYCL.cmake
    ├── Sphinx.cmake
    ├── TBB.cmake
    ├── Threading.cmake
    ├── Threadpool.cmake
    ├── blas.cmake
    ├── build_types.cmake
    ├── config.cmake.in
    ├── configuring_primitive_list.cmake
    ├── coverage.cmake
    ├── dnnl_compat.cmake
    ├── doc.cmake
    ├── dpcpp_driver_check.cmake
    ├── gen_gpu_kernel.cmake
    ├── gen_gpu_kernel_list.cmake
    ├── host_compiler.cmake
    ├── host_compiler_id.cmake
    ├── host_compiler_id.cpp
    ├── lnx
    │   └── TBBConfig.cmake
    ├── mac
    │   └── TBBConfig.cmake
    ├── options.cmake
    ├── platform.cmake
    ├── run_with_env.bat.in
    ├── template.vcxproj.user
    ├── testing.cmake
    ├── utils.cmake
    ├── version.cmake
    ├── version.rc.in
    └── win
    │   └── TBBConfig.cmake
├── doc
    ├── Doxyfile.in
    ├── advanced
    │   ├── design
    │   │   ├── mem_fmt_blk.png
    │   │   ├── mem_fmt_img1.png
    │   │   ├── mem_fmt_img2.png
    │   │   ├── mem_fmt_padded_blk.png
    │   │   └── strides.png
    │   ├── dpcpp_interoperability.md
    │   ├── experimental.md
    │   ├── int8_computations.md
    │   ├── opencl_interoperability.md
    │   ├── persistent_cache.md
    │   ├── primitive_cache.md
    │   ├── sparsity.md
    │   ├── threadpool.md
    │   ├── transition-to-dnnl.md
    │   └── understanding_memory_formats.md
    ├── build
    │   ├── build.md
    │   ├── build_options.md
    │   ├── link.md
    │   └── system_requirements.md
    ├── doxyrest
    │   ├── doxyrest-config.lua
    │   └── frame
    │   │   ├── cfamily
    │   │       ├── class.rst.in
    │   │       ├── compound.rst.in
    │   │       ├── crefdb.py.in
    │   │       ├── crefdb_enums.py.in
    │   │       ├── crefdb_items.py.in
    │   │       ├── crefdb_members.py.in
    │   │       ├── details_aliases.rst.in
    │   │       ├── details_construction.rst.in
    │   │       ├── details_defines.rst.in
    │   │       ├── details_events.rst.in
    │   │       ├── details_functions.rst.in
    │   │       ├── details_properties.rst.in
    │   │       ├── details_typedefs.rst.in
    │   │       ├── details_unnamed_enum_values.rst.in
    │   │       ├── details_variables.rst.in
    │   │       ├── enum.rst.in
    │   │       ├── example.rst.in
    │   │       ├── global.rst.in
    │   │       ├── index.rst.in
    │   │       ├── namespace.rst.in
    │   │       ├── overview_aliases.rst.in
    │   │       ├── overview_classes.rst.in
    │   │       ├── overview_common.rst.in
    │   │       ├── overview_compound.rst.in
    │   │       ├── overview_construction.rst.in
    │   │       ├── overview_defines.rst.in
    │   │       ├── overview_enums.rst.in
    │   │       ├── overview_events.rst.in
    │   │       ├── overview_functions.rst.in
    │   │       ├── overview_members.rst.in
    │   │       ├── overview_namespaces.rst.in
    │   │       ├── overview_properties.rst.in
    │   │       ├── overview_typedefs.rst.in
    │   │       ├── overview_variables.rst.in
    │   │       ├── scope_class.rst.in
    │   │       ├── scope_namespace.rst.in
    │   │       └── utils.lua
    │   │   └── common
    │   │       ├── crefdb_external.py.in
    │   │       ├── crefdb_groups.py.in
    │   │       ├── crefdb_items.py.in
    │   │       ├── details.rst.in
    │   │       ├── doc.lua
    │   │       ├── footnotes.rst.in
    │   │       ├── group.rst.in
    │   │       ├── index_main.rst.in
    │   │       ├── item.lua
    │   │       ├── page.rst.in
    │   │       ├── string.lua
    │   │       ├── table.lua
    │   │       └── toc.lua
    ├── environment.yml
    ├── examples.md
    ├── graph
    │   ├── constant_tensor_cache.md
    │   ├── fusion_patterns
    │   │   ├── binary_fusion_patterns.md
    │   │   ├── convolution_fusion_patterns.md
    │   │   ├── convtranspose_fusions_patterns.md
    │   │   ├── gated_mlp.md
    │   │   ├── gqa.md
    │   │   ├── images
    │   │   │   ├── binary_pattern.png
    │   │   │   ├── compressed_sdpa_pattern.png
    │   │   │   ├── conv_bwd_pattern.png
    │   │   │   ├── conv_pattern.png
    │   │   │   ├── convtranspose_pattern.png
    │   │   │   ├── epilogue_subgraph_conv.png
    │   │   │   ├── epilogue_subgraph_general_1.png
    │   │   │   ├── epilogue_subgraph_general_2.png
    │   │   │   ├── epilogue_subgraph_matmul.png
    │   │   │   ├── f2f_conversion.png
    │   │   │   ├── f2q_conversion_general.png
    │   │   │   ├── f2q_conversion_quantized_conv.png
    │   │   │   ├── f2q_conversion_quantized_matmul.png
    │   │   │   ├── f2q_conversion_softmax.png
    │   │   │   ├── fp-gated-mlp.png
    │   │   │   ├── gated-mlp-swish.png
    │   │   │   ├── gqa.png
    │   │   │   ├── interpolate_pattern.png
    │   │   │   ├── matmul_pattern.png
    │   │   │   ├── norm_pattern.png
    │   │   │   ├── pool_pattern.png
    │   │   │   ├── q2f_conversion_quantized_conv_matmul.png
    │   │   │   ├── q2f_conversion_quantized_convtranspose.png
    │   │   │   ├── quantized_conv_pattern.png
    │   │   │   ├── quantized_convtranspose_pattern.png
    │   │   │   ├── quantized_matmul_pattern.png
    │   │   │   ├── reduction_pattern.png
    │   │   │   ├── sdpa-mask-1.png
    │   │   │   ├── sdpa-mask-2.png
    │   │   │   ├── sdpa-mask-3.png
    │   │   │   ├── sdpa-mask-4.png
    │   │   │   ├── sdpa-reorder.png
    │   │   │   ├── sdpa.png
    │   │   │   ├── softmax_pattern.png
    │   │   │   └── unary_pattern.png
    │   │   ├── interpolate_fusion_patterns.md
    │   │   ├── matmul_fusion_patterns.md
    │   │   ├── norm_fusion_patterns.md
    │   │   ├── pool_fusion_patterns.md
    │   │   ├── quantized_convolution_fusion_patterns.md
    │   │   ├── quantized_convtranspose_fusion_patterns.md
    │   │   ├── quantized_matmul_fusion_patterns.md
    │   │   ├── reduction_fusion_patterns.md
    │   │   ├── sdpa.md
    │   │   ├── sdpa_with_compressed_kv.md
    │   │   ├── softmax_fusion_patterns.md
    │   │   └── unary_fusion_patterns.md
    │   ├── graph_dump.md
    │   ├── operations
    │   │   ├── Abs.md
    │   │   ├── AbsBackward.md
    │   │   ├── Add.md
    │   │   ├── AvgPool.md
    │   │   ├── AvgPoolBackward.md
    │   │   ├── BatchNormForwardTraining.md
    │   │   ├── BatchNormInference.md
    │   │   ├── BatchNormTrainingBackward.md
    │   │   ├── BiasAdd.md
    │   │   ├── BiasAddBackward.md
    │   │   ├── Clamp.md
    │   │   ├── ClampBackward.md
    │   │   ├── Concat.md
    │   │   ├── ConvTranspose.md
    │   │   ├── ConvTransposeBackwardData.md
    │   │   ├── ConvTransposeBackwardWeights.md
    │   │   ├── Convolution.md
    │   │   ├── ConvolutionBackwardData.md
    │   │   ├── ConvolutionBackwardWeights.md
    │   │   ├── Dequantize.md
    │   │   ├── Divide.md
    │   │   ├── DynamicDequantize.md
    │   │   ├── DynamicQuantize.md
    │   │   ├── Elu.md
    │   │   ├── EluBackward.md
    │   │   ├── End.md
    │   │   ├── Exp.md
    │   │   ├── GELU.md
    │   │   ├── GELUBackward.md
    │   │   ├── GenIndex.md
    │   │   ├── GreaterEqual.md
    │   │   ├── GroupNorm.md
    │   │   ├── HardSigmoid.md
    │   │   ├── HardSigmoidBackward.md
    │   │   ├── HardSwish.md
    │   │   ├── HardSwishBackward.md
    │   │   ├── Interpolate.md
    │   │   ├── InterpolateBackward.md
    │   │   ├── LayerNorm.md
    │   │   ├── LayerNormBackward.md
    │   │   ├── LeakyReLU.md
    │   │   ├── Log.md
    │   │   ├── LogSoftmax.md
    │   │   ├── LogSoftmaxBackward.md
    │   │   ├── MatMul.md
    │   │   ├── MaxPool.md
    │   │   ├── MaxPoolBackward.md
    │   │   ├── Maximum.md
    │   │   ├── Minimum.md
    │   │   ├── Mish.md
    │   │   ├── MishBackward.md
    │   │   ├── Multiply.md
    │   │   ├── PReLU.md
    │   │   ├── PReLUBackward.md
    │   │   ├── Pow.md
    │   │   ├── Quantize.md
    │   │   ├── ReLU.md
    │   │   ├── ReLUBackward.md
    │   │   ├── Reciprocal.md
    │   │   ├── ReduceL1.md
    │   │   ├── ReduceL2.md
    │   │   ├── ReduceMax.md
    │   │   ├── ReduceMean.md
    │   │   ├── ReduceMin.md
    │   │   ├── ReduceProd.md
    │   │   ├── ReduceSum.md
    │   │   ├── Reorder.md
    │   │   ├── Round.md
    │   │   ├── Select.md
    │   │   ├── Sigmoid.md
    │   │   ├── SigmoidBackward.md
    │   │   ├── SoftPlus.md
    │   │   ├── SoftPlusBackward.md
    │   │   ├── Softmax.md
    │   │   ├── SoftmaxBackward.md
    │   │   ├── Sqrt.md
    │   │   ├── SqrtBackward.md
    │   │   ├── Square.md
    │   │   ├── SquaredDifference.md
    │   │   ├── StaticReshape.md
    │   │   ├── StaticTranspose.md
    │   │   ├── Subtract.md
    │   │   ├── Tanh.md
    │   │   ├── TanhBackward.md
    │   │   ├── TypeCast.md
    │   │   └── Wildcard.md
    │   ├── programming_model
    │   │   ├── graph_basic_concepts.md
    │   │   ├── images
    │   │   │   ├── bf16_programming.jpg
    │   │   │   ├── img_graph_programming_model.png
    │   │   │   └── int8_programming.jpg
    │   │   └── low_precision.md
    │   └── rst
    │   │   ├── graph_fusion_patterns.rst
    │   │   ├── graph_programming_model.rst
    │   │   ├── graph_supported_operations.rst
    │   │   └── images
    │   │       └── other_pattern.png
    ├── legal_information.md
    ├── mainpage.md
    ├── naming_conventions.md
    ├── performance_considerations
    │   ├── benchdnn.md
    │   ├── cpu_isa_hints.md
    │   ├── dispatcher_control.md
    │   ├── inspecting_jit.md
    │   ├── perf_settings.md
    │   ├── profilers.md
    │   ├── verbose.md
    │   ├── verbose_table.md
    │   └── vtune.md
    ├── primitives
    │   ├── batch_normalization.md
    │   ├── binary.md
    │   ├── concat.md
    │   ├── convolution.md
    │   ├── eltwise.md
    │   ├── group_normalization.md
    │   ├── images
    │   │   └── unrolled_stack_rnn.jpg
    │   ├── inner_product.md
    │   ├── layer_normalization.md
    │   ├── lrn.md
    │   ├── matmul.md
    │   ├── pooling.md
    │   ├── prelu.md
    │   ├── reduction.md
    │   ├── reorder.md
    │   ├── resampling.md
    │   ├── rnn.md
    │   ├── shuffle.md
    │   ├── softmax.md
    │   └── sum.md
    ├── programming_model
    │   ├── api.md
    │   ├── attributes.md
    │   ├── attributes_accumulation_mode.md
    │   ├── attributes_deterministic.md
    │   ├── attributes_dropout.md
    │   ├── attributes_fpmath_mode.md
    │   ├── attributes_post_ops.md
    │   ├── attributes_quantization.md
    │   ├── attributes_rounding_mode.md
    │   ├── attributes_scratchpad.md
    │   ├── basic_concepts.md
    │   ├── data_types.md
    │   ├── images
    │   │   ├── img_depthwise_fusion.jpg
    │   │   ├── img_dnnl_object_snapshot.jpg
    │   │   ├── img_dnnl_programming_flow.jpg
    │   │   ├── img_overview_flow.jpg
    │   │   └── img_programming_model.png
    │   └── inference_and_training_aspects.md
    ├── rst
    │   ├── advanced_topics.rst
    │   ├── build_and_link.rst
    │   ├── dev_guide_examples.rst
    │   ├── graph_extension.rst
    │   ├── index.rst
    │   ├── interop_with_dpcpp_and_opencl.rst
    │   ├── orphans.rst
    │   ├── performance_profiling_and_inspection.rst
    │   ├── programming_model.rst
    │   ├── supported_primitives.rst
    │   └── ukernels.rst
    ├── sphinx
    │   ├── _static
    │   │   ├── dnnl.js
    │   │   ├── doxyrest_code_copy_button.js
    │   │   ├── favicons.png
    │   │   └── oneAPI-rgb-rev-100.png
    │   ├── cleanup.py
    │   └── conf.py
    ├── ukernel
    │   ├── operations
    │   │   ├── brgemm.md
    │   │   └── transform.md
    │   └── programming_model
    │   │   └── ukernel_basic_concepts.md
    └── usage_models
    │   ├── images
    │       ├── img_bf16_diagram.png
    │       ├── img_diagram.png
    │       ├── img_inference_scope.jpg
    │       ├── img_multiscalar.png
    │       ├── img_singlescalar.png
    │       └── img_training_inference_scope.jpg
    │   ├── inference.md
    │   ├── inference_int8.md
    │   ├── training.md
    │   └── training_bf16.md
├── examples
    ├── CMakeLists.txt
    ├── CMakeLists.txt.in
    ├── bnorm_u8_via_binary_postops.cpp
    ├── cnn_inference_f32.c
    ├── cnn_inference_f32.cpp
    ├── cnn_inference_int8.cpp
    ├── cnn_training_bf16.cpp
    ├── cnn_training_f32.cpp
    ├── cpu_cnn_training_f32.c
    ├── cpu_matmul_coo.cpp
    ├── cpu_matmul_csr.cpp
    ├── cpu_matmul_weights_compression.cpp
    ├── cpu_rnn_inference_f32.cpp
    ├── cpu_rnn_inference_int8.cpp
    ├── cross_engine_reorder.c
    ├── cross_engine_reorder.cpp
    ├── example_utils.h
    ├── example_utils.hpp
    ├── getting_started.cpp
    ├── gpu_opencl_interop.cpp
    ├── graph
    │   ├── cpu_getting_started.cpp
    │   ├── cpu_inference_int8.cpp
    │   ├── cpu_single_op_partition.cpp
    │   ├── gated_mlp.cpp
    │   ├── gated_mlp_int4.cpp
    │   ├── gated_mlp_wei_combined.cpp
    │   ├── gpu_opencl_getting_started.cpp
    │   ├── gqa.cpp
    │   ├── graph_example_utils.hpp
    │   ├── mqa.cpp
    │   ├── sdpa.cpp
    │   ├── sdpa_stacked_qkv.cpp
    │   ├── sycl_getting_started.cpp
    │   └── sycl_single_op_partition.cpp
    ├── matmul_perf.cpp
    ├── memory_format_propagation.cpp
    ├── performance_profiling.cpp
    ├── primitives
    │   ├── augru.cpp
    │   ├── batch_normalization.cpp
    │   ├── binary.cpp
    │   ├── concat.cpp
    │   ├── convolution.cpp
    │   ├── deconvolution.cpp
    │   ├── eltwise.cpp
    │   ├── group_normalization.cpp
    │   ├── inner_product.cpp
    │   ├── layer_normalization.cpp
    │   ├── lbr_gru.cpp
    │   ├── lrn.cpp
    │   ├── lstm.cpp
    │   ├── matmul.cpp
    │   ├── pooling.cpp
    │   ├── prelu.cpp
    │   ├── reduction.cpp
    │   ├── reorder.cpp
    │   ├── resampling.cpp
    │   ├── shuffle.cpp
    │   ├── softmax.cpp
    │   ├── sum.cpp
    │   └── vanilla_rnn.cpp
    ├── rnn_training_f32.cpp
    ├── sycl_interop_buffer.cpp
    ├── sycl_interop_usm.cpp
    ├── tutorials
    │   └── matmul
    │   │   ├── cpu_matmul_quantization.cpp
    │   │   ├── cpu_sgemm_and_matmul.cpp
    │   │   ├── inference_int8_matmul.cpp
    │   │   └── weights_decompression_matmul.cpp
    └── ukernels
    │   └── cpu_brgemm.cpp
├── include
    ├── dnnl.h
    ├── dnnl.hpp
    ├── dnnl_config.h
    ├── dnnl_debug.h
    ├── dnnl_ocl.h
    ├── dnnl_ocl.hpp
    ├── dnnl_sycl.h
    ├── dnnl_sycl.hpp
    ├── dnnl_sycl_types.h
    ├── dnnl_threadpool.h
    ├── dnnl_threadpool.hpp
    ├── dnnl_threadpool_iface.hpp
    ├── dnnl_types.h
    ├── dnnl_version.h
    └── oneapi
    │   └── dnnl
    │       ├── dnnl.h
    │       ├── dnnl.hpp
    │       ├── dnnl_common.h
    │       ├── dnnl_common.hpp
    │       ├── dnnl_common_types.h
    │       ├── dnnl_config.h.in
    │       ├── dnnl_debug.h
    │       ├── dnnl_graph.h
    │       ├── dnnl_graph.hpp
    │       ├── dnnl_graph_ocl.h
    │       ├── dnnl_graph_ocl.hpp
    │       ├── dnnl_graph_sycl.h
    │       ├── dnnl_graph_sycl.hpp
    │       ├── dnnl_graph_types.h
    │       ├── dnnl_ocl.h
    │       ├── dnnl_ocl.hpp
    │       ├── dnnl_ocl_types.h
    │       ├── dnnl_sycl.h
    │       ├── dnnl_sycl.hpp
    │       ├── dnnl_sycl_types.h
    │       ├── dnnl_threadpool.h
    │       ├── dnnl_threadpool.hpp
    │       ├── dnnl_threadpool_iface.hpp
    │       ├── dnnl_types.h
    │       ├── dnnl_ukernel.h
    │       ├── dnnl_ukernel.hpp
    │       ├── dnnl_ukernel_types.h
    │       ├── dnnl_version.h.in
    │       └── dnnl_version_hash.h.in
├── pyproject.toml
├── scripts
    ├── README.md
    ├── fix_header_guards.py
    ├── generate_dnnl_debug.py
    ├── generate_format_tags.py
    ├── synthdnn
    │   ├── README.md
    │   ├── matmul
    │   │   ├── primitive.py
    │   │   └── sampler.py
    │   └── synthdnn.py
    └── verbose_converter
    │   ├── README.md
    │   ├── src
    │       ├── __init__.py
    │       ├── benchdnn_generator.py
    │       ├── breakdown_generator.py
    │       ├── dnnl_parser.py
    │       ├── ir.py
    │       ├── parse.py
    │       └── utils.py
    │   ├── tests
    │       ├── README.md
    │       ├── benchdnn_test.py
    │       ├── dataset_ci
    │       └── dataset_simple
    │   └── verbose_converter.py
├── src
    ├── CMakeLists.txt
    ├── common
    │   ├── CMakeLists.txt
    │   ├── batch_normalization.cpp
    │   ├── batch_normalization_pd.hpp
    │   ├── bfloat16.cpp
    │   ├── bfloat16.hpp
    │   ├── binary.cpp
    │   ├── binary_pd.hpp
    │   ├── bit_cast.hpp
    │   ├── broadcast_strategy.cpp
    │   ├── broadcast_strategy.hpp
    │   ├── c_types_map.hpp
    │   ├── cache_blob.hpp
    │   ├── cache_blob_id.cpp
    │   ├── cache_blob_id.hpp
    │   ├── cache_hit_types.hpp
    │   ├── cache_utils.hpp
    │   ├── compiler_workarounds.hpp
    │   ├── concat.cpp
    │   ├── concat.hpp
    │   ├── concat_pd.hpp
    │   ├── convolution.cpp
    │   ├── convolution_pd.cpp
    │   ├── convolution_pd.hpp
    │   ├── counting_barrier.hpp
    │   ├── cpp_compat.hpp
    │   ├── deconvolution.cpp
    │   ├── deconvolution_pd.hpp
    │   ├── dnnl_debug.cpp
    │   ├── dnnl_debug_autogenerated.cpp
    │   ├── dnnl_thread.hpp
    │   ├── dnnl_thread_tbb_proxy.hpp
    │   ├── dnnl_threadpool.cpp
    │   ├── dnnl_traits.hpp
    │   ├── eltwise.cpp
    │   ├── eltwise_pd.hpp
    │   ├── engine.cpp
    │   ├── engine.hpp
    │   ├── engine_id.hpp
    │   ├── engine_impl.hpp
    │   ├── experimental.cpp
    │   ├── experimental.hpp
    │   ├── float16.hpp
    │   ├── float4.cpp
    │   ├── float4.hpp
    │   ├── float8.cpp
    │   ├── float8.hpp
    │   ├── fpmath_mode.cpp
    │   ├── gemm.cpp
    │   ├── gemm_pd.hpp
    │   ├── gemm_types.hpp
    │   ├── gemm_utils.hpp
    │   ├── group_normalization.cpp
    │   ├── group_normalization_pd.hpp
    │   ├── impl_list_item.hpp
    │   ├── impl_registration.hpp
    │   ├── inner_product.cpp
    │   ├── inner_product_pd.hpp
    │   ├── int4.hpp
    │   ├── internal_defs.hpp
    │   ├── ittnotify.cpp
    │   ├── ittnotify.hpp
    │   ├── kernel_cache.cpp
    │   ├── kernel_cache.hpp
    │   ├── layer_normalization.cpp
    │   ├── layer_normalization_pd.hpp
    │   ├── logging.cpp
    │   ├── logging.hpp
    │   ├── lrn.cpp
    │   ├── lrn_pd.hpp
    │   ├── math_utils.hpp
    │   ├── matmul.cpp
    │   ├── matmul_pd.hpp
    │   ├── memory.cpp
    │   ├── memory.hpp
    │   ├── memory_debug.cpp
    │   ├── memory_debug.hpp
    │   ├── memory_desc.cpp
    │   ├── memory_desc.hpp
    │   ├── memory_desc_wrapper.cpp
    │   ├── memory_desc_wrapper.hpp
    │   ├── memory_map_manager.hpp
    │   ├── memory_storage.cpp
    │   ├── memory_storage.hpp
    │   ├── memory_tracking.cpp
    │   ├── memory_tracking.hpp
    │   ├── memory_zero_pad.cpp
    │   ├── nstl.hpp
    │   ├── opdesc.hpp
    │   ├── optional.hpp
    │   ├── pooling.cpp
    │   ├── pooling_pd.hpp
    │   ├── prelu.cpp
    │   ├── prelu_pd.hpp
    │   ├── primitive.cpp
    │   ├── primitive.hpp
    │   ├── primitive_attr.cpp
    │   ├── primitive_attr.hpp
    │   ├── primitive_attr_quant.cpp
    │   ├── primitive_attr_quant.hpp
    │   ├── primitive_cache.cpp
    │   ├── primitive_cache.hpp
    │   ├── primitive_desc.hpp
    │   ├── primitive_desc_iface.cpp
    │   ├── primitive_desc_iface.hpp
    │   ├── primitive_desc_iterator.hpp
    │   ├── primitive_exec_types.cpp
    │   ├── primitive_exec_types.hpp
    │   ├── primitive_hashing.cpp
    │   ├── primitive_hashing.hpp
    │   ├── primitive_iface.cpp
    │   ├── primitive_iface.hpp
    │   ├── primitive_serialization.cpp
    │   ├── primitive_serialization.hpp
    │   ├── profiler.hpp
    │   ├── query.cpp
    │   ├── reduction.cpp
    │   ├── reduction_pd.hpp
    │   ├── reorder.cpp
    │   ├── reorder.hpp
    │   ├── reorder_pd.hpp
    │   ├── resampling.cpp
    │   ├── resampling_pd.hpp
    │   ├── resource.hpp
    │   ├── rnn.cpp
    │   ├── rnn.hpp
    │   ├── rnn_pd.hpp
    │   ├── rw_mutex.cpp
    │   ├── rw_mutex.hpp
    │   ├── scratchpad.cpp
    │   ├── scratchpad.hpp
    │   ├── scratchpad_debug.cpp
    │   ├── scratchpad_debug.hpp
    │   ├── sdpa_pd.hpp
    │   ├── sdpa_test_iface.cpp
    │   ├── sdpa_types.hpp
    │   ├── sdpa_utils.hpp
    │   ├── serialization.hpp
    │   ├── shuffle.cpp
    │   ├── shuffle_pd.hpp
    │   ├── softmax.cpp
    │   ├── softmax_pd.hpp
    │   ├── stack_checker.hpp
    │   ├── stream.cpp
    │   ├── stream.hpp
    │   ├── stream_impl.hpp
    │   ├── stream_profiler.cpp
    │   ├── stream_threadpool.cpp
    │   ├── sum.cpp
    │   ├── sum_pd.hpp
    │   ├── tag_traits.hpp
    │   ├── thread_local_storage.hpp
    │   ├── type_helpers.hpp
    │   ├── utils.cpp
    │   ├── utils.hpp
    │   ├── verbose.cpp
    │   ├── verbose.hpp
    │   ├── verbose_msg.hpp
    │   └── z_magic.hpp
    ├── cpu
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   ├── aarch64
    │   │   ├── CMakeLists.txt
    │   │   ├── acl_batch_normalization.cpp
    │   │   ├── acl_batch_normalization.hpp
    │   │   ├── acl_benchmark_scheduler.cpp
    │   │   ├── acl_benchmark_scheduler.hpp
    │   │   ├── acl_binary.cpp
    │   │   ├── acl_binary.hpp
    │   │   ├── acl_convolution_utils.cpp
    │   │   ├── acl_convolution_utils.hpp
    │   │   ├── acl_deconvolution.cpp
    │   │   ├── acl_deconvolution.hpp
    │   │   ├── acl_depthwise_convolution.cpp
    │   │   ├── acl_depthwise_convolution.hpp
    │   │   ├── acl_eltwise.cpp
    │   │   ├── acl_eltwise.hpp
    │   │   ├── acl_gemm_convolution.cpp
    │   │   ├── acl_gemm_convolution.hpp
    │   │   ├── acl_indirect_gemm_convolution.cpp
    │   │   ├── acl_indirect_gemm_convolution.hpp
    │   │   ├── acl_inner_product.cpp
    │   │   ├── acl_inner_product.hpp
    │   │   ├── acl_layer_normalization.cpp
    │   │   ├── acl_layer_normalization.hpp
    │   │   ├── acl_pooling.cpp
    │   │   ├── acl_pooling.hpp
    │   │   ├── acl_post_ops.cpp
    │   │   ├── acl_post_ops.hpp
    │   │   ├── acl_prelu.cpp
    │   │   ├── acl_prelu.hpp
    │   │   ├── acl_reorder.cpp
    │   │   ├── acl_reorder.hpp
    │   │   ├── acl_softmax.cpp
    │   │   ├── acl_softmax.hpp
    │   │   ├── acl_thread.cpp
    │   │   ├── acl_thread.hpp
    │   │   ├── acl_threadpool_scheduler.cpp
    │   │   ├── acl_threadpool_scheduler.hpp
    │   │   ├── acl_utils.cpp
    │   │   ├── acl_utils.hpp
    │   │   ├── acl_winograd_convolution.cpp
    │   │   ├── acl_winograd_convolution.hpp
    │   │   ├── brgemm
    │   │   │   ├── brgemm.cpp
    │   │   │   ├── brgemm.hpp
    │   │   │   ├── brgemm_containers.cpp
    │   │   │   ├── brgemm_containers.hpp
    │   │   │   ├── brgemm_types.hpp
    │   │   │   ├── brgemm_utils.cpp
    │   │   │   ├── brgemm_utils.hpp
    │   │   │   ├── jit_brdgmm_kernel.cpp
    │   │   │   ├── jit_brdgmm_kernel.hpp
    │   │   │   └── jit_brgemm_kernel.cpp
    │   │   ├── cpu_barrier.cpp
    │   │   ├── cpu_barrier.hpp
    │   │   ├── cpu_isa_traits.cpp
    │   │   ├── cpu_isa_traits.hpp
    │   │   ├── cpu_reducer.cpp
    │   │   ├── cpu_reducer.hpp
    │   │   ├── injectors
    │   │   │   ├── injector_utils.cpp
    │   │   │   ├── injector_utils.hpp
    │   │   │   ├── jit_uni_binary_injector.cpp
    │   │   │   ├── jit_uni_binary_injector.hpp
    │   │   │   ├── jit_uni_eltwise_injector.cpp
    │   │   │   ├── jit_uni_eltwise_injector.hpp
    │   │   │   ├── jit_uni_postops_injector.cpp
    │   │   │   └── jit_uni_postops_injector.hpp
    │   │   ├── jit_brdgmm_dw_conv.cpp
    │   │   ├── jit_brdgmm_dw_conv.hpp
    │   │   ├── jit_brgemm_1x1_conv.cpp
    │   │   ├── jit_brgemm_1x1_conv.hpp
    │   │   ├── jit_brgemm_conv.cpp
    │   │   ├── jit_brgemm_conv.hpp
    │   │   ├── jit_brgemm_conv_bwd.cpp
    │   │   ├── jit_brgemm_conv_bwd.hpp
    │   │   ├── jit_brgemm_conv_comp_pad_kernel.cpp
    │   │   ├── jit_brgemm_conv_comp_pad_kernel.hpp
    │   │   ├── jit_brgemm_conv_trans_kernel.cpp
    │   │   ├── jit_brgemm_conv_trans_kernel.hpp
    │   │   ├── jit_brgemm_conv_utils.cpp
    │   │   ├── jit_brgemm_conv_utils.hpp
    │   │   ├── jit_brgemm_post_ops.hpp
    │   │   ├── jit_brgemm_primitive_conf.hpp
    │   │   ├── jit_brgemm_transpose_utils.cpp
    │   │   ├── jit_brgemm_transpose_utils.hpp
    │   │   ├── jit_generator.hpp
    │   │   ├── jit_op_imm_check.hpp
    │   │   ├── jit_primitive_conf.hpp
    │   │   ├── jit_sve_1x1_conv_kernel.cpp
    │   │   ├── jit_sve_1x1_conv_kernel.hpp
    │   │   ├── jit_sve_1x1_convolution.cpp
    │   │   ├── jit_sve_1x1_convolution.hpp
    │   │   ├── jit_sve_512_core_x8s8s32x_deconvolution.cpp
    │   │   ├── jit_sve_512_core_x8s8s32x_deconvolution.hpp
    │   │   ├── jit_sve_512_x8s8s32x_conv_kernel.cpp
    │   │   ├── jit_sve_512_x8s8s32x_conv_kernel.hpp
    │   │   ├── jit_sve_512_x8s8s32x_convolution.cpp
    │   │   ├── jit_sve_512_x8s8s32x_convolution.hpp
    │   │   ├── jit_sve_conv_kernel.cpp
    │   │   ├── jit_sve_conv_kernel.hpp
    │   │   ├── jit_sve_convolution.cpp
    │   │   ├── jit_sve_convolution.hpp
    │   │   ├── jit_uni_1x1_conv_utils.hpp
    │   │   ├── jit_uni_batch_normalization.cpp
    │   │   ├── jit_uni_batch_normalization.hpp
    │   │   ├── jit_uni_batch_normalization_s8.cpp
    │   │   ├── jit_uni_batch_normalization_s8.hpp
    │   │   ├── jit_uni_binary.cpp
    │   │   ├── jit_uni_binary.hpp
    │   │   ├── jit_uni_binary_kernel.cpp
    │   │   ├── jit_uni_binary_kernel.hpp
    │   │   ├── jit_uni_deconv_zp_pad_str_kernel.cpp
    │   │   ├── jit_uni_deconv_zp_pad_str_kernel.hpp
    │   │   ├── jit_uni_dw_conv_kernel_f32.cpp
    │   │   ├── jit_uni_dw_conv_kernel_f32.hpp
    │   │   ├── jit_uni_dw_conv_kernel_utils.hpp
    │   │   ├── jit_uni_dw_convolution.cpp
    │   │   ├── jit_uni_dw_convolution.hpp
    │   │   ├── jit_uni_eltwise.cpp
    │   │   ├── jit_uni_eltwise.hpp
    │   │   ├── jit_uni_eltwise_int.cpp
    │   │   ├── jit_uni_eltwise_int.hpp
    │   │   ├── jit_uni_i8i8_pooling.cpp
    │   │   ├── jit_uni_i8i8_pooling.hpp
    │   │   ├── jit_uni_pool_kernel.cpp
    │   │   ├── jit_uni_pool_kernel.hpp
    │   │   ├── jit_uni_pooling.cpp
    │   │   ├── jit_uni_pooling.hpp
    │   │   ├── jit_uni_reorder.cpp
    │   │   ├── jit_uni_reorder.hpp
    │   │   ├── jit_uni_reorder_utils.cpp
    │   │   ├── jit_uni_softmax.cpp
    │   │   ├── jit_uni_softmax.hpp
    │   │   ├── matmul
    │   │   │   ├── acl_lowp_matmul.cpp
    │   │   │   ├── acl_lowp_matmul.hpp
    │   │   │   ├── acl_lowp_matmul_sq.cpp
    │   │   │   ├── acl_lowp_matmul_sq.hpp
    │   │   │   ├── acl_matmul.cpp
    │   │   │   ├── acl_matmul.hpp
    │   │   │   ├── acl_matmul_utils.cpp
    │   │   │   ├── acl_matmul_utils.hpp
    │   │   │   ├── brgemm_matmul.cpp
    │   │   │   ├── brgemm_matmul.hpp
    │   │   │   ├── brgemm_matmul_copy_utils.cpp
    │   │   │   ├── brgemm_matmul_copy_utils.hpp
    │   │   │   ├── brgemm_matmul_reorders.cpp
    │   │   │   ├── brgemm_matmul_reorders.hpp
    │   │   │   ├── brgemm_matmul_utils.cpp
    │   │   │   ├── brgemm_matmul_utils.hpp
    │   │   │   ├── jit_int8_kernel_types.hpp
    │   │   │   ├── jit_int8_matmul.cpp
    │   │   │   ├── jit_int8_matmul.hpp
    │   │   │   ├── jit_int8_matmul_utils.cpp
    │   │   │   └── jit_int8_matmul_utils.hpp
    │   │   ├── shuffle
    │   │   │   ├── jit_uni_shuffle.cpp
    │   │   │   ├── jit_uni_shuffle.hpp
    │   │   │   ├── jit_uni_shuffle_kernel.cpp
    │   │   │   └── jit_uni_shuffle_kernel.hpp
    │   │   └── utils
    │   │   │   ├── jit_io_helper.cpp
    │   │   │   └── jit_io_helper.hpp
    │   ├── bfloat16.cpp
    │   ├── binary_injector_utils.cpp
    │   ├── binary_injector_utils.hpp
    │   ├── cpu_batch_normalization_list.cpp
    │   ├── cpu_batch_normalization_pd.hpp
    │   ├── cpu_batch_normalization_utils.cpp
    │   ├── cpu_batch_normalization_utils.hpp
    │   ├── cpu_binary_list.cpp
    │   ├── cpu_binary_pd.hpp
    │   ├── cpu_concat.cpp
    │   ├── cpu_concat_pd.hpp
    │   ├── cpu_convolution_list.cpp
    │   ├── cpu_convolution_pd.hpp
    │   ├── cpu_deconvolution_list.cpp
    │   ├── cpu_deconvolution_pd.hpp
    │   ├── cpu_eltwise_list.cpp
    │   ├── cpu_eltwise_pd.hpp
    │   ├── cpu_engine.cpp
    │   ├── cpu_engine.hpp
    │   ├── cpu_group_normalization_list.cpp
    │   ├── cpu_group_normalization_pd.hpp
    │   ├── cpu_inner_product_list.cpp
    │   ├── cpu_inner_product_pd.hpp
    │   ├── cpu_layer_normalization_list.cpp
    │   ├── cpu_layer_normalization_pd.hpp
    │   ├── cpu_lrn_list.cpp
    │   ├── cpu_lrn_pd.hpp
    │   ├── cpu_memory_storage.hpp
    │   ├── cpu_pooling_list.cpp
    │   ├── cpu_pooling_pd.hpp
    │   ├── cpu_prelu_list.cpp
    │   ├── cpu_prelu_pd.hpp
    │   ├── cpu_primitive.hpp
    │   ├── cpu_reduction_list.cpp
    │   ├── cpu_reduction_pd.hpp
    │   ├── cpu_resampling_list.cpp
    │   ├── cpu_resampling_pd.hpp
    │   ├── cpu_rnn_list.cpp
    │   ├── cpu_shuffle_list.cpp
    │   ├── cpu_shuffle_pd.hpp
    │   ├── cpu_softmax_list.cpp
    │   ├── cpu_softmax_pd.hpp
    │   ├── cpu_stream.hpp
    │   ├── cpu_sum.cpp
    │   ├── cpu_sum_pd.hpp
    │   ├── dw_convolution_utils.hpp
    │   ├── float16.cpp
    │   ├── gemm
    │   │   ├── bf16
    │   │   │   ├── ref_gemm_bf16.cpp
    │   │   │   └── ref_gemm_bf16.hpp
    │   │   ├── f32
    │   │   │   ├── gemm_utils_f32.cpp
    │   │   │   ├── gemm_utils_f32.hpp
    │   │   │   ├── ref_gemm_f32.cpp
    │   │   │   └── ref_gemm_f32.hpp
    │   │   ├── gemm.cpp
    │   │   ├── gemm.hpp
    │   │   ├── gemm_msan_unpoison.hpp
    │   │   ├── gemm_pack.cpp
    │   │   ├── gemm_pack.hpp
    │   │   ├── os_blas.hpp
    │   │   └── s8x8s32
    │   │   │   ├── ref_gemm_s8x8s32.cpp
    │   │   │   ├── ref_gemm_s8x8s32.hpp
    │   │   │   ├── simple_gemm_s8s8s32.cpp
    │   │   │   └── simple_gemm_s8s8s32.hpp
    │   ├── gemm_convolution.cpp
    │   ├── gemm_convolution.hpp
    │   ├── gemm_convolution_utils.cpp
    │   ├── gemm_convolution_utils.hpp
    │   ├── gemm_inner_product.cpp
    │   ├── gemm_inner_product.hpp
    │   ├── gemm_inner_product_utils.cpp
    │   ├── gemm_inner_product_utils.hpp
    │   ├── gemm_x8s8s32x_conv_zp_src_pad_comp.cpp
    │   ├── gemm_x8s8s32x_conv_zp_src_pad_comp.hpp
    │   ├── gemm_x8s8s32x_convolution.cpp
    │   ├── gemm_x8s8s32x_convolution.hpp
    │   ├── gemm_x8s8s32x_convolution_utils.cpp
    │   ├── gemm_x8s8s32x_convolution_utils.hpp
    │   ├── gemm_x8s8s32x_inner_product.cpp
    │   ├── gemm_x8s8s32x_inner_product.hpp
    │   ├── jit_utils
    │   │   ├── jit_utils.cpp
    │   │   ├── jit_utils.hpp
    │   │   └── linux_perf
    │   │   │   ├── README.md
    │   │   │   ├── linux_perf.cpp
    │   │   │   └── linux_perf.hpp
    │   ├── matmul
    │   │   ├── cpu_matmul_list.cpp
    │   │   ├── cpu_matmul_pd.hpp
    │   │   ├── gemm_based_common.hpp
    │   │   ├── gemm_bf16_matmul.cpp
    │   │   ├── gemm_bf16_matmul.hpp
    │   │   ├── gemm_f32_matmul.cpp
    │   │   ├── gemm_f32_matmul.hpp
    │   │   ├── gemm_x8s8s32x_matmul.cpp
    │   │   ├── gemm_x8s8s32x_matmul.hpp
    │   │   ├── matmul_utils.hpp
    │   │   ├── ref_matmul.cpp
    │   │   ├── ref_matmul.hpp
    │   │   ├── ref_matmul_int8.cpp
    │   │   ├── ref_matmul_int8.hpp
    │   │   ├── ref_sparse_matmul.cpp
    │   │   └── ref_sparse_matmul.hpp
    │   ├── nchw_pooling.cpp
    │   ├── nchw_pooling.hpp
    │   ├── ncsp_batch_normalization.cpp
    │   ├── ncsp_batch_normalization.hpp
    │   ├── ncsp_group_normalization.cpp
    │   ├── ncsp_group_normalization.hpp
    │   ├── nhwc_pooling.cpp
    │   ├── nhwc_pooling.hpp
    │   ├── nspc_batch_normalization.cpp
    │   ├── nspc_batch_normalization.hpp
    │   ├── platform.cpp
    │   ├── platform.hpp
    │   ├── ppc64
    │   │   ├── CMakeLists.txt
    │   │   ├── gemm
    │   │   │   ├── gemm_driver.cpp
    │   │   │   ├── gemm_driver.hpp
    │   │   │   ├── gemm_info.cpp
    │   │   │   ├── gemm_info.hpp
    │   │   │   ├── gemm_pack_storage.hpp
    │   │   │   ├── gemm_partition.hpp
    │   │   │   ├── gemm_threading.hpp
    │   │   │   └── gemm_utils.hpp
    │   │   ├── ppc64_gemm_driver.hpp
    │   │   ├── ppc64_gemm_reorder.cpp
    │   │   ├── ppc64_gemm_reorder.hpp
    │   │   ├── ppc64_gemm_s8x8s32.cpp
    │   │   └── ppc64_gemm_s8x8s32.hpp
    │   ├── primitive_attr_postops.cpp
    │   ├── primitive_attr_postops.hpp
    │   ├── ref_batch_normalization.cpp
    │   ├── ref_batch_normalization.hpp
    │   ├── ref_binary.cpp
    │   ├── ref_binary.hpp
    │   ├── ref_concat.hpp
    │   ├── ref_convolution.cpp
    │   ├── ref_convolution.hpp
    │   ├── ref_convolution_int8.cpp
    │   ├── ref_convolution_int8.hpp
    │   ├── ref_convolution_utils.hpp
    │   ├── ref_deconvolution.cpp
    │   ├── ref_deconvolution.hpp
    │   ├── ref_eltwise.cpp
    │   ├── ref_eltwise.hpp
    │   ├── ref_fused_convolution.hpp
    │   ├── ref_group_normalization.cpp
    │   ├── ref_group_normalization.hpp
    │   ├── ref_inner_product.cpp
    │   ├── ref_inner_product.hpp
    │   ├── ref_inner_product_int8.cpp
    │   ├── ref_inner_product_int8.hpp
    │   ├── ref_inner_product_utils.hpp
    │   ├── ref_io_helper.hpp
    │   ├── ref_layer_normalization.cpp
    │   ├── ref_layer_normalization.hpp
    │   ├── ref_lrn.cpp
    │   ├── ref_lrn.hpp
    │   ├── ref_pooling.cpp
    │   ├── ref_pooling.hpp
    │   ├── ref_prelu.cpp
    │   ├── ref_prelu.hpp
    │   ├── ref_reduction.cpp
    │   ├── ref_reduction.hpp
    │   ├── ref_resampling.cpp
    │   ├── ref_resampling.hpp
    │   ├── ref_shuffle.cpp
    │   ├── ref_shuffle.hpp
    │   ├── ref_softmax.cpp
    │   ├── ref_softmax.hpp
    │   ├── ref_sum.hpp
    │   ├── reorder
    │   │   ├── cpu_reorder.cpp
    │   │   ├── cpu_reorder.hpp
    │   │   ├── cpu_reorder_comp_bf16_s8.cpp
    │   │   ├── cpu_reorder_comp_f32_s8.cpp
    │   │   ├── cpu_reorder_comp_s8_s8.cpp
    │   │   ├── cpu_reorder_pd.hpp
    │   │   ├── cpu_reorder_regular_bf16.cpp
    │   │   ├── cpu_reorder_regular_f16.cpp
    │   │   ├── cpu_reorder_regular_f32_bf16.cpp
    │   │   ├── cpu_reorder_regular_f32_f16.cpp
    │   │   ├── cpu_reorder_regular_f32_f32.cpp
    │   │   ├── cpu_reorder_regular_f32_fp8.cpp
    │   │   ├── cpu_reorder_regular_f32_s32.cpp
    │   │   ├── cpu_reorder_regular_f32_s8.cpp
    │   │   ├── cpu_reorder_regular_f32_u8.cpp
    │   │   ├── cpu_reorder_regular_fp4.cpp
    │   │   ├── cpu_reorder_regular_fp8.cpp
    │   │   ├── cpu_reorder_regular_s32.cpp
    │   │   ├── cpu_reorder_regular_s4.cpp
    │   │   ├── cpu_reorder_regular_s8.cpp
    │   │   ├── cpu_reorder_regular_u4.cpp
    │   │   ├── cpu_reorder_regular_u8.cpp
    │   │   ├── simple_reorder.hpp
    │   │   └── simple_sparse_reorder.hpp
    │   ├── resampling_utils.hpp
    │   ├── rnn
    │   │   ├── brgemm_cell_common.cpp
    │   │   ├── cell_common.cpp
    │   │   ├── cell_gru.cpp
    │   │   ├── cell_gru_lbr.cpp
    │   │   ├── cpu_rnn_pd.hpp
    │   │   ├── postgemm_dispatcher.hpp
    │   │   ├── ref_postgemm_gru.cpp
    │   │   ├── ref_postgemm_gru_lbr.cpp
    │   │   ├── ref_postgemm_lstm.cpp
    │   │   ├── ref_postgemm_lstm_projection.cpp
    │   │   ├── ref_postgemm_rnn.cpp
    │   │   ├── ref_rnn.cpp
    │   │   ├── ref_rnn.hpp
    │   │   ├── rnn_reorders.hpp
    │   │   ├── rnn_utils.cpp
    │   │   └── rnn_utils.hpp
    │   ├── rv64
    │   │   ├── CMakeLists.txt
    │   │   ├── rvv_nchw_pooling.cpp
    │   │   └── rvv_nchw_pooling.hpp
    │   ├── s390x
    │   │   ├── CMakeLists.txt
    │   │   ├── gemm.h
    │   │   ├── gemmu16.cpp
    │   │   ├── helpers.h
    │   │   ├── kernel_s16s16s32.hpp
    │   │   └── pack.hpp
    │   ├── scale_utils.cpp
    │   ├── scale_utils.hpp
    │   ├── simple_concat.cpp
    │   ├── simple_concat.hpp
    │   ├── simple_layer_normalization.cpp
    │   ├── simple_layer_normalization.hpp
    │   ├── simple_q10n.hpp
    │   ├── simple_resampling.cpp
    │   ├── simple_resampling.hpp
    │   ├── simple_sum.cpp
    │   ├── simple_sum.hpp
    │   ├── sycl
    │   │   ├── CMakeLists.txt
    │   │   ├── engine.cpp
    │   │   ├── engine.hpp
    │   │   ├── stream.cpp
    │   │   ├── stream.hpp
    │   │   ├── stream_cpu_thunk.cpp
    │   │   ├── stream_cpu_thunk.hpp
    │   │   ├── stream_submit_cpu_primitive.cpp
    │   │   └── stream_submit_cpu_primitive.hpp
    │   ├── ukernel
    │   │   ├── attr_params.cpp
    │   │   ├── brgemm.cpp
    │   │   ├── c_types_map.hpp
    │   │   └── transform.cpp
    │   ├── x64
    │   │   ├── CMakeLists.txt
    │   │   ├── amx_tile_configure.cpp
    │   │   ├── amx_tile_configure.hpp
    │   │   ├── brgemm
    │   │   │   ├── brgemm.cpp
    │   │   │   ├── brgemm.hpp
    │   │   │   ├── brgemm_containers.cpp
    │   │   │   ├── brgemm_containers.hpp
    │   │   │   ├── brgemm_types.hpp
    │   │   │   ├── brgemm_utils.cpp
    │   │   │   ├── brgemm_utils.hpp
    │   │   │   ├── jit_brdgmm_kernel.cpp
    │   │   │   ├── jit_brdgmm_kernel.hpp
    │   │   │   ├── jit_brgemm_amx_uker.cpp
    │   │   │   └── jit_brgemm_kernel.cpp
    │   │   ├── cpu_barrier.cpp
    │   │   ├── cpu_barrier.hpp
    │   │   ├── cpu_isa_traits.cpp
    │   │   ├── cpu_isa_traits.hpp
    │   │   ├── cpu_reducer.cpp
    │   │   ├── cpu_reducer.hpp
    │   │   ├── gemm
    │   │   │   ├── amx
    │   │   │   │   ├── jit_avx512_core_amx_copy_kern.cpp
    │   │   │   │   ├── jit_avx512_core_amx_copy_kern.hpp
    │   │   │   │   ├── jit_avx512_core_amx_gemm_kern.cpp
    │   │   │   │   └── jit_avx512_core_amx_gemm_kern.hpp
    │   │   │   ├── bf16
    │   │   │   │   ├── common_s16.hpp
    │   │   │   │   ├── jit_avx512_core_gemm_bf16bf16f32_kern.cpp
    │   │   │   │   ├── jit_avx512_core_gemm_bf16bf16f32_kern.hpp
    │   │   │   │   ├── jit_avx512_core_gemv_bf16bf16f32_kern.cpp
    │   │   │   │   ├── jit_avx512_core_gemv_bf16bf16f32_kern.hpp
    │   │   │   │   ├── jit_avx512_core_s16_24x8_copy_an_kern_autogen.cpp
    │   │   │   │   ├── jit_avx512_core_s16_24x8_copy_at_kern_autogen.cpp
    │   │   │   │   ├── jit_avx512_core_s16_24x8_copy_bn_kern_autogen.cpp
    │   │   │   │   ├── jit_avx512_core_s16_24x8_copy_bt_kern_autogen.cpp
    │   │   │   │   ├── jit_avx512_core_s16_48x8_copy_an_kern_autogen.cpp
    │   │   │   │   ├── jit_avx512_core_s16_48x8_copy_at_kern_autogen.cpp
    │   │   │   │   ├── jit_avx512_core_s16_48x8_copy_bn_kern_autogen.cpp
    │   │   │   │   └── jit_avx512_core_s16_48x8_copy_bt_kern_autogen.cpp
    │   │   │   ├── f32
    │   │   │   │   ├── common_f32.hpp
    │   │   │   │   ├── jit_avx2_f32_copy_an_kern_autogen.cpp
    │   │   │   │   ├── jit_avx2_f32_copy_at_kern_autogen.cpp
    │   │   │   │   ├── jit_avx2_f32_copy_bn_kern_autogen.cpp
    │   │   │   │   ├── jit_avx2_f32_copy_bt_kern_autogen.cpp
    │   │   │   │   ├── jit_avx2_kernel_sgemm_kern.cpp
    │   │   │   │   ├── jit_avx2_kernel_sgemm_kern.hpp
    │   │   │   │   ├── jit_avx512_common_gemm_f32.cpp
    │   │   │   │   ├── jit_avx512_common_gemm_f32.hpp
    │   │   │   │   ├── jit_avx512_core_f32_copy_an_kern_autogen.cpp
    │   │   │   │   ├── jit_avx512_core_f32_copy_at_kern_autogen.hpp
    │   │   │   │   ├── jit_avx512_core_f32_copy_at_kern_part1_autogen.cpp
    │   │   │   │   ├── jit_avx512_core_f32_copy_at_kern_part2_autogen.cpp
    │   │   │   │   ├── jit_avx512_core_f32_copy_bn_kern_autogen.cpp
    │   │   │   │   ├── jit_avx512_core_f32_copy_bt_kern_autogen.cpp
    │   │   │   │   ├── jit_avx512_core_gemm_smalln_tn_f32_kern.cpp
    │   │   │   │   ├── jit_avx512_core_gemm_smalln_tn_f32_kern.hpp
    │   │   │   │   ├── jit_avx_f32_copy_an_kern_autogen.cpp
    │   │   │   │   ├── jit_avx_f32_copy_at_kern_autogen.cpp
    │   │   │   │   ├── jit_avx_f32_copy_bn_kern_autogen.cpp
    │   │   │   │   ├── jit_avx_f32_copy_bt_kern_autogen.cpp
    │   │   │   │   ├── jit_avx_gemm_f32.cpp
    │   │   │   │   ├── jit_avx_gemm_f32.hpp
    │   │   │   │   ├── jit_avx_gemv_t_f32_kern.cpp
    │   │   │   │   ├── jit_avx_gemv_t_f32_kern.hpp
    │   │   │   │   ├── jit_avx_kernel_b0_sgemm_kern_autogen.hpp
    │   │   │   │   ├── jit_avx_kernel_b0_sgemm_kern_part1_autogen.cpp
    │   │   │   │   ├── jit_avx_kernel_b0_sgemm_kern_part2_autogen.cpp
    │   │   │   │   ├── jit_avx_kernel_sgemm_kern_autogen.hpp
    │   │   │   │   ├── jit_avx_kernel_sgemm_kern_part1_autogen.cpp
    │   │   │   │   ├── jit_avx_kernel_sgemm_kern_part2_autogen.cpp
    │   │   │   │   ├── jit_sse41_f32_copy_an_kern_autogen.cpp
    │   │   │   │   ├── jit_sse41_f32_copy_at_kern_autogen.cpp
    │   │   │   │   ├── jit_sse41_f32_copy_bn_kern_autogen.cpp
    │   │   │   │   ├── jit_sse41_f32_copy_bt_kern_autogen.cpp
    │   │   │   │   ├── jit_sse41_gemv_n_f32_kern.cpp
    │   │   │   │   ├── jit_sse41_gemv_n_f32_kern.hpp
    │   │   │   │   ├── jit_sse41_gemv_t_f32_kern.cpp
    │   │   │   │   ├── jit_sse41_gemv_t_f32_kern.hpp
    │   │   │   │   ├── jit_sse41_kernel_b0_sgemm_kern_autogen.cpp
    │   │   │   │   └── jit_sse41_kernel_sgemm_kern_autogen.cpp
    │   │   │   ├── gemm_driver.cpp
    │   │   │   ├── gemm_driver.hpp
    │   │   │   ├── gemm_info.cpp
    │   │   │   ├── gemm_info.hpp
    │   │   │   ├── gemm_pack.cpp
    │   │   │   ├── gemm_pack.hpp
    │   │   │   ├── gemm_pack_storage.hpp
    │   │   │   ├── gemm_partition.hpp
    │   │   │   ├── gemm_threading.hpp
    │   │   │   ├── gemm_utils.hpp
    │   │   │   ├── gemv_driver.cpp
    │   │   │   ├── gemv_driver.hpp
    │   │   │   └── s8x8s32
    │   │   │   │   ├── common_u8.hpp
    │   │   │   │   ├── jit_avx2_gemm_s8u8s32_kern.cpp
    │   │   │   │   ├── jit_avx2_gemm_s8u8s32_kern.hpp
    │   │   │   │   ├── jit_avx2_u8_copy_an_kern_autogen.cpp
    │   │   │   │   ├── jit_avx2_u8_copy_at_kern_autogen.cpp
    │   │   │   │   ├── jit_avx2_u8_copy_bn_kern_autogen.cpp
    │   │   │   │   ├── jit_avx2_u8_copy_bt_kern_autogen.cpp
    │   │   │   │   ├── jit_avx2_u8_copy_sum_an_kern_autogen.cpp
    │   │   │   │   ├── jit_avx2_u8_copy_sum_at_kern_autogen.cpp
    │   │   │   │   ├── jit_avx2_u8_copy_sum_bn_kern_autogen.cpp
    │   │   │   │   ├── jit_avx2_u8_copy_sum_bt_kern_autogen.cpp
    │   │   │   │   ├── jit_avx2_vnni_u8_copy_an_kern_autogen.cpp
    │   │   │   │   ├── jit_avx2_vnni_u8_copy_at_kern_autogen.cpp
    │   │   │   │   ├── jit_avx2_vnni_u8_copy_bn_kern_autogen.cpp
    │   │   │   │   ├── jit_avx2_vnni_u8_copy_bt_kern_autogen.cpp
    │   │   │   │   ├── jit_avx2_vnni_u8_copy_sum_an_kern_autogen.cpp
    │   │   │   │   ├── jit_avx2_vnni_u8_copy_sum_at_kern_autogen.cpp
    │   │   │   │   ├── jit_avx2_vnni_u8_copy_sum_bn_kern_autogen.cpp
    │   │   │   │   ├── jit_avx2_vnni_u8_copy_sum_bt_kern_autogen.cpp
    │   │   │   │   ├── jit_avx512_core_gemm_s8u8s32_kern.cpp
    │   │   │   │   ├── jit_avx512_core_gemm_s8u8s32_kern.hpp
    │   │   │   │   ├── jit_avx512_core_gemv_s8x8s32.cpp
    │   │   │   │   ├── jit_avx512_core_gemv_s8x8s32.hpp
    │   │   │   │   ├── jit_avx512_core_kernel_gemv_s8x8s32_kern.cpp
    │   │   │   │   ├── jit_avx512_core_kernel_gemv_s8x8s32_kern.hpp
    │   │   │   │   ├── jit_avx512_core_u8_copy_an_kern_autogen.cpp
    │   │   │   │   ├── jit_avx512_core_u8_copy_at_kern_autogen.cpp
    │   │   │   │   ├── jit_avx512_core_u8_copy_bn_kern_autogen.cpp
    │   │   │   │   ├── jit_avx512_core_u8_copy_bt_kern_autogen.cpp
    │   │   │   │   ├── jit_avx512_core_u8_copy_sum_an_kern_autogen.cpp
    │   │   │   │   ├── jit_avx512_core_u8_copy_sum_at_kern_autogen.cpp
    │   │   │   │   ├── jit_avx512_core_u8_copy_sum_bn_kern_autogen.cpp
    │   │   │   │   ├── jit_avx512_core_u8_copy_sum_bt_kern_autogen.cpp
    │   │   │   │   ├── jit_avx_kernel_b0_b_gemm_s8u8s32_kern_autogen.cpp
    │   │   │   │   ├── jit_avx_kernel_b0_c_gemm_s8u8s32_kern_autogen.cpp
    │   │   │   │   ├── jit_avx_kernel_b0_gemm_s8u8s32_kern_autogen.cpp
    │   │   │   │   ├── jit_avx_kernel_b0_r_gemm_s8u8s32_kern_autogen.cpp
    │   │   │   │   ├── jit_avx_kernel_b_gemm_s8u8s32_kern_autogen.cpp
    │   │   │   │   ├── jit_avx_kernel_c_gemm_s8u8s32_kern_autogen.cpp
    │   │   │   │   ├── jit_avx_kernel_gemm_s8u8s32_kern_autogen.cpp
    │   │   │   │   ├── jit_avx_kernel_r_gemm_s8u8s32_kern_autogen.cpp
    │   │   │   │   ├── jit_avx_u8_copy_an_kern_autogen.cpp
    │   │   │   │   ├── jit_avx_u8_copy_at_kern_autogen.cpp
    │   │   │   │   ├── jit_avx_u8_copy_bn_kern_autogen.cpp
    │   │   │   │   ├── jit_avx_u8_copy_bt_kern_autogen.cpp
    │   │   │   │   ├── jit_avx_u8_copy_sum_an_kern_autogen.cpp
    │   │   │   │   ├── jit_avx_u8_copy_sum_at_kern_autogen.cpp
    │   │   │   │   ├── jit_avx_u8_copy_sum_bn_kern_autogen.cpp
    │   │   │   │   ├── jit_avx_u8_copy_sum_bt_kern_autogen.cpp
    │   │   │   │   ├── jit_sse41_kernel_b0_b_gemm_s8u8s32_kern_autogen.cpp
    │   │   │   │   ├── jit_sse41_kernel_b0_c_gemm_s8u8s32_kern_autogen.cpp
    │   │   │   │   ├── jit_sse41_kernel_b0_gemm_s8u8s32_kern_autogen.cpp
    │   │   │   │   ├── jit_sse41_kernel_b0_r_gemm_s8u8s32_kern_autogen.cpp
    │   │   │   │   ├── jit_sse41_kernel_b_gemm_s8u8s32_kern_autogen.cpp
    │   │   │   │   ├── jit_sse41_kernel_c_gemm_s8u8s32_kern_autogen.cpp
    │   │   │   │   ├── jit_sse41_kernel_gemm_s8u8s32_kern_autogen.cpp
    │   │   │   │   ├── jit_sse41_kernel_r_gemm_s8u8s32_kern_autogen.cpp
    │   │   │   │   ├── jit_sse41_u8_copy_an_kern_autogen.cpp
    │   │   │   │   ├── jit_sse41_u8_copy_at_kern_autogen.cpp
    │   │   │   │   ├── jit_sse41_u8_copy_bn_kern_autogen.cpp
    │   │   │   │   ├── jit_sse41_u8_copy_bt_kern_autogen.cpp
    │   │   │   │   ├── jit_sse41_u8_copy_sum_an_kern_autogen.cpp
    │   │   │   │   ├── jit_sse41_u8_copy_sum_at_kern_autogen.cpp
    │   │   │   │   ├── jit_sse41_u8_copy_sum_bn_kern_autogen.cpp
    │   │   │   │   └── jit_sse41_u8_copy_sum_bt_kern_autogen.cpp
    │   │   ├── gemm_bf16_convolution.cpp
    │   │   ├── gemm_bf16_convolution.hpp
    │   │   ├── gemm_bf16_inner_product.cpp
    │   │   ├── gemm_bf16_inner_product.hpp
    │   │   ├── injectors
    │   │   │   ├── injector_utils.cpp
    │   │   │   ├── injector_utils.hpp
    │   │   │   ├── jit_uni_binary_injector.cpp
    │   │   │   ├── jit_uni_binary_injector.hpp
    │   │   │   ├── jit_uni_eltwise_injector.cpp
    │   │   │   ├── jit_uni_eltwise_injector.hpp
    │   │   │   ├── jit_uni_postops_injector.cpp
    │   │   │   └── jit_uni_postops_injector.hpp
    │   │   ├── ip_convolution.cpp
    │   │   ├── ip_convolution.hpp
    │   │   ├── jit_avx2_1x1_conv_kernel_f32.cpp
    │   │   ├── jit_avx2_1x1_conv_kernel_f32.hpp
    │   │   ├── jit_avx2_1x1_convolution.cpp
    │   │   ├── jit_avx2_1x1_convolution.hpp
    │   │   ├── jit_avx2_conv_kernel_f32.cpp
    │   │   ├── jit_avx2_conv_kernel_f32.hpp
    │   │   ├── jit_avx2_convolution.cpp
    │   │   ├── jit_avx2_convolution.hpp
    │   │   ├── jit_avx512_common_1x1_conv_kernel.cpp
    │   │   ├── jit_avx512_common_1x1_conv_kernel.hpp
    │   │   ├── jit_avx512_common_1x1_convolution.cpp
    │   │   ├── jit_avx512_common_1x1_convolution.hpp
    │   │   ├── jit_avx512_common_conv_kernel.cpp
    │   │   ├── jit_avx512_common_conv_kernel.hpp
    │   │   ├── jit_avx512_common_convolution.cpp
    │   │   ├── jit_avx512_common_convolution.hpp
    │   │   ├── jit_avx512_core_amx_1x1_conv_kernel.cpp
    │   │   ├── jit_avx512_core_amx_1x1_conv_kernel.hpp
    │   │   ├── jit_avx512_core_amx_1x1_convolution.cpp
    │   │   ├── jit_avx512_core_amx_1x1_convolution.hpp
    │   │   ├── jit_avx512_core_amx_conv_kernel.cpp
    │   │   ├── jit_avx512_core_amx_conv_kernel.hpp
    │   │   ├── jit_avx512_core_amx_conv_utils.hpp
    │   │   ├── jit_avx512_core_amx_convolution.cpp
    │   │   ├── jit_avx512_core_amx_convolution.hpp
    │   │   ├── jit_avx512_core_amx_deconvolution.cpp
    │   │   ├── jit_avx512_core_amx_deconvolution.hpp
    │   │   ├── jit_avx512_core_bf16_1x1_conv_kernel.cpp
    │   │   ├── jit_avx512_core_bf16_1x1_conv_kernel.hpp
    │   │   ├── jit_avx512_core_bf16_1x1_convolution.cpp
    │   │   ├── jit_avx512_core_bf16_1x1_convolution.hpp
    │   │   ├── jit_avx512_core_bf16_conv_kernel.cpp
    │   │   ├── jit_avx512_core_bf16_conv_kernel.hpp
    │   │   ├── jit_avx512_core_bf16_convolution.cpp
    │   │   ├── jit_avx512_core_bf16_convolution.hpp
    │   │   ├── jit_avx512_core_bf16_dw_conv_kernel.cpp
    │   │   ├── jit_avx512_core_bf16_dw_conv_kernel.hpp
    │   │   ├── jit_avx512_core_bf16cvt.hpp
    │   │   ├── jit_avx512_core_f16_dw_conv_kernel.cpp
    │   │   ├── jit_avx512_core_f16_dw_conv_kernel.hpp
    │   │   ├── jit_avx512_core_fp16cvt.cpp
    │   │   ├── jit_avx512_core_fp16cvt.hpp
    │   │   ├── jit_avx512_core_fp8cvt.cpp
    │   │   ├── jit_avx512_core_fp8cvt.hpp
    │   │   ├── jit_avx512_core_resampling.cpp
    │   │   ├── jit_avx512_core_resampling.hpp
    │   │   ├── jit_avx512_core_scale_precompute.cpp
    │   │   ├── jit_avx512_core_scale_precompute.hpp
    │   │   ├── jit_avx512_core_x8s8s32x_1x1_conv_kernel.cpp
    │   │   ├── jit_avx512_core_x8s8s32x_1x1_conv_kernel.hpp
    │   │   ├── jit_avx512_core_x8s8s32x_1x1_convolution.cpp
    │   │   ├── jit_avx512_core_x8s8s32x_1x1_convolution.hpp
    │   │   ├── jit_avx512_core_x8s8s32x_1x1_deconvolution.hpp
    │   │   ├── jit_avx512_core_x8s8s32x_conv_kernel.cpp
    │   │   ├── jit_avx512_core_x8s8s32x_conv_kernel.hpp
    │   │   ├── jit_avx512_core_x8s8s32x_convolution.cpp
    │   │   ├── jit_avx512_core_x8s8s32x_convolution.hpp
    │   │   ├── jit_avx512_core_x8s8s32x_deconvolution.cpp
    │   │   ├── jit_avx512_core_x8s8s32x_deconvolution.hpp
    │   │   ├── jit_avx512_sparse_decompress_kernel.cpp
    │   │   ├── jit_avx512_sparse_decompress_kernel.hpp
    │   │   ├── jit_brdgmm_dw_conv.cpp
    │   │   ├── jit_brdgmm_dw_conv.hpp
    │   │   ├── jit_brgemm_1x1_conv.cpp
    │   │   ├── jit_brgemm_1x1_conv.hpp
    │   │   ├── jit_brgemm_conv.cpp
    │   │   ├── jit_brgemm_conv.hpp
    │   │   ├── jit_brgemm_conv_bwd.cpp
    │   │   ├── jit_brgemm_conv_bwd.hpp
    │   │   ├── jit_brgemm_conv_bwd_copy_kernel.cpp
    │   │   ├── jit_brgemm_conv_bwd_copy_kernel.hpp
    │   │   ├── jit_brgemm_conv_bwd_strided.cpp
    │   │   ├── jit_brgemm_conv_bwd_strided.hpp
    │   │   ├── jit_brgemm_conv_bwd_trans_kernel.cpp
    │   │   ├── jit_brgemm_conv_bwd_trans_kernel.hpp
    │   │   ├── jit_brgemm_conv_bwd_utils.cpp
    │   │   ├── jit_brgemm_conv_bwd_utils.hpp
    │   │   ├── jit_brgemm_conv_bwd_w.cpp
    │   │   ├── jit_brgemm_conv_bwd_w.hpp
    │   │   ├── jit_brgemm_conv_comp_pad_kernel.cpp
    │   │   ├── jit_brgemm_conv_comp_pad_kernel.hpp
    │   │   ├── jit_brgemm_conv_trans_kernel.cpp
    │   │   ├── jit_brgemm_conv_trans_kernel.hpp
    │   │   ├── jit_brgemm_conv_utils.cpp
    │   │   ├── jit_brgemm_conv_utils.hpp
    │   │   ├── jit_brgemm_deconv.cpp
    │   │   ├── jit_brgemm_deconv.hpp
    │   │   ├── jit_brgemm_inner_product.cpp
    │   │   ├── jit_brgemm_inner_product.hpp
    │   │   ├── jit_brgemm_inner_product_utils.cpp
    │   │   ├── jit_brgemm_inner_product_utils.hpp
    │   │   ├── jit_brgemm_post_ops.cpp
    │   │   ├── jit_brgemm_post_ops.hpp
    │   │   ├── jit_brgemm_primitive_conf.cpp
    │   │   ├── jit_brgemm_primitive_conf.hpp
    │   │   ├── jit_brgemm_transpose_utils.cpp
    │   │   ├── jit_brgemm_transpose_utils.hpp
    │   │   ├── jit_gemm_inner_product_utils.cpp
    │   │   ├── jit_gemm_inner_product_utils.hpp
    │   │   ├── jit_gemm_x8s8s32x_conv_zp_src_pad_comp.cpp
    │   │   ├── jit_gemm_x8s8s32x_conv_zp_src_pad_comp.hpp
    │   │   ├── jit_gemm_x8s8s32x_convolution_utils.cpp
    │   │   ├── jit_gemm_x8s8s32x_convolution_utils.hpp
    │   │   ├── jit_generator.cpp
    │   │   ├── jit_generator.hpp
    │   │   ├── jit_primitive_conf.hpp
    │   │   ├── jit_sse41_1x1_conv_kernel_f32.cpp
    │   │   ├── jit_sse41_1x1_conv_kernel_f32.hpp
    │   │   ├── jit_sse41_1x1_convolution.cpp
    │   │   ├── jit_sse41_1x1_convolution.hpp
    │   │   ├── jit_sse41_conv_kernel_f32.cpp
    │   │   ├── jit_sse41_conv_kernel_f32.hpp
    │   │   ├── jit_sse41_convolution.cpp
    │   │   ├── jit_sse41_convolution.hpp
    │   │   ├── jit_transpose_utils.cpp
    │   │   ├── jit_transpose_utils.hpp
    │   │   ├── jit_uni_1x1_conv_utils.hpp
    │   │   ├── jit_uni_batch_normalization.cpp
    │   │   ├── jit_uni_batch_normalization.hpp
    │   │   ├── jit_uni_batch_normalization_s8.cpp
    │   │   ├── jit_uni_batch_normalization_s8.hpp
    │   │   ├── jit_uni_binary.cpp
    │   │   ├── jit_uni_binary.hpp
    │   │   ├── jit_uni_binary_kernel.cpp
    │   │   ├── jit_uni_binary_kernel.hpp
    │   │   ├── jit_uni_convert_xf16.cpp
    │   │   ├── jit_uni_convert_xf16.hpp
    │   │   ├── jit_uni_deconv_zp_pad_str_kernel.cpp
    │   │   ├── jit_uni_deconv_zp_pad_str_kernel.hpp
    │   │   ├── jit_uni_dw_conv_kernel_f32.cpp
    │   │   ├── jit_uni_dw_conv_kernel_f32.hpp
    │   │   ├── jit_uni_dw_conv_kernel_utils.cpp
    │   │   ├── jit_uni_dw_conv_kernel_utils.hpp
    │   │   ├── jit_uni_dw_convolution.cpp
    │   │   ├── jit_uni_dw_convolution.hpp
    │   │   ├── jit_uni_eltwise.cpp
    │   │   ├── jit_uni_eltwise.hpp
    │   │   ├── jit_uni_eltwise_int.cpp
    │   │   ├── jit_uni_eltwise_int.hpp
    │   │   ├── jit_uni_group_normalization.cpp
    │   │   ├── jit_uni_group_normalization.hpp
    │   │   ├── jit_uni_i8i8_pooling.cpp
    │   │   ├── jit_uni_i8i8_pooling.hpp
    │   │   ├── jit_uni_instance_normalization.cpp
    │   │   ├── jit_uni_instance_normalization.hpp
    │   │   ├── jit_uni_layer_normalization.cpp
    │   │   ├── jit_uni_layer_normalization.hpp
    │   │   ├── jit_uni_ncsp_convolution.cpp
    │   │   ├── jit_uni_ncsp_convolution.hpp
    │   │   ├── jit_uni_pool_kernel.cpp
    │   │   ├── jit_uni_pool_kernel.hpp
    │   │   ├── jit_uni_pooling.cpp
    │   │   ├── jit_uni_pooling.hpp
    │   │   ├── jit_uni_reduction.cpp
    │   │   ├── jit_uni_reduction.hpp
    │   │   ├── jit_uni_reduction_kernel.cpp
    │   │   ├── jit_uni_reduction_kernel.hpp
    │   │   ├── jit_uni_reorder.cpp
    │   │   ├── jit_uni_reorder.hpp
    │   │   ├── jit_uni_reorder_direct_copy.cpp
    │   │   ├── jit_uni_reorder_direct_copy.hpp
    │   │   ├── jit_uni_reorder_utils.cpp
    │   │   ├── jit_uni_resampling.cpp
    │   │   ├── jit_uni_resampling.hpp
    │   │   ├── jit_uni_resampling_kernel.cpp
    │   │   ├── jit_uni_resampling_kernel.hpp
    │   │   ├── jit_uni_softmax.cpp
    │   │   ├── jit_uni_softmax.hpp
    │   │   ├── jit_uni_tbb_batch_normalization.cpp
    │   │   ├── jit_uni_tbb_batch_normalization.hpp
    │   │   ├── jit_uni_x8s8s32x_1x1_conv_kernel.cpp
    │   │   ├── jit_uni_x8s8s32x_1x1_conv_kernel.hpp
    │   │   ├── jit_uni_x8s8s32x_1x1_convolution.cpp
    │   │   ├── jit_uni_x8s8s32x_1x1_convolution.hpp
    │   │   ├── jit_uni_x8s8s32x_1x1_deconvolution.hpp
    │   │   ├── jit_uni_x8s8s32x_conv_kernel.cpp
    │   │   ├── jit_uni_x8s8s32x_conv_kernel.hpp
    │   │   ├── jit_uni_x8s8s32x_convolution.cpp
    │   │   ├── jit_uni_x8s8s32x_convolution.hpp
    │   │   ├── jit_uni_x8s8s32x_deconvolution.cpp
    │   │   ├── jit_uni_x8s8s32x_deconvolution.hpp
    │   │   ├── jit_uni_xf16_sum.cpp
    │   │   ├── jit_uni_xf16_sum.hpp
    │   │   ├── lrn
    │   │   │   ├── jit_avx512_common_lrn.cpp
    │   │   │   ├── jit_avx512_common_lrn.hpp
    │   │   │   ├── jit_avx512_common_lrn_bwd_base.cpp
    │   │   │   ├── jit_avx512_common_lrn_bwd_base.hpp
    │   │   │   ├── jit_avx512_common_lrn_bwd_blocked.cpp
    │   │   │   ├── jit_avx512_common_lrn_bwd_blocked.hpp
    │   │   │   ├── jit_avx512_common_lrn_bwd_nhwc.cpp
    │   │   │   ├── jit_avx512_common_lrn_bwd_nhwc.hpp
    │   │   │   ├── jit_avx512_common_lrn_fwd_base.cpp
    │   │   │   ├── jit_avx512_common_lrn_fwd_base.hpp
    │   │   │   ├── jit_avx512_common_lrn_fwd_blocked.cpp
    │   │   │   ├── jit_avx512_common_lrn_fwd_blocked.hpp
    │   │   │   ├── jit_avx512_common_lrn_fwd_nhwc.cpp
    │   │   │   ├── jit_avx512_common_lrn_fwd_nhwc.hpp
    │   │   │   ├── jit_avx512_common_lrn_utils.hpp
    │   │   │   ├── jit_uni_lrn.cpp
    │   │   │   ├── jit_uni_lrn.hpp
    │   │   │   ├── jit_uni_lrn_kernel.cpp
    │   │   │   ├── jit_uni_lrn_kernel.hpp
    │   │   │   ├── lrn_avx512_blocked_executor.hpp
    │   │   │   ├── lrn_avx512_nhwc_executor.hpp
    │   │   │   ├── lrn_executor.hpp
    │   │   │   └── lrn_executor_factory.hpp
    │   │   ├── matmul
    │   │   │   ├── amx_blocking_heuristics.cpp
    │   │   │   ├── amx_blocking_heuristics.hpp
    │   │   │   ├── brgemm_matmul.cpp
    │   │   │   ├── brgemm_matmul.hpp
    │   │   │   ├── brgemm_matmul_copy_utils.cpp
    │   │   │   ├── brgemm_matmul_copy_utils.hpp
    │   │   │   ├── brgemm_matmul_reorders.cpp
    │   │   │   ├── brgemm_matmul_reorders.hpp
    │   │   │   ├── brgemm_matmul_utils.cpp
    │   │   │   ├── brgemm_matmul_utils.hpp
    │   │   │   ├── jit_uni_sparse_matmul.cpp
    │   │   │   └── jit_uni_sparse_matmul.hpp
    │   │   ├── matmul_inner_product.cpp
    │   │   ├── matmul_inner_product.hpp
    │   │   ├── prelu
    │   │   │   ├── jit_prelu_backward.cpp
    │   │   │   ├── jit_prelu_backward.hpp
    │   │   │   ├── jit_prelu_base_kernel.cpp
    │   │   │   ├── jit_prelu_base_kernel.hpp
    │   │   │   ├── jit_prelu_forward.cpp
    │   │   │   ├── jit_prelu_forward.hpp
    │   │   │   ├── jit_prelu_reduction_kernel.cpp
    │   │   │   ├── jit_prelu_reduction_kernel.hpp
    │   │   │   ├── jit_prelu_utils.cpp
    │   │   │   ├── jit_prelu_utils.hpp
    │   │   │   ├── jit_uni_prelu_backward_kernel.cpp
    │   │   │   ├── jit_uni_prelu_backward_kernel.hpp
    │   │   │   ├── jit_uni_prelu_forward_kernel.cpp
    │   │   │   └── jit_uni_prelu_forward_kernel.hpp
    │   │   ├── rnn
    │   │   │   ├── brgemm_cell_common_bwd.cpp
    │   │   │   ├── brgemm_cell_common_bwd.hpp
    │   │   │   ├── brgemm_cell_common_fwd.cpp
    │   │   │   ├── brgemm_cell_common_fwd.hpp
    │   │   │   ├── brgemm_cell_common_reorders.cpp
    │   │   │   ├── brgemm_cell_common_reorders.hpp
    │   │   │   ├── brgemm_cell_common_utils.cpp
    │   │   │   ├── brgemm_cell_common_utils.hpp
    │   │   │   ├── jit_brgemm_transpose_single_row.cpp
    │   │   │   ├── jit_brgemm_transpose_single_row.hpp
    │   │   │   ├── jit_diff_weights_peephole.cpp
    │   │   │   ├── jit_diff_weights_peephole.hpp
    │   │   │   ├── jit_gates_reduction.cpp
    │   │   │   ├── jit_gates_reduction.hpp
    │   │   │   ├── jit_uni_gru_cell_postgemm_1_bwd.hpp
    │   │   │   ├── jit_uni_gru_cell_postgemm_1_fwd.hpp
    │   │   │   ├── jit_uni_gru_cell_postgemm_2_bwd.hpp
    │   │   │   ├── jit_uni_gru_cell_postgemm_2_fwd.hpp
    │   │   │   ├── jit_uni_gru_lbr_cell_postgemm_bwd.hpp
    │   │   │   ├── jit_uni_gru_lbr_cell_postgemm_fwd.hpp
    │   │   │   ├── jit_uni_lstm_cell_postgemm.hpp
    │   │   │   ├── jit_uni_lstm_cell_postgemm_bwd.hpp
    │   │   │   ├── jit_uni_lstm_cell_postgemm_fwd.hpp
    │   │   │   ├── jit_uni_lstm_cell_projection_postgemm_fwd.hpp
    │   │   │   ├── jit_uni_rnn_cell_postgemm_bwd.hpp
    │   │   │   ├── jit_uni_rnn_cell_postgemm_fwd.hpp
    │   │   │   ├── jit_uni_rnn_common_postgemm.hpp
    │   │   │   ├── rnn_brgemm_utils.cpp
    │   │   │   └── rnn_brgemm_utils.hpp
    │   │   ├── shuffle
    │   │   │   ├── jit_uni_shuffle.cpp
    │   │   │   ├── jit_uni_shuffle.hpp
    │   │   │   ├── jit_uni_shuffle_kernel.cpp
    │   │   │   └── jit_uni_shuffle_kernel.hpp
    │   │   ├── ukernel
    │   │   │   ├── attr_params.cpp
    │   │   │   ├── attr_params.hpp
    │   │   │   ├── brgemm.cpp
    │   │   │   ├── brgemm.hpp
    │   │   │   ├── transform.cpp
    │   │   │   └── transform.hpp
    │   │   └── utils
    │   │   │   ├── jit_io_helper.cpp
    │   │   │   └── jit_io_helper.hpp
    │   ├── zero_point_utils.cpp
    │   └── zero_point_utils.hpp
    ├── gpu
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   ├── amd
    │   │   ├── CMakeLists.txt
    │   │   ├── README.md
    │   │   ├── engine.cpp
    │   │   ├── engine.hpp
    │   │   ├── miopen_batch_normalization.cpp
    │   │   ├── miopen_batch_normalization.hpp
    │   │   ├── miopen_batch_normalization_executor.hpp
    │   │   ├── miopen_batch_normalization_impl.hpp
    │   │   ├── miopen_binary.cpp
    │   │   ├── miopen_binary.hpp
    │   │   ├── miopen_binary_impl.hpp
    │   │   ├── miopen_conv_filter_adjustment_base.hpp
    │   │   ├── miopen_convolution.cpp
    │   │   ├── miopen_convolution.hpp
    │   │   ├── miopen_convolution_impl.hpp
    │   │   ├── miopen_convolution_pd.hpp
    │   │   ├── miopen_deconvolution.cpp
    │   │   ├── miopen_deconvolution.hpp
    │   │   ├── miopen_deconvolution_impl.hpp
    │   │   ├── miopen_eltwise.cpp
    │   │   ├── miopen_eltwise.hpp
    │   │   ├── miopen_eltwise_impl.hpp
    │   │   ├── miopen_gemm_inner_product.hpp
    │   │   ├── miopen_gemm_inner_product_impl.hpp
    │   │   ├── miopen_inner_product.cpp
    │   │   ├── miopen_inner_product.hpp
    │   │   ├── miopen_inner_product_impl.hpp
    │   │   ├── miopen_lrn.cpp
    │   │   ├── miopen_lrn.hpp
    │   │   ├── miopen_lrn_impl.hpp
    │   │   ├── miopen_matmul.cpp
    │   │   ├── miopen_matmul.hpp
    │   │   ├── miopen_matmul_executor.hpp
    │   │   ├── miopen_matmul_impl.hpp
    │   │   ├── miopen_pooling.cpp
    │   │   ├── miopen_pooling.hpp
    │   │   ├── miopen_pooling_impl.hpp
    │   │   ├── miopen_reduction.cpp
    │   │   ├── miopen_reduction.hpp
    │   │   ├── miopen_reduction_impl.hpp
    │   │   ├── miopen_reorder.cpp
    │   │   ├── miopen_reorder.hpp
    │   │   ├── miopen_reorder_impl.hpp
    │   │   ├── miopen_softmax.cpp
    │   │   ├── miopen_softmax.hpp
    │   │   ├── miopen_softmax_impl.hpp
    │   │   ├── stream.cpp
    │   │   ├── stream.hpp
    │   │   ├── sycl_hip_compat.cpp
    │   │   ├── sycl_hip_compat.hpp
    │   │   ├── sycl_hip_scoped_context.cpp
    │   │   ├── sycl_hip_scoped_context.hpp
    │   │   ├── sycl_hip_utils.cpp
    │   │   └── sycl_hip_utils.hpp
    │   ├── generic
    │   │   ├── CMakeLists.txt
    │   │   ├── convolution_deconvolution.hpp
    │   │   ├── cross_engine_reorder.cpp
    │   │   ├── cross_engine_reorder.hpp
    │   │   ├── direct_copy.hpp
    │   │   ├── ref_concat.hpp
    │   │   ├── ref_sum.hpp
    │   │   └── sycl
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── README.md
    │   │   │   ├── batch_normalizations_kernels.hpp
    │   │   │   ├── binary_kernels.hpp
    │   │   │   ├── convolution_kernels.hpp
    │   │   │   ├── eltwise_kernels.hpp
    │   │   │   ├── engine.cpp
    │   │   │   ├── engine.hpp
    │   │   │   ├── group_normalization_kernel.hpp
    │   │   │   ├── layer_normalizations_kernels.hpp
    │   │   │   ├── lrn_kernels.hpp
    │   │   │   ├── matmul_kernels.hpp
    │   │   │   ├── pooling_kernels.hpp
    │   │   │   ├── prelu_kernels.hpp
    │   │   │   ├── reduction_kernels.hpp
    │   │   │   ├── ref_batch_normalization.cpp
    │   │   │   ├── ref_batch_normalization.hpp
    │   │   │   ├── ref_binary.cpp
    │   │   │   ├── ref_binary.hpp
    │   │   │   ├── ref_convolution.cpp
    │   │   │   ├── ref_convolution.hpp
    │   │   │   ├── ref_deconvolution.cpp
    │   │   │   ├── ref_deconvolution.hpp
    │   │   │   ├── ref_eltwise.cpp
    │   │   │   ├── ref_eltwise.hpp
    │   │   │   ├── ref_group_normalization.cpp
    │   │   │   ├── ref_group_normalization.hpp
    │   │   │   ├── ref_inner_product.cpp
    │   │   │   ├── ref_inner_product.hpp
    │   │   │   ├── ref_layer_normalizations.cpp
    │   │   │   ├── ref_layer_normalizations.hpp
    │   │   │   ├── ref_lrn.cpp
    │   │   │   ├── ref_lrn.hpp
    │   │   │   ├── ref_matmul.cpp
    │   │   │   ├── ref_matmul.hpp
    │   │   │   ├── ref_pooling.cpp
    │   │   │   ├── ref_pooling.hpp
    │   │   │   ├── ref_prelu.cpp
    │   │   │   ├── ref_prelu.hpp
    │   │   │   ├── ref_reduction.cpp
    │   │   │   ├── ref_reduction.hpp
    │   │   │   ├── ref_reorder.cpp
    │   │   │   ├── ref_reorder.hpp
    │   │   │   ├── ref_resampling.cpp
    │   │   │   ├── ref_resampling.hpp
    │   │   │   ├── ref_shuffle.cpp
    │   │   │   ├── ref_shuffle.hpp
    │   │   │   ├── ref_softmax.cpp
    │   │   │   ├── ref_softmax.hpp
    │   │   │   ├── ref_sum.cpp
    │   │   │   ├── ref_sum.hpp
    │   │   │   ├── ref_sum_many_inputs.cpp
    │   │   │   ├── ref_sum_many_inputs.hpp
    │   │   │   ├── reorder_kernels.hpp
    │   │   │   ├── resampling_kernels.hpp
    │   │   │   ├── resampling_utils.hpp
    │   │   │   ├── rnn
    │   │   │       ├── cell_common.cpp
    │   │   │       ├── ref_rnn.cpp
    │   │   │       ├── ref_rnn.hpp
    │   │   │       ├── rnn_kernels.hpp
    │   │   │       ├── rnn_utils.cpp
    │   │   │       └── rnn_utils.hpp
    │   │   │   ├── shuffle_kernels.hpp
    │   │   │   ├── simple_reduction.cpp
    │   │   │   ├── simple_reduction.hpp
    │   │   │   ├── simple_reduction_kernels.hpp
    │   │   │   ├── softmax_kernels.hpp
    │   │   │   ├── stream.cpp
    │   │   │   ├── stream.hpp
    │   │   │   ├── sum_kernels.hpp
    │   │   │   ├── sycl_gpu_kernel.cpp
    │   │   │   ├── sycl_gpu_kernel.hpp
    │   │   │   ├── sycl_gpu_primitive.hpp
    │   │   │   ├── sycl_io_helper.hpp
    │   │   │   ├── sycl_math_utils.hpp
    │   │   │   ├── sycl_post_ops.hpp
    │   │   │   ├── sycl_primitive_conf.hpp
    │   │   │   ├── sycl_q10n.hpp
    │   │   │   └── sycl_utils.hpp
    │   ├── gpu_batch_normalization_list.cpp
    │   ├── gpu_batch_normalization_pd.hpp
    │   ├── gpu_binary_list.cpp
    │   ├── gpu_binary_pd.hpp
    │   ├── gpu_concat_list.cpp
    │   ├── gpu_concat_pd.hpp
    │   ├── gpu_convolution_list.cpp
    │   ├── gpu_convolution_pd.hpp
    │   ├── gpu_deconvolution_list.cpp
    │   ├── gpu_deconvolution_pd.hpp
    │   ├── gpu_eltwise_list.cpp
    │   ├── gpu_eltwise_pd.hpp
    │   ├── gpu_engine.hpp
    │   ├── gpu_gemm_list.cpp
    │   ├── gpu_gemm_pd.hpp
    │   ├── gpu_group_normalization_list.cpp
    │   ├── gpu_impl_list.cpp
    │   ├── gpu_impl_list.hpp
    │   ├── gpu_inner_product_list.cpp
    │   ├── gpu_inner_product_pd.hpp
    │   ├── gpu_layer_normalization_list.cpp
    │   ├── gpu_layer_normalization_pd.hpp
    │   ├── gpu_lrn_list.cpp
    │   ├── gpu_lrn_pd.hpp
    │   ├── gpu_matmul_list.cpp
    │   ├── gpu_matmul_pd.hpp
    │   ├── gpu_pooling_list.cpp
    │   ├── gpu_pooling_pd.hpp
    │   ├── gpu_prelu_list.cpp
    │   ├── gpu_prelu_pd.hpp
    │   ├── gpu_primitive.hpp
    │   ├── gpu_reduction_list.cpp
    │   ├── gpu_reduction_pd.hpp
    │   ├── gpu_reorder_list.cpp
    │   ├── gpu_reorder_pd.cpp
    │   ├── gpu_reorder_pd.hpp
    │   ├── gpu_resampling_list.cpp
    │   ├── gpu_resampling_pd.hpp
    │   ├── gpu_resource.hpp
    │   ├── gpu_rnn_list.cpp
    │   ├── gpu_rnn_pd.hpp
    │   ├── gpu_sdpa_list.cpp
    │   ├── gpu_shuffle_list.cpp
    │   ├── gpu_shuffle_pd.hpp
    │   ├── gpu_softmax_list.cpp
    │   ├── gpu_softmax_pd.hpp
    │   ├── gpu_stream.hpp
    │   ├── gpu_sum_list.cpp
    │   ├── gpu_sum_pd.hpp
    │   ├── gpu_utils.hpp
    │   ├── gpu_zero_pad_list.cpp
    │   ├── gpu_zero_pad_pd.hpp
    │   ├── gpu_zero_points_conv.cpp
    │   ├── gpu_zero_points_conv.hpp
    │   ├── intel
    │   │   ├── CMakeLists.txt
    │   │   ├── block_structure.cpp
    │   │   ├── block_structure.hpp
    │   │   ├── compute
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── block_manipulation.cpp
    │   │   │   ├── block_manipulation.hpp
    │   │   │   ├── compute_engine.cpp
    │   │   │   ├── compute_engine.hpp
    │   │   │   ├── compute_stream.cpp
    │   │   │   ├── compute_stream.hpp
    │   │   │   ├── data_type_converter.hpp
    │   │   │   ├── device_info.cpp
    │   │   │   ├── device_info.hpp
    │   │   │   ├── dispatch.cpp
    │   │   │   ├── dispatch.hpp
    │   │   │   ├── dispatch_reusable.cpp
    │   │   │   ├── dispatch_reusable.hpp
    │   │   │   ├── kernel.hpp
    │   │   │   ├── kernel_arg_list.hpp
    │   │   │   ├── kernel_ctx.cpp
    │   │   │   ├── kernel_ctx.hpp
    │   │   │   ├── utils.hpp
    │   │   │   ├── zero_pool.cpp
    │   │   │   └── zero_pool.hpp
    │   │   ├── config.hpp
    │   │   ├── gemm
    │   │   │   ├── gpu_gemm.hpp
    │   │   │   └── gpu_gemm_exec_types.hpp
    │   │   ├── gpu_post_ops.hpp
    │   │   ├── gpu_primitive.hpp
    │   │   ├── gpu_primitive_attr.hpp
    │   │   ├── jit
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── README.md
    │   │   │   ├── binary_format.cpp
    │   │   │   ├── binary_format.hpp
    │   │   │   ├── codegen
    │   │   │   │   ├── bank_conflict_allocation.cpp
    │   │   │   │   ├── bank_conflict_allocation.hpp
    │   │   │   │   ├── codegen.cpp
    │   │   │   │   ├── codegen.hpp
    │   │   │   │   ├── kernel.hpp
    │   │   │   │   ├── ngen_helpers.hpp
    │   │   │   │   ├── operand.cpp
    │   │   │   │   ├── operand.hpp
    │   │   │   │   ├── reduce.hpp
    │   │   │   │   ├── reg_buf.hpp
    │   │   │   │   ├── register_allocator.hpp
    │   │   │   │   ├── register_scope.hpp
    │   │   │   │   ├── reorder.hpp
    │   │   │   │   └── send.hpp
    │   │   │   ├── config
    │   │   │   │   ├── gemmstone_config.cpp
    │   │   │   │   ├── gemmstone_config.hpp
    │   │   │   │   └── ngen_config.hpp
    │   │   │   ├── conv
    │   │   │   │   ├── README.md
    │   │   │   │   ├── config.cpp
    │   │   │   │   ├── config.hpp
    │   │   │   │   ├── conv_kernel.hpp
    │   │   │   │   ├── gen_convolution.cpp
    │   │   │   │   ├── gen_convolution.hpp
    │   │   │   │   ├── grf_usage.cpp
    │   │   │   │   ├── grf_usage.hpp
    │   │   │   │   ├── ir_builder.cpp
    │   │   │   │   ├── ir_builder.hpp
    │   │   │   │   ├── key.cpp
    │   │   │   │   ├── key.hpp
    │   │   │   │   ├── lookup_table.cpp
    │   │   │   │   ├── lookup_table.hpp
    │   │   │   │   ├── lookup_table_data.cpp
    │   │   │   │   ├── message_patterns.hpp
    │   │   │   │   ├── model.hpp
    │   │   │   │   ├── model_bridge.cpp
    │   │   │   │   ├── model_bridge.hpp
    │   │   │   │   ├── model_data.hpp
    │   │   │   │   ├── model_xehpc_common_data.cpp
    │   │   │   │   ├── model_xehpc_dw_data.cpp
    │   │   │   │   ├── model_xehpg_common_data.cpp
    │   │   │   │   ├── model_xehpg_dw_data.cpp
    │   │   │   │   ├── normalization.cpp
    │   │   │   │   ├── normalization.hpp
    │   │   │   │   ├── pipeline.cpp
    │   │   │   │   ├── pipeline.hpp
    │   │   │   │   ├── plan.cpp
    │   │   │   │   ├── plan.hpp
    │   │   │   │   ├── plan_utils.hpp
    │   │   │   │   ├── problem.cpp
    │   │   │   │   ├── problem.hpp
    │   │   │   │   ├── tiler.cpp
    │   │   │   │   ├── tiler.hpp
    │   │   │   │   ├── zero_out.cpp
    │   │   │   │   ├── zero_out.hpp
    │   │   │   │   ├── zp_plan.cpp
    │   │   │   │   └── zp_plan.hpp
    │   │   │   ├── eltwise_injector.cpp
    │   │   │   ├── eltwise_injector.hpp
    │   │   │   ├── emulated_generator.cpp
    │   │   │   ├── emulated_generator.hpp
    │   │   │   ├── gemm
    │   │   │   │   ├── .clang-tidy
    │   │   │   │   ├── CMakeLists.txt
    │   │   │   │   ├── gemm_walk_orders.hpp
    │   │   │   │   ├── gen_gemm.cpp
    │   │   │   │   ├── gen_gemm.hpp
    │   │   │   │   ├── gen_gemm_kernel.cpp
    │   │   │   │   ├── gen_gemm_kernel.hpp
    │   │   │   │   ├── gen_gemm_kernel_db.cpp
    │   │   │   │   ├── gen_gemm_kernel_db.hpp
    │   │   │   │   ├── generator
    │   │   │   │   │   ├── .clang-format
    │   │   │   │   │   ├── generator.cpp
    │   │   │   │   │   ├── microkernel_provider.cpp
    │   │   │   │   │   ├── pieces
    │   │   │   │   │   │   ├── address_setup.cxx
    │   │   │   │   │   │   ├── alloc_utils.cpp
    │   │   │   │   │   │   ├── alloc_utils.hpp
    │   │   │   │   │   │   ├── allocators.cpp
    │   │   │   │   │   │   ├── allocators.hpp
    │   │   │   │   │   │   ├── asm_helpers.cxx
    │   │   │   │   │   │   ├── atomic_fusions.cxx
    │   │   │   │   │   │   ├── atomic_fusions.hpp
    │   │   │   │   │   │   ├── c_update.cxx
    │   │   │   │   │   │   ├── common.cxx
    │   │   │   │   │   │   ├── compute_utils.hpp
    │   │   │   │   │   │   ├── cooperative_split.cpp
    │   │   │   │   │   │   ├── cooperative_split.hpp
    │   │   │   │   │   │   ├── copy.cxx
    │   │   │   │   │   │   ├── copy_plan.cpp
    │   │   │   │   │   │   ├── copy_plan.hpp
    │   │   │   │   │   │   ├── driver_info.cxx
    │   │   │   │   │   │   ├── emulation.cxx
    │   │   │   │   │   │   ├── gemm.cxx
    │   │   │   │   │   │   ├── gemm_microkernel.cxx
    │   │   │   │   │   │   ├── gemm_setup.cxx
    │   │   │   │   │   │   ├── grf_multirange.hpp
    │   │   │   │   │   │   ├── hw_template_instantiations.cxx
    │   │   │   │   │   │   ├── hw_utils.hpp
    │   │   │   │   │   │   ├── invert.hpp
    │   │   │   │   │   │   ├── k_loop.cxx
    │   │   │   │   │   │   ├── k_loop_setup.cxx
    │   │   │   │   │   │   ├── kernel_queries.cpp
    │   │   │   │   │   │   ├── kernel_queries.hpp
    │   │   │   │   │   │   ├── l3_prefetch.cxx
    │   │   │   │   │   │   ├── layout_setup.cxx
    │   │   │   │   │   │   ├── layout_utils.cpp
    │   │   │   │   │   │   ├── layout_utils.hpp
    │   │   │   │   │   │   ├── loop_sequencer.cpp
    │   │   │   │   │   │   ├── loop_sequencer.hpp
    │   │   │   │   │   │   ├── map.hpp
    │   │   │   │   │   │   ├── masks.cxx
    │   │   │   │   │   │   ├── math_helpers.cxx
    │   │   │   │   │   │   ├── matrix_access.cxx
    │   │   │   │   │   │   ├── matrix_multiply.cxx
    │   │   │   │   │   │   ├── monolithic_k_loop_dpasw.cxx
    │   │   │   │   │   │   ├── ngen_object_helpers.cpp
    │   │   │   │   │   │   ├── ngen_object_helpers.hpp
    │   │   │   │   │   │   ├── post_ops.cxx
    │   │   │   │   │   │   ├── problem_utils.cpp
    │   │   │   │   │   │   ├── quantization.cpp
    │   │   │   │   │   │   ├── quantization.cxx
    │   │   │   │   │   │   ├── quantization.hpp
    │   │   │   │   │   │   ├── register_allocation.cxx
    │   │   │   │   │   │   ├── register_layout.cpp
    │   │   │   │   │   │   ├── register_layout.hpp
    │   │   │   │   │   │   ├── remask.cpp
    │   │   │   │   │   │   ├── remask.cxx
    │   │   │   │   │   │   ├── remask.hpp
    │   │   │   │   │   │   ├── row_column_sums.cxx
    │   │   │   │   │   │   ├── state.cpp
    │   │   │   │   │   │   ├── state.hpp
    │   │   │   │   │   │   ├── state_debug.cpp
    │   │   │   │   │   │   ├── state_utils.cpp
    │   │   │   │   │   │   ├── state_utils.cxx
    │   │   │   │   │   │   ├── state_utils.hpp
    │   │   │   │   │   │   ├── stream_k.cxx
    │   │   │   │   │   │   ├── tlb_warmup.cxx
    │   │   │   │   │   │   ├── token_alloc_utils.cpp
    │   │   │   │   │   │   ├── token_alloc_utils.hpp
    │   │   │   │   │   │   └── walk_orders.cxx
    │   │   │   │   │   ├── strategy.cpp
    │   │   │   │   │   └── strategy_parser.cpp
    │   │   │   │   ├── include
    │   │   │   │   │   ├── .clang-format
    │   │   │   │   │   ├── gemmstone
    │   │   │   │   │   │   ├── config.hpp
    │   │   │   │   │   │   ├── driver_info.hpp
    │   │   │   │   │   │   ├── generator.hpp
    │   │   │   │   │   │   ├── kernel_catalog.hpp
    │   │   │   │   │   │   ├── kernel_evaluator.hpp
    │   │   │   │   │   │   ├── kernel_selector.hpp
    │   │   │   │   │   │   ├── microkernel_provider.hpp
    │   │   │   │   │   │   ├── problem.hpp
    │   │   │   │   │   │   ├── strategy.hpp
    │   │   │   │   │   │   ├── strategy_parser.hpp
    │   │   │   │   │   │   └── type.hpp
    │   │   │   │   │   └── internal
    │   │   │   │   │   │   ├── generator_inline.hxx
    │   │   │   │   │   │   ├── namespace_end.hxx
    │   │   │   │   │   │   ├── namespace_start.hxx
    │   │   │   │   │   │   ├── ngen_includes.hpp
    │   │   │   │   │   │   └── utils.hpp
    │   │   │   │   ├── jit_gemm_pd.cpp
    │   │   │   │   ├── jit_gemm_pd.hpp
    │   │   │   │   ├── selector
    │   │   │   │   │   ├── .clang-format
    │   │   │   │   │   ├── db
    │   │   │   │   │   │   ├── kernel.db
    │   │   │   │   │   │   ├── ukernel_lmr.db
    │   │   │   │   │   │   ├── ukernel_mlr.db
    │   │   │   │   │   │   └── ukernel_mmr.db
    │   │   │   │   │   ├── kernel_evaluator.cpp
    │   │   │   │   │   └── kernel_selector.cpp
    │   │   │   │   ├── xe_hp_systolic_gemm.cpp
    │   │   │   │   └── xe_hp_systolic_gemm.hpp
    │   │   │   ├── generator.cpp
    │   │   │   ├── generator.hpp
    │   │   │   ├── generator_base.hpp
    │   │   │   ├── ir
    │   │   │   │   ├── README.md
    │   │   │   │   ├── block_2d_utils.hpp
    │   │   │   │   ├── blocking.cpp
    │   │   │   │   ├── blocking.hpp
    │   │   │   │   ├── config.hpp
    │   │   │   │   ├── core.cpp
    │   │   │   │   ├── core.hpp
    │   │   │   │   ├── eltwise.hpp
    │   │   │   │   ├── epilogue.cpp
    │   │   │   │   ├── epilogue.hpp
    │   │   │   │   ├── fma.cpp
    │   │   │   │   ├── fma.hpp
    │   │   │   │   ├── gemm_schedule.cpp
    │   │   │   │   ├── gemm_schedule.hpp
    │   │   │   │   ├── grf_permutation.hpp
    │   │   │   │   ├── hw.hpp
    │   │   │   │   ├── ir.cpp
    │   │   │   │   ├── ir.hpp
    │   │   │   │   ├── ir_builder.cpp
    │   │   │   │   ├── ir_builder.hpp
    │   │   │   │   ├── kernel_desc.hpp
    │   │   │   │   ├── kernel_info.hpp
    │   │   │   │   ├── linear_expr.cpp
    │   │   │   │   ├── linear_expr.hpp
    │   │   │   │   ├── message.cpp
    │   │   │   │   ├── message.hpp
    │   │   │   │   ├── message_patterns.hpp
    │   │   │   │   ├── post_ops.cpp
    │   │   │   │   ├── post_ops.hpp
    │   │   │   │   ├── primitive_plan.cpp
    │   │   │   │   ├── primitive_plan.hpp
    │   │   │   │   ├── problem.cpp
    │   │   │   │   ├── problem.hpp
    │   │   │   │   ├── reduce.cpp
    │   │   │   │   ├── reduce.hpp
    │   │   │   │   ├── reorder.hpp
    │   │   │   │   ├── send_plan.cpp
    │   │   │   │   ├── send_plan.hpp
    │   │   │   │   ├── slm_reduce_builder.cpp
    │   │   │   │   ├── slm_reduce_builder.hpp
    │   │   │   │   ├── tensor.cpp
    │   │   │   │   ├── tensor.hpp
    │   │   │   │   ├── tensor_config.cpp
    │   │   │   │   ├── tensor_config.hpp
    │   │   │   │   └── walk_order.hpp
    │   │   │   ├── pass
    │   │   │   │   ├── alloc.cpp
    │   │   │   │   ├── alloc.hpp
    │   │   │   │   ├── bank_conflict.cpp
    │   │   │   │   ├── bank_conflict.hpp
    │   │   │   │   ├── barrier.cpp
    │   │   │   │   ├── barrier.hpp
    │   │   │   │   ├── cse.cpp
    │   │   │   │   ├── cse.hpp
    │   │   │   │   ├── dp4a.cpp
    │   │   │   │   ├── dp4a.hpp
    │   │   │   │   ├── dpas.cpp
    │   │   │   │   ├── dpas.hpp
    │   │   │   │   ├── dpasw.cpp
    │   │   │   │   ├── dpasw.hpp
    │   │   │   │   ├── expr_scalarizer.hpp
    │   │   │   │   ├── hoist.cpp
    │   │   │   │   ├── hoist.hpp
    │   │   │   │   ├── overflow.cpp
    │   │   │   │   ├── overflow.hpp
    │   │   │   │   ├── pass.cpp
    │   │   │   │   ├── pass.hpp
    │   │   │   │   ├── peephole.cpp
    │   │   │   │   ├── peephole.hpp
    │   │   │   │   ├── send.cpp
    │   │   │   │   ├── send.hpp
    │   │   │   │   ├── shuffle_splitter.cpp
    │   │   │   │   ├── shuffle_splitter.hpp
    │   │   │   │   ├── simplify.cpp
    │   │   │   │   ├── simplify.hpp
    │   │   │   │   ├── slm.cpp
    │   │   │   │   ├── slm.hpp
    │   │   │   │   ├── strength_reduce.cpp
    │   │   │   │   ├── strength_reduce.hpp
    │   │   │   │   ├── unroll.cpp
    │   │   │   │   └── unroll.hpp
    │   │   │   ├── pooling
    │   │   │   │   ├── config.hpp
    │   │   │   │   ├── gen_pooling.cpp
    │   │   │   │   ├── gen_pooling.hpp
    │   │   │   │   ├── ir_builder.cpp
    │   │   │   │   ├── ir_builder.hpp
    │   │   │   │   └── pooling_kernel.hpp
    │   │   │   ├── post_op_injector.cpp
    │   │   │   ├── post_op_injector.hpp
    │   │   │   ├── reduction.cpp
    │   │   │   ├── reduction.hpp
    │   │   │   ├── reduction_generator.hpp
    │   │   │   ├── reduction_injector.cpp
    │   │   │   ├── reduction_injector.hpp
    │   │   │   ├── reorder
    │   │   │   │   ├── config.cpp
    │   │   │   │   ├── config.hpp
    │   │   │   │   ├── gen_reorder.cpp
    │   │   │   │   ├── gen_reorder.hpp
    │   │   │   │   ├── ir_builder.cpp
    │   │   │   │   ├── ir_builder.hpp
    │   │   │   │   ├── normalization.cpp
    │   │   │   │   ├── normalization.hpp
    │   │   │   │   ├── reorder_kernel.hpp
    │   │   │   │   ├── tiler.cpp
    │   │   │   │   └── tiler.hpp
    │   │   │   ├── utils
    │   │   │   │   ├── iterator.hpp
    │   │   │   │   ├── ngen_type_bridge.hpp
    │   │   │   │   ├── range.hpp
    │   │   │   │   ├── trace.cpp
    │   │   │   │   ├── trace.hpp
    │   │   │   │   ├── utils.cpp
    │   │   │   │   └── utils.hpp
    │   │   │   └── v2
    │   │   │   │   ├── conv
    │   │   │   │       ├── README.md
    │   │   │   │       ├── bench_data.cpp
    │   │   │   │       ├── bench_data.hpp
    │   │   │   │       ├── bridge.hpp
    │   │   │   │       ├── builder.cpp
    │   │   │   │       ├── builder.hpp
    │   │   │   │       ├── debug.cpp
    │   │   │   │       ├── debug.hpp
    │   │   │   │       ├── gen_convolution.cpp
    │   │   │   │       ├── gen_convolution.hpp
    │   │   │   │       ├── kernel.hpp
    │   │   │   │       ├── kernel_desc.cpp
    │   │   │   │       ├── kernel_desc.hpp
    │   │   │   │       ├── kernel_desc_2d_reqs.cpp
    │   │   │   │       ├── model.cpp
    │   │   │   │       ├── model.hpp
    │   │   │   │       ├── plan.cpp
    │   │   │   │       ├── plan.hpp
    │   │   │   │       ├── plan_registry.cpp
    │   │   │   │       ├── plan_registry.hpp
    │   │   │   │       ├── plan_registry_data.cpp
    │   │   │   │       ├── planner
    │   │   │   │       │   ├── CMakeLists.txt
    │   │   │   │       │   ├── bench.cpp
    │   │   │   │       │   ├── bench.hpp
    │   │   │   │       │   ├── model_fit.cpp
    │   │   │   │       │   ├── model_fit.hpp
    │   │   │   │       │   ├── planner.cpp
    │   │   │   │       │   ├── planner.hpp
    │   │   │   │       │   ├── planner_main.cpp
    │   │   │   │       │   ├── search.cpp
    │   │   │   │       │   └── search.hpp
    │   │   │   │       ├── problem.cpp
    │   │   │   │       ├── problem.hpp
    │   │   │   │       ├── tensor_utils.cpp
    │   │   │   │       └── tensor_utils.hpp
    │   │   │   │   └── ir
    │   │   │   │       ├── bridge.hpp
    │   │   │   │       ├── builder.cpp
    │   │   │   │       ├── builder.hpp
    │   │   │   │       ├── plan.hpp
    │   │   │   │       ├── plan_utils.hpp
    │   │   │   │       ├── reqs.cpp
    │   │   │   │       ├── reqs.hpp
    │   │   │   │       ├── send.cpp
    │   │   │   │       ├── send.hpp
    │   │   │   │       ├── tensor.cpp
    │   │   │   │       └── tensor.hpp
    │   │   ├── kernel_cache.cpp
    │   │   ├── kernel_cache.hpp
    │   │   ├── logging.hpp
    │   │   ├── microkernels
    │   │   │   ├── .clang-tidy
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── elf.hpp
    │   │   │   ├── entrance_agent.cpp
    │   │   │   ├── entrance_agent.hpp
    │   │   │   ├── fuser.cpp
    │   │   │   ├── fuser.hpp
    │   │   │   ├── internal_utilities.hpp
    │   │   │   ├── package.hpp
    │   │   │   ├── protocol.cpp
    │   │   │   ├── protocol.hpp
    │   │   │   ├── shim.cpp
    │   │   │   └── shim.hpp
    │   │   ├── ocl
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── binary_common.h
    │   │   │   ├── bnorm
    │   │   │   │   ├── lookup_table.cpp
    │   │   │   │   ├── lookup_table.hpp
    │   │   │   │   ├── model.cpp
    │   │   │   │   ├── model.hpp
    │   │   │   │   ├── nhwc_batch_normalization.cpp
    │   │   │   │   ├── nhwc_batch_normalization.hpp
    │   │   │   │   ├── nhwc_reusable.cl
    │   │   │   │   ├── nhwc_reusable.cpp
    │   │   │   │   ├── nhwc_reusable.h
    │   │   │   │   ├── nhwc_reusable.hpp
    │   │   │   │   ├── ref_batch_normalization.cpp
    │   │   │   │   ├── ref_batch_normalization.hpp
    │   │   │   │   ├── ref_bnorm.cl
    │   │   │   │   ├── reusable_bnorm.cl
    │   │   │   │   ├── reusable_bnorm.cpp
    │   │   │   │   ├── reusable_bnorm.hpp
    │   │   │   │   ├── simple_bnorm.cl
    │   │   │   │   ├── simple_bnorm.cpp
    │   │   │   │   ├── simple_bnorm.hpp
    │   │   │   │   ├── utils.cpp
    │   │   │   │   ├── utils.hpp
    │   │   │   │   ├── xe_batch_normalization.cpp
    │   │   │   │   ├── xe_batch_normalization.hpp
    │   │   │   │   ├── xe_bnorm.h
    │   │   │   │   ├── xe_bnorm_bwd.cl
    │   │   │   │   ├── xe_bnorm_fwd.cl
    │   │   │   │   ├── xe_bnorm_nhwc_bwd.cl
    │   │   │   │   ├── xe_bnorm_nhwc_fwd.cl
    │   │   │   │   └── xe_bnorm_reduce.h
    │   │   │   ├── concat_common.h
    │   │   │   ├── concat_utils.hpp
    │   │   │   ├── convolution_deconvolution.hpp
    │   │   │   ├── convolution_inner_product.cpp
    │   │   │   ├── convolution_inner_product.hpp
    │   │   │   ├── custom_reorder.cl
    │   │   │   ├── custom_reorder.cpp
    │   │   │   ├── custom_reorder.hpp
    │   │   │   ├── deconv_backward_bias.cl
    │   │   │   ├── device_info.cpp
    │   │   │   ├── device_info.hpp
    │   │   │   ├── dispatch.h
    │   │   │   ├── engine.cpp
    │   │   │   ├── engine.hpp
    │   │   │   ├── gemm
    │   │   │   │   ├── conv_gemm.hpp
    │   │   │   │   ├── gemm_with_post_ops.cl
    │   │   │   │   ├── gemm_with_post_ops.cpp
    │   │   │   │   ├── gemm_with_post_ops.hpp
    │   │   │   │   ├── ocl_gemm_attrs.h
    │   │   │   │   ├── ref_gemm.cl
    │   │   │   │   ├── ref_gemm.cpp
    │   │   │   │   ├── ref_gemm.hpp
    │   │   │   │   ├── xe_hp_systolic_gemm_copy.cl
    │   │   │   │   ├── xe_hpc_systolic_gemm_copy.cl
    │   │   │   │   └── xe_systolic_gemm_copy_kernel.hpp
    │   │   │   ├── gemm_inner_product.cpp
    │   │   │   ├── gemm_inner_product.hpp
    │   │   │   ├── gemm_matmul.cpp
    │   │   │   ├── gemm_matmul.hpp
    │   │   │   ├── generic_reorder.cl
    │   │   │   ├── generic_reorder.cpp
    │   │   │   ├── generic_reorder.hpp
    │   │   │   ├── graph
    │   │   │   │   └── gen_index.cl
    │   │   │   ├── hw_info.cpp
    │   │   │   ├── hw_info.hpp
    │   │   │   ├── kernel.cpp
    │   │   │   ├── kernel.hpp
    │   │   │   ├── layer_norm_common.h
    │   │   │   ├── lnorm_utils.hpp
    │   │   │   ├── many_inputs_sum.cl
    │   │   │   ├── many_inputs_sum.cpp
    │   │   │   ├── many_inputs_sum.hpp
    │   │   │   ├── mdapi_utils.cpp
    │   │   │   ├── mdapi_utils.hpp
    │   │   │   ├── micro_sdpa.cl
    │   │   │   ├── micro_sdpa.cpp
    │   │   │   ├── micro_sdpa.hpp
    │   │   │   ├── micro_sdpa_configs.cpp
    │   │   │   ├── micro_sdpa_configs.hpp
    │   │   │   ├── multi_concat.hpp
    │   │   │   ├── multi_po_reorder_binary.hpp
    │   │   │   ├── multi_po_reorder_sum.hpp
    │   │   │   ├── ocl_conversion.h
    │   │   │   ├── ocl_custom_types.h
    │   │   │   ├── ocl_eltwise.h
    │   │   │   ├── ocl_generic_vector_ops.h
    │   │   │   ├── ocl_io.h
    │   │   │   ├── ocl_kernel_list.cpp.in
    │   │   │   ├── ocl_math_utils.h
    │   │   │   ├── ocl_overrides.md
    │   │   │   ├── ocl_philox.h
    │   │   │   ├── ocl_post_ops.h
    │   │   │   ├── ocl_scales.h
    │   │   │   ├── ocl_types.h
    │   │   │   ├── ocl_types_specific.h
    │   │   │   ├── ocl_utils.h
    │   │   │   ├── offsets.h
    │   │   │   ├── reduction
    │   │   │   │   ├── atomic_reduction.cl
    │   │   │   │   ├── atomic_reduction.cpp
    │   │   │   │   ├── atomic_reduction.hpp
    │   │   │   │   ├── combined_reduction.cl
    │   │   │   │   ├── combined_reduction.cpp
    │   │   │   │   ├── combined_reduction.hpp
    │   │   │   │   ├── ocl_reduction.h
    │   │   │   │   ├── ref_reduction.cl
    │   │   │   │   ├── ref_reduction.cpp
    │   │   │   │   ├── ref_reduction.hpp
    │   │   │   │   ├── reusable_ref_reduction.cl
    │   │   │   │   ├── reusable_ref_reduction.cpp
    │   │   │   │   ├── reusable_ref_reduction.hpp
    │   │   │   │   ├── utils.cpp
    │   │   │   │   └── utils.hpp
    │   │   │   ├── ref_convolution.cl
    │   │   │   ├── ref_convolution.cpp
    │   │   │   ├── ref_convolution.hpp
    │   │   │   ├── ref_eltwise.cl
    │   │   │   ├── ref_eltwise.cpp
    │   │   │   ├── ref_eltwise.hpp
    │   │   │   ├── ref_group_normalization.cl
    │   │   │   ├── ref_group_normalization.cpp
    │   │   │   ├── ref_group_normalization.hpp
    │   │   │   ├── ref_inner_product.cl
    │   │   │   ├── ref_inner_product.cpp
    │   │   │   ├── ref_inner_product.hpp
    │   │   │   ├── ref_layer_normalization.cl
    │   │   │   ├── ref_layer_normalization.cpp
    │   │   │   ├── ref_layer_normalization.hpp
    │   │   │   ├── ref_lrn.cl
    │   │   │   ├── ref_lrn.cpp
    │   │   │   ├── ref_lrn.hpp
    │   │   │   ├── ref_matmul.cl
    │   │   │   ├── ref_matmul.cpp
    │   │   │   ├── ref_matmul.hpp
    │   │   │   ├── ref_pooling.cl
    │   │   │   ├── ref_pooling.cpp
    │   │   │   ├── ref_pooling.hpp
    │   │   │   ├── ref_prelu.cl
    │   │   │   ├── ref_prelu.cpp
    │   │   │   ├── ref_prelu.hpp
    │   │   │   ├── ref_reorder.cl
    │   │   │   ├── ref_reorder.cpp
    │   │   │   ├── ref_reorder.hpp
    │   │   │   ├── ref_resampling.cl
    │   │   │   ├── ref_resampling.cpp
    │   │   │   ├── ref_resampling.hpp
    │   │   │   ├── ref_sdpa.cl
    │   │   │   ├── ref_sdpa.cpp
    │   │   │   ├── ref_sdpa.hpp
    │   │   │   ├── ref_shuffle.cl
    │   │   │   ├── ref_shuffle.cpp
    │   │   │   ├── ref_shuffle.hpp
    │   │   │   ├── ref_sparse_matmul.cl
    │   │   │   ├── ref_sparse_matmul.cpp
    │   │   │   ├── ref_sparse_matmul.hpp
    │   │   │   ├── reorder_common.h
    │   │   │   ├── reusable_lnorm.cl
    │   │   │   ├── reusable_lnorm.cpp
    │   │   │   ├── reusable_lnorm.hpp
    │   │   │   ├── reusable_simple_concat.cl
    │   │   │   ├── reusable_simple_concat.cpp
    │   │   │   ├── reusable_simple_concat.hpp
    │   │   │   ├── reusable_softmax.cl
    │   │   │   ├── reusable_softmax.cpp
    │   │   │   ├── reusable_softmax.hpp
    │   │   │   ├── reusable_vectorized_lnorm.cl
    │   │   │   ├── reusable_vectorized_lnorm.cpp
    │   │   │   ├── reusable_vectorized_lnorm.hpp
    │   │   │   ├── rnn
    │   │   │   │   ├── cell_common.cpp
    │   │   │   │   ├── cell_compute.h
    │   │   │   │   ├── cell_gru.cpp
    │   │   │   │   ├── cell_gru_lbr.cpp
    │   │   │   │   ├── cell_kind_utility.h
    │   │   │   │   ├── common.h
    │   │   │   │   ├── grid.cl
    │   │   │   │   ├── grid.cpp
    │   │   │   │   ├── grid.hpp
    │   │   │   │   ├── reorders.cpp
    │   │   │   │   ├── reorders.hpp
    │   │   │   │   ├── rnn_reorder.cl
    │   │   │   │   ├── simple_cell_fusion.cpp
    │   │   │   │   ├── simple_cell_fusion.hpp
    │   │   │   │   ├── simple_postgemm.cpp
    │   │   │   │   ├── utils.cpp
    │   │   │   │   └── utils.hpp
    │   │   │   ├── sdpa_utils.h
    │   │   │   ├── shuffle_by_reorder.hpp
    │   │   │   ├── simple_binary.cl
    │   │   │   ├── simple_binary.cpp
    │   │   │   ├── simple_binary.hpp
    │   │   │   ├── simple_layer_normalization.cl
    │   │   │   ├── simple_layer_normalization.cpp
    │   │   │   ├── simple_layer_normalization.hpp
    │   │   │   ├── simple_softmax.cl
    │   │   │   ├── simple_softmax.cpp
    │   │   │   ├── simple_softmax.h
    │   │   │   ├── simple_softmax.hpp
    │   │   │   ├── simple_sum.cl
    │   │   │   ├── simple_sum.cpp
    │   │   │   ├── simple_sum.hpp
    │   │   │   ├── simple_zero_pad.cl
    │   │   │   ├── simple_zero_pad.cpp
    │   │   │   ├── simple_zero_pad.hpp
    │   │   │   ├── stream.cpp
    │   │   │   ├── stream.hpp
    │   │   │   ├── subbyte_pack.cl
    │   │   │   ├── tile_ops.h
    │   │   │   ├── types_interop.h
    │   │   │   ├── types_interop.hpp
    │   │   │   ├── usm_utils.cpp
    │   │   │   ├── usm_utils.hpp
    │   │   │   ├── utils.cpp
    │   │   │   ├── utils.hpp
    │   │   │   ├── vectorized_lnorm.cl
    │   │   │   ├── vectorized_lnorm.cpp
    │   │   │   ├── vectorized_lnorm.hpp
    │   │   │   ├── vectorized_lnorm_fused.cl
    │   │   │   ├── vectorized_resampling.cl
    │   │   │   ├── vectorized_resampling.cpp
    │   │   │   ├── vectorized_resampling.hpp
    │   │   │   ├── xe_binary.cl
    │   │   │   ├── xe_binary.cpp
    │   │   │   ├── xe_binary.hpp
    │   │   │   ├── xe_concat.cl
    │   │   │   ├── xe_concat.cpp
    │   │   │   ├── xe_concat.hpp
    │   │   │   ├── xe_eltwise.cl
    │   │   │   ├── xe_eltwise.cpp
    │   │   │   ├── xe_eltwise.hpp
    │   │   │   ├── xe_global_pooling.cl
    │   │   │   ├── xe_global_pooling.cpp
    │   │   │   ├── xe_global_pooling.hpp
    │   │   │   ├── xe_pooling.cl
    │   │   │   ├── xe_pooling.cpp
    │   │   │   ├── xe_pooling.hpp
    │   │   │   ├── xe_softmax.cl
    │   │   │   ├── xe_softmax.cpp
    │   │   │   ├── xe_softmax.hpp
    │   │   │   ├── xe_sum.cl
    │   │   │   ├── xe_sum.cpp
    │   │   │   ├── xe_sum.hpp
    │   │   │   ├── xe_wino_conv_fwd_data_2x3.cl
    │   │   │   ├── xe_wino_conv_fwd_data_fused.cl
    │   │   │   ├── xe_wino_convolution.cpp
    │   │   │   └── xe_wino_convolution.hpp
    │   │   ├── primitive_conf.cpp
    │   │   ├── primitive_conf.hpp
    │   │   ├── sycl
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── compat.cpp
    │   │   │   ├── compat.hpp
    │   │   │   ├── device_info.cpp
    │   │   │   ├── device_info.hpp
    │   │   │   ├── engine.cpp
    │   │   │   ├── engine.hpp
    │   │   │   ├── interop_kernel.cpp
    │   │   │   ├── interop_kernel.hpp
    │   │   │   ├── l0
    │   │   │   │   ├── utils.cpp
    │   │   │   │   └── utils.hpp
    │   │   │   ├── stream.cpp
    │   │   │   ├── stream.hpp
    │   │   │   ├── utils.cpp
    │   │   │   └── utils.hpp
    │   │   ├── utils.cpp
    │   │   └── utils.hpp
    │   └── nvidia
    │   │   ├── CMakeLists.txt
    │   │   ├── README.md
    │   │   ├── cudnn_batch_normalization.cpp
    │   │   ├── cudnn_batch_normalization.hpp
    │   │   ├── cudnn_batch_normalization_executor.hpp
    │   │   ├── cudnn_batch_normalization_impl.hpp
    │   │   ├── cudnn_binary.cpp
    │   │   ├── cudnn_binary.hpp
    │   │   ├── cudnn_binary_impl.hpp
    │   │   ├── cudnn_conv_filter_adjustment_base.hpp
    │   │   ├── cudnn_conv_inner_product.hpp
    │   │   ├── cudnn_conv_inner_product_impl.hpp
    │   │   ├── cudnn_convolution.cpp
    │   │   ├── cudnn_convolution.hpp
    │   │   ├── cudnn_convolution_impl.hpp
    │   │   ├── cudnn_convolution_pd.hpp
    │   │   ├── cudnn_deconvolution.cpp
    │   │   ├── cudnn_deconvolution.hpp
    │   │   ├── cudnn_deconvolution_impl.hpp
    │   │   ├── cudnn_eltwise.cpp
    │   │   ├── cudnn_eltwise.hpp
    │   │   ├── cudnn_eltwise_impl.hpp
    │   │   ├── cudnn_gemm_inner_product.hpp
    │   │   ├── cudnn_gemm_inner_product_impl.hpp
    │   │   ├── cudnn_inner_product.cpp
    │   │   ├── cudnn_inner_product.hpp
    │   │   ├── cudnn_inner_product_impl.hpp
    │   │   ├── cudnn_lrn.cpp
    │   │   ├── cudnn_lrn.hpp
    │   │   ├── cudnn_lrn_impl.hpp
    │   │   ├── cudnn_matmul.cpp
    │   │   ├── cudnn_matmul.hpp
    │   │   ├── cudnn_matmul_base_impl.hpp
    │   │   ├── cudnn_matmul_executor.hpp
    │   │   ├── cudnn_matmul_impl.hpp
    │   │   ├── cudnn_matmul_lt.hpp
    │   │   ├── cudnn_matmul_lt_impl.hpp
    │   │   ├── cudnn_pooling.cpp
    │   │   ├── cudnn_pooling.hpp
    │   │   ├── cudnn_pooling_impl.hpp
    │   │   ├── cudnn_reduction.cpp
    │   │   ├── cudnn_reduction.hpp
    │   │   ├── cudnn_reduction_impl.hpp
    │   │   ├── cudnn_reorder.cpp
    │   │   ├── cudnn_reorder.hpp
    │   │   ├── cudnn_reorder_impl.hpp
    │   │   ├── cudnn_reorder_lt.cpp
    │   │   ├── cudnn_reorder_lt.hpp
    │   │   ├── cudnn_reorder_lt_impl.hpp
    │   │   ├── cudnn_softmax.cpp
    │   │   ├── cudnn_softmax.hpp
    │   │   ├── cudnn_softmax_impl.hpp
    │   │   ├── cudnn_sum.hpp
    │   │   ├── engine.cpp
    │   │   ├── engine.hpp
    │   │   ├── stream.cpp
    │   │   ├── stream.hpp
    │   │   ├── sycl_cuda_compat.cpp
    │   │   ├── sycl_cuda_compat.hpp
    │   │   ├── sycl_cuda_scoped_context.cpp
    │   │   ├── sycl_cuda_scoped_context.hpp
    │   │   ├── sycl_cuda_stream_utils.hpp
    │   │   ├── sycl_cuda_utils.cpp
    │   │   └── sycl_cuda_utils.hpp
    ├── graph
    │   ├── CMakeLists.txt
    │   ├── backend
    │   │   ├── CMakeLists.txt
    │   │   ├── dnnl
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── common.cpp
    │   │   │   ├── common.hpp
    │   │   │   ├── dnnl_backend.cpp
    │   │   │   ├── dnnl_backend.hpp
    │   │   │   ├── dnnl_constant_tensor_cache.hpp
    │   │   │   ├── dnnl_op_def.hpp
    │   │   │   ├── dnnl_opset.hpp
    │   │   │   ├── dnnl_partition_impl.cpp
    │   │   │   ├── dnnl_partition_impl.hpp
    │   │   │   ├── dnnl_shape_infer.cpp
    │   │   │   ├── dnnl_shape_infer.hpp
    │   │   │   ├── fusion_info.cpp
    │   │   │   ├── fusion_info.hpp
    │   │   │   ├── internal_attrs.hpp
    │   │   │   ├── internal_ops.hpp
    │   │   │   ├── kernels
    │   │   │   │   ├── batch_norm.cpp
    │   │   │   │   ├── batch_norm.hpp
    │   │   │   │   ├── binary.cpp
    │   │   │   │   ├── binary.hpp
    │   │   │   │   ├── concat.cpp
    │   │   │   │   ├── concat.hpp
    │   │   │   │   ├── conv.cpp
    │   │   │   │   ├── conv.hpp
    │   │   │   │   ├── conv_base.cpp
    │   │   │   │   ├── conv_base.hpp
    │   │   │   │   ├── conv_transpose.cpp
    │   │   │   │   ├── conv_transpose.hpp
    │   │   │   │   ├── dummy.cpp
    │   │   │   │   ├── dummy.hpp
    │   │   │   │   ├── eltwise.cpp
    │   │   │   │   ├── eltwise.hpp
    │   │   │   │   ├── gen_index.cpp
    │   │   │   │   ├── gen_index.hpp
    │   │   │   │   ├── group_norm.cpp
    │   │   │   │   ├── group_norm.hpp
    │   │   │   │   ├── kernel_base.cpp
    │   │   │   │   ├── kernel_base.hpp
    │   │   │   │   ├── kernels.hpp
    │   │   │   │   ├── large_partition.cpp
    │   │   │   │   ├── large_partition.hpp
    │   │   │   │   ├── layer_norm.cpp
    │   │   │   │   ├── layer_norm.hpp
    │   │   │   │   ├── log_softmax.cpp
    │   │   │   │   ├── log_softmax.hpp
    │   │   │   │   ├── matmul.cpp
    │   │   │   │   ├── matmul.hpp
    │   │   │   │   ├── mqa.hpp
    │   │   │   │   ├── mqa_decomp.cpp
    │   │   │   │   ├── mqa_decomp.hpp
    │   │   │   │   ├── mqa_decomp_config.cpp
    │   │   │   │   ├── mqa_decomp_config.hpp
    │   │   │   │   ├── pool.cpp
    │   │   │   │   ├── pool.hpp
    │   │   │   │   ├── prelu.cpp
    │   │   │   │   ├── prelu.hpp
    │   │   │   │   ├── quantize.cpp
    │   │   │   │   ├── quantize.hpp
    │   │   │   │   ├── reduction.cpp
    │   │   │   │   ├── reduction.hpp
    │   │   │   │   ├── reorder.cpp
    │   │   │   │   ├── reorder.hpp
    │   │   │   │   ├── resampling.cpp
    │   │   │   │   ├── resampling.hpp
    │   │   │   │   ├── sdp.hpp
    │   │   │   │   ├── sdp_decomp.cpp
    │   │   │   │   ├── sdp_decomp.hpp
    │   │   │   │   ├── sdp_decomp_config.cpp
    │   │   │   │   ├── sdp_decomp_config.hpp
    │   │   │   │   ├── sdp_primitive.cpp
    │   │   │   │   ├── sdp_primitive.hpp
    │   │   │   │   ├── sdp_primitive_config.cpp
    │   │   │   │   ├── sdp_primitive_config.hpp
    │   │   │   │   ├── sdp_primitive_v1.cpp
    │   │   │   │   ├── sdp_primitive_v1.hpp
    │   │   │   │   ├── shuffle.cpp
    │   │   │   │   ├── shuffle.hpp
    │   │   │   │   ├── softmax.cpp
    │   │   │   │   ├── softmax.hpp
    │   │   │   │   ├── sum.cpp
    │   │   │   │   └── sum.hpp
    │   │   │   ├── layout_id_mgr.cpp
    │   │   │   ├── layout_id_mgr.hpp
    │   │   │   ├── layout_propagator.cpp
    │   │   │   ├── layout_propagator.hpp
    │   │   │   ├── op_executable.cpp
    │   │   │   ├── op_executable.hpp
    │   │   │   ├── passes
    │   │   │   │   ├── compile_ops.cpp
    │   │   │   │   ├── compile_ops.hpp
    │   │   │   │   ├── constant_propagation.cpp
    │   │   │   │   ├── constant_propagation.hpp
    │   │   │   │   ├── insert_ops.cpp
    │   │   │   │   ├── insert_ops.hpp
    │   │   │   │   ├── layout_propagation.cpp
    │   │   │   │   ├── layout_propagation.hpp
    │   │   │   │   ├── lower.cpp
    │   │   │   │   ├── lower.hpp
    │   │   │   │   ├── memory_planning.cpp
    │   │   │   │   ├── memory_planning.hpp
    │   │   │   │   ├── transform.cpp
    │   │   │   │   ├── transform.hpp
    │   │   │   │   ├── utils.cpp
    │   │   │   │   └── utils.hpp
    │   │   │   ├── patterns
    │   │   │   │   ├── binary_fusion.cpp
    │   │   │   │   ├── bn_fusion.cpp
    │   │   │   │   ├── concat_fusion.cpp
    │   │   │   │   ├── conv_block_fusion.cpp
    │   │   │   │   ├── conv_post_ops.cpp
    │   │   │   │   ├── convtranspose_fusion.cpp
    │   │   │   │   ├── data_type_check_pass.hpp
    │   │   │   │   ├── eltwise_fusion.cpp
    │   │   │   │   ├── fusions.hpp
    │   │   │   │   ├── groupnorm_fusion.cpp
    │   │   │   │   ├── interpolate_fusion.cpp
    │   │   │   │   ├── layernorm_fusion.cpp
    │   │   │   │   ├── matmul_post_ops.cpp
    │   │   │   │   ├── mlp.cpp
    │   │   │   │   ├── pattern_matcher_pass.hpp
    │   │   │   │   ├── pool_post_ops.cpp
    │   │   │   │   ├── quantize_fusion.cpp
    │   │   │   │   ├── reduction_fusion.cpp
    │   │   │   │   ├── reorder_fusion.cpp
    │   │   │   │   ├── sdp.cpp
    │   │   │   │   ├── shuffle_fusion.cpp
    │   │   │   │   ├── single_op_pattern.cpp
    │   │   │   │   ├── softmax_post_ops.cpp
    │   │   │   │   ├── sum_fusion.cpp
    │   │   │   │   └── utils.hpp
    │   │   │   ├── platform.cpp
    │   │   │   ├── platform.hpp
    │   │   │   ├── scratchpad.hpp
    │   │   │   ├── subgraph.cpp
    │   │   │   ├── subgraph.hpp
    │   │   │   ├── thread_local_cache.hpp
    │   │   │   └── utils.hpp
    │   │   └── fake
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── fake_backend.cpp
    │   │   │   ├── fake_backend.hpp
    │   │   │   ├── fake_partition_impl.hpp
    │   │   │   ├── pattern_utils.hpp
    │   │   │   ├── single_op_pass.hpp
    │   │   │   └── transformation_pass.hpp
    │   ├── interface
    │   │   ├── CMakeLists.txt
    │   │   ├── allocator.cpp
    │   │   ├── allocator.hpp
    │   │   ├── backend.cpp
    │   │   ├── backend.hpp
    │   │   ├── c_types_map.hpp
    │   │   ├── constant_tensor_cache.cpp
    │   │   ├── constant_tensor_cache.hpp
    │   │   ├── graph.cpp
    │   │   ├── graph.hpp
    │   │   ├── graph_attr.hpp
    │   │   ├── logical_tensor.cpp
    │   │   ├── logical_tensor.hpp
    │   │   ├── op.cpp
    │   │   ├── op.hpp
    │   │   ├── op_def.hpp
    │   │   ├── op_def_constraint.cpp
    │   │   ├── op_def_constraint.hpp
    │   │   ├── op_schema.cpp
    │   │   ├── op_schema.hpp
    │   │   ├── opset.hpp
    │   │   ├── partition.cpp
    │   │   ├── partition.hpp
    │   │   ├── partition_cache.cpp
    │   │   ├── partition_cache.hpp
    │   │   ├── partition_hashing.cpp
    │   │   ├── partition_hashing.hpp
    │   │   ├── partition_impl.cpp
    │   │   ├── partition_impl.hpp
    │   │   ├── shape_infer.cpp
    │   │   ├── shape_infer.hpp
    │   │   ├── tensor.cpp
    │   │   ├── tensor.hpp
    │   │   ├── value.cpp
    │   │   └── value.hpp
    │   └── utils
    │   │   ├── CMakeLists.txt
    │   │   ├── alloc.cpp
    │   │   ├── alloc.hpp
    │   │   ├── any.hpp
    │   │   ├── attribute_value.hpp
    │   │   ├── debug.cpp
    │   │   ├── debug.hpp
    │   │   ├── id.cpp
    │   │   ├── id.hpp
    │   │   ├── json.hpp
    │   │   ├── ocl_check.hpp
    │   │   ├── ocl_usm_utils.cpp
    │   │   ├── ocl_usm_utils.hpp
    │   │   ├── pm
    │   │       ├── dag_check_pass.hpp
    │   │       ├── nested_matcher.cpp
    │   │       ├── nested_matcher.hpp
    │   │       ├── op_depth_check_pass.hpp
    │   │       ├── pass_base.cpp
    │   │       ├── pass_base.hpp
    │   │       ├── pass_manager.cpp
    │   │       ├── pass_manager.hpp
    │   │       ├── pbuilder.cpp
    │   │       └── pbuilder.hpp
    │   │   ├── utils.cpp
    │   │   ├── utils.hpp
    │   │   ├── verbose.cpp
    │   │   └── verbose.hpp
    └── xpu
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   ├── context.hpp
    │   ├── ocl
    │       ├── CMakeLists.txt
    │       ├── buffer_memory_storage.cpp
    │       ├── buffer_memory_storage.hpp
    │       ├── c_types_map.hpp
    │       ├── capi
    │       │   ├── engine.cpp
    │       │   ├── memory.cpp
    │       │   ├── primitive.cpp
    │       │   └── stream.cpp
    │       ├── context.hpp
    │       ├── engine_factory.hpp
    │       ├── engine_id.hpp
    │       ├── engine_impl.cpp
    │       ├── engine_impl.hpp
    │       ├── memory_storage.hpp
    │       ├── memory_storage_base.hpp
    │       ├── stream_impl.cpp
    │       ├── stream_impl.hpp
    │       ├── stream_profiler.cpp
    │       ├── stream_profiler.hpp
    │       ├── usm_memory_storage.cpp
    │       ├── usm_memory_storage.hpp
    │       ├── usm_utils.cpp
    │       ├── usm_utils.hpp
    │       ├── utils.cpp
    │       ├── utils.hpp
    │       └── verbose.hpp
    │   ├── stream_profiler.hpp
    │   ├── sycl
    │       ├── CMakeLists.txt
    │       ├── buffer_memory_storage.cpp
    │       ├── buffer_memory_storage.hpp
    │       ├── c_types_map.hpp
    │       ├── capi
    │       │   ├── capi_engine.cpp
    │       │   ├── capi_memory.cpp
    │       │   ├── capi_primitive.cpp
    │       │   └── capi_stream.cpp
    │       ├── compat.cpp
    │       ├── compat.hpp
    │       ├── context.hpp
    │       ├── engine_factory.cpp
    │       ├── engine_factory.hpp
    │       ├── engine_id.hpp
    │       ├── engine_impl.cpp
    │       ├── engine_impl.hpp
    │       ├── memory_storage.hpp
    │       ├── memory_storage_base.cpp
    │       ├── memory_storage_base.hpp
    │       ├── memory_storage_helper.hpp
    │       ├── stream_impl.cpp
    │       ├── stream_impl.hpp
    │       ├── stream_profiler.cpp
    │       ├── stream_profiler.hpp
    │       ├── types.hpp
    │       ├── usm_memory_storage.cpp
    │       ├── usm_memory_storage.hpp
    │       ├── utils.cpp
    │       ├── utils.hpp
    │       └── verbose.hpp
    │   ├── utils.cpp
    │   └── utils.hpp
├── tests
    ├── CMakeLists.txt
    ├── api.c
    ├── benchdnn
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   ├── benchdnn.cpp
    │   ├── binary
    │   │   ├── bench_binary.cpp
    │   │   ├── binary.cpp
    │   │   ├── binary.hpp
    │   │   ├── binary_aux.cpp
    │   │   └── ref_binary.cpp
    │   ├── bnorm
    │   │   ├── bench_bnorm.cpp
    │   │   ├── bnorm.cpp
    │   │   ├── bnorm.hpp
    │   │   ├── bnorm_aux.cpp
    │   │   └── ref_bnorm.cpp
    │   ├── brgemm
    │   │   ├── bench_brgemm.cpp
    │   │   ├── brgemm.cpp
    │   │   ├── brgemm.hpp
    │   │   ├── brgemm_aux.cpp
    │   │   ├── cfg.cpp
    │   │   └── ref_brgemm.cpp
    │   ├── common.cpp
    │   ├── common.hpp
    │   ├── concat
    │   │   ├── bench_concat.cpp
    │   │   ├── concat.cpp
    │   │   ├── concat.hpp
    │   │   ├── concat_aux.cpp
    │   │   └── ref_concat.cpp
    │   ├── conv
    │   │   ├── bench_conv.cpp
    │   │   ├── cfg.cpp
    │   │   ├── conv.cpp
    │   │   ├── conv.hpp
    │   │   ├── conv_aux.cpp
    │   │   ├── conv_dw_fusion.cpp
    │   │   ├── conv_dw_fusion.hpp
    │   │   ├── ref_conv.cpp
    │   │   ├── ref_conv.hpp
    │   │   └── ref_wino.cpp
    │   ├── deconv
    │   │   ├── bench_deconv.cpp
    │   │   ├── cfg.cpp
    │   │   ├── deconv.cpp
    │   │   ├── deconv.hpp
    │   │   ├── deconv_aux.cpp
    │   │   ├── ref_deconv.cpp
    │   │   ├── ref_deconv.hpp
    │   │   └── ref_wino.cpp
    │   ├── dnn_types.cpp
    │   ├── dnn_types.hpp
    │   ├── dnnl_common.cpp
    │   ├── dnnl_common.hpp
    │   ├── dnnl_debug.hpp
    │   ├── dnnl_debug_autogenerated.cpp
    │   ├── dnnl_memory.cpp
    │   ├── dnnl_memory.hpp
    │   ├── doc
    │   │   ├── benchdnn_general_info.md
    │   │   ├── benchdnn_input_files_naming_convention.md
    │   │   ├── driver_binary.md
    │   │   ├── driver_bnorm.md
    │   │   ├── driver_brgemm.md
    │   │   ├── driver_concat.md
    │   │   ├── driver_conv.md
    │   │   ├── driver_eltwise.md
    │   │   ├── driver_gnorm.md
    │   │   ├── driver_graph.md
    │   │   ├── driver_ip.md
    │   │   ├── driver_lnorm.md
    │   │   ├── driver_lrn.md
    │   │   ├── driver_matmul.md
    │   │   ├── driver_pool.md
    │   │   ├── driver_prelu.md
    │   │   ├── driver_reduction.md
    │   │   ├── driver_reorder.md
    │   │   ├── driver_resampling.md
    │   │   ├── driver_rnn.md
    │   │   ├── driver_shuffle.md
    │   │   ├── driver_softmax.md
    │   │   ├── driver_sum.md
    │   │   ├── driver_zeropad.md
    │   │   ├── knob_cold_cache.md
    │   │   ├── knob_impl_filter.md
    │   │   ├── knob_strides.md
    │   │   ├── knob_summary.md
    │   │   ├── knob_use_fast_ref.md
    │   │   ├── knobs_attr.md
    │   │   ├── knobs_common.md
    │   │   ├── knobs_desc.md
    │   │   ├── knobs_dir.md
    │   │   ├── knobs_dt.md
    │   │   ├── knobs_encoding.md
    │   │   ├── knobs_perf_report.md
    │   │   ├── knobs_tag.md
    │   │   └── knobs_verbose.md
    │   ├── eltwise
    │   │   ├── bench_eltwise.cpp
    │   │   ├── eltwise.cpp
    │   │   ├── eltwise.hpp
    │   │   ├── eltwise_aux.cpp
    │   │   └── ref_eltwise.cpp
    │   ├── gnorm
    │   │   ├── bench_gnorm.cpp
    │   │   ├── gnorm.cpp
    │   │   ├── gnorm.hpp
    │   │   ├── gnorm_aux.cpp
    │   │   └── ref_gnorm.cpp
    │   ├── graph
    │   │   ├── allocator.cpp
    │   │   ├── allocator.hpp
    │   │   ├── bench_graph.cpp
    │   │   ├── custom_driver.cpp
    │   │   ├── custom_driver.hpp
    │   │   ├── deserialize.cpp
    │   │   ├── deserialize.hpp
    │   │   ├── flex_rewrite.cpp
    │   │   ├── flex_rewrite.hpp
    │   │   ├── graph.cpp
    │   │   ├── graph.hpp
    │   │   ├── graph_memory.cpp
    │   │   ├── graph_memory.hpp
    │   │   ├── input_displacer.cpp
    │   │   ├── input_displacer.hpp
    │   │   ├── memory_pool.hpp
    │   │   ├── parser.cpp
    │   │   ├── parser.hpp
    │   │   ├── ref_partition.cpp
    │   │   ├── ref_partition.hpp
    │   │   ├── ref_primitive.cpp
    │   │   ├── ref_primitive.hpp
    │   │   ├── setting_handler.cpp
    │   │   ├── setting_handler.hpp
    │   │   ├── utils.cpp
    │   │   └── utils.hpp
    │   ├── inputs
    │   │   ├── binary
    │   │   │   ├── harness_binary_bf16
    │   │   │   ├── harness_binary_different_dt
    │   │   │   ├── harness_binary_f16
    │   │   │   ├── harness_binary_f32
    │   │   │   ├── harness_binary_i8
    │   │   │   ├── harness_binary_regression
    │   │   │   ├── option_set_all
    │   │   │   ├── option_set_fwks_ext_gpu
    │   │   │   ├── option_set_fwks_key_gpu
    │   │   │   ├── option_set_minimal
    │   │   │   ├── option_set_src0_bcast
    │   │   │   ├── perf_binary_gpu
    │   │   │   ├── shapes_ci
    │   │   │   ├── shapes_perf_1st_conv
    │   │   │   ├── shapes_perf_scaleshift
    │   │   │   ├── test_binary_all
    │   │   │   ├── test_binary_bfloat16
    │   │   │   ├── test_binary_ci
    │   │   │   ├── test_binary_different_dt_ci
    │   │   │   ├── test_binary_float16
    │   │   │   ├── test_binary_gpu
    │   │   │   └── test_binary_smoke
    │   │   ├── bnorm
    │   │   │   ├── option_set_fwks_ext_gpu
    │   │   │   ├── option_set_fwks_key_gpu
    │   │   │   ├── perf_bnorm_gpu
    │   │   │   ├── set_nd
    │   │   │   ├── shapes_1d
    │   │   │   ├── shapes_2d
    │   │   │   ├── shapes_3d
    │   │   │   ├── shapes_ci
    │   │   │   ├── shapes_densenet_121
    │   │   │   ├── shapes_googlenet_v2
    │   │   │   ├── shapes_googlenet_v3
    │   │   │   ├── shapes_large
    │   │   │   ├── shapes_regressions
    │   │   │   ├── shapes_resnet_50
    │   │   │   ├── shapes_topologies_small
    │   │   │   ├── test_bnorm_all_blocked
    │   │   │   ├── test_bnorm_all_plain
    │   │   │   ├── test_bnorm_bfloat16_blocked
    │   │   │   ├── test_bnorm_bfloat16_plain
    │   │   │   ├── test_bnorm_ci
    │   │   │   ├── test_bnorm_float16_plain
    │   │   │   ├── test_bnorm_gpu
    │   │   │   ├── test_bnorm_regressions
    │   │   │   ├── test_bnorm_regressions_large
    │   │   │   └── test_bnorm_smoke
    │   │   ├── brgemm
    │   │   │   ├── harness_brgemm_f32
    │   │   │   ├── harness_brgemm_f8
    │   │   │   ├── harness_brgemm_fpmath
    │   │   │   ├── option_set_bf16
    │   │   │   ├── option_set_f32
    │   │   │   ├── option_set_int8
    │   │   │   ├── shapes_2d_big_k_bf16
    │   │   │   ├── shapes_2d_big_k_f32
    │   │   │   ├── shapes_2d_big_k_int8
    │   │   │   ├── shapes_2d_big_k_tail_n_bf16
    │   │   │   ├── shapes_2d_big_k_tail_n_f32
    │   │   │   ├── shapes_2d_big_k_tail_n_int8
    │   │   │   ├── shapes_2d_no_tail_bf16
    │   │   │   ├── shapes_2d_no_tail_f32
    │   │   │   ├── shapes_2d_no_tail_int8
    │   │   │   ├── shapes_2d_tail_k_bf16
    │   │   │   ├── shapes_2d_tail_k_f32
    │   │   │   ├── shapes_2d_tail_k_int8
    │   │   │   ├── shapes_2d_tail_k_tail_n_bf16
    │   │   │   ├── shapes_2d_tail_k_tail_n_f32
    │   │   │   ├── shapes_2d_tail_k_tail_n_int8
    │   │   │   ├── shapes_2d_tail_n_bf16
    │   │   │   ├── shapes_2d_tail_n_f32
    │   │   │   ├── shapes_2d_tail_n_int8
    │   │   │   ├── test_brgemm_all
    │   │   │   ├── test_brgemm_bf16
    │   │   │   ├── test_brgemm_ci
    │   │   │   ├── test_brgemm_f16
    │   │   │   ├── test_brgemm_f32
    │   │   │   ├── test_brgemm_f8
    │   │   │   ├── test_brgemm_int8
    │   │   │   ├── test_brgemm_regression
    │   │   │   └── test_brgemm_smoke
    │   │   ├── concat
    │   │   │   ├── option_set_fwks_ext_gpu
    │   │   │   ├── option_set_fwks_key_gpu
    │   │   │   ├── option_set_xe_gpu
    │   │   │   ├── test_concat_all
    │   │   │   ├── test_concat_bfloat16
    │   │   │   ├── test_concat_ci
    │   │   │   ├── test_concat_float16
    │   │   │   ├── test_concat_gpu
    │   │   │   ├── test_concat_large_gpu
    │   │   │   └── test_concat_smoke
    │   │   ├── conv
    │   │   │   ├── harness_conv_arbitrary_dst
    │   │   │   ├── harness_conv_attrs_gpu
    │   │   │   ├── harness_conv_attrs_int8
    │   │   │   ├── harness_conv_attrs_int8_asymmetric
    │   │   │   ├── harness_conv_auto
    │   │   │   ├── harness_conv_deepbench
    │   │   │   ├── harness_conv_depthwise_int8
    │   │   │   ├── harness_conv_dilated_3d
    │   │   │   ├── harness_conv_dilated_int8
    │   │   │   ├── harness_conv_dw_bfloat16
    │   │   │   ├── harness_conv_dw_bfloat16_nxc
    │   │   │   ├── harness_conv_dw_float16_nxc
    │   │   │   ├── harness_conv_dw_fp8_nxc
    │   │   │   ├── harness_conv_f32
    │   │   │   ├── harness_conv_f32_plain
    │   │   │   ├── harness_conv_fused_depthwise
    │   │   │   ├── harness_conv_int8
    │   │   │   ├── harness_conv_output_striding
    │   │   │   ├── harness_conv_regression_general
    │   │   │   ├── harness_conv_saturation_int8
    │   │   │   ├── harness_conv_smoke_ref
    │   │   │   ├── harness_conv_tags
    │   │   │   ├── harness_conv_zero_points
    │   │   │   ├── option_gpu_ci
    │   │   │   ├── option_set_all_eltwise_postops
    │   │   │   ├── option_set_combined_postops
    │   │   │   ├── option_set_fwks_ext_gpu
    │   │   │   ├── option_set_fwks_key_gpu
    │   │   │   ├── option_set_v2
    │   │   │   ├── perf_conv_bdw_1sock
    │   │   │   ├── perf_conv_clx_1sock
    │   │   │   ├── perf_conv_skx_1sock
    │   │   │   ├── perf_conv_xe
    │   │   │   ├── perf_conv_xe_hp
    │   │   │   ├── perf_conv_xe_lp
    │   │   │   ├── set_all_topologies
    │   │   │   ├── set_conv_3d
    │   │   │   ├── set_conv_all
    │   │   │   ├── set_conv_dw
    │   │   │   ├── set_dilated-conv
    │   │   │   ├── set_dilated-conv_1st
    │   │   │   ├── set_dilated-conv_3d
    │   │   │   ├── set_fastrcnn
    │   │   │   ├── set_gpu
    │   │   │   ├── set_maskrcnn
    │   │   │   ├── set_perf_cpu_all_mb
    │   │   │   ├── set_perf_cpu_inference_only
    │   │   │   ├── set_perf_cpu_large_mb
    │   │   │   ├── set_perf_cpu_small_mb
    │   │   │   ├── set_perf_gpu_all_mb
    │   │   │   ├── set_perf_gpu_large_mb
    │   │   │   ├── set_perf_gpu_small_mb
    │   │   │   ├── set_topologies_inference_only
    │   │   │   ├── shapes_1d
    │   │   │   ├── shapes_1d_wavenet
    │   │   │   ├── shapes_1x1
    │   │   │   ├── shapes_3d
    │   │   │   ├── shapes_3d_1st_strided_padding
    │   │   │   ├── shapes_3d_1x1_strided_no-padding
    │   │   │   ├── shapes_3d_1x1_strided_padding
    │   │   │   ├── shapes_3d_1x1_unit-stride_no-padding
    │   │   │   ├── shapes_3d_1x1_unit-stride_padding
    │   │   │   ├── shapes_3d_2d_strided_padding
    │   │   │   ├── shapes_3d_gpu
    │   │   │   ├── shapes_3d_i3d
    │   │   │   ├── shapes_3d_resnext101
    │   │   │   ├── shapes_3d_strided_no-padding
    │   │   │   ├── shapes_3d_strided_padding
    │   │   │   ├── shapes_3d_unet
    │   │   │   ├── shapes_3d_unit-stride_no-padding
    │   │   │   ├── shapes_3d_unit-stride_padding
    │   │   │   ├── shapes_4bit
    │   │   │   ├── shapes_a3c
    │   │   │   ├── shapes_alexnet
    │   │   │   ├── shapes_auto
    │   │   │   ├── shapes_basic
    │   │   │   ├── shapes_ci_gpu
    │   │   │   ├── shapes_cosmictagger
    │   │   │   ├── shapes_deepbench_inference_device
    │   │   │   ├── shapes_deepbench_inference_server
    │   │   │   ├── shapes_deepbench_training
    │   │   │   ├── shapes_densnet
    │   │   │   ├── shapes_dilated
    │   │   │   ├── shapes_dilated_1d_1st_strided_padding
    │   │   │   ├── shapes_dilated_1d_strided_no-padding
    │   │   │   ├── shapes_dilated_1d_strided_padding
    │   │   │   ├── shapes_dilated_1d_unit-stride_no-padding
    │   │   │   ├── shapes_dilated_1d_unit-stride_padding
    │   │   │   ├── shapes_dilated_2d_1st_strided_padding
    │   │   │   ├── shapes_dilated_2d_strided_no-padding
    │   │   │   ├── shapes_dilated_2d_strided_padding
    │   │   │   ├── shapes_dilated_2d_unit-stride_no-padding
    │   │   │   ├── shapes_dilated_2d_unit-stride_padding
    │   │   │   ├── shapes_dilated_3d_strided_no-padding
    │   │   │   ├── shapes_dilated_3d_strided_padding
    │   │   │   ├── shapes_dilated_3d_unit-stride_no-padding
    │   │   │   ├── shapes_dilated_3d_unit-stride_padding
    │   │   │   ├── shapes_dilated_rfcn
    │   │   │   ├── shapes_dw_1d_stride_no-padding
    │   │   │   ├── shapes_dw_1d_unit-stride_no-padding
    │   │   │   ├── shapes_dw_1d_unit-stride_padding
    │   │   │   ├── shapes_dw_2d_1d_strided_padding
    │   │   │   ├── shapes_dw_2d_strided_no-padding
    │   │   │   ├── shapes_dw_2d_strided_padding
    │   │   │   ├── shapes_dw_2d_unit-stride_no-padding
    │   │   │   ├── shapes_dw_2d_unit-stride_padding
    │   │   │   ├── shapes_dw_3d_strided_no-padding
    │   │   │   ├── shapes_dw_3d_strided_padding
    │   │   │   ├── shapes_dw_3d_unit-stride_no-padding
    │   │   │   ├── shapes_dw_3d_unit-stride_padding
    │   │   │   ├── shapes_dw_minibatch_2d-spatial
    │   │   │   ├── shapes_dw_minibatch_channel_2d-spatial
    │   │   │   ├── shapes_efficientdet
    │   │   │   ├── shapes_fastrcnn_p1
    │   │   │   ├── shapes_fastrcnn_p2
    │   │   │   ├── shapes_fastrcnn_p3
    │   │   │   ├── shapes_ffn
    │   │   │   ├── shapes_fused_large_src
    │   │   │   ├── shapes_fused_mobilenet_stride_1
    │   │   │   ├── shapes_fused_mobilenet_stride_2
    │   │   │   ├── shapes_gemm
    │   │   │   ├── shapes_googlenet_v1
    │   │   │   ├── shapes_googlenet_v2
    │   │   │   ├── shapes_googlenet_v3
    │   │   │   ├── shapes_large_conv
    │   │   │   ├── shapes_large_padding
    │   │   │   ├── shapes_maskrcnn_p1
    │   │   │   ├── shapes_maskrcnn_p2
    │   │   │   ├── shapes_mem_strided
    │   │   │   ├── shapes_mobilenet
    │   │   │   ├── shapes_mobilenet_dw
    │   │   │   ├── shapes_movinet_dw
    │   │   │   ├── shapes_pointnet
    │   │   │   ├── shapes_regression_1x1
    │   │   │   ├── shapes_regression_dw
    │   │   │   ├── shapes_regression_gemm
    │   │   │   ├── shapes_regression_padding
    │   │   │   ├── shapes_regression_small_spatial
    │   │   │   ├── shapes_resnet_50
    │   │   │   ├── shapes_resnet_50_sparse
    │   │   │   ├── shapes_resnet_50_v1_5
    │   │   │   ├── shapes_resnext_101
    │   │   │   ├── shapes_segnet
    │   │   │   ├── shapes_src-transpose_padding
    │   │   │   ├── shapes_ssd_300_voc0712
    │   │   │   ├── shapes_ssd_mobilenet
    │   │   │   ├── shapes_ssd_resnet34_inference
    │   │   │   ├── shapes_ssd_resnet34_training
    │   │   │   ├── shapes_tails
    │   │   │   ├── shapes_tails_gpu
    │   │   │   ├── shapes_unet
    │   │   │   ├── shapes_vgg_11
    │   │   │   ├── shapes_vgg_19
    │   │   │   ├── shapes_x3d_dw
    │   │   │   ├── shapes_xception
    │   │   │   ├── shapes_yolov2
    │   │   │   ├── test_conv_3d
    │   │   │   ├── test_conv_3d_f32_plain
    │   │   │   ├── test_conv_all
    │   │   │   ├── test_conv_all_topologies
    │   │   │   ├── test_conv_all_topologies_f32_plain
    │   │   │   ├── test_conv_attrs
    │   │   │   ├── test_conv_attrs_f32_plain
    │   │   │   ├── test_conv_bfloat16
    │   │   │   ├── test_conv_bfloat16_nxc
    │   │   │   ├── test_conv_bfloat16_ymm
    │   │   │   ├── test_conv_ci
    │   │   │   ├── test_conv_depthwise
    │   │   │   ├── test_conv_dilated
    │   │   │   ├── test_conv_dilated_f32_plain
    │   │   │   ├── test_conv_dt
    │   │   │   ├── test_conv_dt_plain
    │   │   │   ├── test_conv_float16_nxc
    │   │   │   ├── test_conv_fp4
    │   │   │   ├── test_conv_fp8_nxc
    │   │   │   ├── test_conv_function
    │   │   │   ├── test_conv_gemm_bfloat16
    │   │   │   ├── test_conv_gemm_bfloat16_nxc
    │   │   │   ├── test_conv_gemm_dt
    │   │   │   ├── test_conv_gemm_dt_nxc
    │   │   │   ├── test_conv_gemm_int8
    │   │   │   ├── test_conv_gpu
    │   │   │   ├── test_conv_gpu_ci
    │   │   │   ├── test_conv_int8
    │   │   │   ├── test_conv_large_gpu
    │   │   │   ├── test_conv_regression
    │   │   │   ├── test_conv_regression_gpu
    │   │   │   ├── test_conv_smoke
    │   │   │   ├── test_conv_wino_f32
    │   │   │   └── test_conv_wino_gpu
    │   │   ├── deconv
    │   │   │   ├── harness_deconv_attrs_int8
    │   │   │   ├── harness_deconv_attrs_int8_asymmetric
    │   │   │   ├── harness_deconv_regression_general_f32
    │   │   │   ├── harness_deconv_regression_general_int8
    │   │   │   ├── option_set_fwks_ext_gpu
    │   │   │   ├── option_set_fwks_key_gpu
    │   │   │   ├── set_all
    │   │   │   ├── shapes_1d
    │   │   │   ├── shapes_1x1
    │   │   │   ├── shapes_2d
    │   │   │   ├── shapes_3d
    │   │   │   ├── shapes_ci
    │   │   │   ├── shapes_dilated
    │   │   │   ├── test_deconv_all
    │   │   │   ├── test_deconv_all_f32_nxc
    │   │   │   ├── test_deconv_bfloat16
    │   │   │   ├── test_deconv_bfloat16_nxc
    │   │   │   ├── test_deconv_bfloat16_ymm
    │   │   │   ├── test_deconv_ci
    │   │   │   ├── test_deconv_float16_nxc
    │   │   │   ├── test_deconv_fp8_nxc
    │   │   │   ├── test_deconv_gpu
    │   │   │   ├── test_deconv_int8
    │   │   │   └── test_deconv_smoke
    │   │   ├── eltwise
    │   │   │   ├── harness_eltwise_large_buffer
    │   │   │   ├── harness_eltwise_regression
    │   │   │   ├── harness_eltwise_saturation
    │   │   │   ├── option_set_all_algs
    │   │   │   ├── option_set_all_algs_ci
    │   │   │   ├── option_set_all_algs_int8
    │   │   │   ├── option_set_all_algs_int8_ci
    │   │   │   ├── option_set_fwks_ext_gpu
    │   │   │   ├── option_set_fwks_key_gpu
    │   │   │   ├── shapes_ci
    │   │   │   ├── shapes_eltwise
    │   │   │   ├── shapes_large_buffer
    │   │   │   ├── test_eltwise_all
    │   │   │   ├── test_eltwise_bfloat16
    │   │   │   ├── test_eltwise_ci
    │   │   │   ├── test_eltwise_float16
    │   │   │   ├── test_eltwise_float8
    │   │   │   ├── test_eltwise_gpu
    │   │   │   └── test_eltwise_smoke
    │   │   ├── gnorm
    │   │   │   ├── shapes_all
    │   │   │   ├── shapes_ci
    │   │   │   ├── shapes_sd
    │   │   │   ├── test_gnorm_all
    │   │   │   └── test_gnorm_ci
    │   │   ├── graph
    │   │   │   ├── complex_fusion
    │   │   │   │   ├── harness_mha_all
    │   │   │   │   ├── harness_mha_ci
    │   │   │   │   ├── harness_mlp_all
    │   │   │   │   ├── harness_mlp_ci
    │   │   │   │   ├── mha
    │   │   │   │   │   ├── GQA-fp16-v2.json
    │   │   │   │   │   ├── GQA-fp16.json
    │   │   │   │   │   ├── JAX-MHA-inf-fp32.json
    │   │   │   │   │   ├── JAX-MQA-inf-fp32.json
    │   │   │   │   │   ├── MHA-GPT-inf-fp32-bs1.json
    │   │   │   │   │   ├── MHA-GPT-inf-int8-bs1.json
    │   │   │   │   │   ├── MHA-bert_large-inf-fp32-bs1.json
    │   │   │   │   │   ├── MHA-bert_large-inf-int8-bs1.json
    │   │   │   │   │   ├── MHA-distill_bert-inf-fp32-bs1.json
    │   │   │   │   │   ├── MHA-distill_bert-inf-int8-bs1.json
    │   │   │   │   │   ├── MHA-stable_diffusion-inf-fp32-bs1.json
    │   │   │   │   │   ├── codegemma-bf16-f32.json
    │   │   │   │   │   ├── gemma2-bf16-f32.json
    │   │   │   │   │   ├── gqa-plain-bottom-right-implicit-causal-mask-f16-f32.json
    │   │   │   │   │   ├── gqa-plain-implicit-causal-mask-fp32-bs1.json
    │   │   │   │   │   ├── sdpa-compressed-k-int8-gs32.json
    │   │   │   │   │   ├── sdpa-compressed-kv-implicit-causal-mask-int8-gs128.json
    │   │   │   │   │   ├── sdpa-compressed-kv-int4-gs32.json
    │   │   │   │   │   ├── sdpa-compressed-v-int8-gs32.json
    │   │   │   │   │   ├── sdpa-plain-bottom-right-implicit-causal-mask-f16-f32.json
    │   │   │   │   │   ├── sdpa-plain-implicit-causal-mask-fp32-bs1.json
    │   │   │   │   │   ├── sdpa-plain-simplified-f16-f32.json
    │   │   │   │   │   ├── sdpa-plain-simplified-f16.json
    │   │   │   │   │   ├── sdpa-plain-wo-mask-f16.json
    │   │   │   │   │   ├── sdpa-plain-wo-scale-f16-bs1.json
    │   │   │   │   │   └── sdpa-plain-wo-scale-int8-bs1.json
    │   │   │   │   └── mlp
    │   │   │   │   │   ├── gated-mlp-f32.json
    │   │   │   │   │   └── gated-mlp-int4.json
    │   │   │   ├── op
    │   │   │   │   ├── bf16
    │   │   │   │   │   ├── dynamicdq_s4.json
    │   │   │   │   │   ├── dynamicdq_u4.json
    │   │   │   │   │   └── typecast.json
    │   │   │   │   ├── f16
    │   │   │   │   │   ├── dynamicdq_s4.json
    │   │   │   │   │   ├── dynamicdq_u4.json
    │   │   │   │   │   └── typecast.json
    │   │   │   │   ├── f32
    │   │   │   │   │   ├── abs.json
    │   │   │   │   │   ├── abs_bwd.json
    │   │   │   │   │   ├── add.json
    │   │   │   │   │   ├── avgpool.json
    │   │   │   │   │   ├── avgpool_bwd.json
    │   │   │   │   │   ├── biasadd.json
    │   │   │   │   │   ├── biasadd_bwd.json
    │   │   │   │   │   ├── bnorm.json
    │   │   │   │   │   ├── bnorm_bwd.json
    │   │   │   │   │   ├── bnorm_fwd_d.json
    │   │   │   │   │   ├── clamp.json
    │   │   │   │   │   ├── clamp_bwd.json
    │   │   │   │   │   ├── concat.json
    │   │   │   │   │   ├── concat_2.json
    │   │   │   │   │   ├── concat_3.json
    │   │   │   │   │   ├── conv_2d.json
    │   │   │   │   │   ├── conv_3d.json
    │   │   │   │   │   ├── conv_bwd_d_2d.json
    │   │   │   │   │   ├── conv_bwd_d_3d.json
    │   │   │   │   │   ├── conv_bwd_w_2d.json
    │   │   │   │   │   ├── deconv.json
    │   │   │   │   │   ├── deconv_bwd_d.json
    │   │   │   │   │   ├── deconv_bwd_w.json
    │   │   │   │   │   ├── dequantize_f8_e4m3.json
    │   │   │   │   │   ├── dequantize_f8_e5m2.json
    │   │   │   │   │   ├── dequantize_s8.json
    │   │   │   │   │   ├── dequantize_u8.json
    │   │   │   │   │   ├── dynamicdq.json
    │   │   │   │   │   ├── dynamicdq_s4.json
    │   │   │   │   │   ├── dynamicdq_u4.json
    │   │   │   │   │   ├── dynamicq.json
    │   │   │   │   │   ├── dynamicq_s4.json
    │   │   │   │   │   ├── elu.json
    │   │   │   │   │   ├── elu_bwd.json
    │   │   │   │   │   ├── gelu.json
    │   │   │   │   │   ├── genindex.json
    │   │   │   │   │   ├── gnorm.json
    │   │   │   │   │   ├── greaterequal.json
    │   │   │   │   │   ├── hardsigmoid.json
    │   │   │   │   │   ├── hardsigmoid_bwd.json
    │   │   │   │   │   ├── interpolate.json
    │   │   │   │   │   ├── interpolate_3d.json
    │   │   │   │   │   ├── interpolate_bwd.json
    │   │   │   │   │   ├── interpolate_bwd_2d.json
    │   │   │   │   │   ├── lnorm.json
    │   │   │   │   │   ├── lnorm_3d.json
    │   │   │   │   │   ├── lnorm_3d_bwd.json
    │   │   │   │   │   ├── lnorm_bwd.json
    │   │   │   │   │   ├── lnorm_ks.json
    │   │   │   │   │   ├── logsoftmax.json
    │   │   │   │   │   ├── logsoftmax_bwd.json
    │   │   │   │   │   ├── matmul_2d_4d.json
    │   │   │   │   │   ├── maxpool.json
    │   │   │   │   │   ├── maxpool_bwd.json
    │   │   │   │   │   ├── prelu.json
    │   │   │   │   │   ├── prelu_bwd.json
    │   │   │   │   │   ├── prelu_bwd_dw_5d.json
    │   │   │   │   │   ├── quantize.json
    │   │   │   │   │   ├── quantize_f8_e4m3.json
    │   │   │   │   │   ├── quantize_f8_e5m2.json
    │   │   │   │   │   ├── reciprocal.json
    │   │   │   │   │   ├── reducel1.json
    │   │   │   │   │   ├── reducel2.json
    │   │   │   │   │   ├── reducemax.json
    │   │   │   │   │   ├── reducemean.json
    │   │   │   │   │   ├── reducemin.json
    │   │   │   │   │   ├── reduceprod.json
    │   │   │   │   │   ├── reducesum.json
    │   │   │   │   │   ├── relu.json
    │   │   │   │   │   ├── relu_bwd.json
    │   │   │   │   │   ├── reorder.json
    │   │   │   │   │   ├── select.json
    │   │   │   │   │   ├── softmax.json
    │   │   │   │   │   ├── softmax_bwd.json
    │   │   │   │   │   ├── softmax_bwd_d_3d.json
    │   │   │   │   │   ├── softplus_bwd.json
    │   │   │   │   │   ├── static_reshape.json
    │   │   │   │   │   ├── static_transpose.json
    │   │   │   │   │   └── typecast.json
    │   │   │   │   ├── harness_bf16_all
    │   │   │   │   ├── harness_bf16_ci
    │   │   │   │   ├── harness_f16_all
    │   │   │   │   ├── harness_f16_ci
    │   │   │   │   ├── harness_f32_all
    │   │   │   │   └── harness_f32_ci
    │   │   │   ├── pattern
    │   │   │   │   ├── f32
    │   │   │   │   │   ├── avgpool_3d_chain_fusion.json
    │   │   │   │   │   ├── binary_2d_post_ops_relu_fusion.json
    │   │   │   │   │   ├── binary_2d_post_ops_sum_fusion.json
    │   │   │   │   │   ├── binary_3d_post_ops_add_fusion.json
    │   │   │   │   │   ├── binary_4d_post_ops_relu_fusion.json
    │   │   │   │   │   ├── binary_4d_post_ops_sum_fusion.json
    │   │   │   │   │   ├── binary_post_ops_chain_fusion.json
    │   │   │   │   │   ├── binary_post_ops_fusion.json
    │   │   │   │   │   ├── binary_post_ops_logistic_fusion.json
    │   │   │   │   │   ├── bn_bwd_relu_bwd_fusion.json
    │   │   │   │   │   ├── bn_relu_fusion.json
    │   │   │   │   │   ├── conv_add_sigmoid_multiply_relu_fusion.json
    │   │   │   │   │   ├── conv_add_swish_relu_fusion.json
    │   │   │   │   │   ├── conv_bias_add_fusion.json
    │   │   │   │   │   ├── conv_bias_mul_mul_depthwise_bias_swish_fusion_cpu.json
    │   │   │   │   │   ├── conv_bias_post_ops_chain_fusion.json
    │   │   │   │   │   ├── conv_bias_post_ops_fusion.json
    │   │   │   │   │   ├── conv_bias_relu_depthwise_bias_relu_fusion_cpu.json
    │   │   │   │   │   ├── conv_bias_sum_fusion.json
    │   │   │   │   │   ├── conv_bias_sum_fusion_2.json
    │   │   │   │   │   ├── conv_bias_swish_fusion.json
    │   │   │   │   │   ├── conv_depthwise_fusion_cpu.json
    │   │   │   │   │   ├── conv_post_ops_fusion.json
    │   │   │   │   │   ├── convtranspose_post_ops_fusion.json
    │   │   │   │   │   ├── interpolate_post_ops_chain_fusion.json
    │   │   │   │   │   ├── interpolate_post_ops_chain_fusion_2.json
    │   │   │   │   │   ├── interpolate_post_ops_chain_fusion_3.json
    │   │   │   │   │   ├── interpolate_post_ops_chain_fusion_4.json
    │   │   │   │   │   ├── lnorm_gelu.json
    │   │   │   │   │   ├── matmul_bias_post_ops_chain_fusion.json
    │   │   │   │   │   ├── matmul_bias_post_ops_clip_fusion.json
    │   │   │   │   │   ├── matmul_bias_post_ops_elu_fusion.json
    │   │   │   │   │   ├── matmul_post_ops_add_add_fusion.json
    │   │   │   │   │   ├── matmul_post_ops_chain_fusion.json
    │   │   │   │   │   ├── matmul_post_ops_clip_fusion.json
    │   │   │   │   │   ├── matmul_post_ops_relu_add_fusion.json
    │   │   │   │   │   ├── matmul_post_ops_sum_logistic_fusion.json
    │   │   │   │   │   ├── matmul_post_ops_sum_relu_fusion.json
    │   │   │   │   │   ├── matmul_post_ops_swish_fusion.json
    │   │   │   │   │   ├── matmul_select.json
    │   │   │   │   │   ├── maxpool_chain_fusion.json
    │   │   │   │   │   ├── maxpool_sum_relu_fusion.json
    │   │   │   │   │   ├── reciprocal_multiply_fusion.json
    │   │   │   │   │   ├── reduction_post_ops_l1_chain_fusion.json
    │   │   │   │   │   ├── reduction_post_ops_l2_fusion.json
    │   │   │   │   │   ├── reduction_post_ops_max_chain_fusion.json
    │   │   │   │   │   ├── reduction_post_ops_mean_fusion.json
    │   │   │   │   │   ├── reduction_post_ops_min_chain_fusion.json
    │   │   │   │   │   ├── reduction_post_ops_prod_chain_fusion.json
    │   │   │   │   │   ├── reduction_post_ops_sum_chain_fusion.json
    │   │   │   │   │   ├── shuffle_fusion.json
    │   │   │   │   │   ├── softmax_post_ops_binary_fusion.json
    │   │   │   │   │   ├── softmax_post_ops_unary_fusion.json
    │   │   │   │   │   ├── unary_post_ops_elu_fusion.json
    │   │   │   │   │   ├── unary_post_ops_gelu_fusion.json
    │   │   │   │   │   ├── unary_post_ops_hardswish_fusion.json
    │   │   │   │   │   ├── unary_post_ops_log_fusion.json
    │   │   │   │   │   ├── unary_post_ops_round_fusion.json
    │   │   │   │   │   ├── unary_post_ops_sqrt_fusion.json
    │   │   │   │   │   ├── unary_post_ops_square_fusion.json
    │   │   │   │   │   └── unary_post_ops_tanh_fusion.json
    │   │   │   │   ├── f8
    │   │   │   │   │   ├── f8_bf16_matmul_add_fusion.json
    │   │   │   │   │   ├── f8_bf16_matmul_sum_add_mul_relu.json
    │   │   │   │   │   ├── f8_conv_add_add_fusion.json
    │   │   │   │   │   ├── f8_conv_bias_relu_fusion.json
    │   │   │   │   │   ├── f8_conv_fwd.json
    │   │   │   │   │   ├── f8_conv_post_ops_fusion.json
    │   │   │   │   │   ├── f8_conv_post_ops_int8_add_fusion.json
    │   │   │   │   │   ├── f8_f32_matmul_mul_add_fusion.json
    │   │   │   │   │   ├── f8_matmul.json
    │   │   │   │   │   └── f8_matmul_sum_add_mul_relu.json
    │   │   │   │   ├── harness_bf16_all
    │   │   │   │   ├── harness_bf16_ci
    │   │   │   │   ├── harness_f16_all
    │   │   │   │   ├── harness_f16_ci
    │   │   │   │   ├── harness_f32_all
    │   │   │   │   ├── harness_f32_ci
    │   │   │   │   ├── harness_f8_all
    │   │   │   │   ├── harness_f8_ci
    │   │   │   │   ├── harness_int8_all
    │   │   │   │   ├── harness_int8_ci
    │   │   │   │   └── int8
    │   │   │   │   │   ├── int8_avgpool_reshape_fusion.json
    │   │   │   │   │   ├── int8_avgpool_transpose_fusion.json
    │   │   │   │   │   ├── int8_bf16_conv_add_fusion.json
    │   │   │   │   │   ├── int8_bf16_conv_add_relu_mul.json
    │   │   │   │   │   ├── int8_bf16_conv_binary_add_fusion.json
    │   │   │   │   │   ├── int8_bf16_conv_binary_add_fusion_2.json
    │   │   │   │   │   ├── int8_bf16_gnorm_add_fusion.json
    │   │   │   │   │   ├── int8_bf16_gnorm_relu_fusion.json
    │   │   │   │   │   ├── int8_bf16_matmul.json
    │   │   │   │   │   ├── int8_bf16_matmul_add_fusion.json
    │   │   │   │   │   ├── int8_bf16_matmul_add_mul_relu.json
    │   │   │   │   │   ├── int8_bf16_matmul_mul_add_fusion.json
    │   │   │   │   │   ├── int8_bf16_matmul_mul_add_fusion_2.json
    │   │   │   │   │   ├── int8_bf16_matmul_mul_w_smooth_quant_fusion.json
    │   │   │   │   │   ├── int8_bf16_matmul_post_ops_fusion.json
    │   │   │   │   │   ├── int8_bf16_matmul_relu_w_smooth_quant_fusion.json
    │   │   │   │   │   ├── int8_bf16_matmul_sum_add_mul_relu.json
    │   │   │   │   │   ├── int8_bf16_matmul_tc_add_fusion.json
    │   │   │   │   │   ├── int8_bf16_matmul_tc_add_quant_fusion.json
    │   │   │   │   │   ├── int8_bnorm_relu_fusion.json
    │   │   │   │   │   ├── int8_concat_fusion.json
    │   │   │   │   │   ├── int8_concat_fusion_3.json
    │   │   │   │   │   ├── int8_conv_2d_fusion.json
    │   │   │   │   │   ├── int8_conv_2d_fusion_2.json
    │   │   │   │   │   ├── int8_conv_2d_fwd_i_fusion.json
    │   │   │   │   │   ├── int8_conv_add_add_fusion.json
    │   │   │   │   │   ├── int8_conv_add_mul_fusion.json
    │   │   │   │   │   ├── int8_conv_bias_fusion.json
    │   │   │   │   │   ├── int8_conv_bias_mish_fusion.json
    │   │   │   │   │   ├── int8_conv_bias_relu_fusion.json
    │   │   │   │   │   ├── int8_conv_bias_relu_fusion_2.json
    │   │   │   │   │   ├── int8_conv_bias_relu_fusion_3.json
    │   │   │   │   │   ├── int8_conv_post_ops_fusion.json
    │   │   │   │   │   ├── int8_conv_post_ops_int8_add_fusion.json
    │   │   │   │   │   ├── int8_conv_relu_fusion.json
    │   │   │   │   │   ├── int8_convtranspose_post_ops_add_fusion.json
    │   │   │   │   │   ├── int8_convtranspose_post_ops_chain_fusion.json
    │   │   │   │   │   ├── int8_convtranspose_post_ops_fusion.json
    │   │   │   │   │   ├── int8_convtranspose_post_ops_square_fusion.json
    │   │   │   │   │   ├── int8_convtranspose_post_ops_sum_fusion.json
    │   │   │   │   │   ├── int8_convtranspose_post_ops_sum_fusion_2.json
    │   │   │   │   │   ├── int8_depthwise_conv_fusion.json
    │   │   │   │   │   ├── int8_f32_matmul_mul_add_fusion.json
    │   │   │   │   │   ├── int8_f32_matmul_mul_add_fusion_2.json
    │   │   │   │   │   ├── int8_lnorm_gelu_quantize.json
    │   │   │   │   │   ├── int8_lnorm_multiply_quantize.json
    │   │   │   │   │   ├── int8_lnorm_tc_multiply_quantize.json
    │   │   │   │   │   ├── int8_matmul_add_mul_fusion.json
    │   │   │   │   │   ├── int8_matmul_add_mul_relu.json
    │   │   │   │   │   ├── int8_matmul_bia_relu_fusion.json
    │   │   │   │   │   ├── int8_matmul_bias_sum_fusion.json
    │   │   │   │   │   ├── int8_matmul_logistic_fusion.json
    │   │   │   │   │   ├── int8_matmul_mul_add_mul_fusion.json
    │   │   │   │   │   ├── int8_matmul_post_ops_fusion.json
    │   │   │   │   │   ├── int8_matmul_sum_add_mul_relu.json
    │   │   │   │   │   ├── int8_maxpool_add_mul_fusion.json
    │   │   │   │   │   ├── int8_reorder_fusion.json
    │   │   │   │   │   ├── int8_reorder_fusion_2.json
    │   │   │   │   │   ├── int8_reorder_fusion_3.json
    │   │   │   │   │   └── int8_softmax_add.json
    │   │   │   ├── test_graph_all
    │   │   │   ├── test_graph_bf16
    │   │   │   ├── test_graph_ci
    │   │   │   ├── test_graph_f16
    │   │   │   ├── test_graph_f32
    │   │   │   ├── test_graph_f8
    │   │   │   ├── test_graph_fusions
    │   │   │   ├── test_graph_fusions_gpu
    │   │   │   ├── test_graph_int8
    │   │   │   ├── test_graph_op_gpu
    │   │   │   └── test_graph_pattern_gpu
    │   │   ├── ip
    │   │   │   ├── harness_ip_gpt-j_2016-32_inf_lb_bfloat16
    │   │   │   ├── harness_ip_gpt-j_2016-32_inf_lb_f32
    │   │   │   ├── harness_ip_gpt-j_2016-32_inf_lb_float16
    │   │   │   ├── harness_ip_gpt-j_2016-32_inf_sb_bfloat16
    │   │   │   ├── harness_ip_gpt-j_2016-32_inf_sb_f32
    │   │   │   ├── harness_ip_gpt-j_2016-32_inf_sb_float16
    │   │   │   ├── harness_ip_gpt-j_32-32_inf_lb_bfloat16
    │   │   │   ├── harness_ip_gpt-j_32-32_inf_lb_f32
    │   │   │   ├── harness_ip_gpt-j_32-32_inf_lb_float16
    │   │   │   ├── harness_ip_gpt-j_32-32_inf_sb_bfloat16
    │   │   │   ├── harness_ip_gpt-j_32-32_inf_sb_f32
    │   │   │   ├── harness_ip_gpt-j_32-32_inf_sb_float16
    │   │   │   ├── harness_ip_regression
    │   │   │   ├── harness_ip_sanitizers
    │   │   │   ├── harness_ip_saturation
    │   │   │   ├── harness_ip_smoke_ref
    │   │   │   ├── harness_ip_tag
    │   │   │   ├── harness_ip_tag_gpu
    │   │   │   ├── option_set_fwks_ext_gpu
    │   │   │   ├── option_set_fwks_key_gpu
    │   │   │   ├── option_set_fwks_key_perf_gpu
    │   │   │   ├── option_set_fwks_llm_gpu
    │   │   │   ├── perf_ip_cpu
    │   │   │   ├── perf_ip_inference_lb
    │   │   │   ├── perf_ip_inference_sb
    │   │   │   ├── perf_ip_knx
    │   │   │   ├── perf_ip_training
    │   │   │   ├── perf_ip_xe
    │   │   │   ├── perf_ip_xe_hp
    │   │   │   ├── perf_ip_xe_lp
    │   │   │   ├── set_all
    │   │   │   ├── set_gpu
    │   │   │   ├── set_topologies
    │   │   │   ├── shapes_0d
    │   │   │   ├── shapes_0d_gpu
    │   │   │   ├── shapes_1d
    │   │   │   ├── shapes_3d
    │   │   │   ├── shapes_alexnet
    │   │   │   ├── shapes_bert
    │   │   │   ├── shapes_bert_large
    │   │   │   ├── shapes_ci
    │   │   │   ├── shapes_dien_sb
    │   │   │   ├── shapes_dlrm
    │   │   │   ├── shapes_gnmt
    │   │   │   ├── shapes_googlenet_v1
    │   │   │   ├── shapes_googlenet_v3
    │   │   │   ├── shapes_maskrcnn
    │   │   │   ├── shapes_ncf
    │   │   │   ├── shapes_regression
    │   │   │   ├── shapes_resnet_50
    │   │   │   ├── shapes_resnet_50_sparse
    │   │   │   ├── shapes_rnn_t
    │   │   │   ├── shapes_transformer_lt
    │   │   │   ├── shapes_vgg16
    │   │   │   ├── shapes_wd
    │   │   │   ├── test_ip_acl
    │   │   │   ├── test_ip_all
    │   │   │   ├── test_ip_bf32_bfloat16
    │   │   │   ├── test_ip_bfloat16
    │   │   │   ├── test_ip_bfloat16_ymm
    │   │   │   ├── test_ip_ci
    │   │   │   ├── test_ip_float16
    │   │   │   ├── test_ip_fp8
    │   │   │   ├── test_ip_gpu
    │   │   │   ├── test_ip_int8
    │   │   │   ├── test_ip_large_gpu
    │   │   │   └── test_ip_smoke
    │   │   ├── lnorm
    │   │   │   ├── option_set_all
    │   │   │   ├── option_set_fwks_ext_gpu
    │   │   │   ├── option_set_fwks_key_gpu
    │   │   │   ├── shapes_ci
    │   │   │   ├── test_lnorm_all
    │   │   │   ├── test_lnorm_bfloat16
    │   │   │   ├── test_lnorm_ci
    │   │   │   ├── test_lnorm_float16
    │   │   │   ├── test_lnorm_gpu
    │   │   │   ├── test_lnorm_int8
    │   │   │   └── test_lnorm_smoke
    │   │   ├── lrn
    │   │   │   ├── set_all
    │   │   │   ├── shapes_0d
    │   │   │   ├── shapes_2d
    │   │   │   ├── shapes_3d
    │   │   │   ├── shapes_ci
    │   │   │   ├── shapes_topologies
    │   │   │   ├── test_lrn_all
    │   │   │   ├── test_lrn_bfloat16
    │   │   │   ├── test_lrn_ci
    │   │   │   ├── test_lrn_float16
    │   │   │   ├── test_lrn_gpu
    │   │   │   └── test_lrn_smoke
    │   │   ├── matmul
    │   │   │   ├── harness_matmul_3d_bcast
    │   │   │   ├── harness_matmul_bert_inf_lb_bfloat16
    │   │   │   ├── harness_matmul_bert_inf_lb_int8
    │   │   │   ├── harness_matmul_bert_inf_sb_bfloat16
    │   │   │   ├── harness_matmul_bert_inf_sb_int8
    │   │   │   ├── harness_matmul_bert_tr_bfloat16
    │   │   │   ├── harness_matmul_bert_tr_float16
    │   │   │   ├── harness_matmul_data_tags
    │   │   │   ├── harness_matmul_decompression
    │   │   │   ├── harness_matmul_dropout
    │   │   │   ├── harness_matmul_generated_ci
    │   │   │   ├── harness_matmul_gpt-j_2016-32_inf_lb_bfloat16
    │   │   │   ├── harness_matmul_gpt-j_2016-32_inf_lb_f32
    │   │   │   ├── harness_matmul_gpt-j_2016-32_inf_lb_float16
    │   │   │   ├── harness_matmul_gpt-j_2016-32_inf_sb_bfloat16
    │   │   │   ├── harness_matmul_gpt-j_2016-32_inf_sb_f32
    │   │   │   ├── harness_matmul_gpt-j_2016-32_inf_sb_float16
    │   │   │   ├── harness_matmul_gpt-j_32-32_inf_lb_bfloat16
    │   │   │   ├── harness_matmul_gpt-j_32-32_inf_lb_f32
    │   │   │   ├── harness_matmul_gpt-j_32-32_inf_lb_float16
    │   │   │   ├── harness_matmul_gpt-j_32-32_inf_sb_bfloat16
    │   │   │   ├── harness_matmul_gpt-j_32-32_inf_sb_f32
    │   │   │   ├── harness_matmul_gpt-j_32-32_inf_sb_float16
    │   │   │   ├── harness_matmul_regression_bf16
    │   │   │   ├── harness_matmul_regression_f32
    │   │   │   ├── harness_matmul_regression_float16
    │   │   │   ├── harness_matmul_regression_int8
    │   │   │   ├── harness_matmul_runtime_f32
    │   │   │   ├── harness_matmul_runtime_int8
    │   │   │   ├── harness_matmul_smoke_ref
    │   │   │   ├── harness_matmul_strides
    │   │   │   ├── harness_matmul_transformer_lt_inf_lb_bfloat16
    │   │   │   ├── harness_matmul_transformer_lt_inf_lb_int8
    │   │   │   ├── harness_matmul_transformer_lt_inf_sb_bfloat16
    │   │   │   ├── harness_matmul_transformer_lt_inf_sb_int8
    │   │   │   ├── harness_matmul_transformer_lt_tr_bfloat16
    │   │   │   ├── option_set_fp8_mixed
    │   │   │   ├── option_set_fwks_ext_gpu
    │   │   │   ├── option_set_fwks_key_gpu
    │   │   │   ├── option_set_fwks_key_gpu_tf32
    │   │   │   ├── option_set_fwks_key_perf_gpu
    │   │   │   ├── option_set_fwks_llm_gpu
    │   │   │   ├── perf_matmul_inference_batched
    │   │   │   ├── perf_matmul_inference_lb
    │   │   │   ├── perf_matmul_training
    │   │   │   ├── shapes_2d
    │   │   │   ├── shapes_2d_ci
    │   │   │   ├── shapes_3d
    │   │   │   ├── shapes_4bit
    │   │   │   ├── shapes_4d
    │   │   │   ├── shapes_bert
    │   │   │   ├── shapes_bert_large
    │   │   │   ├── shapes_converted_ip_inf_lb_alexnet
    │   │   │   ├── shapes_converted_ip_inf_lb_dlrm
    │   │   │   ├── shapes_converted_ip_inf_lb_gmnt
    │   │   │   ├── shapes_converted_ip_inf_lb_googlenet
    │   │   │   ├── shapes_converted_ip_inf_lb_maskrcnn
    │   │   │   ├── shapes_converted_ip_inf_lb_ncf
    │   │   │   ├── shapes_converted_ip_inf_lb_resnet
    │   │   │   ├── shapes_converted_ip_inf_lb_rnn_t
    │   │   │   ├── shapes_converted_ip_inf_lb_vgg16
    │   │   │   ├── shapes_converted_ip_inf_lb_wd
    │   │   │   ├── shapes_converted_ip_inf_sb_dien
    │   │   │   ├── shapes_converted_ip_tr_alexnet_bwd_d
    │   │   │   ├── shapes_converted_ip_tr_alexnet_bwd_w
    │   │   │   ├── shapes_converted_ip_tr_alexnet_fwd
    │   │   │   ├── shapes_converted_ip_tr_dlrm_bwd_d
    │   │   │   ├── shapes_converted_ip_tr_dlrm_bwd_w
    │   │   │   ├── shapes_converted_ip_tr_dlrm_fwd
    │   │   │   ├── shapes_converted_ip_tr_gmnt_bwd_d
    │   │   │   ├── shapes_converted_ip_tr_gmnt_bwd_w
    │   │   │   ├── shapes_converted_ip_tr_gmnt_fwd
    │   │   │   ├── shapes_converted_ip_tr_googlenet_bwd_d
    │   │   │   ├── shapes_converted_ip_tr_googlenet_bwd_w
    │   │   │   ├── shapes_converted_ip_tr_googlenet_fwd
    │   │   │   ├── shapes_converted_ip_tr_maskrcnn_bwd_d
    │   │   │   ├── shapes_converted_ip_tr_maskrcnn_bwd_w
    │   │   │   ├── shapes_converted_ip_tr_maskrcnn_fwd
    │   │   │   ├── shapes_converted_ip_tr_ncf_bwd_d
    │   │   │   ├── shapes_converted_ip_tr_ncf_bwd_w
    │   │   │   ├── shapes_converted_ip_tr_ncf_fwd
    │   │   │   ├── shapes_converted_ip_tr_resnet_bwd_d
    │   │   │   ├── shapes_converted_ip_tr_resnet_bwd_w
    │   │   │   ├── shapes_converted_ip_tr_resnet_fwd
    │   │   │   ├── shapes_converted_ip_tr_rnn_t_bwd_d
    │   │   │   ├── shapes_converted_ip_tr_rnn_t_bwd_w
    │   │   │   ├── shapes_converted_ip_tr_rnn_t_fwd
    │   │   │   ├── shapes_converted_ip_tr_vgg16_bwd_d
    │   │   │   ├── shapes_converted_ip_tr_vgg16_bwd_w
    │   │   │   ├── shapes_converted_ip_tr_vgg16_fwd
    │   │   │   ├── shapes_converted_ip_tr_wd_bwd_d
    │   │   │   ├── shapes_converted_ip_tr_wd_bwd_w
    │   │   │   ├── shapes_converted_ip_tr_wd_fwd
    │   │   │   ├── shapes_mem_strided
    │   │   │   ├── shapes_multidim
    │   │   │   ├── shapes_sparse
    │   │   │   ├── shapes_sparse_packed
    │   │   │   ├── shapes_transformer
    │   │   │   ├── test_matmul_all
    │   │   │   ├── test_matmul_bf32_bf16
    │   │   │   ├── test_matmul_bfloat16
    │   │   │   ├── test_matmul_bfloat16_ymm
    │   │   │   ├── test_matmul_ci
    │   │   │   ├── test_matmul_float16
    │   │   │   ├── test_matmul_fp4
    │   │   │   ├── test_matmul_fp8
    │   │   │   ├── test_matmul_gpu
    │   │   │   ├── test_matmul_int8
    │   │   │   ├── test_matmul_large_gpu
    │   │   │   ├── test_matmul_llm_gpu
    │   │   │   ├── test_matmul_multidims
    │   │   │   ├── test_matmul_smoke
    │   │   │   ├── test_matmul_sparse
    │   │   │   ├── test_matmul_sparse_ci
    │   │   │   └── test_matmul_sparse_gpu
    │   │   ├── pool
    │   │   │   ├── harness_pool_regression
    │   │   │   ├── harness_pool_smoke_ref
    │   │   │   ├── harness_pooling_different_dt
    │   │   │   ├── option_set_fwks_ext_gpu
    │   │   │   ├── option_set_fwks_key_gpu
    │   │   │   ├── perf_pool_gpu
    │   │   │   ├── set_all
    │   │   │   ├── set_all_small
    │   │   │   ├── set_topologies
    │   │   │   ├── set_topologies_gpu
    │   │   │   ├── shapes_1d
    │   │   │   ├── shapes_2d
    │   │   │   ├── shapes_2d_small
    │   │   │   ├── shapes_3d
    │   │   │   ├── shapes_3d_small
    │   │   │   ├── shapes_3d_unet
    │   │   │   ├── shapes_alexnet
    │   │   │   ├── shapes_basic
    │   │   │   ├── shapes_global_pooling
    │   │   │   ├── shapes_googlenet_v1
    │   │   │   ├── shapes_googlenet_v3
    │   │   │   ├── shapes_i3d_resnet50_v1
    │   │   │   ├── shapes_large_pool
    │   │   │   ├── shapes_resnet_50
    │   │   │   ├── test_pool_all
    │   │   │   ├── test_pool_bfloat16
    │   │   │   ├── test_pool_ci
    │   │   │   ├── test_pool_float16
    │   │   │   ├── test_pool_fp8
    │   │   │   ├── test_pool_gpu
    │   │   │   ├── test_pool_large_gpu
    │   │   │   └── test_pool_smoke
    │   │   ├── prelu
    │   │   │   ├── option_set_all
    │   │   │   ├── shapes_all
    │   │   │   ├── shapes_ci
    │   │   │   ├── test_prelu_all
    │   │   │   ├── test_prelu_bfloat16
    │   │   │   ├── test_prelu_ci
    │   │   │   ├── test_prelu_float16
    │   │   │   ├── test_prelu_gpu
    │   │   │   └── test_prelu_smoke
    │   │   ├── reduction
    │   │   │   ├── harness_reduction_bf16
    │   │   │   ├── harness_reduction_f16
    │   │   │   ├── harness_reduction_f32
    │   │   │   ├── harness_reduction_i8
    │   │   │   ├── option_set_all
    │   │   │   ├── option_set_all_algs
    │   │   │   ├── option_set_all_algs_ci
    │   │   │   ├── option_set_all_algs_int8
    │   │   │   ├── option_set_all_algs_int8_ci
    │   │   │   ├── option_set_fwks_ext_gpu
    │   │   │   ├── option_set_fwks_key_gpu
    │   │   │   ├── perf_reduction_gpu
    │   │   │   ├── shapes_ci
    │   │   │   ├── shapes_gpu_all
    │   │   │   ├── shapes_nested_gpu
    │   │   │   ├── test_reduction_all
    │   │   │   ├── test_reduction_bfloat16
    │   │   │   ├── test_reduction_ci
    │   │   │   ├── test_reduction_float16
    │   │   │   ├── test_reduction_gpu
    │   │   │   └── test_reduction_smoke
    │   │   ├── reorder
    │   │   │   ├── harness_conv_reorders_gpu
    │   │   │   ├── harness_reorder_amx
    │   │   │   ├── harness_reorder_compensation
    │   │   │   ├── harness_reorder_cross_engine_gpu
    │   │   │   ├── harness_reorder_decompression
    │   │   │   ├── harness_reorder_large
    │   │   │   ├── harness_reorder_regression
    │   │   │   ├── harness_reorder_runtime
    │   │   │   ├── harness_reorder_saturation
    │   │   │   ├── harness_reorder_scales
    │   │   │   ├── option_set_fwks_ext_gpu
    │   │   │   ├── option_set_fwks_key_gpu
    │   │   │   ├── test_reorder_all
    │   │   │   ├── test_reorder_bfloat16
    │   │   │   ├── test_reorder_ci
    │   │   │   ├── test_reorder_float16
    │   │   │   ├── test_reorder_float8
    │   │   │   ├── test_reorder_fp4
    │   │   │   ├── test_reorder_gpu
    │   │   │   ├── test_reorder_int4
    │   │   │   └── test_reorder_smoke
    │   │   ├── resampling
    │   │   │   ├── option_set_fwks_key_gpu
    │   │   │   ├── set_all
    │   │   │   ├── shapes_1d
    │   │   │   ├── shapes_2d
    │   │   │   ├── shapes_3d
    │   │   │   ├── shapes_ci
    │   │   │   ├── shapes_maskrcnn
    │   │   │   ├── test_resampling_all
    │   │   │   ├── test_resampling_bfloat16
    │   │   │   ├── test_resampling_ci
    │   │   │   ├── test_resampling_float16
    │   │   │   ├── test_resampling_gpu
    │   │   │   └── test_resampling_smoke
    │   │   ├── rnn
    │   │   │   ├── harness_augru_bf32
    │   │   │   ├── harness_augru_bfloat16
    │   │   │   ├── harness_augru_float16
    │   │   │   ├── harness_gru_bf32
    │   │   │   ├── harness_gru_bfloat16
    │   │   │   ├── harness_gru_f32
    │   │   │   ├── harness_gru_float16
    │   │   │   ├── harness_gru_int8
    │   │   │   ├── harness_gru_regression
    │   │   │   ├── harness_lstm_bf32
    │   │   │   ├── harness_lstm_bfloat16
    │   │   │   ├── harness_lstm_f32
    │   │   │   ├── harness_lstm_float16
    │   │   │   ├── harness_lstm_int8
    │   │   │   ├── harness_rnn_bf32
    │   │   │   ├── harness_rnn_bfloat16
    │   │   │   ├── harness_rnn_f32
    │   │   │   ├── harness_rnn_float16
    │   │   │   ├── option_set_fwks_key_gpu
    │   │   │   ├── option_set_gnmt_decoder
    │   │   │   ├── option_set_gnmt_encoder
    │   │   │   ├── option_set_large
    │   │   │   ├── option_set_lstmp_large
    │   │   │   ├── option_set_lstmp_small
    │   │   │   ├── option_set_perf_inference_lb
    │   │   │   ├── option_set_perf_inference_sb
    │   │   │   ├── option_set_perf_training
    │   │   │   ├── option_set_rnnt
    │   │   │   ├── option_set_small
    │   │   │   ├── perf_rnn_cpu
    │   │   │   ├── perf_rnn_inference_lb
    │   │   │   ├── perf_rnn_inference_sb
    │   │   │   ├── perf_rnn_knx
    │   │   │   ├── perf_rnn_training
    │   │   │   ├── perf_rnn_xe
    │   │   │   ├── perf_rnn_xe_hp
    │   │   │   ├── perf_rnn_xe_lp
    │   │   │   ├── shapes_deepspeech_2
    │   │   │   ├── shapes_inference
    │   │   │   ├── shapes_large
    │   │   │   ├── shapes_large_gru
    │   │   │   ├── shapes_lstmp_large
    │   │   │   ├── shapes_lstmp_small
    │   │   │   ├── shapes_rnn_t
    │   │   │   ├── shapes_small
    │   │   │   ├── shapes_small_gru
    │   │   │   ├── shapes_training
    │   │   │   ├── test_augru_all
    │   │   │   ├── test_augru_bf32_bfloat16
    │   │   │   ├── test_augru_bfloat16
    │   │   │   ├── test_augru_ci
    │   │   │   ├── test_augru_float16
    │   │   │   ├── test_gru_all
    │   │   │   ├── test_gru_bf32_bfloat16
    │   │   │   ├── test_gru_bfloat16
    │   │   │   ├── test_gru_ci
    │   │   │   ├── test_gru_float16
    │   │   │   ├── test_gru_int8
    │   │   │   ├── test_lstm_all
    │   │   │   ├── test_lstm_bf32_bfloat16
    │   │   │   ├── test_lstm_bfloat16
    │   │   │   ├── test_lstm_bfloat16_ymm
    │   │   │   ├── test_lstm_ci
    │   │   │   ├── test_lstm_f32
    │   │   │   ├── test_lstm_float16
    │   │   │   ├── test_lstm_int8
    │   │   │   ├── test_rnn_all
    │   │   │   ├── test_rnn_bf32_bfloat16
    │   │   │   ├── test_rnn_bfloat16
    │   │   │   ├── test_rnn_ci
    │   │   │   ├── test_rnn_float16
    │   │   │   └── test_rnn_gpu
    │   │   ├── self
    │   │   │   ├── test_self_ci
    │   │   │   ├── test_self_f32
    │   │   │   └── test_self_smoke
    │   │   ├── shuffle
    │   │   │   ├── option_set_all
    │   │   │   ├── option_set_min
    │   │   │   ├── option_set_perf
    │   │   │   ├── perf_shuffle_cpu
    │   │   │   ├── test_shuffle_all
    │   │   │   ├── test_shuffle_bfloat16
    │   │   │   ├── test_shuffle_ci
    │   │   │   ├── test_shuffle_float16
    │   │   │   ├── test_shuffle_gpu
    │   │   │   └── test_shuffle_smoke
    │   │   ├── softmax
    │   │   │   ├── harness_softmax_regression
    │   │   │   ├── option_set_fwks_ext_gpu
    │   │   │   ├── option_set_fwks_key_gpu
    │   │   │   ├── set_0d
    │   │   │   ├── shapes_0d
    │   │   │   ├── shapes_2d
    │   │   │   ├── shapes_3d
    │   │   │   ├── shapes_ci
    │   │   │   ├── shapes_large
    │   │   │   ├── shapes_large_axis
    │   │   │   ├── shapes_nlp
    │   │   │   ├── test_softmax_acl
    │   │   │   ├── test_softmax_all
    │   │   │   ├── test_softmax_bfloat16
    │   │   │   ├── test_softmax_ci
    │   │   │   ├── test_softmax_float16
    │   │   │   ├── test_softmax_gpu
    │   │   │   └── test_softmax_smoke
    │   │   ├── sum
    │   │   │   ├── option_set_fwks_ext_gpu
    │   │   │   ├── option_set_fwks_key_gpu
    │   │   │   ├── test_sum_all
    │   │   │   ├── test_sum_bfloat16
    │   │   │   ├── test_sum_ci
    │   │   │   ├── test_sum_float16
    │   │   │   ├── test_sum_gpu
    │   │   │   └── test_sum_smoke
    │   │   └── zeropad
    │   │   │   ├── option_set_fwks_ext_gpu
    │   │   │   ├── option_set_fwks_key_gpu
    │   │   │   ├── set_dim1_block_3d
    │   │   │   ├── set_dim1dim2_block_2d
    │   │   │   ├── set_dim1dim2_block_3d
    │   │   │   ├── set_dim2_block_3d
    │   │   │   ├── set_dim2dim3_block_4d
    │   │   │   ├── shapes_dim1_block_3d
    │   │   │   ├── shapes_dim1dim2_block_2d
    │   │   │   ├── shapes_dim1dim2_block_3d
    │   │   │   ├── shapes_dim2_block_3d
    │   │   │   ├── shapes_dim2dim3_block_4d
    │   │   │   ├── test_zeropad_ci
    │   │   │   └── test_zeropad_gpu
    │   ├── ip
    │   │   ├── bench_ip.cpp
    │   │   ├── cfg.cpp
    │   │   ├── ip.cpp
    │   │   ├── ip.hpp
    │   │   ├── ip_aux.cpp
    │   │   └── ref_ip.cpp
    │   ├── lnorm
    │   │   ├── bench_lnorm.cpp
    │   │   ├── lnorm.cpp
    │   │   ├── lnorm.hpp
    │   │   ├── lnorm_aux.cpp
    │   │   └── ref_lnorm.cpp
    │   ├── lrn
    │   │   ├── bench_lrn.cpp
    │   │   ├── lrn.cpp
    │   │   ├── lrn.hpp
    │   │   ├── lrn_aux.cpp
    │   │   └── ref_lrn.cpp
    │   ├── matmul
    │   │   ├── bench_matmul.cpp
    │   │   ├── cfg.cpp
    │   │   ├── matmul.cpp
    │   │   ├── matmul.hpp
    │   │   ├── matmul_aux.cpp
    │   │   └── ref_matmul.cpp
    │   ├── pool
    │   │   ├── bench_pool.cpp
    │   │   ├── cfg.cpp
    │   │   ├── pool.cpp
    │   │   ├── pool.hpp
    │   │   ├── pool_aux.cpp
    │   │   └── ref_pool.cpp
    │   ├── prelu
    │   │   ├── bench_prelu.cpp
    │   │   ├── prelu.cpp
    │   │   ├── prelu.hpp
    │   │   ├── prelu_aux.cpp
    │   │   └── ref_prelu.cpp
    │   ├── reduction
    │   │   ├── bench_reduction.cpp
    │   │   ├── reduction.cpp
    │   │   ├── reduction.hpp
    │   │   ├── reduction_aux.cpp
    │   │   └── ref_reduction.cpp
    │   ├── reorder
    │   │   ├── bench_reorder.cpp
    │   │   ├── cfg.cpp
    │   │   ├── ref_reorder.cpp
    │   │   ├── reorder.cpp
    │   │   ├── reorder.hpp
    │   │   └── reorder_aux.cpp
    │   ├── resampling
    │   │   ├── bench_resampling.cpp
    │   │   ├── ref_resampling.cpp
    │   │   ├── resampling.cpp
    │   │   ├── resampling.hpp
    │   │   └── resampling_aux.cpp
    │   ├── rnn
    │   │   ├── bench_rnn.cpp
    │   │   ├── cells.hpp
    │   │   ├── cfg.cpp
    │   │   ├── gru_cell.cpp
    │   │   ├── lbr_gru_cell.cpp
    │   │   ├── lstm_cell.cpp
    │   │   ├── ref_rnn_bwd.cpp
    │   │   ├── ref_rnn_fwd.cpp
    │   │   ├── rnn.cpp
    │   │   ├── rnn.hpp
    │   │   ├── rnn_aux.cpp
    │   │   ├── rnn_aux.hpp
    │   │   ├── rnn_cell.cpp
    │   │   ├── rnn_task.hpp
    │   │   ├── rnn_task_executor.hpp
    │   │   └── rnn_utils.cpp
    │   ├── self
    │   │   ├── bnorm.cpp
    │   │   ├── common.cpp
    │   │   ├── compare.cpp
    │   │   ├── conv.cpp
    │   │   ├── graph_example.cpp
    │   │   ├── memory.cpp
    │   │   ├── norm.cpp
    │   │   ├── res.cpp
    │   │   ├── self.cpp
    │   │   └── self.hpp
    │   ├── shuffle
    │   │   ├── bench_shuffle.cpp
    │   │   ├── ref_shuffle.cpp
    │   │   ├── shuffle.cpp
    │   │   ├── shuffle.hpp
    │   │   └── shuffle_aux.cpp
    │   ├── softmax
    │   │   ├── bench_softmax.cpp
    │   │   ├── ref_softmax.cpp
    │   │   ├── softmax.cpp
    │   │   ├── softmax.hpp
    │   │   └── softmax_aux.cpp
    │   ├── sum
    │   │   ├── bench_sum.cpp
    │   │   ├── ref_sum.cpp
    │   │   ├── sum.cpp
    │   │   ├── sum.hpp
    │   │   └── sum_aux.cpp
    │   ├── utils
    │   │   ├── bench_mode.cpp
    │   │   ├── bench_mode.hpp
    │   │   ├── cfg.hpp
    │   │   ├── cold_cache.cpp
    │   │   ├── cold_cache.hpp
    │   │   ├── compare.cpp
    │   │   ├── compare.hpp
    │   │   ├── data_kind.cpp
    │   │   ├── data_kind.hpp
    │   │   ├── dims.cpp
    │   │   ├── dims.hpp
    │   │   ├── dnnl_query.cpp
    │   │   ├── dnnl_query.hpp
    │   │   ├── fill.cpp
    │   │   ├── fill.hpp
    │   │   ├── impl_filter.cpp
    │   │   ├── impl_filter.hpp
    │   │   ├── norm.hpp
    │   │   ├── numeric.cpp
    │   │   ├── numeric.hpp
    │   │   ├── parallel.cpp
    │   │   ├── parallel.hpp
    │   │   ├── parser.cpp
    │   │   ├── parser.hpp
    │   │   ├── perf_report.cpp
    │   │   ├── perf_report.hpp
    │   │   ├── res.hpp
    │   │   ├── settings.hpp
    │   │   ├── stream_kind.cpp
    │   │   ├── stream_kind.hpp
    │   │   ├── task.hpp
    │   │   ├── task_executor.hpp
    │   │   ├── timer.cpp
    │   │   ├── timer.hpp
    │   │   └── wrapper.hpp
    │   └── zeropad
    │   │   ├── bench_zeropad.cpp
    │   │   ├── zeropad.cpp
    │   │   ├── zeropad.hpp
    │   │   └── zeropad_aux.cpp
    ├── generate_c_symbols_refs.sh
    ├── gtests
    │   ├── CMakeLists.txt
    │   ├── api
    │   │   ├── CMakeLists.txt
    │   │   ├── test_engine.cpp
    │   │   ├── test_memory.cpp
    │   │   ├── test_memory_creation.cpp
    │   │   ├── test_memory_desc.cpp
    │   │   ├── test_memory_desc_ops.cpp
    │   │   ├── test_memory_map.cpp
    │   │   ├── test_namespace.cpp
    │   │   ├── test_stream.cpp
    │   │   └── test_submemory.cpp
    │   ├── convolution_common.h
    │   ├── dnnl_test_common.hpp
    │   ├── dnnl_test_common_ocl.hpp
    │   ├── dnnl_test_macros.hpp
    │   ├── graph
    │   │   ├── CMakeLists.txt
    │   │   ├── api
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── api_test_main.cpp
    │   │   │   ├── ocl
    │   │   │   │   ├── CMakeLists.txt
    │   │   │   │   ├── test_cpp_api_compiled_partition.cpp
    │   │   │   │   ├── test_cpp_api_engine.cpp
    │   │   │   │   └── test_cpp_api_tensor.cpp
    │   │   │   ├── sycl
    │   │   │   │   ├── CMakeLists.txt
    │   │   │   │   ├── test_cpp_api_compiled_partition.cpp
    │   │   │   │   ├── test_cpp_api_engine.cpp
    │   │   │   │   └── test_cpp_api_tensor.cpp
    │   │   │   ├── test_api_common.cpp
    │   │   │   ├── test_api_common.h
    │   │   │   ├── test_api_common.hpp
    │   │   │   ├── test_c_api_add_op.cpp
    │   │   │   ├── test_c_api_compile.cpp
    │   │   │   ├── test_c_api_compile_parametrized.cpp
    │   │   │   ├── test_c_api_constant_cache.cpp
    │   │   │   ├── test_c_api_filter.cpp
    │   │   │   ├── test_c_api_graph.cpp
    │   │   │   ├── test_c_api_logical_tensor.cpp
    │   │   │   ├── test_c_api_op.cpp
    │   │   │   ├── test_cpp_api_compile.cpp
    │   │   │   ├── test_cpp_api_constant_cache.cpp
    │   │   │   ├── test_cpp_api_engine.cpp
    │   │   │   ├── test_cpp_api_graph.cpp
    │   │   │   ├── test_cpp_api_logical_tensor.cpp
    │   │   │   ├── test_cpp_api_op.cpp
    │   │   │   ├── test_cpp_api_partition.cpp
    │   │   │   └── test_cpp_api_tensor.cpp
    │   │   ├── test_allocator.cpp
    │   │   ├── test_allocator.hpp
    │   │   └── unit
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── backend
    │   │   │       ├── CMakeLists.txt
    │   │   │       ├── dnnl
    │   │   │       │   ├── CMakeLists.txt
    │   │   │       │   ├── dnnl_test_common.hpp
    │   │   │       │   ├── ref_func.hpp
    │   │   │       │   ├── test_batch_norm.cpp
    │   │   │       │   ├── test_binary_op.cpp
    │   │   │       │   ├── test_bmm.cpp
    │   │   │       │   ├── test_common.cpp
    │   │   │       │   ├── test_compiled_partition.cpp
    │   │   │       │   ├── test_concat.cpp
    │   │   │       │   ├── test_constant_cache.cpp
    │   │   │       │   ├── test_convolution.cpp
    │   │   │       │   ├── test_convtranspose.cpp
    │   │   │       │   ├── test_dequantize.cpp
    │   │   │       │   ├── test_dnnl_infer_shape_cpu.cpp
    │   │   │       │   ├── test_dnnl_utils_cpu.cpp
    │   │   │       │   ├── test_eltwise.cpp
    │   │   │       │   ├── test_fusion_info_cpu.cpp
    │   │   │       │   ├── test_graph_cpu.cpp
    │   │   │       │   ├── test_group_norm.cpp
    │   │   │       │   ├── test_insert_ops_cpu.cpp
    │   │   │       │   ├── test_internal_attrs_cpu.cpp
    │   │   │       │   ├── test_interpolate.cpp
    │   │   │       │   ├── test_large_partition.cpp
    │   │   │       │   ├── test_layer_norm.cpp
    │   │   │       │   ├── test_layout_id_cpu.cpp
    │   │   │       │   ├── test_layout_propagator_cpu.cpp
    │   │   │       │   ├── test_logical_tensor_cpu.cpp
    │   │   │       │   ├── test_matmul.cpp
    │   │   │       │   ├── test_memory_planning_cpu.cpp
    │   │   │       │   ├── test_mqa_decomp.cpp
    │   │   │       │   ├── test_op_executable.cpp
    │   │   │       │   ├── test_op_schema_cpu.cpp
    │   │   │       │   ├── test_partition_cpu.cpp
    │   │   │       │   ├── test_pass.cpp
    │   │   │       │   ├── test_pool.cpp
    │   │   │       │   ├── test_prelu.cpp
    │   │   │       │   ├── test_quantize.cpp
    │   │   │       │   ├── test_reduce.cpp
    │   │   │       │   ├── test_reorder.cpp
    │   │   │       │   ├── test_scratchpad.cpp
    │   │   │       │   ├── test_sdp_decomp.cpp
    │   │   │       │   ├── test_select.cpp
    │   │   │       │   ├── test_softmax.cpp
    │   │   │       │   ├── test_subgraph_pass.cpp
    │   │   │       │   ├── test_thread_local_cache_cpu.cpp
    │   │   │       │   └── test_typecast.cpp
    │   │   │       └── fake
    │   │   │       │   ├── CMakeLists.txt
    │   │   │       │   ├── test_compiled_partition.cpp
    │   │   │       │   ├── test_fake_backend.cpp
    │   │   │       │   ├── test_graph.cpp
    │   │   │       │   ├── test_partition.cpp
    │   │   │       │   └── test_pass.cpp
    │   │   │   ├── interface
    │   │   │       ├── CMakeLists.txt
    │   │   │       ├── sycl
    │   │   │       │   ├── CMakeLists.txt
    │   │   │       │   └── test_allocator.cpp
    │   │   │       ├── test_allocator.cpp
    │   │   │       ├── test_backend_cpu.cpp
    │   │   │       ├── test_compiled_partition.cpp
    │   │   │       ├── test_graph_cpu.cpp
    │   │   │       ├── test_logical_tensor_cpu.cpp
    │   │   │       ├── test_op_cpu.cpp
    │   │   │       ├── test_op_def_constraint_cpu.cpp
    │   │   │       ├── test_op_schema_cpu.cpp
    │   │   │       ├── test_partition_hashing.cpp
    │   │   │       ├── test_shape_infer_cpu.cpp
    │   │   │       ├── test_tensor.cpp
    │   │   │       └── test_value_cpu.cpp
    │   │   │   ├── unit_test_common.cpp
    │   │   │   ├── unit_test_common.hpp
    │   │   │   ├── unit_test_main.cpp
    │   │   │   ├── utils.hpp
    │   │   │   └── utils
    │   │   │       ├── CMakeLists.txt
    │   │   │       ├── test_allocator.cpp
    │   │   │       ├── test_attribute_value_cpu.cpp
    │   │   │       ├── test_debug_cpu.cpp
    │   │   │       ├── test_json_cpu.cpp
    │   │   │       ├── test_pattern_matcher_cpu.cpp
    │   │   │       └── test_utils_cpu.cpp
    │   ├── in
    │   │   ├── convolution_attr.h
    │   │   ├── convolution_simple.h
    │   │   ├── gemm_in.h
    │   │   └── layer_normalization.h
    │   ├── internals
    │   │   ├── CMakeLists.txt
    │   │   ├── sdpa_internal.hpp
    │   │   ├── test_bcast_strategy.cpp
    │   │   ├── test_bfloat16.cpp
    │   │   ├── test_brgemm.cpp
    │   │   ├── test_comparison_operators.cpp
    │   │   ├── test_dnnl_threading.cpp
    │   │   ├── test_env_vars_dnnl.cpp
    │   │   ├── test_env_vars_onednn.cpp
    │   │   ├── test_float8.cpp
    │   │   ├── test_nibble.cpp
    │   │   ├── test_sdpa.cpp
    │   │   ├── test_utils.cpp
    │   │   └── test_utils.hpp
    │   ├── main.cpp
    │   ├── ocl
    │   │   ├── CMakeLists.txt
    │   │   └── api
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── test_engine.cpp
    │   │   │   ├── test_memory_buffer.cpp
    │   │   │   ├── test_memory_usm.cpp
    │   │   │   └── test_stream.cpp
    │   ├── regression
    │   │   ├── CMakeLists.txt
    │   │   └── test_binary_stride.cpp
    │   ├── sycl
    │   │   ├── CMakeLists.txt
    │   │   └── api
    │   │   │   ├── CMakeLists.txt
    │   │   │   ├── test_engine.cpp
    │   │   │   ├── test_memory_buffer.cpp
    │   │   │   ├── test_memory_usm.cpp
    │   │   │   └── test_stream.cpp
    │   ├── test_batch_normalization.cpp
    │   ├── test_binary.cpp
    │   ├── test_concat.cpp
    │   ├── test_concurrency.cpp
    │   ├── test_convolution_backward_data_common.hpp
    │   ├── test_convolution_backward_data_f32.cpp
    │   ├── test_convolution_backward_weights_common.hpp
    │   ├── test_convolution_backward_weights_f32.cpp
    │   ├── test_convolution_eltwise_forward_common.hpp
    │   ├── test_convolution_eltwise_forward_f32.cpp
    │   ├── test_convolution_eltwise_forward_x8s8f32s32.cpp
    │   ├── test_convolution_format_any.cpp
    │   ├── test_convolution_forward_common.hpp
    │   ├── test_convolution_forward_f32.cpp
    │   ├── test_convolution_forward_u8s8fp.cpp
    │   ├── test_convolution_forward_u8s8s32.cpp
    │   ├── test_cross_engine_reorder.cpp
    │   ├── test_deconvolution.cpp
    │   ├── test_eltwise.cpp
    │   ├── test_gemm_bf16bf16bf16.cpp
    │   ├── test_gemm_bf16bf16f32.cpp
    │   ├── test_gemm_common.hpp
    │   ├── test_gemm_data_preparation.hpp
    │   ├── test_gemm_f16.cpp
    │   ├── test_gemm_f16f16f32.cpp
    │   ├── test_gemm_f32.cpp
    │   ├── test_gemm_params.hpp
    │   ├── test_gemm_s8s8s32.cpp
    │   ├── test_gemm_s8u8s32.cpp
    │   ├── test_gemm_u8s8s32.cpp
    │   ├── test_gemm_u8u8s32.cpp
    │   ├── test_gemm_validation.hpp
    │   ├── test_global_scratchpad.cpp
    │   ├── test_group_normalization.cpp
    │   ├── test_iface_attr.cpp
    │   ├── test_iface_attr_quantization.cpp
    │   ├── test_iface_binary_bcast.cpp
    │   ├── test_iface_gpu_only.cpp
    │   ├── test_iface_handle.cpp
    │   ├── test_iface_pd.cpp
    │   ├── test_iface_pd_iter.cpp
    │   ├── test_iface_primitive_cache.cpp
    │   ├── test_iface_runtime_dims.cpp
    │   ├── test_iface_sparse.cpp
    │   ├── test_iface_threadpool.cpp
    │   ├── test_iface_weights_format.cpp
    │   ├── test_iface_wino_convolution.cpp
    │   ├── test_inner_product_backward_data.cpp
    │   ├── test_inner_product_backward_weights.cpp
    │   ├── test_inner_product_forward.cpp
    │   ├── test_ip_formats.cpp
    │   ├── test_isa_hints.cpp
    │   ├── test_isa_iface.cpp
    │   ├── test_isa_mask.cpp
    │   ├── test_layer_normalization.cpp
    │   ├── test_lrn.cpp
    │   ├── test_malloc.cpp
    │   ├── test_malloc.hpp
    │   ├── test_matmul.cpp
    │   ├── test_persistent_cache_api.cpp
    │   ├── test_pooling_backward.cpp
    │   ├── test_pooling_forward.cpp
    │   ├── test_prelu.cpp
    │   ├── test_primitive_cache_mt.cpp
    │   ├── test_reduction.cpp
    │   ├── test_reorder.cpp
    │   ├── test_reorder_common.hpp
    │   ├── test_reorder_formats.cpp
    │   ├── test_resampling.cpp
    │   ├── test_rnn_forward.cpp
    │   ├── test_shuffle.cpp
    │   ├── test_softmax.cpp
    │   └── test_sum.cpp
    ├── noexcept
    │   ├── CMakeLists.txt
    │   └── main.cpp
    ├── other
    │   └── subproject
    │   │   ├── CMakeLists.txt
    │   │   └── main.c
    ├── test_isa_common.hpp
    ├── test_thread.cpp
    └── test_thread.hpp
└── third_party
    ├── .clang-format
    ├── .clang-tidy
    ├── gtest
        ├── CMakeLists.txt
        ├── LICENSE
        ├── gtest-death-test.h
        ├── gtest-matchers.h
        ├── gtest-message.h
        ├── gtest-param-test.h
        ├── gtest-printers.h
        ├── gtest-spi.h
        ├── gtest-test-part.h
        ├── gtest-typed-test.h
        ├── gtest.h
        ├── gtest_pred_impl.h
        ├── gtest_prod.h
        ├── internal
        │   ├── custom
        │   │   ├── README.md
        │   │   ├── gtest-port.h
        │   │   ├── gtest-printers.h
        │   │   └── gtest.h
        │   ├── gtest-death-test-internal.h
        │   ├── gtest-filepath.h
        │   ├── gtest-internal.h
        │   ├── gtest-param-util.h
        │   ├── gtest-port-arch.h
        │   ├── gtest-port.h
        │   ├── gtest-string.h
        │   └── gtest-type-util.h
        └── src
        │   ├── gtest-all.cc
        │   ├── gtest-death-test.cc
        │   ├── gtest-filepath.cc
        │   ├── gtest-internal-inl.h
        │   ├── gtest-matchers.cc
        │   ├── gtest-port.cc
        │   ├── gtest-printers.cc
        │   ├── gtest-test-part.cc
        │   ├── gtest-typed-test.cc
        │   ├── gtest.cc
        │   └── gtest_main.cc
    ├── ittnotify
        ├── LICENSE.BSD
        ├── README.md
        ├── disable_warnings.h
        ├── ittnotify.h
        ├── ittnotify_config.h
        ├── ittnotify_static.c
        ├── ittnotify_static.h
        ├── ittnotify_types.h
        ├── ittptmark64.S
        ├── ittptmark64.asm
        ├── jitprofiling.c
        ├── jitprofiling.h
        └── legacy
        │   └── ittnotify.h
    ├── level_zero
        ├── layers
        │   ├── zel_tracing_api.h
        │   ├── zel_tracing_ddi.h
        │   └── zel_tracing_register_cb.h
        ├── loader
        │   └── ze_loader.h
        ├── ze_api.h
        ├── ze_ddi.h
        ├── ze_ddi_common.h
        ├── ze_intel_gpu.h
        ├── ze_stypes.h
        ├── zes_api.h
        ├── zes_ddi.h
        ├── zet_api.h
        └── zet_ddi.h
    ├── mdapi
        └── metrics_discovery_api.h
    ├── ngen
        ├── COPYRIGHT
        ├── ngen.hpp
        ├── ngen_asm.hpp
        ├── ngen_auto_swsb.hpp
        ├── ngen_compiler_fix.hpp
        ├── ngen_config_internal.hpp
        ├── ngen_core.hpp
        ├── ngen_debuginfo.hpp
        ├── ngen_decoder.hpp
        ├── ngen_elf.hpp
        ├── ngen_emulation.hpp
        ├── ngen_gen12.hpp
        ├── ngen_gen8.hpp
        ├── ngen_interface.hpp
        ├── ngen_level_zero.hpp
        ├── ngen_opencl.hpp
        ├── ngen_pseudo.hpp
        ├── ngen_register_allocator.cpp
        ├── ngen_register_allocator.hpp
        ├── ngen_register_decl.hpp
        ├── ngen_registers.hpp
        ├── ngen_sycl.hpp
        ├── ngen_utils.hpp
        └── npack
        │   ├── elf_structs.hpp
        │   ├── hash.hpp
        │   ├── neo_packager.hpp
        │   └── neo_structs.hpp
    ├── spdlog
        ├── README.md
        ├── common-inl.h
        ├── common.h
        ├── details
        │   ├── backtracer-inl.h
        │   ├── backtracer.h
        │   ├── circular_q.h
        │   ├── console_globals.h
        │   ├── file_helper-inl.h
        │   ├── file_helper.h
        │   ├── fmt_helper.h
        │   ├── log_msg-inl.h
        │   ├── log_msg.h
        │   ├── log_msg_buffer-inl.h
        │   ├── log_msg_buffer.h
        │   ├── null_mutex.h
        │   ├── os-inl.h
        │   ├── os.h
        │   ├── periodic_worker-inl.h
        │   ├── periodic_worker.h
        │   ├── registry-inl.h
        │   ├── registry.h
        │   ├── synchronous_factory.h
        │   └── windows_include.h
        ├── fmt
        │   ├── bundled
        │   │   ├── base.h
        │   │   ├── core.h
        │   │   ├── format-inl.h
        │   │   └── format.h
        │   └── fmt.h
        ├── formatter.h
        ├── logger-inl.h
        ├── logger.h
        ├── mdc.h
        ├── pattern_formatter-inl.h
        ├── pattern_formatter.h
        ├── sinks
        │   ├── ansicolor_sink-inl.h
        │   ├── ansicolor_sink.h
        │   ├── base_sink-inl.h
        │   ├── base_sink.h
        │   ├── basic_file_sink-inl.h
        │   ├── basic_file_sink.h
        │   ├── null_sink.h
        │   ├── ostream_sink.h
        │   ├── rotating_file_sink-inl.h
        │   ├── rotating_file_sink.h
        │   ├── sink-inl.h
        │   ├── sink.h
        │   ├── wincolor_sink-inl.h
        │   └── wincolor_sink.h
        ├── spdlog-inl.h
        ├── spdlog.h
        ├── tweakme.h
        └── version.h
    ├── xbyak
        ├── COPYRIGHT
        ├── xbyak.h
        ├── xbyak_bin2hex.h
        ├── xbyak_mnemonic.h
        └── xbyak_util.h
    └── xbyak_aarch64
        ├── CMakeLists.txt
        ├── src
            ├── err_impl.h
            ├── util_impl.cpp
            ├── util_impl.h
            ├── util_impl_linux.h
            ├── util_impl_mac.h
            ├── util_impl_windows.h
            ├── xbyak_aarch64_impl.cpp
            ├── xbyak_aarch64_impl.h
            └── xbyak_aarch64_mnemonic.h
        └── xbyak_aarch64
            ├── xbyak_aarch64.h
            ├── xbyak_aarch64_adr.h
            ├── xbyak_aarch64_code_array.h
            ├── xbyak_aarch64_err.h
            ├── xbyak_aarch64_gen.h
            ├── xbyak_aarch64_inner.h
            ├── xbyak_aarch64_label.h
            ├── xbyak_aarch64_meta_mnemonic.h
            ├── xbyak_aarch64_mnemonic_def.h
            ├── xbyak_aarch64_perf.h
            ├── xbyak_aarch64_reg.h
            ├── xbyak_aarch64_util.h
            └── xbyak_aarch64_version.h


/.github/ISSUE_TEMPLATE/documentation.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Request a documentation change
 3 | about: Use this template to report documentation issue or request documentation changes
 4 | title: ''
 5 | labels: 'documentation'
 6 | assignees: ''
 7 | ---
 8 | 
 9 | # Summary
10 | Include a short summary of the issue or request. Sections below provide
11 | guidance on what factors are considered important for a documentation
12 | issue.
13 | 
14 | # URLs
15 | Include pointers to documents that are impacted.
16 | 
17 | # Additional details
18 | Provide detailed description of the expected changes in documentation
19 | and suggestions you have.
20 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/question.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Ask a question
3 | about: Use this template for everything that is not a bug or a feature request
4 | title: ''
5 | labels: 'question'
6 | assignees: ''
7 | ---
8 | 


--------------------------------------------------------------------------------
/.github/automation/aarch64/ci.json:
--------------------------------------------------------------------------------
1 | {
2 |     "dependencies": {
3 |         "acl": "v52.1.0",
4 |         "gcc": "13",
5 |         "clang": "17",
6 |         "onednn-base": "v3.7"
7 |     }
8 | }
9 | 


--------------------------------------------------------------------------------
/.github/codeql-config.yml:
--------------------------------------------------------------------------------
1 | paths:
2 |   - ./
3 |   - .github/
4 | 


--------------------------------------------------------------------------------
/cmake/template.vcxproj.user:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="utf-8"?>
2 | <Project ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
3 |   <PropertyGroup>
4 |     <LocalDebuggerEnvironment>PATH=@CTESTCONFIG_PATH@;$(PATH)</LocalDebuggerEnvironment>
5 |     <DebuggerFlavor>WindowsLocalDebugger</DebuggerFlavor>
6 |   </PropertyGroup>
7 | </Project>
8 | 


--------------------------------------------------------------------------------
/doc/advanced/design/mem_fmt_blk.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/advanced/design/mem_fmt_blk.png


--------------------------------------------------------------------------------
/doc/advanced/design/mem_fmt_img1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/advanced/design/mem_fmt_img1.png


--------------------------------------------------------------------------------
/doc/advanced/design/mem_fmt_img2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/advanced/design/mem_fmt_img2.png


--------------------------------------------------------------------------------
/doc/advanced/design/mem_fmt_padded_blk.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/advanced/design/mem_fmt_padded_blk.png


--------------------------------------------------------------------------------
/doc/advanced/design/strides.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/advanced/design/strides.png


--------------------------------------------------------------------------------
/doc/build/system_requirements.md:
--------------------------------------------------------------------------------
1 | System Requirements {#dev_guide_system_requirements}
2 | ====================================================
3 | 
4 | oneDNN supports a broad list of hardware platforms, operating systems, and compilers. 
5 | For details, see [oneDNN System Requirements](https://github.com/uxlfoundation/oneDNN?tab=readme-ov-file#system-requirements).
6 | 


--------------------------------------------------------------------------------
/doc/environment.yml:
--------------------------------------------------------------------------------
 1 | name: onednn-doc
 2 | channels:
 3 |   - conda-forge
 4 |   - nodefaults
 5 | dependencies:
 6 |   - python=3.9.5
 7 |   - pip=21.1.2
 8 |   - doxyrest=2.1.2
 9 |   - doxygen=1.8.14
10 |   - graphviz=2.40.1
11 |   - sphinx=4.0.2
12 |   - sphinx-book-theme=0.0.41
13 |   - sphinx-copybutton=0.5.2
14 | 


--------------------------------------------------------------------------------
/doc/graph/fusion_patterns/images/binary_pattern.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/graph/fusion_patterns/images/binary_pattern.png


--------------------------------------------------------------------------------
/doc/graph/fusion_patterns/images/compressed_sdpa_pattern.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/graph/fusion_patterns/images/compressed_sdpa_pattern.png


--------------------------------------------------------------------------------
/doc/graph/fusion_patterns/images/conv_bwd_pattern.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/graph/fusion_patterns/images/conv_bwd_pattern.png


--------------------------------------------------------------------------------
/doc/graph/fusion_patterns/images/conv_pattern.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/graph/fusion_patterns/images/conv_pattern.png


--------------------------------------------------------------------------------
/doc/graph/fusion_patterns/images/convtranspose_pattern.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/graph/fusion_patterns/images/convtranspose_pattern.png


--------------------------------------------------------------------------------
/doc/graph/fusion_patterns/images/epilogue_subgraph_conv.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/graph/fusion_patterns/images/epilogue_subgraph_conv.png


--------------------------------------------------------------------------------
/doc/graph/fusion_patterns/images/epilogue_subgraph_general_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/graph/fusion_patterns/images/epilogue_subgraph_general_1.png


--------------------------------------------------------------------------------
/doc/graph/fusion_patterns/images/epilogue_subgraph_general_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/graph/fusion_patterns/images/epilogue_subgraph_general_2.png


--------------------------------------------------------------------------------
/doc/graph/fusion_patterns/images/epilogue_subgraph_matmul.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/graph/fusion_patterns/images/epilogue_subgraph_matmul.png


--------------------------------------------------------------------------------
/doc/graph/fusion_patterns/images/f2f_conversion.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/graph/fusion_patterns/images/f2f_conversion.png


--------------------------------------------------------------------------------
/doc/graph/fusion_patterns/images/f2q_conversion_general.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/graph/fusion_patterns/images/f2q_conversion_general.png


--------------------------------------------------------------------------------
/doc/graph/fusion_patterns/images/f2q_conversion_quantized_conv.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/graph/fusion_patterns/images/f2q_conversion_quantized_conv.png


--------------------------------------------------------------------------------
/doc/graph/fusion_patterns/images/f2q_conversion_quantized_matmul.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/graph/fusion_patterns/images/f2q_conversion_quantized_matmul.png


--------------------------------------------------------------------------------
/doc/graph/fusion_patterns/images/f2q_conversion_softmax.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/graph/fusion_patterns/images/f2q_conversion_softmax.png


--------------------------------------------------------------------------------
/doc/graph/fusion_patterns/images/fp-gated-mlp.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/graph/fusion_patterns/images/fp-gated-mlp.png


--------------------------------------------------------------------------------
/doc/graph/fusion_patterns/images/gated-mlp-swish.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/graph/fusion_patterns/images/gated-mlp-swish.png


--------------------------------------------------------------------------------
/doc/graph/fusion_patterns/images/gqa.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/graph/fusion_patterns/images/gqa.png


--------------------------------------------------------------------------------
/doc/graph/fusion_patterns/images/interpolate_pattern.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/graph/fusion_patterns/images/interpolate_pattern.png


--------------------------------------------------------------------------------
/doc/graph/fusion_patterns/images/matmul_pattern.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/graph/fusion_patterns/images/matmul_pattern.png


--------------------------------------------------------------------------------
/doc/graph/fusion_patterns/images/norm_pattern.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/graph/fusion_patterns/images/norm_pattern.png


--------------------------------------------------------------------------------
/doc/graph/fusion_patterns/images/pool_pattern.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/graph/fusion_patterns/images/pool_pattern.png


--------------------------------------------------------------------------------
/doc/graph/fusion_patterns/images/q2f_conversion_quantized_conv_matmul.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/graph/fusion_patterns/images/q2f_conversion_quantized_conv_matmul.png


--------------------------------------------------------------------------------
/doc/graph/fusion_patterns/images/q2f_conversion_quantized_convtranspose.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/graph/fusion_patterns/images/q2f_conversion_quantized_convtranspose.png


--------------------------------------------------------------------------------
/doc/graph/fusion_patterns/images/quantized_conv_pattern.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/graph/fusion_patterns/images/quantized_conv_pattern.png


--------------------------------------------------------------------------------
/doc/graph/fusion_patterns/images/quantized_convtranspose_pattern.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/graph/fusion_patterns/images/quantized_convtranspose_pattern.png


--------------------------------------------------------------------------------
/doc/graph/fusion_patterns/images/quantized_matmul_pattern.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/graph/fusion_patterns/images/quantized_matmul_pattern.png


--------------------------------------------------------------------------------
/doc/graph/fusion_patterns/images/reduction_pattern.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/graph/fusion_patterns/images/reduction_pattern.png


--------------------------------------------------------------------------------
/doc/graph/fusion_patterns/images/sdpa-mask-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/graph/fusion_patterns/images/sdpa-mask-1.png


--------------------------------------------------------------------------------
/doc/graph/fusion_patterns/images/sdpa-mask-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/graph/fusion_patterns/images/sdpa-mask-2.png


--------------------------------------------------------------------------------
/doc/graph/fusion_patterns/images/sdpa-mask-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/graph/fusion_patterns/images/sdpa-mask-3.png


--------------------------------------------------------------------------------
/doc/graph/fusion_patterns/images/sdpa-mask-4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/graph/fusion_patterns/images/sdpa-mask-4.png


--------------------------------------------------------------------------------
/doc/graph/fusion_patterns/images/sdpa-reorder.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/graph/fusion_patterns/images/sdpa-reorder.png


--------------------------------------------------------------------------------
/doc/graph/fusion_patterns/images/sdpa.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/graph/fusion_patterns/images/sdpa.png


--------------------------------------------------------------------------------
/doc/graph/fusion_patterns/images/softmax_pattern.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/graph/fusion_patterns/images/softmax_pattern.png


--------------------------------------------------------------------------------
/doc/graph/fusion_patterns/images/unary_pattern.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/graph/fusion_patterns/images/unary_pattern.png


--------------------------------------------------------------------------------
/doc/graph/programming_model/images/bf16_programming.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/graph/programming_model/images/bf16_programming.jpg


--------------------------------------------------------------------------------
/doc/graph/programming_model/images/img_graph_programming_model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/graph/programming_model/images/img_graph_programming_model.png


--------------------------------------------------------------------------------
/doc/graph/programming_model/images/int8_programming.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/graph/programming_model/images/int8_programming.jpg


--------------------------------------------------------------------------------
/doc/graph/rst/graph_programming_model.rst:
--------------------------------------------------------------------------------
1 | Programming Model
2 | #################
3 | 
4 | .. toctree::
5 |    :maxdepth: 1
6 | 
7 |    dev_guide_graph_basic_concepts
8 |    dev_guide_graph_low_precision
9 |    


--------------------------------------------------------------------------------
/doc/graph/rst/images/other_pattern.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/graph/rst/images/other_pattern.png


--------------------------------------------------------------------------------
/doc/performance_considerations/benchdnn.md:
--------------------------------------------------------------------------------
1 | Benchmarking Performance {#dev_guide_benchdnn}
2 | ==============================================
3 | 
4 | oneDNN has a built-in benchmarking program called benchdnn.
5 | 
6 | For a complete description of the available options and working examples, see
7 | the [benchdnn readme](https://github.com/uxlfoundation/oneDNN/blob/main/tests/benchdnn/README.md#benchdnn).
8 | 


--------------------------------------------------------------------------------
/doc/performance_considerations/vtune.md:
--------------------------------------------------------------------------------
1 | Profiling with VTune(TM) Profiler {#dev_guide_vtune}
2 | ========================================================
3 | 
4 | See @ref dev_guide_profilers
5 | 


--------------------------------------------------------------------------------
/doc/primitives/images/unrolled_stack_rnn.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/primitives/images/unrolled_stack_rnn.jpg


--------------------------------------------------------------------------------
/doc/programming_model/images/img_depthwise_fusion.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/programming_model/images/img_depthwise_fusion.jpg


--------------------------------------------------------------------------------
/doc/programming_model/images/img_dnnl_object_snapshot.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/programming_model/images/img_dnnl_object_snapshot.jpg


--------------------------------------------------------------------------------
/doc/programming_model/images/img_dnnl_programming_flow.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/programming_model/images/img_dnnl_programming_flow.jpg


--------------------------------------------------------------------------------
/doc/programming_model/images/img_overview_flow.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/programming_model/images/img_overview_flow.jpg


--------------------------------------------------------------------------------
/doc/programming_model/images/img_programming_model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/programming_model/images/img_programming_model.png


--------------------------------------------------------------------------------
/doc/rst/advanced_topics.rst:
--------------------------------------------------------------------------------
 1 | Advanced Topics
 2 | #####################
 3 | 
 4 | .. toctree::
 5 |    :maxdepth: 1
 6 | 
 7 |    dev_guide_transition_to_dnnl
 8 |    dev_guide_understanding_memory_formats
 9 |    dev_guide_int8_computations
10 |    dev_guide_primitive_cache
11 |    dev_guide_persistent_cache
12 |    dev_guide_threadpool
13 |    dev_guide_sparsity
14 |    dev_guide_experimental
15 | 


--------------------------------------------------------------------------------
/doc/rst/build_and_link.rst:
--------------------------------------------------------------------------------
 1 | Build and Link oneDNN
 2 | #####################
 3 | 
 4 | .. toctree::
 5 |    :maxdepth: 1
 6 | 
 7 |    dev_guide_system_requirements
 8 |    dev_guide_build
 9 |    dev_guide_build_options
10 |    dev_guide_link


--------------------------------------------------------------------------------
/doc/rst/graph_extension.rst:
--------------------------------------------------------------------------------
 1 | Graph Extension
 2 | ###############
 3 | 
 4 | .. toctree::
 5 |    :maxdepth: 1
 6 | 
 7 |    graph_programming_model
 8 |    graph_supported_operations
 9 |    graph_fusion_patterns
10 |    dev_guide_graph_dump
11 |    dev_guide_constant_tensor_cache
12 | 


--------------------------------------------------------------------------------
/doc/rst/interop_with_dpcpp_and_opencl.rst:
--------------------------------------------------------------------------------
1 | Interoperability with DPC++ and OpenCL
2 | ########################################
3 | 
4 | .. toctree::
5 |    :maxdepth: 1
6 | 
7 |    dev_guide_opencl_interoperability.rst
8 |    dev_guide_dpcpp_interoperability.rst
9 | 


--------------------------------------------------------------------------------
/doc/rst/performance_profiling_and_inspection.rst:
--------------------------------------------------------------------------------
 1 | Performance Profiling and Inspection
 2 | ########################################
 3 | 
 4 | .. toctree::
 5 |    :maxdepth: 1
 6 | 
 7 |    dev_guide_verbose
 8 |    dev_guide_performance_settings
 9 |    dev_guide_benchdnn
10 |    dev_guide_profilers
11 |    dev_guide_inspecting_jit
12 |    page_performance_profiling_cpp
13 |    dev_guide_cpu_dispatcher_control
14 |    dev_guide_cpu_isa_hints
15 |    dev_guide_verbose_table
16 |    


--------------------------------------------------------------------------------
/doc/rst/programming_model.rst:
--------------------------------------------------------------------------------
 1 | oneDNN Concepts
 2 | ###############
 3 | 
 4 | .. toctree::
 5 |    :maxdepth: 1
 6 | 
 7 |    page_memory_format_propagation_cpp
 8 |    dev_guide_inference_and_training_aspects
 9 |    dev_guide_attributes
10 |    dev_guide_data_types
11 |    page_cross_engine_reorder_cpp
12 |    dev_guide_c_and_cpp_apis
13 |    interop_with_dpcpp_and_opencl


--------------------------------------------------------------------------------
/doc/rst/supported_primitives.rst:
--------------------------------------------------------------------------------
 1 | Supported Primitives
 2 | #####################
 3 | 
 4 | .. toctree::
 5 |    :maxdepth: 1
 6 | 
 7 |    dev_guide_convolution
 8 |    dev_guide_inner_product
 9 |    dev_guide_matmul
10 |    dev_guide_rnn
11 |    dev_guide_batch_normalization
12 |    dev_guide_binary
13 |    dev_guide_concat
14 |    dev_guide_eltwise
15 |    dev_guide_group_normalization
16 |    dev_guide_layer_normalization
17 |    dev_guide_lrn
18 |    dev_guide_pooling
19 |    dev_guide_prelu
20 |    dev_guide_resampling
21 |    dev_guide_shuffle
22 |    dev_guide_softmax
23 |    dev_guide_sum
24 |    dev_guide_reorder
25 |    dev_guide_reduction
26 | 


--------------------------------------------------------------------------------
/doc/rst/ukernels.rst:
--------------------------------------------------------------------------------
 1 | Ukernels
 2 | #####################
 3 | 
 4 | .. toctree::
 5 |    :maxdepth: 1
 6 | 
 7 |    dev_guide_ukernel_basic_concepts.rst
 8 |    dev_guide_ukernel_brgemm.rst
 9 |    dev_guide_ukernel_transform.rst
10 |    page_cpu_brgemm_example_cpp.rst
11 | 


--------------------------------------------------------------------------------
/doc/sphinx/_static/favicons.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/sphinx/_static/favicons.png


--------------------------------------------------------------------------------
/doc/sphinx/_static/oneAPI-rgb-rev-100.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/sphinx/_static/oneAPI-rgb-rev-100.png


--------------------------------------------------------------------------------
/doc/usage_models/images/img_bf16_diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/usage_models/images/img_bf16_diagram.png


--------------------------------------------------------------------------------
/doc/usage_models/images/img_diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/usage_models/images/img_diagram.png


--------------------------------------------------------------------------------
/doc/usage_models/images/img_inference_scope.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/usage_models/images/img_inference_scope.jpg


--------------------------------------------------------------------------------
/doc/usage_models/images/img_multiscalar.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/usage_models/images/img_multiscalar.png


--------------------------------------------------------------------------------
/doc/usage_models/images/img_singlescalar.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/usage_models/images/img_singlescalar.png


--------------------------------------------------------------------------------
/doc/usage_models/images/img_training_inference_scope.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/usage_models/images/img_training_inference_scope.jpg


--------------------------------------------------------------------------------
/doc/usage_models/training.md:
--------------------------------------------------------------------------------
 1 | Training {#dev_guide_training}
 2 | ==============================
 3 | 
 4 | NEW_CONTENT_GOES_HERE
 5 | 
 6 | ## fp32 Training
 7 | 
 8 | NEW_CONTENT_GOES_HERE
 9 | 
10 | ## bfp16 Training
11 | 
12 | NEW_CONTENT_GOES_HERE
13 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.black]
2 | line-length = 80
3 | include = 'scripts\/.*\.pyi?
#39;
4 | 


--------------------------------------------------------------------------------
/src/cpu/jit_utils/linux_perf/README.md:
--------------------------------------------------------------------------------
1 | This is an implementation of jitdump format used by linux perf. The
2 | [spec](https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/plain/tools/perf/Documentation/jitdump-specification.txt)
3 | 


--------------------------------------------------------------------------------
/src/gpu/intel/jit/gemm/.clang-tidy:
--------------------------------------------------------------------------------
1 | Checks: '-*,misc-definitions-in-headers'
2 | CheckOptions:
3 |   - { key: HeaderFileExtensions,          value: "x" }
4 | 


--------------------------------------------------------------------------------
/src/gpu/intel/microkernels/.clang-tidy:
--------------------------------------------------------------------------------
1 | Checks: '-*,misc-definitions-in-headers'
2 | CheckOptions:
3 |   - { key: HeaderFileExtensions,          value: "x" }
4 | 


--------------------------------------------------------------------------------
/tests/benchdnn/doc/knobs_dir.md:
--------------------------------------------------------------------------------
 1 | # Direction
 2 | 
 3 | **Benchdnn** renames the library propagation kind abstraction into "direction".
 4 | The following direction values are supported:
 5 | 
 6 | | Prop kind     | Description
 7 | | :---          | :---
 8 | | FWD_B         | dnnl_forward_training w/ bias
 9 | | FWD_D         | dnnl_forward_training w/o bias
10 | | FWD_I         | dnnl_forward_inference
11 | | BWD_D         | dnnl_backward_data
12 | | BWD_WB        | dnnl_backward_weights w/ bias
13 | | BWD_W         | dnnl_backward_weights w/o bias
14 | | BWD_DW        | dnnl_backward
15 | 
16 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/binary/harness_binary_regression:
--------------------------------------------------------------------------------
 1 | # repeated sum with varying scale
 2 | --reset --attr-post-ops=sum+relu+sum:2 8x8x3x5:8x8x3x5_n"multisum"
 3 | 
 4 | # Curious edge case in GPU JIT-reorder-based binary
 5 | --reset --alg=ADD --stag=ABcd32a16b:ABcd32a16b --dtag=acdb --sdt=f16:f16 --ddt=f16 64x168x42x42:64x168x42x42
 6 | 
 7 | # Mixed src1/post-op src broadcast
 8 | --reset --attr-post-ops=add:f32:2 1x17:1x1
 9 | 
10 | # per_w broadcasting strategy
11 | --reset --attr-post-ops=mul:f32:4+add:f32:4 --alg=add 1x20x768:1x20x1
12 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/binary/option_set_src0_bcast:
--------------------------------------------------------------------------------
 1 | --stag=abx:abx             1x5x1x1:3x5x6x9
 2 |                            4x1x4x4:4x4x1x4
 3 |                            1x1x1x1:16x12x2x2
 4 |                            1x12:12x1
 5 | 
 6 | --stag=aBx8b:aBx16b  2x16x1x1:2x16x5x7
 7 | 
 8 | --stag=aBx16b:axb    1x16x5x7:2x16x5x7
 9 |                      2x16x1x1:2x16x1x1
10 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/binary/shapes_ci:
--------------------------------------------------------------------------------
 1 | 3x5x6x9:3x5x6x9
 2 | 5x3x2x9:1x3x2x9
 3 | 32x17x2x3:32x17
 4 | 32x17x2x3:1x17
 5 | 15x12x3x5:15
 6 | 15x12x3x5:1
 7 | 5x3x1x9:1x3x2x9
 8 | 12x12:1x12
 9 | 12x1:1x12
10 | 2x3x48:1x3x48
11 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/binary/shapes_perf_1st_conv:
--------------------------------------------------------------------------------
1 | 64x3x224x224:1x3x1x1
2 | 256x3x224x224:1x3x1x1
3 | 512x3x224x224:1x3x1x1
4 | 1024x3x224x224:1x3x1x1
5 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/binary/shapes_perf_scaleshift:
--------------------------------------------------------------------------------
1 | 1024x2048x10x10:1x2048x1x1
2 | 1024x1024x19x19:1x1024x1x1
3 | 1024x512x38x38:1x512x1x1
4 | 1024x256x75x75:1x256x1x1
5 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/binary/test_binary_all:
--------------------------------------------------------------------------------
 1 | # all
 2 | --reset
 3 | 
 4 | --batch=harness_binary_f32
 5 | --batch=harness_binary_bf16
 6 | --batch=harness_binary_f16
 7 | --batch=harness_binary_i8
 8 | --batch=harness_binary_different_dt
 9 | --batch=harness_binary_regression
10 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/binary/test_binary_bfloat16:
--------------------------------------------------------------------------------
1 | # bf16 (for legacy infra support)
2 | --reset
3 | 
4 | --batch=harness_binary_bf16
5 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/binary/test_binary_different_dt_ci:
--------------------------------------------------------------------------------
 1 | --reset
 2 | 
 3 | --inplace=false # Different src and dst data types does not support in-place mode.
 4 | --ddt=s8,u8,f32,s32 --sdt=s8:u8,u8:s8,s8:f32,f32:u8,f32:f32,f32:s32,s32:f32
 5 | --alg=ADD,MUL,MAX,MIN,DIV,SUB,GE,GT,LE,LT,EQ,NE,SELECT
 6 | --stag=abx:any,axb:any
 7 | --batch=shapes_ci
 8 | 
 9 | --alg=ADD
10 | --attr-post-ops=,add:f32:per_oc+sum:2+linear:2:1:3
11 | --attr-scales=,src:common:0.5+src1:common:0.25
12 | --batch=shapes_ci
13 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/binary/test_binary_float16:
--------------------------------------------------------------------------------
1 | # f16
2 | --reset
3 | 
4 | --batch=harness_binary_f16
5 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/bnorm/perf_bnorm_gpu:
--------------------------------------------------------------------------------
1 | --reset --batch=option_set_fwks_key_gpu
2 | --reset --batch=option_set_fwks_ext_gpu
3 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/bnorm/set_nd:
--------------------------------------------------------------------------------
1 | --batch=shapes_1d
2 | --batch=shapes_2d
3 | --batch=shapes_3d
4 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/bnorm/shapes_1d:
--------------------------------------------------------------------------------
1 | # random 1d problems
2 | 
3 | mb1ic16iw1n"bnorm1d:1"
4 | mb1ic32iw30n"bnorm1d:2"
5 | mb32ic128iw10n"bnorm1d:3"
6 | mb32ic27iw7n"bnorm1d:4"
7 | mb9ic128iw1n"bnorm1d:5"
8 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/bnorm/shapes_2d:
--------------------------------------------------------------------------------
1 | # random 2d problems
2 | 
3 | mb1ic1ih1_n"bnorm_2d:1"
4 | mb1ic3ih1_n"bnorm_2d:2"
5 | mb1ic256ih28_n"bnorm_2d:3"
6 | mb1ic257ih28_n"bnorm_2d:4"
7 | mb1ic1024ih28iw17_n"bnorm_2d:5"
8 | mb2ic128ih2_n"bnorm_2d:catch_cancel"
9 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/bnorm/shapes_3d:
--------------------------------------------------------------------------------
1 | # random 3d problems
2 | 
3 | mb1ic16id40_n"bnorm_3d:1"
4 | mb32ic128id12ih10iw8_n"bnorm_3d:2"
5 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/bnorm/shapes_ci:
--------------------------------------------------------------------------------
 1 | # random problems
 2 | 
 3 | ic23_n"bnorm_ci_0d:1"
 4 | ic32iw32_n"bnorm_ci_1d:1"
 5 | mb5ic33iw27_n"bnorm_ci_1d:2"
 6 | ic256ih28_n"bnorm_ci_2d:1" # Used in smoke validation, don't change the name
 7 | mb4ic200ih17iw16_n"bnorm_ci_2d:2" # Used in smoke validation, don't change the name
 8 | ic16id12_n"bnorm_ci_3d:1"
 9 | mb1ic19id4ih6iw6_n"bnorm_ci_3d:2"
10 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/bnorm/shapes_large:
--------------------------------------------------------------------------------
1 | # Large minibatch
2 | mb3072ic64ih112
3 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/bnorm/shapes_regressions:
--------------------------------------------------------------------------------
 1 | # Blocking bugs
 2 | mb2ic251ih28 # 2-core system w/ 1.5MB LLC slices
 3 | mb2ic64ih224 # 2-core system w/ 2.5MB LLC slices
 4 | mb2ic64ih56 # 4-core system w/ 1.5MB LLC slices
 5 | mb8ic322ih16 # 4-core system w/ 2.5MB LLC slices
 6 | mb88ic600ih16 # 36-core system w/ 1.5MB LLC slices
 7 | mb88ic980ih16 # 44-core system w/ 2.5MB LLC slices
 8 | mb88ic800ih16 # 48-core system w/ 1.5MB LLC slices
 9 | mb88ic900ih16 # 56-core system w/ 1.5MB LLC slices
10 | 
11 | # Spatial threading bugs for bfloat16
12 | mb12ic4ih8
13 | mb1ic24ih14iw14
14 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/bnorm/test_bnorm_regressions:
--------------------------------------------------------------------------------
 1 | --reset
 2 | 
 3 | --inplace=true
 4 | --tag=axb,abx,aBx8b,aBx16b
 5 | 
 6 | # training
 7 | --dir=FWD_D,BWD_DW
 8 | --dt=f32,bf16
 9 | --flags=,G,CH,R,GCHR
10 | --batch=shapes_regressions
11 | 
12 | # inference
13 | --dir=FWD_I
14 | 
15 | --dt=f32,bf16
16 | --flags=,GCH
17 | --attr-post-ops=,relu
18 | --batch=shapes_regressions
19 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/bnorm/test_bnorm_regressions_large:
--------------------------------------------------------------------------------
1 | --reset
2 | 
3 | --skip-impl=ref
4 | 
5 | --tag=axb,abx,aBx16b --dir=FWD_D,BWD_DW --flags=CHR --batch=shapes_large
6 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/bnorm/test_bnorm_smoke:
--------------------------------------------------------------------------------
 1 | --reset
 2 | 
 3 | --match=.*bnorm_ci_2d.* # Use 2d problems only from shapes_ci
 4 | --inplace=false
 5 | --tag=axb
 6 | 
 7 | # training
 8 | --dir=FWD_D,BWD_DW
 9 | --dt=f32,bf16,f16
10 | --flags=,G,CHR
11 | --batch=shapes_ci
12 | ## no scale or shift support for backward_data
13 | --dir=BWD_D
14 | --flags=,G,R
15 | --batch=shapes_ci
16 | 
17 | # inference
18 | --dir=FWD_I
19 | 
20 | --dt=f32,bf16,f16
21 | --flags=,G,CH
22 | --attr-post-ops=,relu
23 | --batch=shapes_ci
24 | --attr-post-ops=
25 | 
26 | --dt=f16
27 | --flags=,G,CH
28 | --batch=shapes_ci
29 | 
30 | --dt=s8
31 | --flags=G,GCH
32 | --batch=shapes_ci
33 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/brgemm/harness_brgemm_f32:
--------------------------------------------------------------------------------
 1 | --reset
 2 | 
 3 | --dt=f32
 4 | --bia_dt=undef,f32
 5 | --beta=0,1
 6 | --attr-post-ops=,sum:2,relu
 7 | --ld=,:160:96
 8 | --batch=option_set_f32
 9 | 
10 | # Separate cases for non-default alpha
11 | --reset
12 | --dt=f32
13 | --alpha=2
14 | --batch=shapes_2d_no_tail_f32
15 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/brgemm/harness_brgemm_fpmath:
--------------------------------------------------------------------------------
 1 | --reset
 2 | 
 3 | --attr-fpmath=bf16
 4 | 
 5 | # f32
 6 | --dt=f32
 7 | --bia_dt=undef,f32
 8 | --beta=0,1
 9 | --batch=option_set_f32
10 | 
11 | # Separate cases for non-default alpha
12 | --reset
13 | --alpha=2
14 | --batch=shapes_2d_no_tail_f32
15 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/brgemm/option_set_bf16:
--------------------------------------------------------------------------------
 1 | # Incorporates all meaningful shapes with bs included.
 2 | # bs > 1 is applicable when K is divisible by K-block size.
 3 | 
 4 | --bs=1,7,16
 5 | 
 6 | --batch=shapes_2d_no_tail_bf16
 7 | 
 8 | --batch=shapes_2d_tail_n_bf16
 9 | 
10 | --bs=1
11 | 
12 | --batch=shapes_2d_tail_k_bf16
13 | --batch=shapes_2d_big_k_bf16
14 | 
15 | --batch=shapes_2d_tail_k_tail_n_bf16
16 | --batch=shapes_2d_big_k_tail_n_bf16
17 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/brgemm/option_set_f32:
--------------------------------------------------------------------------------
 1 | # Incorporates all meaningful shapes with bs included.
 2 | # bs > 1 is applicable when K is divisible by K-block size.
 3 | 
 4 | --bs=1,7,16
 5 | 
 6 | --batch=shapes_2d_no_tail_f32
 7 | 
 8 | --batch=shapes_2d_tail_n_f32
 9 | 
10 | --bs=1
11 | 
12 | --batch=shapes_2d_tail_k_f32
13 | --batch=shapes_2d_big_k_f32
14 | 
15 | --batch=shapes_2d_tail_k_tail_n_f32
16 | --batch=shapes_2d_big_k_tail_n_f32
17 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/brgemm/option_set_int8:
--------------------------------------------------------------------------------
 1 | # Incorporates all meaningful shapes with bs included.
 2 | # bs > 1 is applicable when K is divisible by K-block size.
 3 | 
 4 | --bs=1,7,16
 5 | 
 6 | --batch=shapes_2d_no_tail_int8
 7 | 
 8 | --batch=shapes_2d_tail_n_int8
 9 | 
10 | --bs=1
11 | 
12 | --batch=shapes_2d_tail_k_int8
13 | --batch=shapes_2d_big_k_int8
14 | 
15 | --batch=shapes_2d_tail_k_tail_n_int8
16 | --batch=shapes_2d_big_k_tail_n_int8
17 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/brgemm/shapes_2d_big_k_bf16:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 1x256:256x16_n"bf16:big_k:0"
 4 | 1x256:256x32_n"bf16:big_k:1"
 5 | 1x256:256x48_n"bf16:big_k:2"
 6 | 1x256:256x64_n"bf16:big_k:3"
 7 | 
 8 | 
 9 | 13x256:256x16_n"bf16:big_k:4"
10 | 13x256:256x32_n"bf16:big_k:5"
11 | 13x256:256x48_n"bf16:big_k:6"
12 | 13x256:256x64_n"bf16:big_k:7"
13 | 
14 | 
15 | 16x256:256x16_n"bf16:big_k:8"
16 | 16x256:256x32_n"bf16:big_k:9"
17 | 16x256:256x48_n"bf16:big_k:10"
18 | 16x256:256x64_n"bf16:big_k:11"
19 | 
20 | 
21 | 64x256:256x16_n"bf16:big_k:12"
22 | 64x256:256x32_n"bf16:big_k:13"
23 | 64x256:256x48_n"bf16:big_k:14"
24 | 64x256:256x64_n"bf16:big_k:15"
25 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/brgemm/shapes_2d_big_k_f32:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 1x136:136x16_n"f32:big_k:0"
 4 | 1x136:136x32_n"f32:big_k:1"
 5 | 1x136:136x48_n"f32:big_k:2"
 6 | 1x136:136x64_n"f32:big_k:3"
 7 | 
 8 | 
 9 | 13x136:136x16_n"f32:big_k:4"
10 | 13x136:136x32_n"f32:big_k:5"
11 | 13x136:136x48_n"f32:big_k:6"
12 | 13x136:136x64_n"f32:big_k:7"
13 | 
14 | 
15 | 16x136:136x16_n"f32:big_k:8"
16 | 16x136:136x32_n"f32:big_k:9"
17 | 16x136:136x48_n"f32:big_k:10"
18 | 16x136:136x64_n"f32:big_k:11"
19 | 
20 | 
21 | 64x136:136x16_n"f32:big_k:12"
22 | 64x136:136x32_n"f32:big_k:13"
23 | 64x136:136x48_n"f32:big_k:14"
24 | 64x136:136x64_n"f32:big_k:15"
25 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/brgemm/shapes_2d_big_k_int8:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 1x512:512x16_n"int8:big_k:0"
 4 | 1x512:512x32_n"int8:big_k:1"
 5 | 1x512:512x48_n"int8:big_k:2"
 6 | 1x512:512x64_n"int8:big_k:3"
 7 | 
 8 | 
 9 | 13x512:512x16_n"int8:big_k:4"
10 | 13x512:512x32_n"int8:big_k:5"
11 | 13x512:512x48_n"int8:big_k:6"
12 | 13x512:512x64_n"int8:big_k:7"
13 | 
14 | 
15 | 16x512:512x16_n"int8:big_k:8"
16 | 16x512:512x32_n"int8:big_k:9"
17 | 16x512:512x48_n"int8:big_k:10"
18 | 16x512:512x64_n"int8:big_k:11"
19 | 
20 | 
21 | 64x512:512x16_n"int8:big_k:12"
22 | 64x512:512x32_n"int8:big_k:13"
23 | 64x512:512x48_n"int8:big_k:14"
24 | 64x512:512x64_n"int8:big_k:15"
25 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/brgemm/test_brgemm_all:
--------------------------------------------------------------------------------
 1 | --reset
 2 | 
 3 | --batch=test_brgemm_ci
 4 | 
 5 | --batch=test_brgemm_f32
 6 | 
 7 | --batch=test_brgemm_bf16
 8 | 
 9 | --batch=test_brgemm_f16
10 | 
11 | --batch=test_brgemm_f8
12 | 
13 | --batch=test_brgemm_int8
14 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/brgemm/test_brgemm_f32:
--------------------------------------------------------------------------------
 1 | --reset
 2 | 
 3 | --batch=harness_brgemm_f32
 4 | 
 5 | --batch=harness_brgemm_fpmath
 6 | 
 7 | --reset
 8 | --brgemm-attr=generate_skip_accumulation:1
 9 | --beta=0,1
10 | --bia_dt=f32
11 | --dt=f32
12 | --batch=option_set_f32
13 | 
14 | # ukernel wtag support
15 | --reset
16 | --wtag=ba
17 | --dt=f32
18 | --batch=option_set_f32
19 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/brgemm/test_brgemm_f8:
--------------------------------------------------------------------------------
 1 | --reset
 2 | 
 3 | --batch=harness_brgemm_f8
 4 | 
 5 | # ukernel wtag support
 6 | --reset
 7 | --wtag=ba
 8 | --dt=f8_e5m2,f8_e4m3
 9 | --batch=option_set_int8
10 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/brgemm/test_brgemm_regression:
--------------------------------------------------------------------------------
1 | # Incorrect mask comparison in lazy hw config, need two problems to run back-to-back
2 | --reset --dt=bf16 128x448:448x32 42x608:608x32
3 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/brgemm/test_brgemm_smoke:
--------------------------------------------------------------------------------
 1 | --reset
 2 | 
 3 | --match=.*64:64.* # Use shapes from ci with K=64 only
 4 | 
 5 | --bs=16
 6 | --dt=f32
 7 | --bia_dt=undef,f32
 8 | --attr-post-ops=,relu
 9 | --batch=shapes_2d_no_tail_f32
10 | 
11 | --dt=f16,bf16
12 | --batch=shapes_2d_no_tail_bf16
13 | 
14 | --dt=u8:s8:f32,s8:s8:s32
15 | --attr-scales=,src:common:0.5+wei:per_oc
16 | --attr-zero-points=,src:common:128+dst:common:-1
17 | --batch=shapes_2d_no_tail_int8
18 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/concat/test_concat_ci:
--------------------------------------------------------------------------------
 1 | --reset
 2 | 
 3 | --dtag=undef,any,abx,axb
 4 | --sdt=f32,bf16,f16,s32,s8,u8
 5 | --ddt=f32,bf16,f16,s32,s8,u8
 6 | --stag=abx:abx,axb:axb
 7 | --axis=1
 8 | 2x16x3x4:2x16x3x4
 9 | 3x5x5x17:3x4x5x17
10 | --axis=0
11 | 32x4x5x17:16x4x5x17
12 | 1x16x3x4:2x16x3x4
13 | 
14 | --stag=abx:abx:abx,axb:axb:axb
15 | --axis=1
16 | --attr-scales=,msrc0:common:1.5,msrc0:common:1.5+msrc1:common:2.5
17 | 6x48x3x4x5:6x32x3x4x5:6x16x3x4x5
18 | 6x48x3x4x5:6x31x3x4x5:6x16x3x4x5
19 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/concat/test_concat_smoke:
--------------------------------------------------------------------------------
 1 | --reset
 2 | 
 3 | --dtag=undef
 4 | --sdt=f32,bf16,f16,s8,u8
 5 | --ddt=f32,bf16,f16,s8,u8
 6 | --stag=axb:axb
 7 | --axis=1
 8 | 2x16x3x4:2x16x3x4
 9 | 3x5x5x17:3x4x5x17
10 | 
11 | --stag=axb:axb:axb
12 | --axis=1
13 | --attr-scales=,msrc0:common:1.5+msrc1:common:2.5
14 | 6x48x3x4x5:6x32x3x4x5:6x16x3x4x5
15 | 6x48x3x4x5:6x31x3x4x5:6x16x3x4x5
16 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/conv/harness_conv_arbitrary_dst:
--------------------------------------------------------------------------------
 1 | --reset
 2 | --mb=2,16
 3 | --dir=FWD_B
 4 | 
 5 | # mixed halfs
 6 | --dt=bf16,bf16:bf16:f32,f16:f16:f32
 7 | --batch=set_gpu
 8 | 
 9 | #int8dst
10 | --dt=f16:f16:s8,f16:f16:u8,bf16:bf16:s8,bf16:bf16:u8,f32:f32:s8,f32:f32:u8
11 | --batch=set_gpu
12 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/conv/harness_conv_auto:
--------------------------------------------------------------------------------
1 | # auto algo
2 | --reset --dt=f32 --alg=auto
3 | --dir=FWD_B --batch=shapes_auto
4 | --dir=BWD_D --batch=shapes_auto
5 | --dir=BWD_WB --batch=shapes_auto
6 | --dt=u8:s8:s8
7 | --dir=FWD_B --batch=shapes_auto
8 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/conv/harness_conv_deepbench:
--------------------------------------------------------------------------------
 1 | --reset
 2 | 
 3 | --dir=FWD_B,BWD_D,BWD_W
 4 | --batch=shapes_deepbench_inference_device
 5 | --batch=shapes_deepbench_inference_server
 6 | --batch=shapes_deepbench_training
 7 | 
 8 | --dir=FWD_B
 9 | --attr-post-ops=relu
10 | --batch=shapes_deepbench_inference_device
11 | --batch=shapes_deepbench_inference_server
12 | --batch=shapes_deepbench_training
13 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/conv/harness_conv_dilated_int8:
--------------------------------------------------------------------------------
 1 | # Dilated Int8
 2 | --reset
 3 | --mb=2
 4 | --dir=FWD_D
 5 | --dt=u8:s8:u8,s8:s8:u8
 6 | --batch=shapes_dilated_rfcn
 7 | --match=.*fc6.* --batch=shapes_ssd_300_voc0712
 8 | 
 9 | --reset
10 | --dir=FWD_D --mb=2
11 | --skip-impl=ref,x64:gemm
12 | --dt=u8:s8:s32
13 | --batch=set_dilated-conv_1st
14 | --batch=set_dilated-conv
15 | --batch=set_dilated-conv_3d
16 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/conv/harness_conv_f32:
--------------------------------------------------------------------------------
1 | # f32
2 | --reset --dt=f32
3 | --mb=2                      # for fwd and bwd_d reduce mb
4 | --dir=FWD_B --batch=set_conv_all
5 | --dir=BWD_D --batch=set_conv_all
6 | --dir=BWD_WB --batch=set_conv_all
7 | --mb=0                      # for bwd_w use the actual mb for 1 topology
8 | --dir=BWD_WB --batch=shapes_resnet_50
9 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/conv/harness_conv_saturation_int8:
--------------------------------------------------------------------------------
 1 | # Test that saturation is handled properly (lightweight test)
 2 | --reset
 3 | --mb=2
 4 | --dir=FWD_B
 5 | --dt=u8:s8:u8,u8:s8:s8,u8:s8:s32
 6 | --attr-scales=src:common:0.25+wei:common:0.5+dst:common:4294967295
 7 | ic16oc16ih4oh4kh1ph0 # jit 1x1
 8 | ic16oc16ih4oh4kh3ph0 # jit
 9 | ic16oc16ih4oh7kh3ph3 # gemm impl
10 | 
11 | --reset
12 | --dt=u8:s8:u8
13 | --attr-scales=src:common:0.25+wei:common:0.5+dst:common:4294967295
14 | --attr-post-ops=round
15 | ic16oc16_ih5oh5kh3ph1
16 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/conv/harness_conv_tags:
--------------------------------------------------------------------------------
 1 | # Test different combinations of memory format tags for input tensors
 2 | --reset
 3 | --mb=2
 4 | 
 5 | # Source or destination tensor may be in plain layout
 6 | --stag=any,abx,axb
 7 | --dtag=any,abx,axb
 8 | 
 9 | # Training: bf16 and f32
10 | --dt=bf16,f32
11 | --dir=FWD_B,BWD_D,BWD_WB
12 | --batch=shapes_basic
13 | # Inference: int8
14 | --dt=u8:s8:u8
15 | --dir=FWD_B
16 | --batch=shapes_basic
17 | 
18 | # Winograd
19 | --alg=wino
20 | --match=.*k[dhw]3.*
21 | --dt=f32 # Training: f32 (no bf16 for Winograd)
22 | --dir=FWD_B,BWD_D,BWD_WB
23 | --batch=shapes_basic
24 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/conv/perf_conv_bdw_1sock:
--------------------------------------------------------------------------------
 1 | --reset
 2 | 
 3 | --dt=f32
 4 | --dir=FWD_D,BWD_D,BWD_W
 5 | --alg=direct
 6 | 
 7 | --mb=0,20,40,80
 8 | --batch=set_perf_cpu_large_mb
 9 | 
10 | --mb=1,4,8
11 | --batch=set_perf_cpu_all_mb
12 | 
13 | --reset
14 | ## int8
15 | --dt=u8:s8:u8
16 | --dir=FWD_D
17 | --alg=direct
18 | 
19 | --mb=0,20,40,80
20 | --batch=set_perf_cpu_large_mb
21 | 
22 | --mb=1,4,8
23 | --batch=set_perf_cpu_all_mb
24 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/conv/perf_conv_clx_1sock:
--------------------------------------------------------------------------------
 1 | --reset
 2 | 
 3 | # Direct
 4 | ## f32
 5 | --dt=f32
 6 | --dir=FWD_D,BWD_D,BWD_W
 7 | --alg=direct
 8 | 
 9 | --mb=0,26,52,104
10 | --batch=set_perf_cpu_large_mb
11 | 
12 | --mb=1,4,8
13 | --batch=set_perf_cpu_all_mb
14 | 
15 | --reset
16 | ## int8
17 | --dt=u8:s8:u8
18 | --dir=FWD_D
19 | --alg=direct
20 | 
21 | --mb=0,26,52,104
22 | --batch=set_perf_cpu_large_mb
23 | 
24 | --mb=1,4,8
25 | --batch=set_perf_cpu_all_mb
26 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/conv/perf_conv_skx_1sock:
--------------------------------------------------------------------------------
 1 | --reset
 2 | 
 3 | # Direct
 4 | ## f32
 5 | --dt=f32
 6 | --dir=FWD_D,BWD_D,BWD_W
 7 | --alg=direct
 8 | 
 9 | --mb=0,16,32,64
10 | --batch=set_perf_cpu_large_mb
11 | 
12 | --mb=1,4,8
13 | --batch=set_perf_cpu_all_mb
14 | 
15 | --reset
16 | ## int8
17 | --dt=u8:s8:u8
18 | --dir=FWD_D
19 | --alg=direct
20 | 
21 | --mb=0,16,32,64
22 | --batch=set_perf_cpu_large_mb
23 | 
24 | --mb=1,4,8
25 | --batch=set_perf_cpu_all_mb
26 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/conv/perf_conv_xe:
--------------------------------------------------------------------------------
 1 | --reset
 2 | 
 3 | # Forward
 4 | 
 5 | # f32
 6 | --dir=FWD_B
 7 | --dt=f32
 8 | --mb=16,64
 9 | --batch=set_perf_gpu_large_mb
10 | 
11 | --dir=FWD_B
12 | --dt=f32
13 | --mb=1
14 | --batch=set_perf_gpu_all_mb
15 | 
16 | # f16
17 | --dt=f16
18 | --dir=FWD_B
19 | 
20 | --mb=16,64
21 | --batch=set_perf_gpu_large_mb
22 | 
23 | --mb=1
24 | --batch=set_perf_gpu_all_mb
25 | 
26 | # Backward
27 | 
28 | # f32
29 | --dt=f32
30 | --dir=BWD_D,BWD_W
31 | --mb=64
32 | --batch=set_perf_gpu_large_mb
33 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/conv/perf_conv_xe_hp:
--------------------------------------------------------------------------------
 1 | --batch=perf_conv_xe_lp
 2 | 
 3 | --reset
 4 | 
 5 | # Forward, bf16
 6 | 
 7 | --dir=FWD_B
 8 | --dt=bf16
 9 | --mb=16,64
10 | --batch=set_perf_gpu_large_mb
11 | 
12 | --dir=FWD_B
13 | --dt=bf16
14 | --mb=1
15 | --batch=set_perf_gpu_all_mb
16 | 
17 | # Backward, bf16
18 | 
19 | --dt=bf16
20 | --dir=BWD_D,BWD_W
21 | --mb=64
22 | --batch=set_perf_gpu_large_mb
23 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/conv/perf_conv_xe_lp:
--------------------------------------------------------------------------------
 1 | --batch=perf_conv_xe
 2 | 
 3 | --reset
 4 | 
 5 | # Forward, int8
 6 | --dt=u8:s8:s8
 7 | --dir=FWD_B
 8 | 
 9 | --mb=16,64
10 | --batch=set_perf_gpu_large_mb
11 | 
12 | --mb=1
13 | --batch=set_perf_gpu_all_mb
14 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/conv/set_conv_3d:
--------------------------------------------------------------------------------
 1 | # 3D Convolutions
 2 | # ngroups = 1; ic = oc = 16
 3 | # 1x1 Filter
 4 | --batch=shapes_3d_1x1_unit-stride_no-padding
 5 | --batch=shapes_3d_1x1_unit-stride_padding
 6 | --batch=shapes_3d_1x1_strided_no-padding
 7 | --batch=shapes_3d_1x1_strided_padding
 8 | # N-sized Filter
 9 | # ngroups = 1; ic = 1; oc = 16
10 | --batch=shapes_3d_1st_strided_padding
11 | # ngroups = 1; ic = oc = 16
12 | --batch=shapes_3d_unit-stride_no-padding
13 | --batch=shapes_3d_unit-stride_padding
14 | --batch=shapes_3d_strided_no-padding
15 | --batch=shapes_3d_strided_padding
16 | # Depth + Height
17 | --batch=shapes_3d_2d_strided_padding
18 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/conv/set_conv_all:
--------------------------------------------------------------------------------
 1 | --batch=shapes_1d
 2 | --batch=shapes_1d_wavenet
 3 | --batch=shapes_3d_unet
 4 | --batch=shapes_alexnet
 5 | --batch=shapes_vgg_19
 6 | --batch=shapes_resnet_50
 7 | --batch=shapes_googlenet_v1
 8 | --batch=shapes_googlenet_v2
 9 | --batch=shapes_googlenet_v3
10 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/conv/set_dilated-conv:
--------------------------------------------------------------------------------
 1 | # Dilated Convolution
 2 | # 1D
 3 | --batch=shapes_dilated_1d_strided_no-padding
 4 | --batch=shapes_dilated_1d_unit-stride_no-padding
 5 | --batch=shapes_dilated_1d_unit-stride_padding
 6 | --batch=shapes_dilated_1d_strided_padding
 7 | # 2D
 8 | --batch=shapes_dilated_2d_strided_no-padding
 9 | --batch=shapes_dilated_2d_strided_padding
10 | --batch=shapes_dilated_2d_unit-stride_no-padding
11 | --batch=shapes_dilated_2d_unit-stride_padding
12 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/conv/set_dilated-conv_1st:
--------------------------------------------------------------------------------
1 | # Dilated Convolutions
2 | # ic = 1 for 1st convolution code-path
3 | --batch=shapes_dilated_1d_1st_strided_padding
4 | --batch=shapes_dilated_2d_1st_strided_padding
5 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/conv/set_dilated-conv_3d:
--------------------------------------------------------------------------------
1 | # 3D Dilated Convolution
2 | # N-sized Filter
3 | --batch=shapes_dilated_3d_strided_no-padding
4 | --batch=shapes_dilated_3d_strided_padding
5 | --batch=shapes_dilated_3d_unit-stride_no-padding
6 | --batch=shapes_dilated_3d_unit-stride_padding
7 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/conv/set_fastrcnn:
--------------------------------------------------------------------------------
1 | # FastRCNN
2 | --batch=shapes_fastrcnn_p1
3 | --batch=shapes_fastrcnn_p2
4 | #--batch=shapes_fastrcnn_p3 # no unique conv. shapes
5 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/conv/set_gpu:
--------------------------------------------------------------------------------
1 | --batch=shapes_1d_wavenet
2 | --batch=shapes_alexnet
3 | --batch=shapes_resnet_50
4 | --batch=shapes_3d_gpu
5 | --batch=shapes_mobilenet_dw
6 | --batch=shapes_tails_gpu
7 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/conv/set_maskrcnn:
--------------------------------------------------------------------------------
1 | # MaskRCNN
2 | --batch=shapes_maskrcnn_p1
3 | --batch=shapes_maskrcnn_p2
4 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/conv/set_perf_cpu_all_mb:
--------------------------------------------------------------------------------
1 | --batch=set_perf_cpu_large_mb
2 | --batch=set_perf_cpu_small_mb
3 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/conv/set_perf_cpu_inference_only:
--------------------------------------------------------------------------------
1 | --batch=shapes_ssd_resnet34_inference
2 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/conv/set_perf_cpu_large_mb:
--------------------------------------------------------------------------------
 1 | --batch=shapes_1d_wavenet
 2 | --batch=shapes_alexnet
 3 | --batch=shapes_densnet
 4 | --batch=shapes_efficientdet
 5 | --batch=shapes_googlenet_v1
 6 | --batch=shapes_googlenet_v2
 7 | --batch=shapes_googlenet_v3
 8 | --batch=shapes_mobilenet
 9 | --batch=shapes_mobilenet_dw
10 | --batch=shapes_resnet_50
11 | --batch=shapes_resnet_50_sparse
12 | --batch=shapes_ssd_300_voc0712
13 | --batch=shapes_ssd_mobilenet
14 | --batch=shapes_ssd_resnet34_training
15 | --batch=shapes_unet
16 | --batch=shapes_vgg_19
17 | --batch=shapes_yolov2
18 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/conv/set_perf_cpu_small_mb:
--------------------------------------------------------------------------------
1 | --batch=set_maskrcnn
2 | --batch=set_fastrcnn
3 | --batch=shapes_3d_unet
4 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/conv/set_perf_gpu_all_mb:
--------------------------------------------------------------------------------
1 | --batch=set_perf_gpu_large_mb
2 | --batch=set_perf_gpu_small_mb
3 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/conv/set_perf_gpu_large_mb:
--------------------------------------------------------------------------------
 1 | --batch=shapes_1d_wavenet
 2 | --batch=shapes_a3c
 3 | --batch=shapes_alexnet
 4 | --batch=shapes_densnet
 5 | --batch=shapes_ffn
 6 | --batch=shapes_googlenet_v3
 7 | --batch=shapes_mobilenet
 8 | --batch=shapes_mobilenet_dw
 9 | --batch=shapes_pointnet
10 | --batch=shapes_resnet_50
11 | --batch=shapes_resnet_50_sparse
12 | --batch=shapes_unet
13 | --batch=shapes_vgg_19
14 | --batch=shapes_yolov2
15 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/conv/set_perf_gpu_small_mb:
--------------------------------------------------------------------------------
1 | --batch=set_maskrcnn
2 | --batch=set_fastrcnn
3 | --batch=shapes_3d_unet
4 | --batch=shapes_cosmictagger
5 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/conv/set_topologies_inference_only:
--------------------------------------------------------------------------------
1 | --batch=shapes_ssd_resnet34_inference
2 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/conv/shapes_3d_gpu:
--------------------------------------------------------------------------------
 1 | #test cases for 3D convolution
 2 | 
 3 | ic16oc16_ih13kh3ph1_iw50kw3pw1_id10kd3pd1_n"3d_conv_pad:1"
 4 | ic64oc64_ih10kh3ph1_iw20kw3pw1_id15kd3pd1_n"3d_conv_pad:2"
 5 | ic256oc256_ih7kh3ph1_iw9kw3pw1_id11kd3pd1_n"3d_conv_pad:3"
 6 | ic256oc256_ih7kh1ph1_iw9kw1pw1_id11kd1pd1_n"3d_conv_pad:4"
 7 | ic16oc16_ih13kh3ph0_iw50kw3pw0_id10kd3pd0_n"3d_conv:1"
 8 | ic16oc16_ih13kh1ph0_iw50kw1pw0_id10kd1pd0_n"3d_conv:2"
 9 | ic256oc256_ih7kh3ph0_iw9kw3pw0_id11kd3pd0_n"3d_conv:3"
10 | ic256oc256_ih7kh1ph0_iw9kw1pw0_id11kd1pd0_n"3d_conv:4"
11 | 
12 | #mb1_ic16oc16_id1od2kd1sd1dd1pd0_ih2oh3kh2sh1dh1ph2_iw5ow3kw3sw2dw0pw0
13 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/conv/shapes_a3c:
--------------------------------------------------------------------------------
1 | # A3C
2 | 
3 | mb1_g1ic4oc16_ih84oh20kh8sh4dh0ph0_iw84ow20kw8sw4dw0pw0_n"a3c:conv1"
4 | mb1_g1ic16oc32_ih20oh9kh4sh2dh0ph0_iw20ow9kw4sw2dw0pw0_n"a3c:conv2"
5 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/conv/shapes_alexnet:
--------------------------------------------------------------------------------
1 | # alexnet
2 | 
3 | g1mb256ic3ih227iw227oc96oh55ow55kh11kw11sh4sw4ph0pw0n"alexnet:conv1"
4 | g2mb256ic96ih27oc256oh27kh5ph2n"alexnet:conv2"
5 | mb256ic256ih13oc384oh13kh3ph1n"alexnet:conv3"
6 | g2mb256ic384ih13oc384oh13kh3ph1n"alexnet:conv4"
7 | g2mb256ic384ih13oc256oh13kh3ph1n"alexnet:conv5"
8 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/conv/shapes_auto:
--------------------------------------------------------------------------------
1 | mb2_ic3oc64_ih300kh3oh300ph1n"ssd_300_voc0712:conv1_1"
2 | mb9_ic3oc64_ih300kh3oh300ph1n"ssd_300_voc0712:conv1_1"
3 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/conv/shapes_dilated_rfcn:
--------------------------------------------------------------------------------
1 | # Dilated Convolution from RFCN
2 | 
3 | mb1ic512ih38iw57oc512oh38ow57kh3kw3ph3pw3dh2dw2n"dilated_rfcn"
4 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/conv/shapes_ffn:
--------------------------------------------------------------------------------
1 | # FFN
2 | 
3 | mb4ic2id33oc32od33kd3pd1n"ffn:0*2"
4 | mb4ic32id33oc1od33kd1pd0n"ffn:1*3"
5 | mb4ic32id33oc32od33kd3pd1n"ffn:2*69"
6 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/conv/shapes_fused_large_src:
--------------------------------------------------------------------------------
1 | # large-mb, small-ic shape to exceed combined L2 cache and target jit impls
2 | mb448_ic16oc64_ih56oh56kh1sh1dh0ph0n"fused_large_src"
3 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/conv/shapes_fused_mobilenet_stride_2:
--------------------------------------------------------------------------------
 1 | ic32oc64_ih112oh112kh1sh1dh0ph0n"MobileNet_v1_fused_stride_2:conv1"
 2 | ic128oc128_ih56oh56kh1sh1dh0ph0n"MobileNet_v1_fused_stride_2:conv2"
 3 | ic256oc256_ih28oh28kh1sh1dh0ph0n"MobileNet_v1_fused_stride_2:conv3"
 4 | ic512oc512_ih14oh14kh1sh1dh0ph0n"MobileNet_v1_fused_stride_2:conv4"
 5 | 
 6 | ic16oc96_ih112oh112kh1sh1dh0ph0n"MobileNet_v2_fused_stride_2:conv1"
 7 | ic24oc144_ih56oh56kh1sh1dh0ph0n"MobileNet_v2_fused_stride_2:conv2"
 8 | ic64oc384_ih28oh28kh1sh1dh0ph0n"MobileNet_v2_fused_stride_2:conv3"
 9 | ic96oc576_ih14oh14kh1sh1dh0ph0n"MobileNet_v2_fused_stride_2:conv4"
10 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/conv/shapes_large_conv:
--------------------------------------------------------------------------------
 1 | # Large iw
 2 | mb1ic32oc1iw134217732kw3
 3 | mb1ic1oc1iw4294967311kw3
 4 | 
 5 | # Large ic
 6 | mb1ic4294967311oc1iw1kw1
 7 | mb1ic134217732oc1iw27kw3
 8 | 
 9 | # Larg oc
10 | mb1ic1oc1342177321iw27kw3
11 | mb1ic1oc4294967311iw1kw1
12 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/conv/shapes_large_padding:
--------------------------------------------------------------------------------
1 | ic16oc16_iw1kw3ow100pw100
2 | ic1oc64_ih55oh18kh30sh3ph29dh2
3 | ic1oc64ih7oh7kh3ph1sh1dh0
4 | ic1oc64iw100ow98kw5pw50sw2dw0
5 | ic2oc64ih30oh37kh14ph10sh1dh2
6 | ic3oc64id28od36kd3pd5sd1dd0
7 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/conv/shapes_pointnet:
--------------------------------------------------------------------------------
 1 | # PointNet
 2 | 
 3 | mb10ic1088ih1iw15000oc512oh1ow15000kh1kw1ph0pw0n"pointnet:0*300"
 4 | mb10ic128ih1iw15000oc1024oh1ow15000kh1kw1ph0pw0n"pointnet:1*900"
 5 | mb10ic128ih1iw15000oc5oh1ow15000kh1kw1ph0pw0n"pointnet:2*300"
 6 | mb10ic256ih1iw15000oc128oh1ow15000kh1kw1ph0pw0n"pointnet:3*300"
 7 | mb10ic4ih1iw15000oc64oh1ow15000kh1kw1ph0pw0n"pointnet:4*500"
 8 | mb10ic512ih1iw15000oc256oh1ow15000kh1kw1ph0pw0n"pointnet:5*300"
 9 | mb10ic64ih1iw15000oc128oh1ow15000kh1kw1ph0pw0n"pointnet:6*900"
10 | mb10ic64ih1iw15000oc64oh1ow15000kh1kw1ph0pw0n"pointnet:7*900"
11 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/conv/shapes_regression_gemm:
--------------------------------------------------------------------------------
1 | # ResNext50
2 | mb2_g32ic128oc128_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1
3 | 
4 | # Faster RCNN
5 | mb1_g64ic256oc256_ih240oh240kh3sh1dh0ph1_iw352ow352kw3sw1dw0pw1
6 | 
7 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/conv/shapes_segnet:
--------------------------------------------------------------------------------
1 | # Minibatch 4 is found in the original prototxt
2 | # Not clear if that's what should be used for perf measurements
3 | 
4 | mb4ih240iw320ic64oc64oh240ow320kh7kw7ph3pw3sh1sw1n"conv2"
5 | mb4ih120iw160ic64oc64oh120ow160kh7kw7ph3pw3sh1sw1n"conv3"
6 | mb4ih60iw80ic64oc64oh60ow80kh7kw7ph3pw3sh1sw1n"conv4"
7 | mb4ih480iw640ic64oc64oh480ow640kh7kw7ph3pw3sh1sw1n"conv_decode1"
8 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/conv/shapes_vgg_11:
--------------------------------------------------------------------------------
 1 | # vgg_11
 2 | 
 3 | mb64ic3ih100oc32oh100kh3ph1n"vgg_11:conv1_1"
 4 | mb64ic32ih100oc64oh100kh3ph1n"vgg_11:conv1_2"
 5 | mb64ic64ih50oc64oh50kh3ph1n"vgg_11:conv2_1"
 6 | mb64ic64ih50oc128oh50kh3ph1n"vgg_11:conv2_2"
 7 | mb64ic128ih25oc96oh25kh3ph1n"vgg_11:conv3_1"
 8 | mb64ic96ih25oc192oh25kh3ph1n"vgg_11:conv3_2"
 9 | mb64ic192ih13oc128oh13kh3ph1n"vgg_11:conv4_1"
10 | mb64ic128ih13oc256oh13kh3ph1n"vgg_11:conv4_2"
11 | mb64ic256ih7oc160oh7kh3ph1n"vgg_11:conv5_1"
12 | mb64ic160ih7oc320oh7kh3ph1n"vgg_11:conv5_2"
13 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/conv/test_conv_3d:
--------------------------------------------------------------------------------
 1 | # f32 3-D Convolutions
 2 | --reset --dt=f32
 3 | --mb=2
 4 | --skip-impl=ref,x64:gemm      # ! test jit version only
 5 | --dir=FWD_B,BWD_D,BWD_WB
 6 | --batch=shapes_3d
 7 | --batch=set_conv_3d
 8 | 
 9 | # i8 3-D Convolution
10 | --reset
11 | --skip-impl=ref,x64:gemm # ! test jit version only
12 | --mb=2
13 | --dt=u8:s8:s8,s8:s8:s32,u8:s8:s8
14 | --batch=shapes_3d
15 | --batch=set_conv_3d
16 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/conv/test_conv_3d_f32_plain:
--------------------------------------------------------------------------------
 1 | # f32 3-D Convolutions
 2 | --reset --dt=f32
 3 | --stag=axb --dtag=axb
 4 | --mb=2
 5 | --skip-impl=ref,x64:gemm      # ! test jit version only
 6 | --dir=FWD_B,BWD_D,BWD_WB
 7 | --batch=shapes_3d
 8 | --batch=set_conv_3d
 9 | 
10 | --stag=abx --dtag=abx
11 | --batch=shapes_3d
12 | --batch=set_conv_3d
13 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/conv/test_conv_all_topologies:
--------------------------------------------------------------------------------
 1 | # Test All Topologies with F32 Configuration
 2 | --reset --dt=f32
 3 | --skip-impl=ref
 4 | --mb=2
 5 | --dir=FWD_B,BWD_D,BWD_WB
 6 | --batch=set_all_topologies
 7 | 
 8 | --dir=FWD_B
 9 | --batch=set_topologies_inference_only
10 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/conv/test_conv_all_topologies_f32_plain:
--------------------------------------------------------------------------------
 1 | # Test All Topologies with F32 Configuration
 2 | --reset --dt=f32
 3 | --stag=axb --dtag=axb
 4 | --skip-impl=ref
 5 | --mb=2
 6 | --dir=FWD_B,BWD_D,BWD_WB
 7 | --batch=set_all_topologies
 8 | 
 9 | --dir=FWD_B
10 | --batch=set_topologies_inference_only
11 | 
12 | --stag=abx --dtag=abx
13 | --dir=FWD_B,BWD_D,BWD_WB
14 | --batch=set_all_topologies
15 | 
16 | --dir=FWD_B
17 | --batch=set_topologies_inference_only
18 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/conv/test_conv_attrs:
--------------------------------------------------------------------------------
 1 | # f32
 2 | --reset
 3 | --mb=2
 4 | --dir=FWD_B
 5 | 
 6 | --dt=f32
 7 | --skip-impl=ref,x64:gemm      # ! test jit version only
 8 | --batch=option_set_combined_postops
 9 | --skip-impl=
10 | 
11 | --dt=f32,f16,bf16,u8:s8:u8
12 | --skip-impl=ref
13 | --batch=option_set_all_eltwise_postops
14 | --skip-impl=
15 | 
16 | # f32_wino
17 | --reset
18 | --mb=2
19 | --dir=FWD_B
20 | 
21 | --dt=f32
22 | --alg=wino
23 | --attr-post-ops=sum+relu
24 | --batch=shapes_resnet_50
25 | 
26 | # Depthwise fusion
27 | --batch=harness_conv_fused_depthwise
28 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/conv/test_conv_attrs_f32_plain:
--------------------------------------------------------------------------------
 1 | # f32
 2 | --reset --dt=f32
 3 | --mb=2
 4 | --stag=axb --dtag=axb
 5 | --dir=FWD_B
 6 | 
 7 | --skip-impl=ref,x64:gemm      # ! test jit version only
 8 | --batch=option_set_combined_postops
 9 | 
10 | --stag=abx --dtag=abx
11 | --batch=option_set_all_eltwise_postops
12 | 
13 | --stag=axb --dtag=axb
14 | --skip-impl=ref
15 | --batch=option_set_all_eltwise_postops
16 | 
17 | --stag=abx --dtag=abx
18 | --batch=option_set_all_eltwise_postops
19 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/conv/test_conv_bfloat16_ymm:
--------------------------------------------------------------------------------
 1 | # Currently, only gemm routine implements bf16_ymm hint
 2 | # Therefore test bfloat16 GeMM specific functionality
 3 | 
 4 | # global benchdnn knob, will not be reset again
 5 | --cpu-isa-hints=prefer_ymm
 6 | --reset
 7 | 
 8 | --batch=test_conv_gemm_bfloat16
 9 | --batch=test_conv_gemm_bfloat16_nxc
10 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/conv/test_conv_depthwise:
--------------------------------------------------------------------------------
 1 | # f32
 2 | --reset
 3 | --mb=2
 4 | --dir=FWD_D,BWD_D,BWD_WB,FWD_I --batch=shapes_mobilenet_dw
 5 | --dir=FWD_D,BWD_D,BWD_WB,FWD_I --batch=shapes_regression_dw
 6 | 
 7 | # post-ops
 8 | --dir=FWD_D
 9 | --attr-post-ops=relu,sum,sum+relu+add:f32:per_tensor,add:f32:per_oc
10 | --batch=shapes_mobilenet_dw
11 | 
12 | --reset --dt=f32
13 | --mb=2
14 | --skip-impl=ref,x64:gemm
15 | --dir=FWD_B,BWD_D,BWD_WB,FWD_I
16 | --batch=set_conv_dw
17 | --batch=shapes_dw_minibatch_2d-spatial
18 | --batch=shapes_dw_minibatch_channel_2d-spatial
19 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/conv/test_conv_dilated:
--------------------------------------------------------------------------------
 1 | # dilated f32
 2 | --reset
 3 | --mb=2
 4 | --dt=f32
 5 | --dir=FWD_B,BWD_D,BWD_WB
 6 | --batch=shapes_dilated --batch=shapes_dilated_rfcn
 7 | --match=.*fc6.* --batch=shapes_ssd_300_voc0712
 8 | 
 9 | --reset --dt=f32
10 | --mb=2
11 | --skip-impl=ref,x64:gemm      # ! test jit version only
12 | --dir=FWD_B,BWD_D,BWD_WB
13 | --batch=set_dilated-conv_1st
14 | --batch=set_dilated-conv
15 | --batch=set_dilated-conv_3d
16 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/conv/test_conv_dt:
--------------------------------------------------------------------------------
 1 | --reset
 2 | 
 3 | # f32
 4 | --batch=harness_conv_f32
 5 | 
 6 | # tails
 7 | --reset
 8 | --dir=FWD_B,BWD_D,BWD_WB  --batch=shapes_tails
 9 | 
10 | # bf16
11 | --batch=test_conv_bfloat16
12 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/conv/test_conv_dt_plain:
--------------------------------------------------------------------------------
 1 | --reset
 2 | 
 3 | # f32
 4 | --batch=harness_conv_f32_plain
 5 | 
 6 | # tails
 7 | --reset
 8 | --skip-impl=ref
 9 | --stag=axb --dtag=axb
10 | --dir=FWD_B,BWD_D,BWD_WB  --batch=shapes_tails
11 | --stag=abx --dtag=abx
12 | --batch=shapes_tails
13 | 
14 | # bf16
15 | --batch=test_conv_bfloat16_nxc
16 | 
17 | # f16
18 | --batch=test_conv_float16_nxc
19 | 
20 | # fp8
21 | --batch=test_conv_fp8_nxc
22 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/conv/test_conv_fp4:
--------------------------------------------------------------------------------
1 | --reset --mb=1 --dt=f4_e2m1:f4_e2m1:bf16,f4_e3m0:f4_e3m0:bf16 --dir=fwd_d --batch=shapes_4bit
2 | --reset --mb=1 --dt=f16:f4_e2m1:f4_e2m1,f32:f4_e3m0:f4_e3m0 --dir=bwd_d --batch=shapes_4bit
3 | --reset --mb=1 --dt=f4_e2m1:f16:f4_e2m1,f4_e3m0:f32:f4_e3m0 --dir=bwd_w --batch=shapes_4bit
4 | --reset --mb=1 --dt=f4_e2m1,f4_e3m0 --dir=fwd_d --batch=shapes_4bit
5 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/conv/test_conv_function:
--------------------------------------------------------------------------------
 1 | # Test functional convolution features
 2 | 
 3 | --reset
 4 | 
 5 | # auto algo
 6 | --batch=harness_conv_auto
 7 | 
 8 | # memory-tags
 9 | --batch=harness_conv_tags
10 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/conv/test_conv_gemm_bfloat16:
--------------------------------------------------------------------------------
 1 | # Test Bfloat16 GeMM specific functionality
 2 | --reset
 3 | --mb=2
 4 | --skip-impl=ref
 5 | --dir=FWD_B
 6 | --dt=bf16:bf16:f32 --batch=shapes_gemm
 7 | 
 8 | --dir=FWD_D
 9 | --dt=bf16 --batch=shapes_gemm
10 | 
11 | --dir=BWD_D
12 | --dt=f32:bf16:bf16 --batch=shapes_gemm
13 | 
14 | --dir=BWD_WB
15 | --dt=bf16:f32:bf16 --batch=shapes_gemm
16 | 
17 | --dir=FWD_B,BWD_D,BWD_W
18 | --dt=bf16
19 | --stag=abx --dtag=abx
20 | --batch=shapes_gemm
21 | --batch=shapes_3d_2d_strided_padding
22 | --batch=shapes_dilated_3d_strided_padding
23 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/conv/test_conv_gemm_bfloat16_nxc:
--------------------------------------------------------------------------------
 1 | # bfloat16
 2 | --reset
 3 | --mb=2
 4 | --stag=axb --dtag=axb
 5 | --skip-impl=ref
 6 | 
 7 | --dir=FWD_B
 8 | --dt=bf16:bf16:f32,bf16
 9 | --batch=shapes_gemm
10 | 
11 | --dir=BWD_D
12 | --dt=f32:bf16:bf16,bf16
13 | --batch=shapes_gemm
14 | 
15 | --dir=BWD_WB
16 | --mb=0
17 | --dt=bf16:f32:bf16,bf16
18 | --batch=shapes_gemm
19 | 
20 | # PostOps + Attributes
21 | --dir=FWD_B
22 | --dt=bf16:bf16:f32,bf16
23 | --mb=2
24 | --attr-post-ops=sum,relu,sum+tanh:1:1:2.5 --batch=shapes_gemm
25 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/conv/test_conv_gemm_dt:
--------------------------------------------------------------------------------
 1 | # Test GeMM specific functionality
 2 | 
 3 | # f32
 4 | --reset --dt=f32
 5 | --mb=2                      # for fwd and bwd_d reduce mb
 6 | --dir=FWD_B,BWD_D,BWD_WB --batch=shapes_gemm
 7 | 
 8 | --stag=abx --dtag=abx
 9 | --batch=shapes_3d_2d_strided_padding
10 | --batch=shapes_dilated_3d_strided_padding
11 | 
12 | # bf16
13 | --batch=test_conv_gemm_bfloat16
14 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/conv/test_conv_gpu_ci:
--------------------------------------------------------------------------------
 1 | --reset
 2 | 
 3 | --stag=any
 4 | --dtag=any
 5 | --batch=option_gpu_ci
 6 | 
 7 | --stag=axb
 8 | --dtag=axb
 9 | --batch=option_gpu_ci
10 | 
11 | --stag=any
12 | --dtag=axb
13 | --dir=FWD_B
14 | --dt=f64,f32,bf16,f16,s8,f8_e4m3,f8_e5m2
15 | --batch=shapes_ci_gpu
16 | 
17 | --dir=BWD_D,BWD_W
18 | --dt=f64,f32,bf16
19 | --batch=shapes_ci_gpu
20 | 
21 | --stag=axb
22 | --dtag=any
23 | --dir=FWD_B
24 | --dt=f64,f32,bf16,f16,s8,f8_e4m3,f8_e5m2
25 | --batch=shapes_ci_gpu
26 | 
27 | --dir=BWD_D,BWD_W
28 | --dt=f64,f32,bf16
29 | --batch=shapes_ci_gpu
30 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/conv/test_conv_int8:
--------------------------------------------------------------------------------
 1 | # Test Int8 Convolutions
 2 | --reset
 3 | 
 4 | --batch=harness_conv_saturation_int8
 5 | 
 6 | --batch=harness_conv_int8
 7 | 
 8 | --batch=harness_conv_dilated_int8
 9 | 
10 | --batch=harness_conv_attrs_int8
11 | 
12 | --batch=harness_conv_attrs_int8_asymmetric
13 | 
14 | --batch=harness_conv_depthwise_int8
15 | 
16 | --batch=harness_conv_zero_points
17 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/conv/test_conv_regression:
--------------------------------------------------------------------------------
 1 | # f32
 2 | --reset --dt=f32
 3 | --dir=FWD_B,BWD_D,BWD_WB
 4 | --batch=shapes_regression_small_spatial
 5 | --batch=shapes_regression_padding
 6 | --batch=shapes_regression_gemm
 7 | --batch=shapes_regression_1x1
 8 | --stag=axb --dtag=axb --batch=shapes_regression_1x1
 9 | 
10 | --reset --dt=f32
11 | --dir=FWD_B
12 | --attr-post-ops=relu
13 | --batch=shapes_regression_small_spatial
14 | --batch=shapes_regression_padding
15 | 
16 | #
17 | --batch=harness_conv_regression_general
18 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/conv/test_conv_wino_f32:
--------------------------------------------------------------------------------
 1 | # f32 wino
 2 | --reset
 3 | --dt=f32
 4 | --alg=wino
 5 | --match=.*kh3[^0-9].*       # only 3x3 convolutions so far
 6 | --mb=2                      # for fwd and bwd_d reduce mb
 7 | --dir=FWD_I,FWD_B,BWD_D,BWD_WB
 8 | --batch=set_conv_all
 9 | --batch=shapes_regression_padding
10 | 
11 | --mb=0
12 | --batch=shapes_tails
13 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/conv/test_conv_wino_gpu:
--------------------------------------------------------------------------------
1 | # f16 wino
2 | --reset --dt=f16,f32 --stag=any,nhwc --alg=wino
3 | --match=.*[^k][^d][0-9]kh3[^0-9].*       # only 3x3 convolutions so far
4 | --mb=2,32                      # for fwd and bwd_d reduce mb
5 | --batch=set_conv_all
6 | --batch=shapes_regression_padding
7 | --batch=shapes_tails
8 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/deconv/harness_deconv_regression_general_f32:
--------------------------------------------------------------------------------
 1 | # f32 regression : long accumulation chains
 2 | --reset --dt=f32 --dir=bwd_w mb28_ic16oc16_id10od10kd3
 3 | 
 4 | # test brgdeconv strided shape that caused segfault
 5 | --reset
 6 | --skip-impl=ref,x64:gemm
 7 | --dt=f32
 8 | --dir=fwd_b
 9 | mb56_ic2oc3_ih6oh18kh3sh3dh0ph1_iw5ow15kw3sw3dw0pw1_n"2d_strided_with_bias"
10 | 
11 | # test brgdeconv strided shape that kd/kh block != kd/kh
12 | --reset
13 | --skip-impl=ref,x64:gemm
14 | --dt=f32
15 | --dir=FWD_B
16 | mb1ic16ih16oc128oh32kh64sh2ph1
17 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/deconv/set_all:
--------------------------------------------------------------------------------
1 | --batch=shapes_1d
2 | --batch=shapes_3d
3 | --batch=shapes_2d
4 | --batch=shapes_dilated
5 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/deconv/shapes_1d:
--------------------------------------------------------------------------------
 1 | # 1D
 2 | ic8iw5oc8ow2kw3pw3dw2n"deconv1d:1"
 3 | g1mb256oc3ow227ic96iw55kw11sw4pw0n"alexnet:deconv1"
 4 | g2mb256oc96ow27ic256iw27kw5pw2n"alexnet:deconv2"
 5 | mb256oc256ow13ic384iw13kw3pw1n"alexnet:deconv3"
 6 | g1mb96ic64iw112oc3ow224kw7sw2pw3n"googlenet_v1:conv1/7x7_s2"
 7 | mb1_g1oc3ic64_ow1030iw512kw7sw2dw0pw0_n"masknet_p1:deconv1"
 8 | g1mb50ic256iw28oc512ow56kw1sw2pw0n"resnet_50:res3a_branch1"
 9 | mb9_ic1oc1_ih1oh1kh1sh1dh0ph0_iw55ow55kw3sw1dw0pw1n"pytorch_unittest"
10 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/deconv/test_deconv_all:
--------------------------------------------------------------------------------
 1 | # f32
 2 | --reset
 3 | --skip-impl=ref
 4 | --mb=2
 5 | 
 6 | --dir=FWD_B
 7 | --attr-post-ops=,sum,linear:2:1,sum:1.5+add:f32:per_oc+relu
 8 | --batch=set_all
 9 | 
10 | --dir=BWD_D,BWD_W,BWD_WB
11 | --attr-post-ops=
12 | --batch=set_all
13 | 
14 | # int8
15 | --batch=test_deconv_int8
16 | 
17 | # bf16
18 | --batch=test_deconv_bfloat16
19 | 
20 | # f16
21 | --batch=test_deconv_float16_nxc
22 | 
23 | # Regression
24 | --batch=harness_deconv_regression_general_f32
25 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/deconv/test_deconv_all_f32_nxc:
--------------------------------------------------------------------------------
 1 | # f32 nxc
 2 | --reset
 3 | --skip-impl=ref
 4 | --mb=2
 5 | --stag=axb --dtag=axb
 6 | 
 7 | --dir=FWD_B
 8 | --attr-post-ops=,sum+prelu:per_oc,linear:2:1,sum:1.5+add:f32:per_oc+relu
 9 | --batch=set_all
10 | 
11 | --dir=BWD_D,BWD_W,BWD_WB
12 | --attr-post-ops=
13 | --batch=set_all
14 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/deconv/test_deconv_bfloat16:
--------------------------------------------------------------------------------
 1 | # bf16
 2 | --reset
 3 | --skip-impl=ref
 4 | --mb=2
 5 | 
 6 | --dt=bf16
 7 | --dir=FWD_B
 8 | --attr-post-ops=,sum,linear:2:1,sum:1.5+add:f32:per_oc+relu
 9 | --batch=set_all
10 | 
11 | --dir=BWD_D,BWD_W,BWD_WB
12 | --attr-post-ops=
13 | --batch=set_all
14 | 
15 | --dt=bf16:bf16:f32 --dir=FWD_B --batch=set_all
16 | --dt=f32:bf16:bf16 --dir=BWD_D --batch=set_all
17 | --dt=bf16:f32:bf16 --dir=BWD_WB --batch=set_all
18 | 
19 | # Test Deconv w/bias through GeMM
20 | --reset
21 | --skip-impl=ref
22 | --mb=2
23 | 
24 | --dt=bf16
25 | --dir=FWD_B,BWD_WB g16_ic32ih4iw8_oc64oh3ow8_kh3kw3sh1sw1ph0pw0n"gemm_shape"
26 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/deconv/test_deconv_bfloat16_ymm:
--------------------------------------------------------------------------------
1 | # bf16
2 | 
3 | # global benchdnn knob, will not be reset again
4 | --cpu-isa-hints=prefer_ymm
5 | --batch=test_deconv_bfloat16
6 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/deconv/test_deconv_int8:
--------------------------------------------------------------------------------
 1 | # int8
 2 | --reset
 3 | --skip-impl=ref
 4 | --mb=2
 5 | --dir=FWD_I,FWD_B
 6 | --batch=set_all
 7 | --batch=shapes_1x1
 8 | 
 9 | --batch=harness_deconv_regression_general_int8
10 | --batch=harness_deconv_attrs_int8
11 | --batch=harness_deconv_attrs_int8_asymmetric
12 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/eltwise/harness_eltwise_large_buffer:
--------------------------------------------------------------------------------
 1 | --reset
 2 | 
 3 | # Use inplace to reduce memory consumption, most eltwise implementations do not
 4 | # use different implementations for inplace. Use smallest supported data type to
 5 | # minimize memory usage as well.
 6 | 
 7 | --inplace=true
 8 | 
 9 | --dt=u8
10 | --dir=FWD_D
11 | --attr-post-ops=,add:f32:per_tensor
12 | --batch=shapes_large_buffer
13 | 
14 | --reset
15 | --dt=bf16
16 | --dir=BWD_D
17 | --batch=shapes_large_buffer
18 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/eltwise/harness_eltwise_regression:
--------------------------------------------------------------------------------
 1 | # tag `a` regression check
 2 | --reset
 3 | --skip-impl=
 4 | 
 5 | --dir=FWD_D
 6 | --dt=f32
 7 | --tag=a
 8 | --attr-post-ops=mul:f32
 9 | 16
10 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/eltwise/harness_eltwise_saturation:
--------------------------------------------------------------------------------
 1 | # int linear saturation check
 2 | --reset
 3 | 
 4 | --dt=u8
 5 | --tag=abx,axb
 6 | --alpha=1,4294967295
 7 | --beta=1,4294967295
 8 | --alg=linear
 9 | --batch=shapes_eltwise
10 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/eltwise/option_set_all_algs_int8:
--------------------------------------------------------------------------------
 1 | # Integer algorithm coverage based on alpha and beta validity
 2 | --alpha=0,0.25,-0.25 --beta=0
 3 | --alg=relu
 4 | --batch=shapes_eltwise
 5 | 
 6 | --alpha=0,0.25,-0.25 --beta=0,0.25,-0.25
 7 | --alg=linear
 8 | --batch=shapes_eltwise
 9 | 
10 | --alpha=0,0.25,-0.25 --beta=0,0.25,-0.25
11 | --alg=clip
12 | --batch=shapes_eltwise
13 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/eltwise/option_set_all_algs_int8_ci:
--------------------------------------------------------------------------------
 1 | # Integer algorithm coverage based on alpha and beta validity
 2 | --alpha=-2 --beta=0
 3 | --alg=relu
 4 | --batch=shapes_ci
 5 | 
 6 | --alpha=1 --beta=2
 7 | --alg=linear
 8 | --batch=shapes_ci
 9 | 
10 | --alpha=-2 --beta=3
11 | --alg=clip
12 | --batch=shapes_ci
13 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/eltwise/shapes_ci:
--------------------------------------------------------------------------------
1 | 16x16x2x1_n"eltwise_ci_2d:0" # Used in smoke validation, don't change the name
2 | 3x15x2x5x3_n"eltwise_ci_3d:0"
3 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/eltwise/shapes_eltwise:
--------------------------------------------------------------------------------
1 | 5x16x3
2 | 16x64x1x1
3 | 3x7x3x2
4 | 2x32x3x2
5 | 32x5x2x3
6 | 2x16x5x2x3
7 | 3x17x2x5x3
8 | 
9 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/eltwise/shapes_large_buffer:
--------------------------------------------------------------------------------
1 | # Test dimensions with 2 ^ 32 + 1 size to catch integer overflow
2 | 4294967297n"large_dim1"
3 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/eltwise/test_eltwise_bfloat16:
--------------------------------------------------------------------------------
 1 | --reset
 2 | 
 3 | --inplace=true,false
 4 | --dt=bf16
 5 | --tag=abx,axb,aBx8b,aBx16b,ABx16a16b
 6 | 
 7 | --dir=FWD_D,BWD_D
 8 | --batch=option_set_all_algs
 9 | 
10 | --dir=FWD_D
11 | --attr-post-ops=add:f32+mul:f32:per_oc
12 | --batch=option_set_all_algs
13 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/eltwise/test_eltwise_ci:
--------------------------------------------------------------------------------
 1 | --reset
 2 | 
 3 | --inplace=true,false
 4 | 
 5 | --dt=f32,bf16,f16,f8_e5m2,f8_e4m3
 6 | --tag=abx,axb
 7 | --dir=FWD_D
 8 | --attr-post-ops=,mul:s8:per_oc
 9 | --batch=option_set_all_algs_ci
10 | --dir=BWD_D
11 | --attr-post-ops=
12 | --batch=option_set_all_algs_ci
13 | 
14 | --dir=FWD_I
15 | --dt=s32,s8,u8
16 | --attr-post-ops=,mul:f32
17 | --batch=option_set_all_algs_int8_ci
18 | 
19 | 
20 | --dt=f64
21 | --dir=BWD_D,FWD_I
22 | --attr-post-ops=
23 | --batch=option_set_all_algs_ci
24 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/eltwise/test_eltwise_float16:
--------------------------------------------------------------------------------
 1 | --reset
 2 | 
 3 | --inplace=true,false
 4 | --dt=f16
 5 | --tag=abx,axb
 6 | 
 7 | --dir=FWD_D,BWD_D
 8 | --batch=option_set_all_algs
 9 | 
10 | --dir=FWD_D
11 | --attr-post-ops=add:f32+mul:f32:per_oc
12 | --batch=option_set_all_algs
13 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/eltwise/test_eltwise_float8:
--------------------------------------------------------------------------------
 1 | --reset
 2 | 
 3 | --inplace=true,false
 4 | --dt=f8_e5m2,f8_e4m3
 5 | --tag=abx,axb
 6 | 
 7 | --dir=FWD_D,BWD_D
 8 | --batch=option_set_all_algs
 9 | 
10 | --dir=FWD_D
11 | --attr-post-ops=add:f32+mul:f32:per_oc
12 | --batch=option_set_all_algs
13 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/eltwise/test_eltwise_smoke:
--------------------------------------------------------------------------------
 1 | --reset
 2 | 
 3 | --match=.*eltwise_ci_2d.* # Use 2d problems only from shapes_ci
 4 | --inplace=false
 5 | 
 6 | --dt=f32,bf16,f16
 7 | --tag=axb
 8 | --dir=FWD_D,BWD_D
 9 | --batch=option_set_all_algs_ci
10 | 
11 | --dir=FWD_I
12 | --dt=s8,u8
13 | --batch=option_set_all_algs_int8_ci
14 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/gnorm/shapes_all:
--------------------------------------------------------------------------------
 1 | # Instance normalization
 2 | # Unet-3D (id is divided by 10 for the first 2 cases)
 3 | g32mb2ic32id16ih224iw224
 4 | g256mb6ic256id2ih28iw28
 5 | g320mb2ic320id10ih14iw14
 6 | g128mb2ic128id6ih7iw7
 7 | 
 8 | # Group normalization
 9 | g1mb2ic2iw40
10 | g2mb2ic8ih30iw40
11 | g5mb2ic10id9ih10iw10
12 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/gnorm/shapes_ci:
--------------------------------------------------------------------------------
 1 | # Instance normalization
 2 | g2mb2ic2iw4
 3 | g7mb2ic7ih3iw4
 4 | g5mb2ic5id9ih1iw10
 5 | 
 6 | # Group normalization
 7 | g1mb2ic2iw4
 8 | g2mb2ic8ih3iw4
 9 | g5mb2ic10id9ih1iw10
10 | g5mb2ic40id2ih1iw5
11 | g5mb2ic80id2ih1iw5
12 | g1mb1ic128ih2iw2
13 | g2mb2ic30ih1iw1
14 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/graph/complex_fusion/harness_mlp_all:
--------------------------------------------------------------------------------
1 | --batch=complex_fusion/harness_mlp_ci
2 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/graph/complex_fusion/harness_mlp_ci:
--------------------------------------------------------------------------------
1 | --reset --dt=bf16,f16 --case=complex_fusion/mlp/gated-mlp-f32.json
2 | 
3 | # WA1: use smaller problem to pass correctness check for f32 on pvc.
4 | # WA2: use subtract binary to avoid precision issue for f32 on xe-lpg.
5 | --reset --in-shapes=0:1x128+1:128x256+4:128x256+13:256x128 --op-kind=12:Subtract --case=complex_fusion/mlp/gated-mlp-f32.json
6 | 
7 | # f16-int4 case
8 | --reset --case=complex_fusion/mlp/gated-mlp-int4.json
9 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/graph/pattern/harness_bf16_ci:
--------------------------------------------------------------------------------
1 | --reset --dt=bf16 --case=pattern/f32/conv_post_ops_fusion.json
2 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/graph/pattern/harness_f16_ci:
--------------------------------------------------------------------------------
1 | --reset --dt=f16 --case=pattern/f32/conv_post_ops_fusion.json
2 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/graph/pattern/harness_f32_ci:
--------------------------------------------------------------------------------
1 | --reset --in-shapes=0:2x64x112x112+1:32x64x2x2 --case=pattern/f32/conv_post_ops_fusion.json
2 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/graph/pattern/harness_f8_all:
--------------------------------------------------------------------------------
 1 | --reset --case=pattern/f8/f8_conv_add_add_fusion.json
 2 | --reset --case=pattern/f8/f8_conv_fwd.json
 3 | --reset --case=pattern/f8/f8_conv_post_ops_fusion.json
 4 | --reset --case=pattern/f8/f8_conv_post_ops_int8_add_fusion.json
 5 | --reset --case=pattern/f8/f8_conv_bias_relu_fusion.json
 6 | --reset --case=pattern/f8/f8_matmul.json
 7 | --reset --case=pattern/f8/f8_bf16_matmul_add_fusion.json
 8 | --reset --case=pattern/f8/f8_bf16_matmul_sum_add_mul_relu.json
 9 | --reset --case=pattern/f8/f8_matmul_sum_add_mul_relu.json
10 | --reset --case=pattern/f8/f8_f32_matmul_mul_add_fusion.json
11 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/graph/pattern/harness_f8_ci:
--------------------------------------------------------------------------------
1 | --reset --case=pattern/f8/f8_conv_fwd.json
2 | --reset --case=pattern/f8/f8_matmul.json
3 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/graph/pattern/harness_int8_ci:
--------------------------------------------------------------------------------
1 | --reset --mb=2 --case=pattern/int8/int8_conv_post_ops_fusion.json
2 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/graph/test_graph_all:
--------------------------------------------------------------------------------
1 | --batch=test_graph_f32
2 | --batch=test_graph_bf16
3 | --batch=test_graph_f16
4 | --batch=test_graph_int8
5 | --batch=test_graph_f8
6 | --batch=test_graph_fusions
7 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/graph/test_graph_bf16:
--------------------------------------------------------------------------------
1 | --batch=op/harness_bf16_all
2 | --batch=pattern/harness_bf16_all
3 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/graph/test_graph_ci:
--------------------------------------------------------------------------------
 1 | --batch=op/harness_f32_ci
 2 | --batch=op/harness_f16_ci
 3 | --batch=op/harness_bf16_ci
 4 | --batch=pattern/harness_f32_ci
 5 | --batch=pattern/harness_f16_ci
 6 | --batch=pattern/harness_bf16_ci
 7 | --batch=pattern/harness_int8_ci
 8 | --batch=pattern/harness_f8_ci
 9 | --batch=complex_fusion/harness_mha_ci
10 | --batch=complex_fusion/harness_mlp_ci
11 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/graph/test_graph_f16:
--------------------------------------------------------------------------------
1 | --batch=op/harness_f16_all
2 | --batch=pattern/harness_f16_all
3 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/graph/test_graph_f32:
--------------------------------------------------------------------------------
1 | --batch=op/harness_f32_all
2 | --batch=pattern/harness_f32_all
3 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/graph/test_graph_f8:
--------------------------------------------------------------------------------
1 | --batch=pattern/harness_f8_all


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/graph/test_graph_fusions:
--------------------------------------------------------------------------------
1 | --batch=complex_fusion/harness_mha_all
2 | --batch=complex_fusion/harness_mlp_all
3 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/graph/test_graph_fusions_gpu:
--------------------------------------------------------------------------------
1 | --batch=complex_fusion/harness_mha_all
2 | --batch=complex_fusion/harness_mha_ci
3 | --batch=complex_fusion/harness_mlp_all
4 | --batch=complex_fusion/harness_mlp_ci
5 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/graph/test_graph_int8:
--------------------------------------------------------------------------------
1 | --batch=pattern/harness_int8_all
2 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/graph/test_graph_op_gpu:
--------------------------------------------------------------------------------
1 | --batch=op/harness_f32_ci
2 | --batch=op/harness_f16_ci
3 | --batch=op/harness_bf16_ci
4 | 
5 | --batch=op/harness_f32_all
6 | --batch=op/harness_f16_all
7 | --batch=op/harness_bf16_all
8 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/graph/test_graph_pattern_gpu:
--------------------------------------------------------------------------------
 1 | --batch=pattern/harness_f32_ci
 2 | --batch=pattern/harness_f16_ci
 3 | --batch=pattern/harness_bf16_ci
 4 | --batch=pattern/harness_int8_ci
 5 | --batch=pattern/harness_f8_ci
 6 | 
 7 | --batch=pattern/harness_f32_all
 8 | --batch=pattern/harness_f16_all
 9 | --batch=pattern/harness_bf16_all
10 | --batch=pattern/harness_int8_all
11 | --batch=pattern/harness_f8_all
12 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/ip/harness_ip_gpt-j_2016-32_inf_lb_bfloat16:
--------------------------------------------------------------------------------
 1 | # GPT-J
 2 | # Input token size = 2016
 3 | # Output token size = 32
 4 | # Batch Size = 12
 5 | # Num Beams = 4
 6 | # M = num_beams * batch_size = 48
 7 | 
 8 | --reset
 9 | --dir=FWD_I
10 | --dt=bf16
11 | 
12 | mb96768ic4096oc4096_n"gptj:gemm0*112"
13 | mb96768ic4096oc16384_n"gptj:gemm3*28"
14 | mb96768ic16384oc4096_n"gptj:gemm4*28"
15 | mb96768ic4096oc50400_n"gptj:gemm5*1"
16 | mb48ic4096oc4096_n"gptj:gemm6*3472"
17 | mb48ic4096oc16384_n"gptj:gemm9*868"
18 | mb48ic16384oc4096_n"gptj:gemm10*868"
19 | mb48ic4096oc50400_n"gptj:gemm11*31"
20 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/ip/harness_ip_gpt-j_2016-32_inf_lb_f32:
--------------------------------------------------------------------------------
 1 | # GPT-J
 2 | # Input token size = 2016
 3 | # Output token size = 32
 4 | # Batch Size = 6
 5 | # Num Beams = 4
 6 | # M = num_beams * batch_size = 24
 7 | 
 8 | --reset
 9 | --dir=FWD_I
10 | --dt=f32
11 | 
12 | mb48384ic4096oc4096_n"gptj:gemm0*112"
13 | mb48384ic4096oc16384_n"gptj:gemm3*28"
14 | mb48384ic16384oc4096_n"gptj:gemm4*28"
15 | mb48384ic4096oc50400_n"gptj:gemm5*1"
16 | mb24ic4096oc4096_n"gptj:gemm6*3472"
17 | mb24ic4096oc16384_n"gptj:gemm9*868"
18 | mb24ic16384oc4096_n"gptj:gemm10*868"
19 | mb24ic4096oc50400_n"gptj:gemm11*31"
20 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/ip/harness_ip_gpt-j_2016-32_inf_lb_float16:
--------------------------------------------------------------------------------
 1 | # GPT-J
 2 | # Input token size = 2016
 3 | # Output token size = 32
 4 | # Batch Size = 12
 5 | # Num Beams = 4
 6 | # M = num_beams * batch_size = 48
 7 | 
 8 | --reset
 9 | --dir=FWD_I
10 | --dt=f16
11 | 
12 | mb96768ic4096oc4096_n"gptj:gemm0*112"
13 | mb96768ic4096oc16384_n"gptj:gemm3*28"
14 | mb96768ic16384oc4096_n"gptj:gemm4*28"
15 | mb96768ic4096oc50400_n"gptj:gemm5*1"
16 | mb48ic4096oc4096_n"gptj:gemm6*3472"
17 | mb48ic4096oc16384_n"gptj:gemm9*868"
18 | mb48ic16384oc4096_n"gptj:gemm10*868"
19 | mb48ic4096oc50400_n"gptj:gemm11*31"
20 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/ip/harness_ip_gpt-j_2016-32_inf_sb_bfloat16:
--------------------------------------------------------------------------------
 1 | # GPT-J
 2 | # Input token size = 2016
 3 | # Output token size = 32
 4 | # Batch Size = 1
 5 | # Num Beams = 4
 6 | # M = num_beams * batch_size = 4
 7 | 
 8 | --reset
 9 | --dir=FWD_I
10 | --dt=bf16
11 | 
12 | mb8064ic4096oc4096_n"gptj:gemm0*112"
13 | mb8064ic4096oc16384_n"gptj:gemm3*28"
14 | mb8064ic16384oc4096_n"gptj:gemm4*28"
15 | mb8064ic4096oc50400_n"gptj:gemm5*1"
16 | mb4ic4096oc4096_n"gptj:gemm6*3472"
17 | mb4ic4096oc16384_n"gptj:gemm9*868"
18 | mb4ic16384oc4096_n"gptj:gemm10*868"
19 | mb4ic4096oc50400_n"gptj:gemm11*31"
20 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/ip/harness_ip_gpt-j_2016-32_inf_sb_f32:
--------------------------------------------------------------------------------
 1 | # GPT-J
 2 | # Input token size = 2016
 3 | # Output token size = 32
 4 | # Batch Size = 1
 5 | # Num Beams = 4
 6 | # M = num_beams * batch_size = 4
 7 | 
 8 | --reset
 9 | --dir=FWD_I
10 | --dt=f32
11 | 
12 | mb8064ic4096oc4096_n"gptj:gemm0*112"
13 | mb8064ic4096oc16384_n"gptj:gemm3*28"
14 | mb8064ic16384oc4096_n"gptj:gemm4*28"
15 | mb8064ic4096oc50400_n"gptj:gemm5*1"
16 | mb4ic4096oc4096_n"gptj:gemm6*3472"
17 | mb4ic4096oc16384_n"gptj:gemm9*868"
18 | mb4ic16384oc4096_n"gptj:gemm10*868"
19 | mb4ic4096oc50400_n"gptj:gemm11*31"
20 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/ip/harness_ip_gpt-j_2016-32_inf_sb_float16:
--------------------------------------------------------------------------------
 1 | # GPT-J
 2 | # Input token size = 2016
 3 | # Output token size = 32
 4 | # Batch Size = 1
 5 | # Num Beams = 4
 6 | # M = num_beams * batch_size = 4
 7 | 
 8 | --reset
 9 | --dir=FWD_I
10 | --dt=f16
11 | 
12 | mb8064ic4096oc4096_n"gptj:gemm0*112"
13 | mb8064ic4096oc16384_n"gptj:gemm3*28"
14 | mb8064ic16384oc4096_n"gptj:gemm4*28"
15 | mb8064ic4096oc50400_n"gptj:gemm5*1"
16 | mb4ic4096oc4096_n"gptj:gemm6*3472"
17 | mb4ic4096oc16384_n"gptj:gemm9*868"
18 | mb4ic16384oc4096_n"gptj:gemm10*868"
19 | mb4ic4096oc50400_n"gptj:gemm11*31"
20 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/ip/harness_ip_gpt-j_32-32_inf_lb_bfloat16:
--------------------------------------------------------------------------------
 1 | # GPT-J
 2 | # Input token size = 32
 3 | # Output token size = 32
 4 | # Batch Size = 428
 5 | # Num Beams = 4
 6 | # M = num_beams * batch_size = 1712
 7 | 
 8 | --reset
 9 | --dir=FWD_I
10 | --dt=bf16
11 | 
12 | mb54784ic4096oc4096_n"gptj:gemm0*112"
13 | mb54784ic4096oc16384_n"gptj:gemm3*28"
14 | mb54784ic16384oc4096_n"gptj:gemm4*28"
15 | mb54784ic4096oc50400_n"gptj:gemm5*1"
16 | mb1712ic4096oc4096_n"gptj:gemm6*3472"
17 | mb1712ic4096oc16384_n"gptj:gemm9*868"
18 | mb1712ic16384oc4096_n"gptj:gemm10*868"
19 | mb1712ic4096oc50400_n"gptj:gemm11*31"
20 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/ip/harness_ip_gpt-j_32-32_inf_lb_f32:
--------------------------------------------------------------------------------
 1 | # GPT-J
 2 | # Input token size = 32
 3 | # Output token size = 32
 4 | # Batch Size = 214
 5 | # Num Beams = 4
 6 | # M = num_beams * batch_size = 856
 7 | 
 8 | --reset
 9 | --dir=FWD_I
10 | --dt=f32
11 | 
12 | mb27392ic4096oc4096_n"gptj:gemm0*112"
13 | mb27392ic4096oc16384_n"gptj:gemm3*28"
14 | mb27392ic16384oc4096_n"gptj:gemm4*28"
15 | mb27392ic4096oc50400_n"gptj:gemm5*1"
16 | mb856ic4096oc4096_n"gptj:gemm6*3472"
17 | mb856ic4096oc16384_n"gptj:gemm9*868"
18 | mb856ic16384oc4096_n"gptj:gemm10*868"
19 | mb856ic4096oc50400_n"gptj:gemm11*31"
20 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/ip/harness_ip_gpt-j_32-32_inf_lb_float16:
--------------------------------------------------------------------------------
 1 | # GPT-J
 2 | # Input token size = 32
 3 | # Output token size = 32
 4 | # Batch Size = 428
 5 | # Num Beams = 4
 6 | # M = num_beams * batch_size = 1712
 7 | 
 8 | --reset
 9 | --dir=FWD_I
10 | --dt=f16
11 | 
12 | mb54784ic4096oc4096_n"gptj:gemm0*112"
13 | mb54784ic4096oc16384_n"gptj:gemm3*28"
14 | mb54784ic16384oc4096_n"gptj:gemm4*28"
15 | mb54784ic4096oc50400_n"gptj:gemm5*1"
16 | mb1712ic4096oc4096_n"gptj:gemm6*3472"
17 | mb1712ic4096oc16384_n"gptj:gemm9*868"
18 | mb1712ic16384oc4096_n"gptj:gemm10*868"
19 | mb1712ic4096oc50400_n"gptj:gemm11*31"
20 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/ip/harness_ip_gpt-j_32-32_inf_sb_bfloat16:
--------------------------------------------------------------------------------
 1 | # GPT-J
 2 | # Input token size = 32
 3 | # Output token size = 32
 4 | # Batch Size = 1
 5 | # Num Beams = 4
 6 | # M = num_beams * batch_size = 4
 7 | 
 8 | --reset
 9 | --dir=FWD_I
10 | --dt=bf16
11 | 
12 | mb128ic4096oc4096_n"gptj:gemm0*112"
13 | mb128ic4096oc16384_n"gptj:gemm3*28"
14 | mb128ic16384oc4096_n"gptj:gemm4*28"
15 | mb128ic4096oc50400_n"gptj:gemm5*1"
16 | mb4ic4096oc4096_n"gptj:gemm6*3472"
17 | mb4ic4096oc16384_n"gptj:gemm9*868"
18 | mb4ic16384oc4096_n"gptj:gemm10*868"
19 | mb4ic4096oc50400_n"gptj:gemm11*31"
20 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/ip/harness_ip_gpt-j_32-32_inf_sb_f32:
--------------------------------------------------------------------------------
 1 | # GPT-J
 2 | # Input token size = 32
 3 | # Output token size = 32
 4 | # Batch Size = 1
 5 | # Num Beams = 4
 6 | # M = num_beams * batch_size = 4
 7 | 
 8 | --reset
 9 | --dir=FWD_I
10 | --dt=f32
11 | 
12 | mb128ic4096oc4096_n"gptj:gemm0*112"
13 | mb128ic4096oc16384_n"gptj:gemm3*28"
14 | mb128ic16384oc4096_n"gptj:gemm4*28"
15 | mb128ic4096oc50400_n"gptj:gemm5*1"
16 | mb4ic4096oc4096_n"gptj:gemm6*3472"
17 | mb4ic4096oc16384_n"gptj:gemm9*868"
18 | mb4ic16384oc4096_n"gptj:gemm10*868"
19 | mb4ic4096oc50400_n"gptj:gemm11*31"
20 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/ip/harness_ip_gpt-j_32-32_inf_sb_float16:
--------------------------------------------------------------------------------
 1 | # GPT-J
 2 | # Input token size = 32
 3 | # Output token size = 32
 4 | # Batch Size = 1
 5 | # Num Beams = 4
 6 | # M = num_beams * batch_size = 4
 7 | 
 8 | --reset
 9 | --dir=FWD_I
10 | --dt=f16
11 | 
12 | mb128ic4096oc4096_n"gptj:gemm0*112"
13 | mb128ic4096oc16384_n"gptj:gemm3*28"
14 | mb128ic16384oc4096_n"gptj:gemm4*28"
15 | mb128ic4096oc50400_n"gptj:gemm5*1"
16 | mb4ic4096oc4096_n"gptj:gemm6*3472"
17 | mb4ic4096oc16384_n"gptj:gemm9*868"
18 | mb4ic16384oc4096_n"gptj:gemm10*868"
19 | mb4ic4096oc50400_n"gptj:gemm11*31"
20 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/ip/harness_ip_regression:
--------------------------------------------------------------------------------
 1 | # repeated sum with varying scale
 2 | --reset --attr-post-ops=sum+relu+sum:2 ic64oc64_n"multisum"
 3 | 
 4 | # large oc to trigger use_buffer_b with tail
 5 | --reset --dir=BWD_W mb32ic16oc2049_n"large_oc_for_use_buffer_b_with_tail"
 6 | 
 7 | # f16->s8 is not supported by reference, gemm based ip handles this case,
 8 | # but it requires src and weights to have compatible tags.
 9 | # This test case makes sure ip chooses weights tag correctly for plain src tag
10 | # when shapes contain ones.
11 | --reset --dt=f16:f16:s8 --stag=abcd mb128ic768ih1oc768_n"f16_s8_plain_src_tag"
12 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/ip/harness_ip_sanitizers:
--------------------------------------------------------------------------------
 1 | # shapes to help sanitizers catch bugs
 2 | 
 3 | --reset
 4 | --dt=f32,bf16,s8:s8:s8
 5 | --dir=FWD_B
 6 | 
 7 | # mb * ic will overflow for int data type in below shape
 8 | mb2613184_ic1536oc16_n"huge_mb_ic"
 9 | 
10 | --dt=f32,bf16
11 | --dir=BWD_D,BWD_WB
12 | mb2613184_ic1536oc16_n"huge_mb_ic"
13 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/ip/harness_ip_saturation:
--------------------------------------------------------------------------------
1 | --reset
2 | --dir=FWD_D
3 | --dt=u8:s8:s32,u8:s8:s8,u8:s8:u8
4 | --attr-scales=wei:common:4294967295
5 | ic16oc16ih1
6 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/ip/harness_ip_tag_gpu:
--------------------------------------------------------------------------------
 1 | --reset
 2 | --attr-scratchpad=user
 3 | --dt=f32
 4 | --dir=FWD_B,FWD_I
 5 | --dtag=abx
 6 | --mb=16,17
 7 | 
 8 | --wtag=ABx16a16b
 9 | ic2048ih2id2iw1oc1000
10 | 
11 | --stag=aBx16b
12 | --wtag=any
13 | ic2048id2ih2iw2oc1000
14 | 
15 | --stag=ABx16a16b,ABcd8a8b,aBx16b,aBx8b
16 | --attr-post-ops=sum:0.5+relu:0.5
17 | ic2048ih2iw2oc1000
18 | 
19 | --stag=aBx8b,aBx4b
20 | ic2048iw2oc1000
21 | 
22 | --stag=aBx16b
23 | --attr-post-ops=linear:0.5:1.5:2
24 | ic2048id2ih2iw2oc1000
25 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/ip/option_set_fwks_llm_gpu:
--------------------------------------------------------------------------------
1 | --reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f32 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab mb1024ic896ih1iw1oc151936_n"a12f80214bc2625a81ecaa173e76b977*2"
2 | --reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f32 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab mb32ic896ih1iw1oc151936_n"a72ef24774e776b6f286de263d897013*2"
3 | --reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f32 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab mb8ic896ih1iw1oc151936_n"fed31de8b7be65676854064135d375c7*508"
4 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/ip/perf_ip_cpu:
--------------------------------------------------------------------------------
 1 | --reset
 2 | 
 3 | # Forward
 4 | --dir=FWD_B
 5 | --dt=f32,u8:s8:s32,s8:s8:s8
 6 | --mb=1,64,256,1024,2048
 7 | --batch=set_topologies
 8 | 
 9 | # Backward
10 | --dt=f32
11 | --dir=BWD_D,BWD_WB
12 | --mb=64,256,1024,2048
13 | --batch=set_topologies
14 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/ip/perf_ip_inference_sb:
--------------------------------------------------------------------------------
 1 | # inference_sb -- inference with small batch size
 2 | # f32
 3 | --reset
 4 | --dir=FWD_B
 5 | --mb=1
 6 | --batch=shapes_gnmt
 7 | --batch=shapes_wd
 8 | --batch=shapes_resnet_50
 9 | --batch=shapes_resnet_50_sparse
10 | --batch=shapes_googlenet_v1
11 | --batch=shapes_googlenet_v3
12 | --batch=shapes_vgg16
13 | --batch=shapes_ncf
14 | --batch=shapes_alexnet
15 | --batch=shapes_bert
16 | --batch=shapes_bert_large
17 | --mb=0 --batch=shapes_rnn_t
18 | --mb=128 --batch=shapes_dlrm
19 | --mb=0 --batch=shapes_transformer_lt
20 | --mb=0 --batch=shapes_dien_sb
21 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/ip/perf_ip_knx:
--------------------------------------------------------------------------------
 1 | --reset
 2 | 
 3 | # Forward
 4 | --dir=FWD_B
 5 | --dt=f32
 6 | --mb=1,64,256,1024,2048
 7 | --batch=set_topologies
 8 | 
 9 | # Backward
10 | --dt=f32
11 | --dir=BWD_D,BWD_WB
12 | --mb=64,256,1024,2048
13 | --batch=set_topologies
14 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/ip/perf_ip_xe:
--------------------------------------------------------------------------------
 1 | --reset
 2 | 
 3 | # Forward
 4 | 
 5 | # f32
 6 | --dir=FWD_B
 7 | --dt=f32
 8 | 
 9 | --mb=1,32,64,128
10 | --batch=set_gpu
11 | 
12 | --mb=0
13 | --batch=shapes_1d
14 | 
15 | # f16
16 | --dir=FWD_B
17 | --dt=f16
18 | 
19 | --mb=1,32,64,128
20 | --batch=set_gpu
21 | 
22 | --mb=0
23 | --batch=shapes_1d
24 | 
25 | # Backward
26 | 
27 | # f32
28 | --dir=BWD_D,BWD_WB
29 | --dt=f32
30 | 
31 | --mb=32,64,128
32 | --batch=set_gpu
33 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/ip/perf_ip_xe_hp:
--------------------------------------------------------------------------------
 1 | --batch=perf_ip_xe_lp
 2 | 
 3 | --reset
 4 | 
 5 | # Forward, bf16
 6 | 
 7 | --dir=FWD_B
 8 | --dt=bf16
 9 | 
10 | --mb=1,32,64,128
11 | --batch=set_gpu
12 | 
13 | --mb=0
14 | --batch=shapes_1d
15 | 
16 | # Backward, bf16
17 | 
18 | --dir=BWD_D,BWD_WB
19 | --dt=bf16
20 | 
21 | --mb=32,64,128
22 | --batch=set_gpu
23 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/ip/perf_ip_xe_lp:
--------------------------------------------------------------------------------
 1 | --batch=perf_ip_xe
 2 | 
 3 | --reset
 4 | 
 5 | # Forward, int8
 6 | --dir=FWD_B
 7 | --dt=u8:s8:s32
 8 | 
 9 | --mb=1,32,64,128
10 | --batch=set_gpu
11 | 
12 | --mb=0
13 | --batch=shapes_1d
14 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/ip/set_all:
--------------------------------------------------------------------------------
 1 | --batch=shapes_resnet_50
 2 | --batch=shapes_googlenet_v1
 3 | --batch=shapes_googlenet_v3
 4 | --batch=shapes_resnet_50_sparse
 5 | --batch=shapes_vgg16
 6 | --batch=shapes_3d
 7 | --batch=shapes_wd
 8 | --batch=shapes_maskrcnn
 9 | --batch=shapes_rnn_t
10 | --batch=shapes_alexnet
11 | --batch=shapes_bert
12 | --batch=shapes_bert_large
13 | --batch=shapes_dlrm
14 | --batch=shapes_gnmt
15 | --batch=shapes_ncf
16 | --batch=shapes_transformer_lt
17 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/ip/set_gpu:
--------------------------------------------------------------------------------
1 | --batch=shapes_resnet_50
2 | --batch=shapes_googlenet_v1
3 | --batch=shapes_googlenet_v3
4 | --batch=shapes_resnet_50_sparse
5 | --batch=shapes_vgg16
6 | --batch=shapes_3d
7 | --batch=shapes_wd
8 | --batch=shapes_maskrcnn
9 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/ip/set_topologies:
--------------------------------------------------------------------------------
 1 | --batch=shapes_alexnet
 2 | --batch=shapes_bert
 3 | --batch=shapes_bert_large
 4 | --batch=shapes_dlrm
 5 | --batch=shapes_gnmt
 6 | --batch=shapes_googlenet_v1
 7 | --batch=shapes_googlenet_v3
 8 | --batch=shapes_maskrcnn
 9 | --batch=shapes_ncf
10 | --batch=shapes_resnet_50
11 | --batch=shapes_resnet_50_sparse
12 | --batch=shapes_transformer_lt
13 | --batch=shapes_vgg16
14 | --batch=shapes_wd
15 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/ip/shapes_0d_gpu:
--------------------------------------------------------------------------------
1 | mb64ic1oc2
2 | mb18ic7oc1
3 | mb16ic1oc5
4 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/ip/shapes_1d:
--------------------------------------------------------------------------------
1 | # 1D
2 | mb128ic128iw4oc1024n"googlenet_v1_1d:ip1"
3 | mb64ic512iw7oc4096n"VGG16_1d:ip1"
4 | mb32ic64iw3oc1000n"1d:ip1"
5 | mb32ic512iw5oc1000n"1d:ip2"
6 | mb256ic128iw5oc128n"1d:ip3"
7 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/ip/shapes_3d:
--------------------------------------------------------------------------------
1 | ic64id2ih3iw3oc1000n"3d:ip1"
2 | ic512id5ih5iw5oc1000n"3d:ip2"
3 | ic128id5ih5iw5oc128n"3d:ip3"
4 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/ip/shapes_alexnet:
--------------------------------------------------------------------------------
1 | ic9216oc4096n"Alexnet:ip1"
2 | ic4096oc4096n"Alexnet:ip2"
3 | ic4096oc1000n"Alexnet:ip3"
4 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/ip/shapes_bert:
--------------------------------------------------------------------------------
1 | ic768oc2304n"BERT:1"
2 | ic768oc768n"BERT:2"
3 | ic768oc3072n"BERT:3"
4 | ic3072oc768n"BERT:4"
5 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/ip/shapes_ci:
--------------------------------------------------------------------------------
1 | --batch=shapes_dlrm
2 | --batch=shapes_ncf
3 | --batch=shapes_resnet_50
4 | --batch=shapes_rnn_t
5 | --batch=shapes_bert
6 | --batch=shapes_bert_large
7 | --batch=shapes_gnmt
8 | --batch=shapes_wd
9 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/ip/shapes_dlrm:
--------------------------------------------------------------------------------
1 | ic13oc512n"DLRM:0"
2 | ic512oc256n"DLRM:1*2"
3 | ic256oc128n"DLRM:2"
4 | ic479oc1024n"DLRM:3"
5 | ic1024oc1024n"DLRM:4"
6 | ic1024oc512n"DLRM:5"
7 | # ic512oc256n"DLRM:6"
8 | ic256oc1n"DLRM:7"
9 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/ip/shapes_gnmt:
--------------------------------------------------------------------------------
1 | ic512oc512n"GNMT:0"
2 | ic1024oc1024n"GNMT:1"
3 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/ip/shapes_googlenet_v1:
--------------------------------------------------------------------------------
1 | ic128ih4iw4oc1024n"googlenet_v1:ip1"
2 | ic1024ih1iw1oc1000n"googlenet_v1:ip2"
3 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/ip/shapes_googlenet_v3:
--------------------------------------------------------------------------------
1 | ic2048ih1oc1000n"inceptionv3:ip1"
2 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/ip/shapes_maskrcnn:
--------------------------------------------------------------------------------
1 | mb1000ic256ih7iw7oc1024n"masknet:ip1"
2 | mb1000ic1024oc1024n"masknet:ip2"
3 | mb1000ic1024oc324n"masknet:ip3"
4 | mb1000ic1024oc81n"masknet:ip4"
5 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/ip/shapes_ncf:
--------------------------------------------------------------------------------
1 | ic256oc256n"NCF:0"
2 | ic256oc128n"NCF:1"
3 | ic128oc64n"NCF:2"
4 | ic128oc1n"NCF:3"
5 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/ip/shapes_regression:
--------------------------------------------------------------------------------
1 | mb1_ic16oc26_n"small_oc_block"
2 | mb128ic2200oc2200
3 | mb128ic1500oc1500
4 | mb1120ic1024oc2046
5 | mb1000ic1000oc1111
6 | mb1ic16001oc101
7 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/ip/shapes_resnet_50:
--------------------------------------------------------------------------------
1 | ic2048oc1000n"resnet:ip1"
2 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/ip/shapes_resnet_50_sparse:
--------------------------------------------------------------------------------
1 | ic2048oc1000n"resnet_sparse:ip1"
2 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/ip/shapes_rnn_t:
--------------------------------------------------------------------------------
1 | # mb = 16 * num_cores  for throughput inference / training
2 | # and mb = 16 for real time inference
3 | mb16ic240oc4096n"RNN-T:Encoder_cell1_Input*2"
4 | mb16ic1024oc4096n"RNN-T:Encoder_cell1_Hidden*11"
5 | mb16ic2048oc4096n"RNN-T:Encoder_cell3_Input*1"
6 | mb16ic320oc1280n"RNN-T:Prediction_Input*12"
7 | mb16ic1344oc512n"RNN-T:JointNet_Linear1*3"
8 | mb16ic512oc29n"RNN-T:JointNet_Linear2*3"
9 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/ip/shapes_vgg16:
--------------------------------------------------------------------------------
1 | ic512ih7iw7oc4096n"VGG16:ip1"
2 | ic4096oc4096n"VGG16:ip2"
3 | ic4096oc81n"VGG16:ip3"
4 | ic4096oc324n"VGG16:ip4"
5 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/ip/shapes_wd:
--------------------------------------------------------------------------------
1 | # Used in smoke validation, don't change the name
2 | ic845oc1024n"WnD:0"
3 | ic1024oc512n"WnD:1"
4 | ic512oc256n"WnD:2"
5 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/ip/test_ip_all:
--------------------------------------------------------------------------------
 1 | # f32
 2 | --reset
 3 | --dir=FWD_B,BWD_D,BWD_WB
 4 | --batch=set_all --batch=shapes_0d
 5 | 
 6 | --dir=FWD_B
 7 | --attr-post-ops=sum:0.5+relu:0.5+add:f32:per_oc,prelu:per_oc,sum:2:0:s32
 8 | --mb=2 --batch=set_all
 9 | --mb=0 --batch=shapes_0d
10 | 
11 | --batch=harness_ip_regression
12 | --batch=harness_ip_tag
13 | 
14 | # int8
15 | --batch=test_ip_int8
16 | 
17 | # fp8
18 | --batch=test_ip_fp8
19 | 
20 | # bf16
21 | --batch=test_ip_bfloat16
22 | 
23 | # bf32
24 | --batch=test_ip_bf32_bfloat16
25 | 
26 | # f16
27 | --batch=test_ip_float16
28 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/ip/test_ip_bfloat16_ymm:
--------------------------------------------------------------------------------
1 | # global benchdnn knob, will not be reset again
2 | --cpu-isa-hints=prefer_ymm
3 | --batch=test_ip_bfloat16
4 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/lnorm/shapes_ci:
--------------------------------------------------------------------------------
1 | 15x3_n"lnorm_ci_0d:0" # Used in smoke validation, don't change the name
2 | 30x300_n"lnorm_ci_0d:1" # Used in smoke validation, don't change the name
3 | 256x768_n"lnorm_ci_0d:2" # Used in smoke validation, don't change the name
4 | 257x768_n"lnorm_ci_0d:3" # Used in smoke validation, don't change the name
5 | 128x1x1024_n"lnorm_ci_1d:0"
6 | 6x2x128x1024_n"lnorm_ci_2d:0"
7 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/lnorm/test_lnorm_all:
--------------------------------------------------------------------------------
 1 | --reset
 2 | 
 3 | # f32
 4 | --inplace=true,false
 5 | --dt=f32
 6 | 
 7 | --dir=FWD_D,FWD_I
 8 | --flags=,CH,G,GCH,M,GCHM
 9 | --batch=option_set_all
10 | 
11 | --dir=BWD_D
12 | --flags=,G,GM
13 | --batch=option_set_all
14 | 
15 | --dir=BWD_DW
16 | --flags=CH,GCH,M,GCHM
17 | --batch=option_set_all
18 | 
19 | # bf16
20 | --batch=test_lnorm_bfloat16
21 | 
22 | # int8
23 | --batch=test_lnorm_int8
24 | 
25 | # F16
26 | --batch=test_lnorm_float16
27 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/lnorm/test_lnorm_bfloat16:
--------------------------------------------------------------------------------
 1 | --reset
 2 | 
 3 | --inplace=true,false
 4 | --dt=bf16,bf16:f32,f32:bf16
 5 | 
 6 | --dir=FWD_D,FWD_I
 7 | --flags=,CH,G,GCH,M,GCHM
 8 | --batch=option_set_all
 9 | 
10 | --dir=BWD_D
11 | --flags=,G,GM
12 | --batch=option_set_all
13 | 
14 | --dir=BWD_DW
15 | --flags=CH,GCH,M,GCHM
16 | --batch=option_set_all
17 | 
18 | # Different data type combinations
19 | 
20 | --dt=f32:bf16,bf16
21 | --dir=FWD_D,BWD_DW
22 | --ss_dt=bf16
23 | --flags=C,H,CH
24 | --batch=option_set_all
25 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/lnorm/test_lnorm_float16:
--------------------------------------------------------------------------------
 1 | --reset
 2 | 
 3 | --inplace=true,false
 4 | --dt=f16,f16:f32,f32:f16
 5 | 
 6 | --dir=FWD_D,FWD_I
 7 | --flags=,CH,G,GCH,M,GCHM
 8 | --batch=option_set_all
 9 | 
10 | --dir=BWD_D
11 | --flags=,G,GM
12 | --batch=option_set_all
13 | 
14 | --dir=BWD_DW
15 | --flags=CH,GCH,M,GCHM
16 | --batch=option_set_all
17 | 
18 | # Different data type combinations
19 | 
20 | --dt=f32:f16,f16
21 | --dir=FWD_D,BWD_DW
22 | --ss_dt=f16
23 | --flags=C,H,CH
24 | --batch=option_set_all
25 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/lnorm/test_lnorm_int8:
--------------------------------------------------------------------------------
 1 | --reset
 2 | 
 3 | --inplace=false
 4 | --dt=f32:s8,f32:u8,bf16:s8,bf16:u8, \
 5 |      s8:f32,u8:f32,s8:bf16,u8:bf16, \
 6 |      s8:s8,u8:s8,s8:u8,u8:u8
 7 | 
 8 | --dir=FWD_I
 9 | --attr-scales=,src:common:128,dst:common:0.125,src:common:64+dst:common:0.5
10 | --flags=,CH,G,GCH,M,GCHM
11 | --batch=option_set_all
12 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/lnorm/test_lnorm_smoke:
--------------------------------------------------------------------------------
 1 | --reset
 2 | 
 3 | --match=.*lnorm_ci_0d.* # Use 0d problems only from shapes_ci
 4 | --inplace=false
 5 | --tag=axb
 6 | --stat_tag=any
 7 | 
 8 | --dir=FWD_D,BWD_DW
 9 | --dt=f32,bf16,f16
10 | --flags=,G,CH,M,GCHM
11 | --batch=shapes_ci
12 | 
13 | --dir=BWD_D
14 | --flags=,G,GM
15 | --batch=shapes_ci
16 | 
17 | # Different data type combinations
18 | --dt=f32:s8,u8:f32
19 | --dir=FWD_I
20 | --attr-scales=,src:common:64+dst:common:0.5
21 | --flags=,CH
22 | --batch=shapes_ci
23 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/lrn/set_all:
--------------------------------------------------------------------------------
1 | --batch=shapes_2d
2 | --batch=shapes_3d
3 | --batch=shapes_topologies
4 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/lrn/shapes_0d:
--------------------------------------------------------------------------------
1 | # random problems
2 | mb3ic128_n"lrn_ci_0d:0"
3 | mb10ic63_n"lrn_ci_0d:1"
4 | mb17ic15_n"lrn_ci_0d:2"
5 | ic1_n"lrn_ci_0d:3"
6 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/lrn/shapes_2d:
--------------------------------------------------------------------------------
 1 | # random problems
 2 | ic15_ih16_n"lrn_ci_2d:channel_tail"
 3 | ic16_ih5_ls7_n"lrn_ci_2d:non_default_local_size"
 4 | ic16_ih5_beta1.0_n"lrn_ci_2d:non_default_beta"
 5 | ic16_ih5_k14.2_n"lrn_ci_2d:non_default_k"
 6 | ic16_ih10iw7_n"lrn_ci_2d:non_square_shape"
 7 | ic12_ih15iw1_n"lrn_ci_2d:unit_width"
 8 | ic31_ih1iw16_n"lrn_ci_2d:unit_heigth"
 9 | ic64_ih1iw1_n"lrn_ci_2d:unit_heigth_and_width"
10 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/lrn/shapes_3d:
--------------------------------------------------------------------------------
 1 | ic15_id2n"channels_only"
 2 | ic15_id2ih16iw16n"channel_tail"
 3 | ic16_id3ih5iw5_ls7n"non_default_local_size"
 4 | ic16_id2ih5iw5_beta1.0n"non_default_beta"
 5 | ic16_id4ih5iw5_k14.2n"non_default_k"
 6 | ic16_id2ih10iw7n"non_square_shape"
 7 | ic16_id3ih10iw7n"non_square_shape"
 8 | ic12_id2ih15iw1n"unit_width"
 9 | ic31_id1ih1iw16_ls4n"unit_heigth"
10 | ic31_id2ih1iw16_ls4n"unit_heigth"
11 | ic31_id3ih1iw16n"unit_heigth"
12 | ic64_id1ih1iw1n"unit_heigth_and_width"
13 | ic64_id2ih1iw1n"unit_heigth_and_width"
14 | ic64_id3ih1iw1n"unit_heigth_and_width"
15 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/lrn/shapes_ci:
--------------------------------------------------------------------------------
1 | --batch=shapes_0d
2 | --batch=shapes_2d
3 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/lrn/shapes_topologies:
--------------------------------------------------------------------------------
1 | # model problems
2 | mb256ic96_ih55n"alexnet:norm1"
3 | mb256ic256_ih27n"alexnet:norm2"
4 | mb96ic64_ih57n"googlenet:pool1/norm1"
5 | mb96ic192_ih57n"googlenet:conv2/norm2"
6 | mb50ic96ih112_alpha0.0005_k2n"fastrcnn:norm1"
7 | mb50ic96ih112_ls3_alpha0.00005n"fastrcnn_zf:norm1"
8 | 
9 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/lrn/test_lrn_all:
--------------------------------------------------------------------------------
 1 | --reset
 2 | 
 3 | --mb=2
 4 | --dt=f32
 5 | --alg=ACROSS,WITHIN
 6 | --dir=FWD_D,FWD_I,BWD_D
 7 | 
 8 | --tag=abx
 9 | --batch=shapes_0d
10 | 
11 | --tag=abx,axb,aBx8b,aBx16b
12 | --batch=set_all
13 | 
14 | # bf16
15 | --batch=test_lrn_bfloat16
16 | 
17 | # f16
18 | --batch=test_lrn_float16


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/lrn/test_lrn_bfloat16:
--------------------------------------------------------------------------------
 1 | --reset
 2 | 
 3 | --mb=2
 4 | --dt=bf16
 5 | --alg=ACROSS,WITHIN
 6 | --dir=FWD_D,FWD_I,BWD_D
 7 | 
 8 | --tag=abx
 9 | --batch=shapes_0d
10 | 
11 | --tag=abx,axb,aBx8b,aBx16b
12 | --batch=set_all
13 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/lrn/test_lrn_ci:
--------------------------------------------------------------------------------
1 | --reset
2 | 
3 | --mb=2
4 | --dt=f32,bf16,f16
5 | --alg=ACROSS,WITHIN
6 | --dir=FWD_D,FWD_I,BWD_D
7 | --tag=abx,axb
8 | --batch=shapes_ci
9 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/lrn/test_lrn_float16:
--------------------------------------------------------------------------------
 1 | --reset
 2 | 
 3 | --mb=2
 4 | --dt=f16
 5 | --alg=ACROSS,WITHIN
 6 | --dir=FWD_D,FWD_I,BWD_D
 7 | 
 8 | --tag=abx
 9 | --batch=shapes_0d
10 | 
11 | --tag=abx,axb
12 | --batch=set_all
13 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/lrn/test_lrn_gpu:
--------------------------------------------------------------------------------
 1 | # f32, bf16
 2 | --reset
 3 | 
 4 | --dt=f32,bf16
 5 | --alg=ACROSS,WITHIN
 6 | --dir=FWD_D,FWD_I,BWD_D
 7 | --tag=abx
 8 | --batch=set_all
 9 | 
10 | # f16
11 | --reset
12 | 
13 | --dt=f16
14 | --alg=ACROSS,WITHIN
15 | --dir=FWD_I
16 | --tag=abx
17 | --batch=set_all
18 | 
19 | # Test CI in Nightly
20 | --reset
21 | --batch=test_lrn_ci
22 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/lrn/test_lrn_smoke:
--------------------------------------------------------------------------------
 1 | --reset
 2 | 
 3 | --match=.*lrn_ci_2d.* # Use 2d problems only from shapes_ci
 4 | --mb=2
 5 | --dt=f32,bf16,f16
 6 | --alg=ACROSS,WITHIN
 7 | --dir=FWD_D,FWD_I,BWD_D
 8 | --tag=axb
 9 | --batch=shapes_ci
10 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/matmul/harness_matmul_dropout:
--------------------------------------------------------------------------------
 1 | --reset
 2 | --dt=f32,bf16
 3 | --attr-fpmath=,bf16
 4 | --check-ref-impl=false
 5 | --attr-dropout=0.5:12345678
 6 | 
 7 | --stag=ab --dtag=ab
 8 | --batch=shapes_2d
 9 | 
10 | --stag=abc --dtag=abc
11 | --batch=shapes_3d
12 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/matmul/harness_matmul_regression_float16:
--------------------------------------------------------------------------------
1 | 
2 | # test shapes with b_buffer
3 | --reset
4 | --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --bia-dt=f16 327x256:256x256
5 | 
6 | # Test that cases when M == 1 are handled correctly.
7 | --reset
8 | --stag=ba,ab --wtag=ab --dtag=ab --dt=f16 1x2:2x256
9 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/matmul/harness_matmul_runtime_f32:
--------------------------------------------------------------------------------
 1 | # f32
 2 | --reset
 3 | --skip-impl=ref
 4 | 
 5 | --dt=f32
 6 | --stag=ab,ba --wtag=ab,ba --dtag=ab
 7 | --bia-dt=undef,f32 --bia_mask=2
 8 | 
 9 | --runtime_dims_masks=0
10 | --attr-scales=src:common:0.25+wei:common:0.5+dst:common:4
11 | --attr-post-ops=,sum,relu
12 | --batch=shapes_2d
13 | 
14 | --runtime_dims_masks=3:3
15 | --attr-scales=src:common:0.25+wei:common:0.5+dst:common:4
16 | --attr-post-ops=,sum+add:s8,mul:f32:per_oc,mul:f32:per_tensor
17 | --batch=shapes_2d
18 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/matmul/perf_matmul_inference_batched:
--------------------------------------------------------------------------------
1 | --batch=shapes_bert
2 | --batch=shapes_bert_large
3 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/matmul/shapes_2d:
--------------------------------------------------------------------------------
 1 | # random shapes for correctness testing
 2 | 1x1:1x1
 3 | 10x30:30x1
 4 | 1x30:30x20
 5 | 10x1:1x20
 6 | 1x300:300x1
 7 | 1x1:1x200
 8 | 100x1:1x1
 9 | 10x30:30x20
10 | 2x30:30x47
11 | 10x30:30x16
12 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/matmul/shapes_2d_ci:
--------------------------------------------------------------------------------
1 | # 2d shapes for correctness testing in CI
2 | --batch=shapes_converted_ip_inf_lb_dlrm
3 | --batch=shapes_converted_ip_inf_lb_ncf
4 | --batch=shapes_converted_ip_inf_lb_rnn_t
5 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/matmul/shapes_3d:
--------------------------------------------------------------------------------
 1 | # random batched shapes for correctness testing
 2 | 2x10x30:2x30x1
 3 | 3x30x1:3x1x20
 4 | 1x30x30:1x30x20
 5 | 3x10x30:3x30x16
 6 | 
 7 | # batch broadcast shapes
 8 | 7x32x16:1x16x8
 9 | 1x128x8:2x8x16
10 | 2x16x73:1x73x8
11 | 1x26x17:5x17x65
12 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/matmul/shapes_4bit:
--------------------------------------------------------------------------------
 1 | 24x32:32x64
 2 | 25x32:32x16
 3 | 96x96:96x64
 4 | 14x96:96x32
 5 | 1x30:30x20
 6 | 10x30:30x20
 7 | 2048x1024:1024x512_n"DLRM:5*1"
 8 | 2048x256:256x128_n"NCF:1*1"
 9 | 2048x128:128x64_n"NCF:2*1"
10 | 896x240:240x4096_n"RNN-T:Encoder_cell1_Input*2"
11 | 896x1024:1024x4096_n"RNN-T:Encoder_cell1_Hidden*11"
12 | 896x320:320x1280_n"RNN-T:Prediction_Input*12"
13 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/matmul/shapes_4d:
--------------------------------------------------------------------------------
 1 | 18x16x54x64:18x16x64x54
 2 | 11x16x45x45:11x16x45x64
 3 | 21x16x41x41:21x16x41x64
 4 | 16x16x49x49:16x16x49x64
 5 | 14x16x54x54:14x16x54x64
 6 | 5x16x38x38:5x16x38x64
 7 | 24x16x32x32:24x16x32x64
 8 | 13x16x45x64:13x16x64x45
 9 | 17x16x41x64:17x16x64x41
10 | 21x16x49x64:21x16x64x49
11 | 
12 | # Broadcast shapes
13 | 2x16x384x384:2x1x384x64
14 | 1x1x35x64:13x16x64x35
15 | 1x16x38x64:5x1x64x38
16 | 14x16x54x64:1x1x64x54n"B_full_bcast"
17 | 14x6x1x253:1x1x253x1n"dot_prod_w_B_full_bcast"
18 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/matmul/shapes_bert:
--------------------------------------------------------------------------------
 1 | # multihead self-attention layer
 2 | # mb = 1, num_heads = 12, hidden_size = 1024, t_x = t_y = 128
 3 | 12x128x64:12x64x128_n"encoder:QK_matmul:12"
 4 | 12x128x128:12x128x64_n"encoder:WV_matmul:12"
 5 | 
 6 | # mb = 128, num_heads = 12, hidden_size = 768, t_x = t_y = 128,
 7 | 1536x128x64:1536x64x128_n"encoder:QK_matmul:12"
 8 | 1536x128x128:1536x128x64_n"encoder:WV_matmul:12"
 9 | 
10 | # mb = 128, num_heads = 16, hidden_size = 1024, t_x = t_y = 128,
11 | #2048x128x64:2048x64x128_n"encoder:QK_matmul:24"
12 | #2048x128x128:2048x128x64_n"encoder:WV_matmul:24"
13 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/matmul/shapes_bert_large:
--------------------------------------------------------------------------------
 1 | # multihead self-attention layer
 2 | # mb = 1, num_heads = 12, hidden_size = 768, t_x = t_y = 384
 3 | 12x384x64:12x64x384_n"encoder:QK_matmul:12"
 4 | 12x384x384:12x384x64_n"encoder:WV_matmul:12"
 5 | 
 6 | # mb = 128, num_heads = 12, hidden_size = 768, t_x = t_y = 384
 7 | 1536x384x64:1536x64x384_n"encoder:QK_matmul:12"
 8 | 1536x384x384:1536x384x64_n"encoder:WV_matmul:12"
 9 | 
10 | # mb = 128, num_heads = 16, hidden_size = 1024, t_x = t_y = 384
11 | #2048x384x64:2048x64x384_n"encoder:QK_matmul:24"
12 | #2048x384x384:2048x384x64_n"encoder:WV_matmul:24"
13 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/matmul/shapes_converted_ip_inf_lb_alexnet:
--------------------------------------------------------------------------------
1 | # These problems are ported from corresponding inner product forward
2 | # ip mb=1024
3 | 1024x9216:9216x4096n"Alexnet:ip1*1"
4 | 1024x4096:4096x4096n"Alexnet:ip2*1"
5 | 1024x4096:4096x1000n"Alexnet:ip3*1"
6 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/matmul/shapes_converted_ip_inf_lb_dlrm:
--------------------------------------------------------------------------------
 1 | # These problems are ported from corresponding inner product forward
 2 | # ip mb=2048
 3 | 2048x13:13x512n"DLRM:0*1"
 4 | 2048x512:512x256n"DLRM:1*2"
 5 | 2048x256:256x128n"DLRM:2*1"
 6 | 2048x479:479x1024n"DLRM:3*1"
 7 | 2048x1024:1024x1024n"DLRM:4*1"
 8 | 2048x1024:1024x512n"DLRM:5*1"
 9 | 2048x256:256x1n"DLRM:7*1"
10 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/matmul/shapes_converted_ip_inf_lb_gmnt:
--------------------------------------------------------------------------------
1 | # These problems are ported from corresponding inner product forward
2 | # ip mb=64
3 | 64x512:512x512n"GNMT:0*1"
4 | 64x1024:1024x1024n"GNMT:1*1"
5 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/matmul/shapes_converted_ip_inf_lb_googlenet:
--------------------------------------------------------------------------------
1 | # These problems are ported from corresponding inner product forward
2 | # ip mb=128
3 | 128x2048:2048x1024n"googlenet_v1:ip1*1"
4 | 128x1024:1024x1000n"googlenet_v1:ip2*1"
5 | 
6 | # ip mb=224
7 | 224x2048:2048x1000n"inceptionv3:ip1*1"
8 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/matmul/shapes_converted_ip_inf_lb_maskrcnn:
--------------------------------------------------------------------------------
1 | # These problems are ported from corresponding inner product forward
2 | # ip mb=1000
3 | 1000x12544:12544x1024n"masknet:ip1*1"
4 | 1000x1024:1024x1024n"masknet:ip2*1"
5 | 1000x1024:1024x324n"masknet:ip3*1"
6 | 1000x1024:1024x81n"masknet:ip4*1"
7 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/matmul/shapes_converted_ip_inf_lb_ncf:
--------------------------------------------------------------------------------
1 | # These problems are ported from corresponding inner product forward
2 | # ip mb=2048
3 | # Used in smoke validation, don't change the name
4 | 2048x256:256x256n"NCF:0*1"
5 | 2048x256:256x128n"NCF:1*1"
6 | 2048x128:128x64n"NCF:2*1"
7 | 2048x128:128x1n"NCF:3*1"
8 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/matmul/shapes_converted_ip_inf_lb_resnet:
--------------------------------------------------------------------------------
1 | # These problems are ported from corresponding inner product forward
2 | # ip mb=112
3 | 112x2048:2048x1000n"resnet:ip1*1"
4 | 
5 | # ip mb=64
6 | 64x2048:2048x1000n"resnet_sparse:ip1*1"
7 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/matmul/shapes_converted_ip_inf_lb_rnn_t:
--------------------------------------------------------------------------------
 1 | # These problems are ported from corresponding inner product forward
 2 | # ip mb=896 (16 * num_cores)
 3 | 
 4 | 896x240:240x4096n"RNN-T:Encoder_cell1_Input*2"
 5 | 896x1024:1024x4096n"RNN-T:Encoder_cell1_Hidden*11"
 6 | 896x2048:2048x4096n"RNN-T:Encoder_cell3_Input*1"
 7 | 896x320:320x1280n"RNN-T:Prediction_Input*12"
 8 | 896x1344:1344x512n"RNN-T:JointNet_Linear1*3"
 9 | 896x512:512x29n"RNN-T:JointNet_Linear2*3"
10 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/matmul/shapes_converted_ip_inf_lb_vgg16:
--------------------------------------------------------------------------------
1 | # These problems are ported from corresponding inner product forward
2 | # ip mb=64
3 | 64x25088:25088x4096n"VGG16:ip1*1"
4 | 64x4096:4096x4096n"VGG16:ip2*1"
5 | 64x4096:4096x81n"VGG16:ip3*1"
6 | 64x4096:4096x324n"VGG16:ip4*1"
7 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/matmul/shapes_converted_ip_inf_lb_wd:
--------------------------------------------------------------------------------
 1 | # These problems are ported from corresponding inner product forward
 2 | # ip mb=512
 3 | 512x845:845x1024n"WnD-512:0*1"
 4 | 512x1024:1024x512n"WnD-512:1*1"
 5 | 512x512:512x256n"WnD-512:2*1"
 6 | 
 7 | # ip mb=1024
 8 | #1024x845:845x1024n"WnD-1024:0*1"
 9 | #1024x1024:1024x512n"WnD-1024:1*1"
10 | #1024x512:512x256n"WnD-1024:2*1"
11 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/matmul/shapes_converted_ip_tr_alexnet_bwd_d:
--------------------------------------------------------------------------------
1 | # These problems are ported from corresponding inner product shapes for
2 | # training backward wrt data mb=1024
3 | 
4 | 1024x4096:4096x9216n"Alexnet_train:BWD_D,ip1*1"
5 | 1024x4096:4096x4096n"Alexnet_train:BWD_D,ip2*1"
6 | 1024x1000:1000x4096n"Alexnet_train:BWD_D,ip3*1"
7 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/matmul/shapes_converted_ip_tr_alexnet_bwd_w:
--------------------------------------------------------------------------------
1 | # These problems are ported from corresponding inner product shapes for
2 | # training backward wrt weights mb=1024
3 | 
4 | 9216x1024:1024x4096n"Alexnet_train:BWD_W,ip1*1"
5 | 4096x1024:1024x4096n"Alexnet_train:BWD_W,ip2*1"
6 | 4096x1024:1024x1000n"Alexnet_train:BWD_W,ip3*1"
7 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/matmul/shapes_converted_ip_tr_alexnet_fwd:
--------------------------------------------------------------------------------
1 | # These problems are ported from corresponding inner product shapes for
2 | # training forward mb=1024
3 | 
4 | 1024x9216:9216x4096n"Alexnet_train:FWD,ip1*1"
5 | 1024x4096:4096x4096n"Alexnet_train:FWD,ip2*1"
6 | 1024x4096:4096x1000n"Alexnet_train:FWD,ip3*1"
7 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/matmul/shapes_converted_ip_tr_dlrm_bwd_d:
--------------------------------------------------------------------------------
 1 | # These problems are ported from corresponding inner product shapes for
 2 | # training backward wrt data mb=2048
 3 | 
 4 | 2048x512:512x13n"DLRM_train:BWD_D,0*1"
 5 | 2048x256:256x512n"DLRM_train:BWD_D,1*2"
 6 | 2048x128:128x256n"DLRM_train:BWD_D,2*1"
 7 | 2048x1024:1024x479n"DLRM_train:BWD_D,3*1"
 8 | 2048x1024:1024x1024n"DLRM_train:BWD_D,4*1"
 9 | 2048x512:512x1024n"DLRM_train:BWD_D,5*1"
10 | 2048x1:1x256n"DLRM_train:BWD_D,7*1"
11 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/matmul/shapes_converted_ip_tr_dlrm_bwd_w:
--------------------------------------------------------------------------------
 1 | # These problems are ported from corresponding inner product shapes for
 2 | # training backward wrt weights mb=2048
 3 | 
 4 | 13x2048:2048x512n"DLRM_train:BWD_W,0*1"
 5 | 512x2048:2048x256n"DLRM_train:BWD_W,1*2"
 6 | 256x2048:2048x128n"DLRM_train:BWD_W,2*1"
 7 | 479x2048:2048x1024n"DLRM_train:BWD_W,3*1"
 8 | 1024x2048:2048x1024n"DLRM_train:BWD_W,4*1"
 9 | 1024x2048:2048x512n"DLRM_train:BWD_W,5*1"
10 | 256x2048:2048x1n"DLRM_train:BWD_W,7*1"
11 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/matmul/shapes_converted_ip_tr_dlrm_fwd:
--------------------------------------------------------------------------------
 1 | # These problems are ported from corresponding inner product shapes for
 2 | # training forward mb=2048
 3 | 
 4 | 2048x13:13x512n"DLRM_train:FWD,0*1"
 5 | 2048x512:512x256n"DLRM_train:FWD,1*2"
 6 | 2048x256:256x128n"DLRM_train:FWD,2*1"
 7 | 2048x479:479x1024n"DLRM_train:FWD,3*1"
 8 | 2048x1024:1024x1024n"DLRM_train:FWD,4*1"
 9 | 2048x1024:1024x512n"DLRM_train:FWD,5*1"
10 | 2048x256:256x1n"DLRM_train:FWD,7*1"
11 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/matmul/shapes_converted_ip_tr_gmnt_bwd_d:
--------------------------------------------------------------------------------
1 | # These problems are ported from corresponding inner product shapes for
2 | # training backward wrt data mb=128
3 | 
4 | 128x512:512x512n"GNMT_train:BWD_D,0*1"
5 | 128x1024:1024x1024n"GNMT_train:BWD_D,1*1"
6 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/matmul/shapes_converted_ip_tr_gmnt_bwd_w:
--------------------------------------------------------------------------------
1 | # These problems are ported from corresponding inner product shapes for
2 | # training backward wrt weights mb=128
3 | 
4 | 512x128:128x512n"GNMT_train:BWD_W,0*1"
5 | 1024x128:128x1024n"GNMT_train:BWD_W,1*1"
6 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/matmul/shapes_converted_ip_tr_gmnt_fwd:
--------------------------------------------------------------------------------
1 | # These problems are ported from corresponding inner product shapes for
2 | # training forward mb=128
3 | 
4 | 128x512:512x512n"GNMT_train:FWD,0*1"
5 | 128x1024:1024x1024n"GNMT_train:FWD,1*1"
6 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/matmul/shapes_converted_ip_tr_googlenet_bwd_d:
--------------------------------------------------------------------------------
 1 | # These problems are ported from corresponding inner product shapes for
 2 | # training backward wrt data
 3 | 
 4 | # googlenet_v1 mb=128
 5 | 128x1024:1024x2048n"googlenet_v1_train:BWD_D,ip1*1"
 6 | 128x1000:1000x1024n"googlenet_v1_train:BWD_D,ip2*1"
 7 | 
 8 | # inceptionv3 mb=224
 9 | 224x1000:1000x2048n"inceptionv3_train:BWD_D,ip1*1"
10 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/matmul/shapes_converted_ip_tr_googlenet_bwd_w:
--------------------------------------------------------------------------------
 1 | # These problems are ported from corresponding inner product shapes for
 2 | # training backward wrt weights
 3 | 
 4 | # googlenet_v1 mb=128
 5 | 2048x128:128x1024n"googlenet_v1_train:BWD_W,ip1*1"
 6 | 1024x128:128x1000n"googlenet_v1_train:BWD_W,ip2*1"
 7 | 
 8 | # inceptionv3 mb=224
 9 | 2048x224:224x1000n"inceptionv3_train:BWD_W,ip1*1"
10 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/matmul/shapes_converted_ip_tr_googlenet_fwd:
--------------------------------------------------------------------------------
 1 | # These problems are ported from corresponding inner product shapes for
 2 | # training forward
 3 | 
 4 | # googlenet_v1 mb=128
 5 | 128x2048:2048x1024n"googlenet_v1_train:FWD,ip1*1"
 6 | 128x1024:1024x1000n"googlenet_v1_train:FWD,ip2*1"
 7 | 
 8 | # inceptionv3 mb=224
 9 | 224x2048:2048x1000n"inceptionv3_train:FWD,ip1*1"
10 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/matmul/shapes_converted_ip_tr_maskrcnn_bwd_d:
--------------------------------------------------------------------------------
1 | # These problems are ported from corresponding inner product shapes for
2 | # training backward wrt data mb=512
3 | 
4 | 512x1024:1024x12544n"masknet_train:BWD_D,ip1*1"
5 | 512x1024:1024x1024n"masknet_train:BWD_D,ip2*1"
6 | 512x324:324x1024n"masknet_train:BWD_D,ip3*1"
7 | 512x81:81x1024n"masknet_train:BWD_D,ip4*1"
8 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/matmul/shapes_converted_ip_tr_maskrcnn_bwd_w:
--------------------------------------------------------------------------------
1 | # These problems are ported from corresponding inner product shapes for
2 | # training backward wrt weights mb=512
3 | 
4 | 12544x512:512x1024n"masknet_train:BWD_W,ip1*1"
5 | 1024x512:512x1024n"masknet_train:BWD_W,ip2*1"
6 | 1024x512:512x324n"masknet_train:BWD_W,ip3*1"
7 | 1024x512:512x81n"masknet_train:BWD_W,ip4*1"
8 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/matmul/shapes_converted_ip_tr_maskrcnn_fwd:
--------------------------------------------------------------------------------
1 | # These problems are ported from corresponding inner product shapes for
2 | # training forward mb=512
3 | 
4 | 512x12544:12544x1024n"masknet_train:FWD,ip1*1"
5 | 512x1024:1024x1024n"masknet_train:FWD,ip2*1"
6 | 512x1024:1024x324n"masknet_train:FWD,ip3*1"
7 | 512x1024:1024x81n"masknet_train:FWD,ip4*1"
8 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/matmul/shapes_converted_ip_tr_ncf_bwd_d:
--------------------------------------------------------------------------------
1 | # These problems are ported from corresponding inner product shapes for
2 | # training backward wrt data mb=2048
3 | 
4 | 2048x256:256x256n"NCF_train:BWD_D,0*1"
5 | 2048x128:128x256n"NCF_train:BWD_D,1*1"
6 | 2048x64:64x128n"NCF_train:BWD_D,2*1"
7 | 2048x1:1x128n"NCF_train:BWD_D,3*1"
8 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/matmul/shapes_converted_ip_tr_ncf_bwd_w:
--------------------------------------------------------------------------------
1 | # These problems are ported from corresponding inner product shapes for
2 | # training backward wrt weights mb=2048
3 | 
4 | 256x2048:2048x256n"NCF_train:BWD_W,0*1"
5 | 256x2048:2048x128n"NCF_train:BWD_W,1*1"
6 | 128x2048:2048x64n"NCF_train:BWD_W,2*1"
7 | 128x2048:2048x1n"NCF_train:BWD_W,3*1"
8 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/matmul/shapes_converted_ip_tr_ncf_fwd:
--------------------------------------------------------------------------------
1 | # These problems are ported from corresponding inner product shapes for
2 | # training forward mb=2048
3 | 
4 | 2048x256:256x256n"NCF_train:FWD,0*1"
5 | 2048x256:256x128n"NCF_train:FWD,1*1"
6 | 2048x128:128x64n"NCF_train:FWD,2*1"
7 | 2048x128:128x1n"NCF_train:FWD,3*1"
8 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/matmul/shapes_converted_ip_tr_resnet_bwd_d:
--------------------------------------------------------------------------------
1 | # These problems are ported from corresponding inner product shapes for
2 | # training backward wrt data
3 | 
4 | # resnet mb=112
5 | 112x1000:1000x2048n"resnet_train:BWD_D,ip1*1"
6 | 
7 | # resnet_sparse mb=64
8 | 64x1000:1000x2048n"resnet_sparse_train:BWD_D,ip1*1"
9 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/matmul/shapes_converted_ip_tr_resnet_bwd_w:
--------------------------------------------------------------------------------
1 | # These problems are ported from corresponding inner product shapes for
2 | # training backward wrt weights
3 | 
4 | # resnet mb=112
5 | 2048x112:112x1000n"resnet_train:BWD_W,ip1*1"
6 | 
7 | # resnet_sparse mb=64
8 | 2048x64:64x1000n"resnet_sparse_train:BWD_W,ip1*1"
9 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/matmul/shapes_converted_ip_tr_resnet_fwd:
--------------------------------------------------------------------------------
1 | # These problems are ported from corresponding inner product shapes for
2 | # training forward
3 | 
4 | # resnet mb=112
5 | 112x2048:2048x1000n"resnet_train:FWD,ip1*1"
6 | 
7 | # resnet_sparse mb=64
8 | 64x2048:2048x1000n"resnet_sparse_train:FWD,ip1*1"
9 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/matmul/shapes_converted_ip_tr_rnn_t_bwd_d:
--------------------------------------------------------------------------------
 1 | # These problems are ported from corresponding inner product shapes for
 2 | # training backward wrt data mb=896 (16 * num_cores)
 3 | 
 4 | 896x4096:4096x240n"RNN-T:BWD_D,Encoder_cell1_Input*2"
 5 | 896x4096:4096x1024n"RNN-T:BWD_D,Encoder_cell1_Hidden*11"
 6 | 896x4096:4096x2048n"RNN-T:BWD_D,Encoder_cell3_Input*1"
 7 | 896x1280:1280x320n"RNN-T:BWD_D,Prediction_Input*12"
 8 | 896x512:512x1344n"RNN-T:BWD_D,JointNet_Linear1*3"
 9 | 896x29:29x512n"RNN-T:BWD_D,JointNet_Linear2*3"
10 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/matmul/shapes_converted_ip_tr_rnn_t_bwd_w:
--------------------------------------------------------------------------------
 1 | # These problems are ported from corresponding inner product shapes for
 2 | # training backward wrt weights mb=896 (16 * num_cores)
 3 | 
 4 | 240x896:896x4096n"RNN-T:BWD_W,Encoder_cell1_Input*2"
 5 | 1024x896:896x4096n"RNN-T:BWD_W,Encoder_cell1_Hidden*11"
 6 | 2048x896:896x4096n"RNN-T:BWD_W,Encoder_cell3_Input*1"
 7 | 320x896:896x1280n"RNN-T:BWD_W,Prediction_Input*12"
 8 | 1344x896:896x512n"RNN-T:BWD_W,JointNet_Linear1*3"
 9 | 512x896:896x29n"RNN-T:BWD_W,JointNet_Linear2*3"
10 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/matmul/shapes_converted_ip_tr_rnn_t_fwd:
--------------------------------------------------------------------------------
 1 | # These problems are ported from corresponding inner product shapes for
 2 | # training forward mb=896 (16 * num_cores)
 3 | 
 4 | 896x240:240x4096n"RNN-T:FWD,Encoder_cell1_Input*2"
 5 | 896x1024:1024x4096n"RNN-T:FWD,Encoder_cell1_Hidden*11"
 6 | 896x2048:2048x4096n"RNN-T:FWD,Encoder_cell3_Input*1"
 7 | 896x320:320x1280n"RNN-T:FWD,Prediction_Input*12"
 8 | 896x1344:1344x512n"RNN-T:FWD,JointNet_Linear1*3"
 9 | 896x512:512x29n"RNN-T:FWD,JointNet_Linear2*3"
10 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/matmul/shapes_converted_ip_tr_vgg16_bwd_d:
--------------------------------------------------------------------------------
1 | # These problems are ported from corresponding inner product shapes for
2 | # training backward wrt data mb=64
3 | 
4 | 64x4096:4096x25088n"VGG16_train:BWD_D,ip1*1"
5 | 64x4096:4096x4096n"VGG16_train:BWD_D,ip2*1"
6 | 64x81:81x4096n"VGG16_train:BWD_D,ip3*1"
7 | 64x324:324x4096n"VGG16_train:BWD_D,ip4*1"
8 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/matmul/shapes_converted_ip_tr_vgg16_bwd_w:
--------------------------------------------------------------------------------
1 | # These problems are ported from corresponding inner product shapes for
2 | # training backward wrt weights mb=64
3 | 
4 | 25088x64:64x4096n"VGG16_train:BWD_W,ip1*1"
5 | 4096x64:64x4096n"VGG16_train:BWD_W,ip2*1"
6 | 4096x64:64x81n"VGG16_train:BWD_W,ip3*1"
7 | 4096x64:64x324n"VGG16_train:BWD_W,ip4*1"
8 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/matmul/shapes_converted_ip_tr_vgg16_fwd:
--------------------------------------------------------------------------------
1 | # These problems are ported from corresponding inner product shapes for
2 | # training forward mb=64
3 | 
4 | 64x25088:25088x4096n"VGG16_train:FWD,ip1*1"
5 | 64x4096:4096x4096n"VGG16_train:FWD,ip2*1"
6 | 64x4096:4096x81n"VGG16_train:FWD,ip3*1"
7 | 64x4096:4096x324n"VGG16_train:FWD,ip4*1"
8 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/matmul/shapes_converted_ip_tr_wd_bwd_d:
--------------------------------------------------------------------------------
 1 | # These problems are ported from corresponding inner product shapes for
 2 | # training backward wrt data
 3 | 
 4 | # mb=40
 5 | 40x1024:1024x845n"WnD-40_train:BWD_D,0*1"
 6 | 40x512:512x1024n"WnD-40_train:BWD_D,1*1"
 7 | 40x256:256x512n"WnD-40_train:BWD_D,2*1"
 8 | 
 9 | # mb=256
10 | #256x1024:1024x845n"WnD-256_train:BWD_D,0*1"
11 | #256x512:512x1024n"WnD-256_train:BWD_D,1*1"
12 | #256x256:256x512n"WnD-256_train:BWD_D,2*1"
13 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/matmul/shapes_converted_ip_tr_wd_bwd_w:
--------------------------------------------------------------------------------
 1 | # These problems are ported from corresponding inner product shapes for
 2 | # training backward wrt weights
 3 | 
 4 | # mb=40
 5 | 845x40:40x1024n"WnD-40_train:BWD_W,0*1"
 6 | 1024x40:40x512n"WnD-40_train:BWD_W,1*1"
 7 | 512x40:40x256n"WnD-40_train:BWD_W,2*1"
 8 | 
 9 | # mb=256
10 | #845x256:256x1024n"WnD-256_train:BWD_W,0*1"
11 | #1024x256:256x512n"WnD-256_train:BWD_W,1*1"
12 | #512x256:256x256n"WnD-256_train:BWD_W,2*1"
13 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/matmul/shapes_converted_ip_tr_wd_fwd:
--------------------------------------------------------------------------------
 1 | # These problems are ported from corresponding inner product shapes for
 2 | # training forward
 3 | 
 4 | # mb=40
 5 | 40x845:845x1024n"WnD-40_train:FWD,0*1"
 6 | 40x1024:1024x512n"WnD-40_train:FWD,1*1"
 7 | 40x512:512x256n"WnD-40_train:FWD,2*1"
 8 | 
 9 | # mb=256
10 | #256x845:845x1024n"WnD-256_train:FWD,0*1"
11 | #256x1024:1024x512n"WnD-256_train:FWD,1*1"
12 | #256x512:512x256n"WnD-256_train:FWD,2*1"
13 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/matmul/shapes_multidim:
--------------------------------------------------------------------------------
1 | # Broadcast shapes > 4D
2 | 22x3x1x8x14:22x1x4x14x8
3 | 13x1x1x8x16:1x13x8x16x8
4 | 2x3x1x1x6x2x3:1x1x4x5x6x3x4
5 | 2x1x7x3x1x6x9:2x6x7x3x4x9x6
6 | 3x5x2x9x4x7x3:1x5x2x1x4x3x9
7 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/matmul/shapes_sparse:
--------------------------------------------------------------------------------
 1 | 1x1000005:1000005x6
 2 | 1x1000010:1000010x16
 3 | 1x1000025:1000025x17
 4 | 1x1000045:1000045x32
 5 | 1x1000075:1000075x33
 6 | 1x1000100:1000100x64
 7 | 1x1000120:1000120x65
 8 | 1x1000300:1000300x72
 9 | 1x1000500:1000500x96
10 | 1x1000600:1000600x100
11 | 1x1000700:1000700x128
12 | 1x1000800:1000800x131
13 | 
14 | 4x1000005:1000005x6
15 | 4x1000010:1000010x16
16 | 4x1000025:1000025x17
17 | 4x1000045:1000045x32
18 | 4x1000075:1000075x33
19 | 4x1000100:1000100x64
20 | 4x1000120:1000120x65
21 | 4x1000300:1000300x72
22 | 4x1000500:1000500x96
23 | 4x1000600:1000600x100
24 | 4x1000700:1000700x128
25 | 4x1000800:1000800x131
26 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/matmul/shapes_sparse_packed:
--------------------------------------------------------------------------------
 1 | 1x64:64x128
 2 | 2x64:64x128
 3 | 2x128:128x128
 4 | 2x300:300x128
 5 | 2x300:300x129
 6 | 256x256:256x256
 7 | 2x1024:1024x128
 8 | 2x1030:1030x128
 9 | 2x1030:1030x200
10 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/matmul/shapes_transformer:
--------------------------------------------------------------------------------
1 | # multihead self-attention layer
2 | # mb = 1, num_heads = 16, hidden_size = 1024, t_x = t_y = 40
3 | 16x40x64:16x64x40
4 | 16x40x40:16x40x64
5 | # mb = 128, num_heads = 16, hidden_size = 1024, t_x = t_y = 40
6 | 2048x40x64:2048x64x40
7 | 2048x40x40:2048x40x64
8 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/matmul/test_matmul_bf32_bf16:
--------------------------------------------------------------------------------
 1 | # bf32
 2 | --reset
 3 | --skip-impl=ref,x64:gemm
 4 | --dt=f32 --attr-fpmath=bf16
 5 | --stag=ab,ba --wtag=ab,ba --dtag=ab
 6 | 
 7 | # test any + blocked
 8 | --stag=any --wtag=any,BA16a64b,BA16a48b,BA16a32b,BA16a16b --dtag=any
 9 | --batch=shapes_2d
10 | 
11 | # 3d
12 | --reset
13 | --skip-impl=ref,x64:gemm
14 | --dt=f32 --attr-fpmath=bf16
15 | --stag=abc,acb --wtag=abc,acb --dtag=abc
16 | --batch=shapes_3d
17 | 2x20x30:2x30x4
18 | 2x20x30:1x30x4
19 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/matmul/test_matmul_bfloat16_ymm:
--------------------------------------------------------------------------------
1 | # bf16
2 | 
3 | # global benchdnn knob, will not be reset again
4 | --cpu-isa-hints=prefer_ymm
5 | --batch=test_matmul_bfloat16
6 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/matmul/test_matmul_llm_gpu:
--------------------------------------------------------------------------------
1 | --reset
2 | --batch=option_set_fwks_llm_gpu
3 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/matmul/test_matmul_sparse:
--------------------------------------------------------------------------------
 1 | --reset
 2 | --dt=f16:f16:f16,f32:f32:f32
 3 | --dtag=ab
 4 | --encoding=csr+0.9::,:csr+0.9:
 5 | --batch=shapes_sparse
 6 | 
 7 | --reset
 8 | --dt=f16:f16:f16,f32:f32:f32
 9 | --wtag=ab,ba
10 | --dtag=ab
11 | --encoding=coo+0.9::,:coo+0.9:
12 | --batch=shapes_sparse
13 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/matmul/test_matmul_sparse_ci:
--------------------------------------------------------------------------------
 1 | --reset
 2 | --dt=f16:f16:f16,f32:f32:f32
 3 | --dtag=ab
 4 | --encoding=csr+0.99::,:csr+0.99:
 5 | --batch=shapes_sparse
 6 | 
 7 | --reset
 8 | --dt=f16:f16:f16,f32:f32:f32
 9 | --dtag=ab
10 | --encoding=coo+0.99::,:coo+0.99:
11 | --batch=shapes_sparse
12 | 
13 | --dt=u8:s8:s32,s8:s8:s32,u8:s8:f32,s8:s8:f32
14 | --encoding=:packed+0.99:,:packed+0.5:,:packed+0.0:,:packed+1.0:
15 | --batch=shapes_sparse_packed
16 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/matmul/test_matmul_sparse_gpu:
--------------------------------------------------------------------------------
1 | --reset
2 | --batch=test_matmul_sparse
3 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/pool/harness_pool_regression:
--------------------------------------------------------------------------------
1 | # This shape dispatches to jit:ir and has
2 | #  (1) no extra zero-padding, so no zero-out statement, and
3 | #  (2) a number of channels that requires a 3-register (mod 4) accumulation
4 | #      buffer (ic=208 has the same issue)
5 | --reset --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic80_ih160oh160kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1
6 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/pool/harness_pool_smoke_ref:
--------------------------------------------------------------------------------
 1 | --reset
 2 | --skip-impl=ir # Intentionally test ocl impl coverage
 3 | --check-ref-impl=
 4 | --match=.*pool_ci_2d.* # Use 2d problems only from shapes_basic
 5 | --mb=2
 6 | --tag=axb
 7 | --alg=max,avg_np,avg_p
 8 | 
 9 | # Training
10 | --dt=f32,bf16,f16
11 | --dir=FWD_D,BWD_D
12 | --batch=shapes_basic
13 | 
14 | # Inference
15 | --dir=FWD_I
16 | --tag=axb
17 | --dt=f16,s8,u8
18 | --attr-post-ops=,add:f32:per_oc
19 | --batch=shapes_basic
20 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/pool/harness_pooling_different_dt:
--------------------------------------------------------------------------------
1 | --reset
2 | --mb=2
3 | --dt=s8:u8,u8:s8,u8:f32,f32:u8,s8:f32,f32:s8,u8:f16,f16:u8,s8:f16,f16:s8
4 | --dir=FWD_I
5 | --tag=axb
6 | --alg=max,avg_np,avg_p
7 | --attr-post-ops=,add:f32:per_oc
8 | --batch=shapes_1d
9 | --batch=shapes_2d_small


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/pool/perf_pool_gpu:
--------------------------------------------------------------------------------
 1 | # mb=1, inference only
 2 | --reset
 3 | --mb=1
 4 | 
 5 | --dt=f32,s8
 6 | --dir=FWD_I
 7 | --alg=max,avg_np,avg_p
 8 | --tag=axb,aBx16b
 9 | 
10 | --batch=set_topologies_gpu
11 | 
12 | # mb>1, inference only
13 | --reset
14 | --mb=16,32,64
15 | 
16 | --dt=f32,s8
17 | --dir=FWD_I
18 | --alg=max,avg_np,avg_p
19 | --tag=axb,ABx16a16b
20 | 
21 | --batch=set_topologies_gpu
22 | 
23 | # mb>1, training only
24 | --reset 
25 | --mb=16,32,64
26 | 
27 | --dt=f32
28 | --dir=FWD_D,BWD_D
29 | --alg=max,avg_np,avg_p
30 | --tag=axb,ABx16a16b
31 | 
32 | --batch=set_topologies_gpu
33 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/pool/set_all:
--------------------------------------------------------------------------------
1 | --batch=shapes_1d
2 | --batch=shapes_2d
3 | --batch=shapes_3d
4 | 
5 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/pool/set_all_small:
--------------------------------------------------------------------------------
1 | --batch=shapes_1d
2 | --batch=shapes_2d_small
3 | --batch=shapes_3d_small
4 | 
5 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/pool/set_topologies:
--------------------------------------------------------------------------------
1 | --batch=shapes_3d_unet
2 | --batch=shapes_alexnet
3 | --batch=shapes_googlenet_v1
4 | --batch=shapes_googlenet_v3
5 | --batch=shapes_i3d_resnet50_v1
6 | --batch=shapes_resnet_50
7 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/pool/set_topologies_gpu:
--------------------------------------------------------------------------------
1 | --batch=shapes_alexnet
2 | --batch=shapes_googlenet_v1
3 | --batch=shapes_googlenet_v3
4 | --batch=shapes_resnet_50
5 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/pool/shapes_3d_unet:
--------------------------------------------------------------------------------
1 | # 3d unet
2 | 
3 | mb1ic64_id64od32_kd2sd2n"3d_unet:max_pool1"
4 | mb1ic128_id28od14_kd2sd2n"3d_unet:max_pool2"
5 | mb1ic256_id12od6_kd2sd2n"3d_unet:max_pool3"
6 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/pool/shapes_alexnet:
--------------------------------------------------------------------------------
1 | # alexnet
2 | 
3 | mb256ic96_ih55oh27_kh3sh2n"alexnet:max_pool1"
4 | mb256ic256_ih27oh13_kh3sh2n"alexnet:max_pool2"
5 | mb256ic256_ih13oh6_kh3sh2n"alexnet:max_pool5"
6 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/pool/shapes_global_pooling:
--------------------------------------------------------------------------------
1 | mb2ic5_iw11ow1_kw11sw11pw0_ih7oh1_kh7sh7ph0
2 | mb2ic5_iw11ow1_kw11sw11pw0_ih1oh1_kh1sh1ph0
3 | mb2ic5_iw11ow1_kw11sw11pw0_ih1oh1_kh1sh1ph0_iw15ow1_kw15sw15pw0
4 | mb4ic32_iw32ow1_kw32sw32pw0_ih32oh1_kh32sh32ph0
5 | mb4ic16_iw8ow1_kw8sw8pw0_ih16oh1_kh16sh16ph0
6 | mb16ic16_iw8ow1_kw8sw8pw0_ih4oh1_kh4sh4ph0
7 | mb32ic16_iw8ow1_kw8sw8pw0_ih4oh1_kh4sh4ph0
8 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/pool/shapes_i3d_resnet50_v1:
--------------------------------------------------------------------------------
1 | # i3d resnet50 v1
2 | 
3 | mb1ic64_id16od8kd1sd2pd0_ih112oh56kh3sh2ph1_iw112ow56kw3sw2pw1_n"i3d_resnet50_v1:max_pool1"
4 | mb1ic256_id8od4kd2sd2pd0_ih56oh56kh1sh1ph0_iw56ow56kw1sw1pw0_n"i3d_resnet50_v1:max_pool2"
5 | mb1ic2048_id4od1kd4sd1pd0_ih7oh1kh7sh1ph0_iw7ow1kw7sw1pw0_n"i3d_resnet50_v1:max_pool3"
6 | 
7 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/pool/shapes_large_pool:
--------------------------------------------------------------------------------
 1 | # Large iw
 2 | mb1ic32iw134217732kw7sw5
 3 | mb1ic1iw4294967311kw7sw5
 4 | 
 5 | # Large mb
 6 | mb4294967311ic1iw1pw1kw3
 7 | 
 8 | # Large ic
 9 | mb1ic4294967311iw1pw1kw3
10 | mb1ic4294967311iw1kw1
11 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/pool/shapes_resnet_50:
--------------------------------------------------------------------------------
1 | # resnet_50
2 | 
3 | mb50ic64_ih112oh56_kh3sh2n"resnet_50:max_pool1"
4 | # mb50ic256_ih56oh28_khXshXn"resnet_50:res2c_max_pool"
5 | # mb50ic512_ih28oh14_khXshXn"resnet_50:res3d_max_pool"
6 | # mb50ic1024_ih14oh7_khXshXn"resnet_50:res4f_max_pool"
7 | mb50ic2048_ih7oh1_kh7sh1n"resnet_50:ave_pool5"
8 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/pool/test_pool_ci:
--------------------------------------------------------------------------------
 1 | --reset
 2 | --mb=2
 3 | 
 4 | --alg=max,avg_np,avg_p
 5 | # Training
 6 | --dt=f32,bf16,f16
 7 | --dir=FWD_D,BWD_D
 8 | --tag=abx,axb
 9 | --batch=shapes_basic
10 | 
11 | # Inference
12 | --dir=FWD_I
13 | --tag=axb
14 | 
15 | ## All inference configs
16 | --dt=f32,bf16,f16,s32,s8,u8, \
17 |       s8:u8,u8:s8,s8:f32,f32:s8,u8:f32,f32:u8,s8:f16,f16:s8,u8:f16,f16:u8
18 | --batch=shapes_basic
19 | 
20 | ## Attributes
21 | --dt=f32,bf16,f16,s32,s8,u8
22 | --attr-post-ops=add:f32:per_oc,linear:0.5:-1
23 | --batch=shapes_basic
24 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/pool/test_pool_float16:
--------------------------------------------------------------------------------
 1 | # f16
 2 | --reset
 3 | --mb=2
 4 | 
 5 | --dt=f16
 6 | --alg=max,avg_np,avg_p
 7 | # Training
 8 | --tag=abx,axb
 9 | 
10 | --dir=FWD_D,BWD_D
11 | --batch=set_all
12 | --batch=set_topologies
13 | 
14 | --dir=FWD_D
15 | --attr-post-ops=add:f32:per_oc,linear:0.5:-1
16 | --batch=set_all_small
17 | 
18 | # Inference
19 | --dir=FWD_I
20 | --tag=axb
21 | --batch=set_all
22 | 
23 | --attr-post-ops=add:f16,linear:0.5:-1
24 | --batch=set_all_small
25 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/pool/test_pool_fp8:
--------------------------------------------------------------------------------
 1 | # fp8
 2 | --reset
 3 | --mb=2
 4 | 
 5 | --dt=f8_e5m2,f8_e4m3
 6 | --alg=max,avg_np,avg_p
 7 | # Training
 8 | --tag=abx,axb,aBx16b
 9 | 
10 | --dir=FWD_D
11 | --batch=set_all
12 | --batch=set_topologies
13 | 
14 | --dir=FWD_D
15 | --attr-post-ops=add:f32:per_oc,linear:0.5:-1
16 | --batch=set_all_small
17 | 
18 | # Inference
19 | --dir=FWD_I
20 | --tag=axb
21 | --batch=set_all
22 | 
23 | --attr-post-ops=add:f16,linear:0.5:-1
24 | --batch=set_all_small
25 | 
26 | --attr-post-ops=add:f8_e5m2,linear:0.5:-1
27 | --batch=set_all_small
28 | 
29 | --attr-post-ops=add:f8_e4m3,linear:0.5:-1
30 | --batch=set_all_small
31 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/pool/test_pool_large_gpu:
--------------------------------------------------------------------------------
 1 | # Implicitly test FWD_D via BWD_D
 2 | --dir=FWD_I,BWD_D
 3 | --dt=bf16:bf16
 4 | --alg=max,avg_p
 5 | --tag=axb
 6 | 
 7 | --impl=jit
 8 | --batch=shapes_large_pool
 9 | 
10 | # Test both gen_pooling and xe_global pooling in the same pass
11 | --impl=xe
12 | --batch=shapes_large_pool
13 | 
14 | --impl=ref
15 | --batch=shapes_large_pool
16 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/pool/test_pool_smoke:
--------------------------------------------------------------------------------
 1 | --reset
 2 | 
 3 | --match=.*pool_ci_2d.* # Use 2d problems only from shapes_basic
 4 | --mb=2
 5 | --tag=axb
 6 | --alg=max,avg_np,avg_p
 7 | 
 8 | # Training
 9 | --dt=f32,bf16,f16
10 | --dir=FWD_D,BWD_D
11 | --batch=shapes_basic
12 | 
13 | # Inference
14 | --dir=FWD_I
15 | --tag=axb
16 | --dt=f16,s8,u8
17 | --attr-post-ops=,add:f32:per_oc
18 | --batch=shapes_basic
19 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/prelu/option_set_all:
--------------------------------------------------------------------------------
 1 | # tag:any
 2 | --stag=abx:any, \
 3 |        axb:any, \
 4 |        aBx8b:any, \
 5 |        aBx16b:any
 6 | --batch=shapes_all
 7 | 
 8 | # tag:tag
 9 | --stag=abx:abx, \
10 |        axb:axb, \
11 |        aBx8b:aBx8b, \
12 |        aBx16b:aBx16b
13 | --batch=shapes_all
14 | 
15 | # tag1:tag2
16 | --stag=abx:axb, \
17 |        axb:aBx8b, \
18 |        aBx16b:abx
19 | --batch=shapes_all
20 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/prelu/test_prelu_all:
--------------------------------------------------------------------------------
 1 | --reset
 2 | 
 3 | --dir=FWD_D,BWD_DW
 4 | --sdt=f32,s32,s8,u8
 5 | --batch=option_set_all
 6 | 
 7 | --batch=test_prelu_bfloat16
 8 | 
 9 | --batch=test_prelu_float16
10 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/prelu/test_prelu_bfloat16:
--------------------------------------------------------------------------------
1 | --reset
2 | 
3 | --dir=FWD_D,BWD_DW
4 | --sdt=bf16:f32,bf16
5 | --batch=option_set_all
6 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/prelu/test_prelu_ci:
--------------------------------------------------------------------------------
 1 | --reset
 2 | 
 3 | --stag=abx:any,axb:any,abx:abx,axb:axb
 4 | 
 5 | --dir=FWD_D,BWD_DW
 6 | --sdt=f32,bf16:f32,bf16,f16,f16:f32
 7 | --batch=shapes_ci
 8 | 
 9 | --dir=FWD_I
10 | --sdt=s8:s8,u8:u8,s8:bf16,u8:bf16,s8:f32,u8:f32
11 | --batch=shapes_ci
12 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/prelu/test_prelu_float16:
--------------------------------------------------------------------------------
1 | --reset
2 | 
3 | --dir=FWD_D,BWD_DW
4 | --sdt=f16:f32,f16
5 | --batch=option_set_all
6 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/prelu/test_prelu_smoke:
--------------------------------------------------------------------------------
 1 | --reset
 2 | 
 3 | --match=.*prelu_ci_2d.* # Use 2d problems only from shapes_ci
 4 | --stag=axb:any
 5 | 
 6 | --dir=FWD_D,BWD_DW
 7 | --sdt=f32,bf16,f16
 8 | --batch=shapes_ci
 9 | 
10 | --dir=FWD_I
11 | --sdt=s8,u8
12 | --batch=shapes_ci
13 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/reduction/harness_reduction_bf16:
--------------------------------------------------------------------------------
1 | # bf16
2 | --reset
3 | 
4 | --sdt=bf16 --ddt=bf16,f32
5 | --attr-post-ops=,sum+linear:2:1+add:f32,add:f32:per_oc
6 | --batch=option_set_all_algs
7 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/reduction/harness_reduction_f16:
--------------------------------------------------------------------------------
1 | # f16
2 | --reset
3 | 
4 | --sdt=f16 --ddt=f16,f32
5 | --attr-post-ops=,sum+linear:2:1+add:f32,add:f32:per_oc
6 | --batch=option_set_all_algs
7 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/reduction/harness_reduction_f32:
--------------------------------------------------------------------------------
1 | # f32
2 | --reset
3 | 
4 | --sdt=f32 --ddt=f32
5 | --attr-post-ops=,sum+linear:2:1+add:f32,add:f32:per_oc
6 | --batch=option_set_all_algs
7 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/reduction/harness_reduction_i8:
--------------------------------------------------------------------------------
 1 | # i8
 2 | --reset
 3 | 
 4 | --attr-post-ops=,sum+linear:2:1+add:f32,add:f32:per_oc
 5 | --sdt=u8 --ddt=u8,s32,f32
 6 | --batch=option_set_all_algs_int8
 7 | 
 8 | --sdt=s8 --ddt=s8,s32,f32
 9 | --batch=option_set_all_algs_int8
10 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/reduction/option_set_all_algs:
--------------------------------------------------------------------------------
1 | # Algorithm coverage based on p and eps validity
2 | --p=1,2 --eps=0.5
3 | --alg=norm_lp_max,norm_lp_sum,norm_lp_power_p_max,norm_lp_power_p_sum
4 | --batch=option_set_all
5 | 
6 | --p= --eps=
7 | --alg=sum,mul,max,min,mean
8 | --batch=option_set_all
9 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/reduction/option_set_all_algs_ci:
--------------------------------------------------------------------------------
1 | # Algorithm coverage based on p and eps validity
2 | --p=1,2 --eps=0.5
3 | --alg=norm_lp_max,norm_lp_sum,norm_lp_power_p_max,norm_lp_power_p_sum
4 | --batch=shapes_ci
5 | 
6 | --p= --eps=
7 | --alg=sum,mul,max,min,mean
8 | --batch=shapes_ci
9 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/reduction/option_set_all_algs_int8:
--------------------------------------------------------------------------------
1 | # i8
2 | --alg=sum,mul,max,min,mean
3 | --batch=option_set_all
4 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/reduction/option_set_all_algs_int8_ci:
--------------------------------------------------------------------------------
1 | # i8
2 | --alg=sum,mul,max,min,mean
3 | --batch=shapes_ci
4 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/reduction/perf_reduction_gpu:
--------------------------------------------------------------------------------
 1 | # Test layers of some key & extened GPU DL Frameworks
 2 | --reset
 3 | --batch=option_set_fwks_key_gpu
 4 | 
 5 | --reset
 6 | --batch=option_set_fwks_ext_gpu
 7 | 
 8 | # Test nested cases
 9 | --reset
10 | --batch=shapes_nested_gpu
11 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/reduction/shapes_ci:
--------------------------------------------------------------------------------
1 | 5x3x2x9:1x3x2x9
2 | 32x17x2x3:32x17x1x1
3 | 32x17x2x3:1x17x1x1
4 | 15x12x3x5:15x1x1x1
5 | 15x12x3x5:1x1x1x1
6 | 12x12:1x12
7 | 10x16x32:10x1x32
8 | 1x17x64:1x1x64
9 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/reduction/shapes_gpu_all:
--------------------------------------------------------------------------------
 1 | # ND single-dim reduction
 2 | 10000:1
 3 | 8000x32:1x32
 4 | 32x4096x16:32x1x16
 5 | 2x52x128x36:2x1x128x36
 6 | 17x17x15x15x35:1x17x15x15x35
 7 | 11x23x16x16x7x2:11x1x16x16x7x2
 8 | 
 9 | # 2-dim reduction
10 | 16x3x3x30:16x1x1x30
11 | 16x128x4x64:16x128x1x1
12 | 128x53x17:128x1x1
13 | 
14 | # 3-dim reduction
15 | 4x16x3x4:1x1x1x4
16 | 4x16x3x4:1x1x3x1
17 | 4x16x3x4:1x16x1x1
18 | 4x16x3x4:4x1x1x1
19 | 
20 | # Full reduction
21 | 4192x17:1x1
22 | 36x640x4:1x1x1
23 | 16x16x32x16:1x1x1x1
24 | 
25 | # Split reduction
26 | 16x16x32x32:1x16x1x32
27 | 16x16x32x32:16x1x32x1
28 | 5x15x7x17x9:1x15x1x17x1
29 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/reduction/test_reduction_all:
--------------------------------------------------------------------------------
1 | # all
2 | --reset
3 | 
4 | --batch=harness_reduction_f32
5 | --batch=harness_reduction_i8
6 | --batch=test_reduction_bfloat16
7 | --batch=test_reduction_float16
8 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/reduction/test_reduction_bfloat16:
--------------------------------------------------------------------------------
1 | # bf16
2 | --reset
3 | 
4 | --batch=harness_reduction_bf16
5 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/reduction/test_reduction_ci:
--------------------------------------------------------------------------------
 1 | --reset
 2 | 
 3 | --stag=abx,axb --dtag=abx,axb,any
 4 | --attr-post-ops=,sum+linear:2:1+add:f32
 5 | 
 6 | --sdt=f32 --ddt=f32
 7 | --batch=option_set_all_algs_ci
 8 | 
 9 | --sdt=bf16 --ddt=bf16,f32
10 | --batch=option_set_all_algs_ci
11 | 
12 | --sdt=f16 --ddt=f16,f32
13 | --batch=option_set_all_algs_ci
14 | 
15 | --sdt=s8 --ddt=s8,s32,f32
16 | --batch=option_set_all_algs_int8_ci
17 | 
18 | --sdt=u8 --ddt=u8,s32,f32
19 | --batch=option_set_all_algs_int8_ci
20 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/reduction/test_reduction_float16:
--------------------------------------------------------------------------------
1 | # f16
2 | --reset
3 | 
4 | --batch=harness_reduction_f16
5 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/reduction/test_reduction_smoke:
--------------------------------------------------------------------------------
 1 | --reset
 2 | 
 3 | --stag=axb --dtag=any
 4 | --attr-post-ops=,sum+linear:2:1+add:f32
 5 | 
 6 | --sdt=f32 --ddt=f32
 7 | --batch=option_set_all_algs_ci
 8 | 
 9 | --sdt=bf16 --ddt=bf16
10 | --batch=option_set_all_algs_ci
11 | 
12 | --sdt=f16 --ddt=f16
13 | --batch=option_set_all_algs_ci
14 | 
15 | --sdt=s8 --ddt=s8
16 | --batch=option_set_all_algs_int8_ci
17 | 
18 | --sdt=u8 --ddt=u8
19 | --batch=option_set_all_algs_int8_ci
20 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/reorder/harness_reorder_large:
--------------------------------------------------------------------------------
 1 | # test if jit kernels properly handle corner cases:
 2 | # * large stride problems
 3 | # * huge dimensions (UINT_MAX + 1)
 4 | --reset
 5 | --skip-impl=ref,simple # run only jit impl, won't iterate
 6 | --sdt=f32
 7 | --ddt=f32
 8 | --stag=abx
 9 | --dtag=aBx8b
10 | 2x16x19200x19200
11 | 1x4294967296x1
12 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/reorder/harness_reorder_regression:
--------------------------------------------------------------------------------
 1 | # Blocked format with tail processing in compensation
 2 | --reset
 3 | --sdt=s8 --ddt=s8 --oflag=zp_comp:3
 4 | --stag=aBxC16b4c --dtag=xcab
 5 | 2x2x32x1x3 1x15x32x1 2x17x32x5
 6 | 
 7 | # test if jit kernel applies zero-point
 8 | --reset
 9 | --skip-impl=ref,simple # ! test jit version only
10 | --sdt=u8 --ddt=f32
11 | --stag=abdc --dtag=abcd
12 | --attr-zero-points=src0:common:1
13 | 1x32x128x33
14 | 
15 | # Test bf16 with aBcde4b format
16 | --reset
17 | --skip-impl=simple #skip non-jit version
18 | --sdt=bf16 --ddt=bf16
19 | --stag=aBcde4b --dtag=aBcde4b
20 | 2x24x19x19x19
21 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/reorder/harness_reorder_saturation:
--------------------------------------------------------------------------------
 1 | # basic checks for saturation
 2 | --reset
 3 | 
 4 | --sdt=f32,s32,s8,u8
 5 | --ddt=f32,s32,s8,u8
 6 | 
 7 | --attr-scales=src:common:4294967295
 8 | --stag=abx,axb,aBx4b,aBx8b,aBx16b
 9 | --dtag=abx,axb,aBx4b,aBx8b,aBx16b
10 | 1x17x9x5 2x64x3x3
11 | 
12 | # checks for int overflow
13 | --reset
14 | 
15 | --sdt=s32
16 | --ddt=f32,s8
17 | 
18 | --attr-scales=src:common:4294967295
19 | --attr-zero-points=src:common:1
20 | --stag=abx,axb,aBx4b,aBx8b,aBx16b
21 | --dtag=abx,axb,aBx4b,aBx8b,aBx16b
22 | 1x17x9x5
23 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/reorder/test_reorder_smoke:
--------------------------------------------------------------------------------
 1 | --reset
 2 | 
 3 | --sdt=f32,bf16,f16,s32,s8,u8
 4 | --ddt=f32,bf16,f16,s32,s8,u8
 5 | --attr-scales=,src:per_dim_1+dst:per_dim_1
 6 | --attr-zero-points=,src:common:-1+dst:common:2
 7 | --attr-post-ops=,sum:0.5
 8 | --runtime-dim-mask=0,63
 9 | --stag=abx
10 | --dtag=axb
11 | --oflag=
12 | 2x16x3x4 1x17x5x3
13 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/resampling/set_all:
--------------------------------------------------------------------------------
1 | --batch=shapes_1d
2 | --batch=shapes_2d
3 | --batch=shapes_3d
4 | --batch=shapes_maskrcnn
5 | 
6 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/resampling/shapes_1d:
--------------------------------------------------------------------------------
 1 | # random problems
 2 | 
 3 | # upsampling
 4 | mb16ic2_iw2_ow6
 5 | mb16ic64_iw32_ow64
 6 | mb1ic32_iw151_ow300
 7 | mb4ic17_iw17_ow20
 8 | 
 9 | # downsampling
10 | mb1ic8_iw14_ow7
11 | mb1ic3_iw20_ow17
12 | mb1ic1_iw21_ow13
13 | mb1ic32_iw32_ow6
14 | mb2ic5_iw42_ow14
15 | mb1ic23_iw525_ow5
16 | 
17 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/resampling/shapes_2d:
--------------------------------------------------------------------------------
 1 | # random problems
 2 | 
 3 | # upsampling
 4 | mb1ic8_ih6oh6_iw24ow24
 5 | mb1ic8_ih3oh6_iw7ow21
 6 | mb2ic16_ih5oh3_iw10ow11
 7 | mb12ic8_ih5oh3_iw7ow11
 8 | 
 9 | # downsampling
10 | mb1ic8_ih14oh14_iw7ow7
11 | mb1ic8_ih14oh6_iw7ow3
12 | mb2ic16_ih15oh73_iw10ow11
13 | mb12ic8_ih15oh3_iw5ow11
14 | 
15 | # mixed
16 | mb16ic18_ih14oh6_iw7ow12
17 | mb16ic8_ih4oh63_iw9ow13
18 | 
19 | # tails for blocked format
20 | mb4ic17_ih6oh7_iw12ow14
21 | mb4ic23_ih60oh60_iw30ow75
22 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/resampling/shapes_3d:
--------------------------------------------------------------------------------
 1 | # random problems
 2 | 
 3 | # upsampling
 4 | ic8_id2ih4iw6_od4oh8ow12
 5 | ic9_id3ih6iw4_od4oh8ow9
 6 | 
 7 | # downsampling
 8 | ic16_id6ih6iw6_od3oh3ow3
 9 | ic19_id6ih6iw6_od4oh4ow4
10 | 
11 | # mixed
12 | mb4ic16_id6ih6iw6_od12oh6ow3
13 | mb32ic32_id31ih50iw31_od16oh77ow16
14 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/resampling/shapes_maskrcnn:
--------------------------------------------------------------------------------
1 | mb1ic256ih34iw25oh68ow50n"maskrcnn1"
2 | mb1ic256ih68iw50oh136ow100n"maskrcnn2"
3 | mb1ic256ih136iw100oh272ow200n"maskrcnn3"


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/resampling/test_resampling_bfloat16:
--------------------------------------------------------------------------------
 1 | --reset
 2 | 
 3 | # bf16
 4 | --mb=2
 5 | --sdt=bf16
 6 | --ddt=bf16
 7 | --dir=FWD_D,BWD_D
 8 | --alg=nearest,linear
 9 | --tag=abx,axb,aBx8b,aBx16b
10 | --batch=set_all
11 | 
12 | # post-ops
13 | --dir=FWD_D
14 | --sdt=bf16
15 | --ddt=bf16
16 | --attr-post-ops=add:bf16
17 | --batch=shapes_ci
18 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/resampling/test_resampling_ci:
--------------------------------------------------------------------------------
 1 | --reset
 2 | 
 3 | --mb=2
 4 | --tag=abx,axb
 5 | --alg=nearest,linear
 6 | 
 7 | --dir=FWD_D
 8 | --attr-post-ops=,sum+add:f32
 9 | 
10 | --sdt=f32 --ddt=f32,s8
11 | --batch=shapes_ci
12 | 
13 | --sdt=bf16 --ddt=bf16
14 | --batch=shapes_ci
15 | 
16 | --sdt=f16 --ddt=f16
17 | --batch=shapes_ci
18 | 
19 | # backward
20 | --dir=BWD_D
21 | --attr-post-ops=
22 | 
23 | --sdt=f32 --ddt=f32
24 | --batch=shapes_ci
25 | 
26 | --sdt=bf16 --ddt=bf16
27 | --batch=shapes_ci
28 | 
29 | --sdt=f16 --ddt=f16
30 | --batch=shapes_ci
31 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/resampling/test_resampling_float16:
--------------------------------------------------------------------------------
 1 | --reset
 2 | 
 3 | # f16
 4 | --mb=2
 5 | --sdt=f16
 6 | --ddt=f16
 7 | --dir=FWD_D,BWD_D
 8 | --alg=nearest,linear
 9 | --tag=abx,axb
10 | --batch=set_all
11 | 
12 | # post ops
13 | --dir=FWD_D
14 | --sdt=f16
15 | --ddt=f16
16 | --attr-post-ops=add:f16
17 | --batch=shapes_ci
18 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/resampling/test_resampling_smoke:
--------------------------------------------------------------------------------
 1 | --reset
 2 | 
 3 | --match=.*resampling_ci_2d.* # Use 2d problems only from shapes_ci
 4 | --mb=2
 5 | --tag=axb
 6 | --alg=nearest,linear
 7 | 
 8 | --dir=FWD_D
 9 | --attr-post-ops=,sum+add:f32
10 | 
11 | --sdt=f32 --ddt=f32
12 | --batch=shapes_ci
13 | 
14 | --sdt=bf16 --ddt=bf16
15 | --batch=shapes_ci
16 | 
17 | --sdt=f16 --ddt=f16
18 | --batch=shapes_ci
19 | 
20 | # backward
21 | --dir=BWD_D
22 | --attr-post-ops=
23 | 
24 | --sdt=f32 --ddt=f32
25 | --batch=shapes_ci
26 | 
27 | --sdt=bf16 --ddt=bf16
28 | --batch=shapes_ci
29 | 
30 | --sdt=f16 --ddt=f16
31 | --batch=shapes_ci
32 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/rnn/harness_augru_bf32:
--------------------------------------------------------------------------------
 1 | # bf32
 2 | --reset
 3 | 
 4 | --trivial-strides=true,false
 5 | --alg=VANILLA_AUGRU,LBR_AUGRU
 6 | --activation=UNDEF
 7 | --attr-fpmath=bf16
 8 | --prop=FWD_I
 9 | 
10 | # small problems
11 | --direction=left2right
12 | --batch=option_set_small
13 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/rnn/harness_augru_bfloat16:
--------------------------------------------------------------------------------
 1 | # bf16
 2 | --reset
 3 | 
 4 | --trivial-strides=true,false
 5 | --alg=VANILLA_AUGRU,LBR_AUGRU
 6 | --activation=UNDEF
 7 | --cfg=bf16f32,bf16
 8 | --prop=FWD_I,BWD_DW
 9 | 
10 | # small problems
11 | --direction=left2right
12 | --batch=option_set_small
13 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/rnn/harness_augru_float16:
--------------------------------------------------------------------------------
 1 | # f16
 2 | --reset
 3 | 
 4 | --trivial-strides=true,false
 5 | --alg=VANILLA_AUGRU,LBR_AUGRU
 6 | --activation=UNDEF
 7 | --cfg=f16f32,f16
 8 | --prop=FWD_I,BWD_DW
 9 | 
10 | # small problems
11 | --direction=left2right
12 | --batch=option_set_small
13 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/rnn/harness_gru_bf32:
--------------------------------------------------------------------------------
 1 | # bf32
 2 | --reset
 3 | 
 4 | --trivial-strides=true,false
 5 | --alg=VANILLA_GRU,LBR_GRU
 6 | --activation=UNDEF
 7 | --attr-fpmath=bf16
 8 | --prop=FWD_I
 9 | 
10 | # small problems
11 | --direction=left2right,right2left,concat,sum
12 | --batch=option_set_small
13 | 
14 | # large problems
15 | --direction=left2right
16 | --batch=option_set_large
17 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/rnn/harness_gru_bfloat16:
--------------------------------------------------------------------------------
 1 | # bf16
 2 | --reset
 3 | 
 4 | --trivial-strides=true,false
 5 | --alg=VANILLA_GRU,LBR_GRU
 6 | --activation=UNDEF
 7 | --cfg=bf16f32,bf16
 8 | --prop=FWD_I,BWD_DW
 9 | 
10 | # small problems
11 | --direction=left2right,right2left,concat,sum
12 | --batch=option_set_small
13 | 
14 | # large problems
15 | --direction=left2right
16 | --batch=option_set_large
17 | 
18 | --prop=BWD_DW
19 | --flags=O
20 | 
21 | # small problems
22 | --direction=left2right,right2left,concat,sum
23 | --batch=option_set_small
24 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/rnn/harness_gru_f32:
--------------------------------------------------------------------------------
 1 | # f32
 2 | --reset
 3 | 
 4 | --trivial-strides=true,false
 5 | --alg=VANILLA_GRU,LBR_GRU
 6 | --activation=UNDEF
 7 | --cfg=f32
 8 | --prop=FWD_I,BWD_DW
 9 | 
10 | # small problems
11 | --direction=left2right,right2left,concat,sum
12 | --batch=option_set_small
13 | 
14 | # large problems
15 | --direction=left2right
16 | --batch=option_set_large
17 | 
18 | --prop=BWD_DW
19 | --flags=O
20 | 
21 | # small problems
22 | --direction=left2right,right2left,concat,sum
23 | --batch=option_set_small
24 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/rnn/harness_gru_float16:
--------------------------------------------------------------------------------
 1 | # f16
 2 | --reset
 3 | 
 4 | --trivial-strides=true,false
 5 | --alg=VANILLA_GRU,LBR_GRU
 6 | --activation=UNDEF
 7 | --cfg=f16f32,f16
 8 | --prop=FWD_I,BWD_DW
 9 | 
10 | # small problems
11 | --direction=left2right,right2left,concat,sum
12 | --batch=option_set_small
13 | 
14 | # large problems
15 | --direction=left2right
16 | --batch=option_set_large
17 | 
18 | --prop=BWD_DW
19 | --flags=O
20 | 
21 | # small problems
22 | --direction=left2right,right2left,concat,sum
23 | --batch=option_set_small
24 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/rnn/harness_gru_int8:
--------------------------------------------------------------------------------
 1 | # int8
 2 | --reset
 3 | 
 4 | --trivial-strides=true
 5 | --prop=FWD_I
 6 | --alg=VANILLA_GRU
 7 | --activation=UNDEF
 8 | 
 9 | # small problems
10 | --cfg=u8u8u8u8,u8u8u8f32,f32u8f32u8,f32u8f32f32
11 | --direction=left2right,right2left,concat,sum
12 | --scaling=common,per_oc
13 | --batch=option_set_small
14 | 
15 | # large problems
16 | --cfg=u8u8u8u8
17 | --direction=left2right
18 | --scaling=per_oc
19 | --batch=option_set_large
20 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/rnn/harness_gru_regression:
--------------------------------------------------------------------------------
1 | # int8 SIC != SLC
2 | --reset
3 | --trivial-strides=true --prop=FWD_I --alg=VANILLA_GRU --activation=UNDEF
4 | --direction=left2right --cfg=u8u8u8f32 l1t47mb100sic128slc256dhc128dic128
5 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/rnn/harness_lstm_bf32:
--------------------------------------------------------------------------------
 1 | # bf32
 2 | --reset
 3 | 
 4 | --trivial-strides=true,false
 5 | --prop=FWD_I
 6 | --alg=VANILLA_LSTM
 7 | --activation=UNDEF
 8 | --attr-fpmath=bf16
 9 | --with-peephole=false,true
10 | --with-projection=false,true
11 | 
12 | # small problems
13 | --direction=left2right,right2left,concat,sum
14 | --batch=option_set_small
15 | --batch=option_set_lstmp_small
16 | 
17 | # large problems
18 | --direction=left2right
19 | --batch=option_set_large
20 | --batch=option_set_lstmp_large
21 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/rnn/harness_rnn_bf32:
--------------------------------------------------------------------------------
 1 | # bf32
 2 | --reset
 3 | 
 4 | --trivial-strides=true,false
 5 | --alg=VANILLA_RNN
 6 | --attr-fpmath=bf16
 7 | --prop=FWD_I
 8 | 
 9 | # small test case to check GEMM and non-GEMM ops accuracy
10 | --direction=left2right,right2left,concat,sum
11 | --activation=RELU,TANH,LOGISTIC
12 | --batch=option_set_small
13 | 
14 | # large cases - test linear activations
15 | --direction=left2right
16 | --activation=RELU
17 | --batch=option_set_large
18 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/rnn/option_set_gnmt_decoder:
--------------------------------------------------------------------------------
1 | --direction=left2right
2 | l1t1sic512slc768dhc512n"GNMT:decoder_0"
3 | l1t1sic512slc1024dhc512n"GNMT:decoder_1"


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/rnn/option_set_large:
--------------------------------------------------------------------------------
1 | # option set to run all reasonable large shapes
2 | 
3 | --skip-nonlinear=true
4 | --tag=tnc:any:tnc,ntc:any:ntc
5 | --batch=shapes_large
6 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/rnn/option_set_lstmp_large:
--------------------------------------------------------------------------------
1 | # option set to run all reasonable large lstmp shapes
2 | 
3 | --skip-nonlinear=true
4 | --batch=shapes_lstmp_large
5 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/rnn/option_set_lstmp_small:
--------------------------------------------------------------------------------
1 | # option set to run all reasonable small lstmp shapes
2 | 
3 | --skip-nonlinear=false
4 | --batch=shapes_lstmp_small
5 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/rnn/option_set_perf_inference_lb:
--------------------------------------------------------------------------------
 1 | # inference_lb -- inference with large batch size
 2 | 
 3 | --alg=VANILLA_LSTM
 4 | --activation=UNDEF
 5 | 
 6 | --mb=640
 7 | --batch=option_set_gnmt_decoder
 8 | 
 9 | --mb=64
10 | --batch=option_set_gnmt_encoder
11 | 
12 | --mb=64
13 | --alg=VANILLA_RNN
14 | --direction=left2right
15 | --activation=TANH
16 | --batch=shapes_deepspeech_2
17 | 
18 | --mb=64
19 | --alg=LBR_GRU
20 | --direction=left2right
21 | --activation=UNDEF
22 | --batch=shapes_deepspeech_2
23 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/rnn/option_set_perf_inference_sb:
--------------------------------------------------------------------------------
 1 | # inference_sb -- inference with small batch size
 2 | 
 3 | --alg=VANILLA_LSTM
 4 | --activation=UNDEF
 5 | 
 6 | --mb=10
 7 | --batch=option_set_gnmt_decoder
 8 | 
 9 | --mb=1
10 | --batch=option_set_gnmt_encoder
11 | 
12 | --mb=1
13 | --alg=VANILLA_RNN
14 | --direction=left2right
15 | --activation=TANH
16 | --batch=shapes_deepspeech_2
17 | 
18 | --mb=1
19 | --alg=LBR_GRU
20 | --direction=left2right
21 | --activation=UNDEF
22 | --batch=shapes_deepspeech_2
23 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/rnn/option_set_perf_training:
--------------------------------------------------------------------------------
 1 | --mb=128
 2 | 
 3 | --alg=VANILLA_LSTM
 4 | --activation=UNDEF
 5 | --batch=option_set_gnmt_decoder
 6 | --batch=option_set_gnmt_encoder
 7 | 
 8 | --alg=VANILLA_RNN
 9 | --direction=left2right
10 | --activation=TANH
11 | --batch=shapes_deepspeech_2
12 | 
13 | --alg=LBR_GRU
14 | --direction=left2right
15 | --activation=UNDEF
16 | --batch=shapes_deepspeech_2
17 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/rnn/option_set_rnnt:
--------------------------------------------------------------------------------
1 | # RNN-T LSTM shapes
2 | 
3 | --alg=VANILLA_LSTM
4 | --activation=UNDEF
5 | --direction=left2right
6 | 
7 | --batch=shapes_rnn_t
8 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/rnn/option_set_small:
--------------------------------------------------------------------------------
1 | # option set to run all reasonable small shapes
2 | 
3 | --skip-nonlinear=false
4 | --tag=tnc:any:tnc,ntc:any:ntc
5 | --batch=shapes_small
6 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/rnn/perf_rnn_cpu:
--------------------------------------------------------------------------------
 1 | --reset
 2 | 
 3 | # Forward
 4 | 
 5 | --prop=FWD_I
 6 | 
 7 | --cfg=f32
 8 | --batch=option_set_perf_inference_lb
 9 | --batch=option_set_perf_inference_sb
10 | --batch=option_set_perf_training
11 | 
12 | --cfg=u8u8u8u8
13 | --trivial-strides=true
14 | --scaling=per_oc
15 | --batch=option_set_perf_inference_lb
16 | --batch=option_set_perf_inference_sb
17 | --batch=option_set_perf_training
18 | 
19 | # Backward
20 | 
21 | --prop=BWD_DW
22 | --cfg=f32
23 | --trivial-strides=
24 | --scaling=
25 | 
26 | --batch=option_set_perf_inference_lb
27 | --batch=option_set_perf_inference_sb
28 | --batch=option_set_perf_training
29 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/rnn/perf_rnn_inference_lb:
--------------------------------------------------------------------------------
 1 | # inference_lb -- inference with large batch size
 2 | --reset
 3 | 
 4 | --mb=640
 5 | --alg=VANILLA_LSTM
 6 | --activation=TANH
 7 | --batch=option_set_gnmt_decoder
 8 | 
 9 | --mb=64
10 | --alg=VANILLA_LSTM
11 | --activation=TANH
12 | --batch=option_set_gnmt_encoder
13 | 
14 | --mb=64
15 | --alg=VANILLA_RNN,LBR_GRU
16 | --direction=left2right
17 | --batch=shapes_deepspeech_2


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/rnn/perf_rnn_inference_sb:
--------------------------------------------------------------------------------
 1 | # inference_sb -- inference with small batch size
 2 | --reset
 3 | 
 4 | --mb=10
 5 | --alg=VANILLA_LSTM
 6 | --activation=TANH
 7 | --batch=option_set_gnmt_decoder
 8 | 
 9 | --mb=1
10 | --alg=VANILLA_LSTM
11 | --activation=TANH
12 | --batch=option_set_gnmt_encoder
13 | 
14 | --mb=1
15 | --alg=VANILLA_RNN,LBR_GRU
16 | --direction=left2right
17 | --batch=shapes_deepspeech_2
18 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/rnn/perf_rnn_knx:
--------------------------------------------------------------------------------
1 | --reset
2 | 
3 | --prop=FWD_I,BWD_DW
4 | 
5 | --cfg=f32
6 | --batch=option_set_perf_inference_lb
7 | --batch=option_set_perf_inference_sb
8 | --batch=option_set_perf_training
9 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/rnn/perf_rnn_training:
--------------------------------------------------------------------------------
 1 | --reset
 2 | 
 3 | --mb=128
 4 | --alg=VANILLA_LSTM
 5 | --activation=TANH
 6 | --batch=option_set_gnmt_decoder
 7 | --batch=option_set_gnmt_encoder
 8 | 
 9 | --alg=VANILLA_RNN,LBR_GRU
10 | --direction=left2right
11 | --batch=shapes_deepspeech_2
12 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/rnn/perf_rnn_xe:
--------------------------------------------------------------------------------
 1 | --reset
 2 | 
 3 | # Forward
 4 | 
 5 | --prop=FWD_I
 6 | --cfg=f16,f32
 7 | 
 8 | --batch=option_set_perf_inference_lb
 9 | --batch=option_set_perf_inference_sb
10 | --batch=option_set_perf_training
11 | 
12 | # Backward
13 | 
14 | --prop=BWD_DW
15 | --cfg=f32
16 | 
17 | --batch=option_set_perf_inference_lb
18 | --batch=option_set_perf_inference_sb
19 | --batch=option_set_perf_training
20 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/rnn/perf_rnn_xe_hp:
--------------------------------------------------------------------------------
 1 | --batch=perf_rnn_xe_lp
 2 | 
 3 | --reset
 4 | 
 5 | # Forward, bf16
 6 | 
 7 | --prop=FWD_I
 8 | --cfg=bf16f32
 9 | 
10 | --batch=option_set_perf_inference_lb
11 | --batch=option_set_perf_inference_sb
12 | --batch=option_set_perf_training
13 | 
14 | # Backward
15 | 
16 | --prop=BWD_DW
17 | --cfg=bf16f32
18 | 
19 | --batch=option_set_perf_inference_lb
20 | --batch=option_set_perf_inference_sb
21 | --batch=option_set_perf_training
22 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/rnn/perf_rnn_xe_lp:
--------------------------------------------------------------------------------
 1 | --batch=perf_rnn_xe
 2 | 
 3 | --reset
 4 | 
 5 | # Forward, int8
 6 | --prop=FWD_I
 7 | --cfg=u8u8u8u8
 8 | --scaling=per_oc
 9 | --trivial-strides=true
10 | 
11 | --batch=option_set_perf_inference_lb
12 | --batch=option_set_perf_inference_sb
13 | --batch=option_set_perf_training
14 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/rnn/shapes_deepspeech_2:
--------------------------------------------------------------------------------
1 | l1t50sic1760n"deepspeech2:0"
2 | l1t100sic1760n"deepspeech2:1"
3 | l1t200sic1760n"deepspeech2:2"


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/rnn/shapes_inference:
--------------------------------------------------------------------------------
 1 | l1t30mb1sic512n"GNMT_enc-inference"
 2 | l7t30mb1sic1024n"GNMT_enc-inference"
 3 | l8t1mb1sic2048slc1024dhc1024n"GNMT_dec-inference"
 4 | l1t1mb1sic2048slc1024dhc1024n"GNMT_dec-inference"
 5 | l1t1mb640sic2048slc1024dhc1024n"GNMT_dec-inference"
 6 | l1t50mb1sic1760n"deepspeech2-inference"
 7 | l1t100mb1sic1760n"deepspeech2-inference"
 8 | l1t200mb1sic1760n"deepspeech2-inference"
 9 | l1t50mb1sic500n"pytorch_testcase-inference"
10 | l1t629mb1sic128n"paddlepaddle_testcase-inference"
11 | l1t10mb1sic128slc512dhc128n"exp-0"
12 | l10t1mb1sic512slc128dhc128n"exp-1"


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/rnn/shapes_large:
--------------------------------------------------------------------------------
1 | # large shapes
2 | 
3 | l1t1mb63_sic64_n"uniform"
4 | l1t1mb34_sic65_n"uniform:tail"
5 | l1t1mb19_sic64_slc128_n"non-uniform:slc_neq_sic"
6 | l1t1mb12_sic65_dhc130_n"non-uniform:slc_neq_dhc_tail"
7 | l1t1mb6_sic64_slc128_dhc256_n"non-uniform:slc_neq_sic_neq_dhc"
8 | l1t1mb4_sic65_slc130_dhc260_n"non-uniform:slc_neq_sic_neq_dhc_tail"
9 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/rnn/shapes_large_gru:
--------------------------------------------------------------------------------
1 | l1t1mb65_sic64_n"uniform"
2 | l1t1mb17_sic128_n"uniform"
3 | l1t1mb100_sic65_n"uniform:tail"
4 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/rnn/shapes_lstmp_large:
--------------------------------------------------------------------------------
1 | # large shapes for lstm w/ projection when dhc != dic
2 | 
3 | l1t1mb31_sic64_dic128_n"non-uniform:dhc_neq_dic"
4 | l1t1mb32_sic65_dic130_n"non-uniform:dhc_neq_dic_tail"
5 | l1t1mb12_sic64_slc128_dic128_n"non-uniform:slc_neq_sic_and_dhc_neq_dic"
6 | l1t1mb10_sic65_dhc130_dic260_n"non-uniform:slc_neq_dhc_neq_dic_tail"
7 | l1t1mb3_sic64_slc128_dhc256_dic320_n"non-uniform:slc_neq_sic_neq_dhc_neq_dic"
8 | l1t1mb4_sic65_slc130_dhc260_dic325_n"non-uniform:slc_neq_sic_neq_dhc_neq_dic_tail"
9 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/rnn/shapes_lstmp_small:
--------------------------------------------------------------------------------
1 | # small shapes for lstm w/ projection when dhc != dic
2 | 
3 | l1t1mb9_sic16_dic32_n"non-uniform:dhc_neq_dic"
4 | l1t1mb7_sic17_dic34_n"non-uniform:dhc_neq_dic_tail"
5 | l1t1mb3_sic16_slc32_dic32_n"non-uniform:slc_neq_sic_and_dhc_neq_dic"
6 | l1t1mb4_sic17_dhc34_dic68_n"non-uniform:slc_neq_dhc_neq_dic_tail"
7 | l1t1mb2_sic16_slc32_dhc64_dic80_n"non-uniform:slc_neq_sic_neq_dhc_neq_dic"
8 | l1t1mb3_sic17_slc34_dhc68_dic85_n"non-uniform:slc_neq_sic_neq_dhc_neq_dic_tail"
9 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/rnn/shapes_rnn_t:
--------------------------------------------------------------------------------
1 | # RNN-T shapes with fixed timestamp = 256
2 | l1t256mb896sic1024slc240dhc1024dic1024n"RNN-T:Encoder_1"
3 | l1t256mb896sic1024slc1024dhc1024dic1024n"RNN-T:Encoder_2"
4 | l1t128mb896sic1024slc2048dhc1024dic1024n"RNN-T:Encoder_3"
5 | l1t128mb896sic1024slc1024dhc1024dic1024n"RNN-T:Encoder_4*2"
6 | l1t1mb896sic320slc320dhc320dic320n"RNN-T:Prediction*768"
7 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/rnn/shapes_small:
--------------------------------------------------------------------------------
1 | # small shapes
2 | 
3 | l8t3mb12_sic16_n"uniform"
4 | l4t3mb20_sic36_n"uniform:unroll_tail"
5 | l1t2mb6_sic16_slc32_n"non-uniform:slc_neq_sic"
6 | l1t1mb7_sic17_dhc34_n"non-uniform:slc_neq_dhc_tail"
7 | l1t1mb3_sic16_slc32_dhc64_n"non-uniform:slc_neq_sic_neq_dhc"
8 | l1t1mb4_sic17_slc34_dhc68_n"non-uniform:slc_neq_sic_neq_dhc_tail"
9 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/rnn/shapes_small_gru:
--------------------------------------------------------------------------------
1 | l14t10mb12_sic16_n"uniform"
2 | l10t14mb10_sic17_n"uniform:tail"
3 | l5t7mb7_sic32_n"uniform:unroll"
4 | l1t6mb8_sic36_n"uniform:unroll_tail"
5 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/rnn/shapes_training:
--------------------------------------------------------------------------------
 1 | l1t1mb128sic512n"GNMT_enc-training"
 2 | l2t2mb128sic1024n"GNMT_enc-training"
 3 | l8t1mb128sic2048slc1024dhc1024n"GNMT_dec-training"
 4 | l1t1mb128sic2048slc1024dhc1024n"GNMT_dec-training"
 5 | l1t50mb32sic1760n"deepspeech2-training"
 6 | l1t100mb32sic1760n"deepspeech2-training"
 7 | l1t200mb32sic1760n"deepspeech2-training"
 8 | l1t50mb64sic500n"pytorch_testcase-training"
 9 | l1t629mb128sic128n"paddlepaddle_testcase-training"
10 | l1t952mb128sic128n"paddlepaddle_testcase-training"
11 | l1t10mb32sic128slc512dhc128n"exp-0"
12 | l10t1mb32sic512slc128dhc128n"exp-1"


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/rnn/test_augru_all:
--------------------------------------------------------------------------------
 1 | --reset
 2 | 
 3 | --batch=test_augru_ci
 4 | 
 5 | --batch=test_augru_bfloat16
 6 | 
 7 | --batch=test_augru_bf32_bfloat16
 8 | 
 9 | --batch=test_augru_float16
10 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/rnn/test_augru_bf32_bfloat16:
--------------------------------------------------------------------------------
1 | --reset
2 | 
3 | --batch=harness_augru_bf32
4 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/rnn/test_augru_bfloat16:
--------------------------------------------------------------------------------
1 | --reset
2 | 
3 | --batch=harness_augru_bfloat16
4 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/rnn/test_augru_ci:
--------------------------------------------------------------------------------
 1 | --reset
 2 | 
 3 | --alg=LBR_AUGRU,VANILLA_AUGRU
 4 | --activation=UNDEF
 5 | --direction=left2right
 6 | --skip-nonlinear=false
 7 | 
 8 | --trivial-strides=true,false
 9 | --prop=FWD_I,BWD_DW
10 | --cfg=f32,bf16f32,bf16,f16
11 | --batch=shapes_small
12 | 
13 | --trivial-strides=true,false
14 | --prop=FWD_I
15 | --cfg=f32
16 | --attr-fpmath=bf16
17 | --batch=shapes_small
18 | 
19 | 
20 | # flags
21 | --trivial-strides=true,false
22 | --prop=BWD_DW
23 | --cfg=f32
24 | --flags=O
25 | --batch=shapes_small
26 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/rnn/test_augru_float16:
--------------------------------------------------------------------------------
1 | --reset
2 | 
3 | --batch=harness_augru_float16
4 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/rnn/test_gru_all:
--------------------------------------------------------------------------------
 1 | --reset
 2 | 
 3 | --batch=harness_gru_f32
 4 | 
 5 | --batch=test_gru_int8
 6 | 
 7 | --batch=test_gru_bfloat16
 8 | 
 9 | --batch=test_gru_bf32_bfloat16
10 | 
11 | --batch=test_gru_float16
12 | 
13 | --batch=harness_gru_regression
14 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/rnn/test_gru_bf32_bfloat16:
--------------------------------------------------------------------------------
1 | --reset
2 | 
3 | --batch=harness_gru_bf32
4 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/rnn/test_gru_bfloat16:
--------------------------------------------------------------------------------
1 | --reset
2 | 
3 | --batch=harness_gru_bfloat16


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/rnn/test_gru_float16:
--------------------------------------------------------------------------------
1 | --reset
2 | 
3 | --batch=harness_gru_float16
4 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/rnn/test_gru_int8:
--------------------------------------------------------------------------------
1 | # int8
2 | --reset
3 | 
4 | --batch=harness_gru_int8
5 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/rnn/test_lstm_all:
--------------------------------------------------------------------------------
 1 | --reset
 2 | 
 3 | --batch=test_lstm_f32
 4 | 
 5 | --batch=test_lstm_int8
 6 | 
 7 | --batch=test_lstm_bfloat16
 8 | 
 9 | --batch=test_lstm_bf32_bfloat16
10 | 
11 | --batch=test_lstm_float16
12 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/rnn/test_lstm_bf32_bfloat16:
--------------------------------------------------------------------------------
1 | --reset
2 | 
3 | --batch=harness_lstm_bf32
4 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/rnn/test_lstm_bfloat16:
--------------------------------------------------------------------------------
1 | --reset
2 | 
3 | --batch=harness_lstm_bfloat16
4 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/rnn/test_lstm_bfloat16_ymm:
--------------------------------------------------------------------------------
1 | # global benchdnn knob, will not be reset again
2 | --cpu-isa-hints=prefer_ymm
3 | 
4 | --reset
5 | --batch=test_lstm_bfloat16
6 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/rnn/test_lstm_f32:
--------------------------------------------------------------------------------
1 | --reset
2 | 
3 | --batch=harness_lstm_f32
4 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/rnn/test_lstm_float16:
--------------------------------------------------------------------------------
1 | --reset
2 | 
3 | --batch=harness_lstm_float16
4 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/rnn/test_lstm_int8:
--------------------------------------------------------------------------------
1 | # int8
2 | --reset
3 | 
4 | --batch=harness_lstm_int8
5 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/rnn/test_rnn_all:
--------------------------------------------------------------------------------
 1 | --reset
 2 | 
 3 | --batch=harness_rnn_f32
 4 | 
 5 | --batch=test_rnn_bfloat16
 6 | 
 7 | --batch=test_rnn_bf32_bfloat16
 8 | 
 9 | --batch=test_rnn_float16
10 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/rnn/test_rnn_bf32_bfloat16:
--------------------------------------------------------------------------------
1 | --reset
2 | 
3 | --batch=harness_rnn_bf32
4 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/rnn/test_rnn_bfloat16:
--------------------------------------------------------------------------------
1 | --reset
2 | 
3 | --batch=harness_rnn_bfloat16


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/rnn/test_rnn_float16:
--------------------------------------------------------------------------------
1 | --reset
2 | 
3 | --batch=harness_rnn_float16
4 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/self/test_self_ci:
--------------------------------------------------------------------------------
1 | # Use empty input file to align with other drivers
2 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/self/test_self_f32:
--------------------------------------------------------------------------------
1 | # Use empty input file to align with other drivers
2 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/self/test_self_smoke:
--------------------------------------------------------------------------------
1 | # Use empty input file to align with other drivers
2 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/shuffle/option_set_all:
--------------------------------------------------------------------------------
 1 | --group=2,4
 2 | 
 3 | --axis=1     1x12x56x56 1x24x56x56 1x36x56x56 1x68x56x56
 4 | --axis=1,2   1x68x56x56 1x136x56x56 1x272x56x56
 5 | --axis=1,3,4 1x272x2x56x56
 6 | 
 7 | --group=3
 8 | 
 9 | --axis=1     1x3x224x224 1x24x56x56 1x48x28x28 1x96x14x14 1x192x7x7
10 | --axis=1,2   1x36x225x225 1x72x57x57 1x144x27x27
11 | --axis=1,3,4 1x282x2x57x57
12 | 
13 | --group=8
14 | 
15 | --axis=1,2   1x24x56x56 1x32x56x56 1x72x56x56
16 | --axis=1,3,4 1x272x2x56x56
17 | 
18 | --group=16
19 | 
20 | --axis=1,2   1x16x64x64 1x48x64x64 1x128x64x64
21 | --axis=1,3,4 1x272x2x64x64


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/shuffle/option_set_min:
--------------------------------------------------------------------------------
1 | --axis=1,2
2 | 
3 | --group=2,4 1x12x56x56 1x24x56x56 1x272x56x56
4 | 
5 | --group=3   1x36x225x225 1x72x57x57 1x144x27x27
6 | 
7 | --group=8   1x24x56x56 1x32x56x56 1x72x56x56
8 | 
9 | --group=16  1x16x64x64 1x48x64x64 1x128x64x64


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/shuffle/perf_shuffle_cpu:
--------------------------------------------------------------------------------
1 | --reset
2 | 
3 | --dir=FWD_D
4 | --dt=f32,bf16
5 | --tag=abx,axb,aBx4b,aBx8b,aBx16b
6 | --batch=option_set_perf


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/shuffle/test_shuffle_all:
--------------------------------------------------------------------------------
 1 | --reset
 2 | 
 3 | --dir=FWD_D
 4 | --dt=f32,s32,s8,u8
 5 | --tag=abx,axb,aBx4b,aBx8b,aBx16b
 6 | --batch=option_set_all
 7 | 
 8 | --dir=BWD_D
 9 | --dt=f32
10 | --batch=option_set_min
11 | 
12 | # bf16
13 | --batch=test_shuffle_bfloat16
14 | 
15 | # f16
16 | --batch=test_shuffle_float16
17 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/shuffle/test_shuffle_bfloat16:
--------------------------------------------------------------------------------
 1 | --reset
 2 | 
 3 | --dt=bf16
 4 | 
 5 | --dir=FWD_D
 6 | --tag=abx,axb,aBx16b
 7 | --batch=option_set_all
 8 | 
 9 | --dir=BWD_D
10 | --batch=option_set_min
11 | 
12 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/shuffle/test_shuffle_ci:
--------------------------------------------------------------------------------
 1 | --reset
 2 | 
 3 | --group=4
 4 | 
 5 | --dir=FWD_D,BWD_D
 6 | --dt=f32,bf16,f16,s32,s8,u8
 7 | --tag=abx,axb
 8 | --axis=1,2
 9 | 2x12x32x17 3x16x36x9
10 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/shuffle/test_shuffle_float16:
--------------------------------------------------------------------------------
 1 | --reset
 2 | 
 3 | --dt=f16
 4 | 
 5 | --dir=FWD_D
 6 | --tag=abx,axb
 7 | --batch=option_set_all
 8 | 
 9 | --dir=BWD_D
10 | --batch=option_set_min
11 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/shuffle/test_shuffle_gpu:
--------------------------------------------------------------------------------
 1 | --reset
 2 | 
 3 | --group=4
 4 | --dir=FWD_D,BWD_D
 5 | 
 6 | --dt=f32,f16,bf16,s8,u8
 7 | 
 8 | --tag=abx,aBx16b
 9 | --axis=1,2 1x68x56x56 1x272x56x56
10 | --axis=1,3,4 1x272x2x56x56
11 | 
12 | --tag=ABx16a16b
13 | --axis=1,2 32x64x56x56
14 | --axis=1,3,4 32x64x2x56x56
15 | 
16 | # blocked with tail
17 | --dt=f32
18 | --tag=aBx16b --axis=1 1x12x56x56 1x36x56x56
19 | 
20 | # double block
21 | --allow-enum-tags-only=0
22 | --tag=BA8b4a2b 
23 | --group=6 
24 | --axis=0,1
25 | 48x48
26 | 
27 | # Test CI in Nightly
28 | --reset
29 | --batch=test_shuffle_ci
30 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/shuffle/test_shuffle_smoke:
--------------------------------------------------------------------------------
1 | --reset
2 | 
3 | --dir=FWD_D,BWD_D
4 | --dt=f32,bf16,f16,s8,u8
5 | --tag=axb
6 | --group=4
7 | --axis=1
8 | 2x12x32x17 3x16x36x9
9 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/softmax/harness_softmax_regression:
--------------------------------------------------------------------------------
1 | # "vectorized" and "small" reusable kernels (stride 1, aligned) with scale
2 | --reset
3 | --dir=FWD_I
4 | --sdt=u8
5 | --ddt=s8
6 | --attr-scales=dst:common:0.125,src:common:2
7 | --axis=3 1x4x192x256 1x4x16x32
8 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/softmax/set_0d:
--------------------------------------------------------------------------------
1 | # 2d dataset
2 | 
3 | --batch=shapes_0d
4 | --batch=shapes_nlp
5 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/softmax/shapes_0d:
--------------------------------------------------------------------------------
1 | # 2d dataset
2 | 
3 | 96x1000
4 | 256x10
5 | 32x100
6 | 2x113
7 | 128x365
8 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/softmax/shapes_2d:
--------------------------------------------------------------------------------
 1 | # 4d dataset
 2 | 
 3 | # spatial is 1
 4 | 96x1000x1x1
 5 | 256x10x1x1
 6 | 
 7 | # regular 4d
 8 | 2x19x128x256
 9 | 2x16x128x128
10 | 1x8x1024x16
11 | 1x2x64x64
12 | 448x16x28x28
13 | 64x1011x1x1
14 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/softmax/shapes_3d:
--------------------------------------------------------------------------------
 1 | # 5d dataset
 2 | 
 3 | # spatial is 1
 4 | 96x1024x1x1x1
 5 | 256x10x1x1x1
 6 | 
 7 | # regular 5d
 8 | 3x17x9x37x19
 9 | 2x16x128x2x4
10 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/softmax/shapes_ci:
--------------------------------------------------------------------------------
1 | # basic shapes
2 | 
3 | 16x16_n"softmax_ci_0d:0"
4 | 255x10_n"softmax_ci_0d:1"
5 | 2x19x17x13_n"softmax_ci_2d:0" # Used in smoke validation, don't change the name
6 | 1x16x2x12_n"softmax_ci_2d:1" # Used in smoke validation, don't change the name
7 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/softmax/shapes_large:
--------------------------------------------------------------------------------
1 | 8192x64
2 | 16384x64
3 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/softmax/shapes_large_axis:
--------------------------------------------------------------------------------
1 | 8x3x8320
2 | 4x3x5600
3 | 1x1x4097
4 | 2x3x9999
5 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/softmax/shapes_nlp:
--------------------------------------------------------------------------------
 1 | # nlp
 2 | 
 3 | 240x20
 4 | 2560x20
 5 | 2688x21
 6 | 3712x29
 7 | 4480x35
 8 | 4736x37
 9 | 4864x38
10 | 4992x39
11 | 5120x40
12 | 5248x41
13 | 5376x42
14 | 5504x43
15 | 5632x44
16 | 6144x48
17 | 6272x49
18 | 6400x50
19 | 6784x53
20 | 7296x57
21 | 7424x58
22 | 7808x61
23 | 8192x64
24 | 8448x66
25 | 8576x67
26 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/softmax/test_softmax_acl:
--------------------------------------------------------------------------------
 1 | --reset
 2 | 
 3 | # only FWD_I is supported with ACL
 4 | --dir=FWD_I
 5 | 
 6 | # do not run ref
 7 | --skip-impl=ref
 8 | 
 9 | --alg=SOFTMAX,LOGSOFTMAX
10 | 
11 | --sdt=f32
12 | --ddt=f32
13 | --dtag=any
14 | 
15 | --axis=0,1
16 | --batch=shapes_ci
17 | --batch=shapes_nlp
18 | 
19 | --stag=abx
20 | --axis=0,1
21 | --batch=set_0d
22 | --axis=1,3
23 | --batch=shapes_2d
24 | --axis=3,4
25 | --batch=shapes_3d
26 | 
27 | --stag=axb
28 | --axis=0,1
29 | --batch=shapes_2d
30 | --batch=shapes_3d
31 | 
32 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/softmax/test_softmax_smoke:
--------------------------------------------------------------------------------
 1 | --reset
 2 | 
 3 | --match=.*softmax_ci_2d.* # Use 2d problems only from shapes_ci
 4 | --inplace=false
 5 | --stag=axb
 6 | --dtag=any
 7 | --alg=SOFTMAX,LOGSOFTMAX
 8 | --axis=1
 9 | 
10 | --dir=FWD_D,BWD_D
11 | --sdt=f32,bf16,f16
12 | --ddt=f32,bf16,f16
13 | --batch=shapes_ci
14 | 
15 | --dir=FWD_I
16 | --sdt=s8,u8
17 | --ddt=s8,u8
18 | --attr-scales=,src:common:64+dst:common:0.5
19 | --batch=shapes_ci
20 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/sum/option_set_fwks_ext_gpu:
--------------------------------------------------------------------------------
1 | --reset --allow-enum-tags-only=0 --sdt=f32:f32 --ddt=f32 --stag=abcd:abcd --dtag=abcd 1113x8x30x64_n"1bca9bd0e78574fe0ab41bbc68b795f9"
2 | --reset --allow-enum-tags-only=0 --sdt=f32:f32 --ddt=f32 --stag=abcd:abcd --dtag=abcd 64x8x50x64_n"8def238b6b453fb393856464ba8a0d4d"
3 | --reset --allow-enum-tags-only=0 --sdt=f32:f32:f32 --ddt=f32 --stag=abc:abc:abc --dtag=abc 64x50x512_n"7d5660f01c9be41fba9d6135abf33d2d"
4 | --reset --allow-enum-tags-only=0 --sdt=f32:f32:f32:f32 --ddt=f32 --stag=ab:ab:ab:ab --dtag=ab 64x512_n"0bab0aacc5f8028faa61019716a8ef50"
5 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/sum/test_sum_all:
--------------------------------------------------------------------------------
 1 | --reset
 2 | 
 3 | --inplace=true,false
 4 | --scales=0.25,1,4
 5 | --ddt=f32,s32,s8,u8
 6 | --sdt=f32:f32,f32:s32,f32:s8,f32:u8,s32:s8,s32:u8,s8:u8
 7 | --dtag=undef,abx,axb,aBx8b,aBx16b
 8 | --stag=abx:abx       3x3x16x4
 9 | --stag=axb:axb       4x4x2x16 5x5x1x15
10 | --stag=aBx8b:aBx8b   2x8x3x10 1x9x4x7
11 | --stag=aBx16b:aBx16b 1x16x5x11 2x15x6x3
12 | 
13 | --ddt=f32,s32
14 | --sdt=f32:s32:s8
15 | --stag=aBx8b:abx:axb,axb:axb:axb
16 | --scales=1.25:3:0.5    16x2x6x4x3
17 | 
18 | # bf16
19 | --batch=test_sum_bfloat16
20 | 
21 | # f16
22 | --batch=test_sum_float16
23 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/sum/test_sum_bfloat16:
--------------------------------------------------------------------------------
 1 | --reset
 2 | 
 3 | --scales=0.25,1,4
 4 | --ddt=f32,bf16
 5 | --sdt=bf16:bf16
 6 | --dtag=undef,abx,axb,aBx16b
 7 | --stag=abx:abx       3x3x16x4
 8 | --stag=axb:axb       4x4x2x16 5x5x1x15
 9 | --stag=aBx16b:aBx16b 1x16x5x11 2x15x6x3
10 | 
11 | --sdt=bf16:bf16:bf16
12 | --stag=aBx16b:abx:axb,axb:axb:axb
13 | --scales=2:0.25:0.5   16x2x6x4x3
14 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/sum/test_sum_ci:
--------------------------------------------------------------------------------
 1 | --reset
 2 | 
 3 | --inplace=true,false
 4 | --ddt=f32,bf16,f16,s32,s8,u8
 5 | --sdt=f32:f32,f32:bf16,f32:s8,bf16:bf16,f16:f16,s32:s8,u8:u8
 6 | --dtag=undef,any,abx,axb
 7 | --stag=abx:abx,axb:axb
 8 | --scales=0.25:2
 9 | 3x17x5x7 4x16x8x10
10 | 
11 | --ddt=f32,s8
12 | --sdt=f32:u8:s8
13 | --stag=abx:abx:abx,axb:axb:axb
14 | --scales=0.25:2:0.5
15 | 2x17x5x7x3 4x16x8x10x2
16 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/sum/test_sum_float16:
--------------------------------------------------------------------------------
 1 | --reset
 2 | 
 3 | --scales=0.25,1,4
 4 | --ddt=f32,f16
 5 | --sdt=f16:f16
 6 | --dtag=undef,abx,axb
 7 | --stag=abx:abx       3x3x16x4
 8 | --stag=axb:axb       4x4x2x16 5x5x1x15
 9 | 
10 | --sdt=f16:f16:f16
11 | --stag=axb:axb:axb
12 | --scales=2:0.25:0.5   16x2x6x4x3
13 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/sum/test_sum_smoke:
--------------------------------------------------------------------------------
 1 | --reset
 2 | 
 3 | --inplace=false
 4 | --ddt=f32,bf16,f16,s8,u8
 5 | --sdt=f32:f32,bf16:bf16,f16:f16,s8:s8,u8:u8
 6 | --dtag=undef
 7 | --stag=axb
 8 | --scales=0.25
 9 | 3x17x5x7 4x16x8x10
10 | 
11 | --ddt=f32
12 | --sdt=f32:f32:f32
13 | --stag=axb:axb:axb
14 | 2x17x5x7x3 4x16x8x10x2
15 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/zeropad/option_set_fwks_ext_gpu:
--------------------------------------------------------------------------------
1 | #
2 | --reset --allow-enum-tags-only=0 --dt=f16 --tag=aBcd16b 1x4x1080x1920n"text-image-super-resolution-0001_onnx.inf.fp16.ov.b1*1"
3 | #
4 | --reset --allow-enum-tags-only=0 --dt=f16 --tag=ABcd32a16b 32x4x1080x1920n"text-image-super-resolution-0001_onnx.inf.fp16.ov.b32*1"
5 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/zeropad/option_set_fwks_key_gpu:
--------------------------------------------------------------------------------
1 | #
2 | --reset --allow-enum-tags-only=0 --dt=f32 --tag=ABcde32a16b 32x6x13x14x9n"3dgan.tr.fp32.tf.mb256*8"
3 | --reset --allow-enum-tags-only=0 --dt=f32 --tag=ABcde32a16b 256x8x25x25x25n"3dgan.tr.fp32.tf.mb256*8"
4 | --reset --allow-enum-tags-only=0 --dt=f32 --tag=ABcde16a16b 256x8x25x25x25n"3dgan.tr.fp32.tf.mb256*8"
5 | --reset --allow-enum-tags-only=0 --dt=f32 --tag=ABcde16a16b 256x8x23x23x23n"3dgan.tr.fp32.tf.mb256*8"
6 | --reset --allow-enum-tags-only=0 --dt=f32 --tag=ABcde32a16b 256x6x13x14x9n"3dgan.tr.fp32.tf.mb256*4"
7 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/zeropad/set_dim1_block_3d:
--------------------------------------------------------------------------------
1 | --tag=Abc16a,Abc4a
2 | --batch=shapes_dim1_block_3d


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/zeropad/set_dim1dim2_block_2d:
--------------------------------------------------------------------------------
1 | --tag=AB48a16b,AB48a32b,BA4b8a8b4a
2 | 
3 | --batch=shapes_dim1dim2_block_2d
4 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/zeropad/set_dim1dim2_block_3d:
--------------------------------------------------------------------------------
1 | # The follwing tags are omitted as they should be tested in the 4d dim2 dim3
2 | # blocked cases
3 | # --tag=ABc16a16b,ABc4a4b,ABc16b16a,ABc4b16a4b,ABc2b8a4b,
4 | #       ABc16b16a4b,ABc16b16a2b,ABc4b4a,ABc8a16b2a,ABc8a8b,
5 | #       ABc8a4b,ABc8b16a2b,ABc8b8a
6 | 
7 | --tag=ABc32a32b,ABc16b32a,ABc16b64a,ABc4b32a4b,ABc4b64a4b,ABc8b32a2b,ABc8b64a2b,ABc4a8b8a4b
8 | --batch=shapes_dim1dim2_block_3d
9 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/zeropad/set_dim2_block_3d:
--------------------------------------------------------------------------------
1 | --tag=aBc16b,aBc32b,aBc4b,aBc8b
2 | --batch=shapes_dim2_block_3d


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/zeropad/set_dim2dim3_block_4d:
--------------------------------------------------------------------------------
1 | --tag=aBCd16b16c,aBCd16c16b,aBCd2c4b2c,aBCd4b8c2b,aBCd4c16b4c,aBCd2c8b4c,aBCd16c16b4c,aBCd16c16b2c,aBCd4c4b,aBCd4b4c,aBCd4c8b2c,aBCd8b16c2b,aBCd8b8c,aBCd8b4c,aBCd8c16b2c,aBCd8c8b,aBCd2b4c2b
2 | --batch=shapes_dim2dim3_block_4d
3 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/zeropad/shapes_dim1_block_3d:
--------------------------------------------------------------------------------
 1 | 1x1x1
 2 | 1x1x33
 3 | 1x33x1
 4 | 1x33x33
 5 | 3x1x1
 6 | 3x1x33
 7 | 3x33x1
 8 | 3x33x33
 9 | 7x1x1
10 | 7x1x33
11 | 7x33x1
12 | 7x33x33
13 | 9x1x1
14 | 9x1x33
15 | 9x33x1
16 | 9x33x33
17 | 19x1x1
18 | 19x1x33
19 | 19x33x1
20 | 19x33x33
21 | 43x1x1
22 | 43x1x33
23 | 43x33x1
24 | 43x33x33
25 | 71x1x1
26 | 71x1x33
27 | 71x33x1
28 | 71x33x33
29 | 128x1x1
30 | 128x1x33
31 | 128x33x1
32 | 128x33x33
33 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/zeropad/shapes_dim2_block_3d:
--------------------------------------------------------------------------------
 1 | 1x1x1
 2 | 1x1x33
 3 | 1x3x1
 4 | 1x3x33
 5 | 1x7x1
 6 | 1x7x33
 7 | 1x9x1
 8 | 1x9x33
 9 | 1x19x1
10 | 1x19x33
11 | 1x43x1
12 | 1x43x33
13 | 1x71x1
14 | 1x71x33
15 | 1x128x1
16 | 1x128x33
17 | 33x1x1
18 | 33x1x33
19 | 33x3x1
20 | 33x3x33
21 | 33x7x1
22 | 33x7x33
23 | 33x9x1
24 | 33x9x33
25 | 33x19x1
26 | 33x19x33
27 | 33x43x1
28 | 33x43x33
29 | 33x71x1
30 | 33x71x33
31 | 33x128x1
32 | 33x128x33
33 | 


--------------------------------------------------------------------------------
/tests/benchdnn/inputs/zeropad/test_zeropad_ci:
--------------------------------------------------------------------------------
1 | --dt=s8,f16,f32,f64
2 | --batch=set_dim1_block_3d
3 | --batch=set_dim2_block_3d
4 | --batch=set_dim1dim2_block_3d
5 | --batch=set_dim2dim3_block_4d
6 | 


--------------------------------------------------------------------------------
/third_party/.clang-tidy:
--------------------------------------------------------------------------------
1 | Checks: '-*,misc-definitions-in-headers'
2 | CheckOptions:
3 |   - { key: HeaderFileExtensions,          value: "x" }
4 | 


--------------------------------------------------------------------------------
/third_party/ittnotify/README.md:
--------------------------------------------------------------------------------
1 | This code is from [Intel(R) Instrumentation and Tracing Technology (ITT) and
2 | Just-In-Time (JIT) API](https://github.com/intel/ittapi)
3 | 
4 | tag: 3.22.5
5 | 


--------------------------------------------------------------------------------
/third_party/spdlog/README.md:
--------------------------------------------------------------------------------
1 | This code is from [spdlog](https://github.com/gabime/spdlog).
2 | 
3 | tag: 1.15.1
4 | 


--------------------------------------------------------------------------------
/third_party/spdlog/details/windows_include.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #ifndef NOMINMAX
 4 |     #define NOMINMAX  // prevent windows redefining min/max
 5 | #endif
 6 | 
 7 | #ifndef WIN32_LEAN_AND_MEAN
 8 |     #define WIN32_LEAN_AND_MEAN
 9 | #endif
10 | 
11 | #include <windows.h>
12 | 


--------------------------------------------------------------------------------
/third_party/spdlog/fmt/bundled/core.h:
--------------------------------------------------------------------------------
1 | // This file is only provided for compatibility and may be removed in future
2 | // versions. Use fmt/base.h if you don't need fmt::format and fmt/format.h
3 | // otherwise.
4 | 
5 | #include "format.h"
6 | 


--------------------------------------------------------------------------------
/third_party/spdlog/formatter.h:
--------------------------------------------------------------------------------
 1 | // Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT)
 3 | 
 4 | #pragma once
 5 | 
 6 | #include <spdlog/details/log_msg.h>
 7 | #include <spdlog/fmt/fmt.h>
 8 | 
 9 | namespace spdlog {
10 | 
11 | class formatter {
12 | public:
13 |     virtual ~formatter() = default;
14 |     virtual void format(const details::log_msg &msg, memory_buf_t &dest) = 0;
15 |     virtual std::unique_ptr<formatter> clone() const = 0;
16 | };
17 | }  // namespace spdlog
18 | 


--------------------------------------------------------------------------------
/third_party/spdlog/version.h:
--------------------------------------------------------------------------------
 1 | // Copyright(c) 2015-present, Gabi Melman & spdlog contributors.
 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT)
 3 | 
 4 | #pragma once
 5 | 
 6 | #define SPDLOG_VER_MAJOR 1
 7 | #define SPDLOG_VER_MINOR 15
 8 | #define SPDLOG_VER_PATCH 1
 9 | 
10 | #define SPDLOG_TO_VERSION(major, minor, patch) (major * 10000 + minor * 100 + patch)
11 | #define SPDLOG_VERSION SPDLOG_TO_VERSION(SPDLOG_VER_MAJOR, SPDLOG_VER_MINOR, SPDLOG_VER_PATCH)
12 | 


--------------------------------------------------------------------------------
/third_party/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_version.h:
--------------------------------------------------------------------------------
1 | static const int majorVersion = 1;
2 | static const int minorVersion = 1;
3 | static const int patchVersion = 2;
4 | static int getVersion() { return (majorVersion << 16) + (minorVersion << 8) + patchVersion; }
5 | static const char *getVersionString() { return "1.1.2"; }
6 | 


--------------------------------------------------------------------------------