├── .clang-format ├── .clang-ignorelist ├── .clang-tidy ├── .gitattributes ├── .github ├── CODEOWNERS ├── ISSUE_TEMPLATE │ ├── bug_report.md │ ├── documentation.md │ ├── feature_request.md │ └── question.md ├── automation │ ├── aarch64 │ │ ├── build.sh │ │ ├── build_acl.sh │ │ ├── ci.json │ │ ├── common.sh │ │ ├── skipped-tests.sh │ │ └── test.sh │ ├── clang-format.sh │ ├── commit-msg-check.py │ ├── performance │ │ ├── bench_nightly_performance.sh │ │ ├── bench_pr_performance.sh │ │ ├── benchdnn_comparison.py │ │ └── inputs │ │ │ ├── conv │ │ │ ├── conv_nightly │ │ │ ├── eltwise │ │ │ ├── eltwise_nightly │ │ │ ├── matmul │ │ │ ├── matmul_nightly │ │ │ ├── reorder │ │ │ └── reorder_nightly │ └── x64 │ │ └── build_linters.sh ├── azure │ ├── build.bat │ ├── build.sh │ ├── ci-x64.yml │ ├── env │ │ └── clang.sh │ ├── test.bat │ └── test.sh ├── codeql-config.yml ├── dependabot.yml ├── labels.yml ├── pull_request_template.md └── workflows │ ├── aarch64-acl.yml │ ├── ci-aarch64.yml │ ├── clang-tidy.yml │ ├── codeql.yml │ ├── labeler.yml │ ├── nightly-aarch64.yml │ ├── openssf-scorecard.yml │ ├── performance-aarch64.yml │ └── pr-linter.yml ├── .gitignore ├── CITATION.cff ├── CMakeLists.txt ├── CODE_OF_CONDUCT.md ├── CODING_STANDARDS.md ├── CONTRIBUTING.md ├── LICENSE ├── MAINTAINERS.md ├── README.binary.in ├── README.md ├── SECURITY.md ├── THIRD-PARTY-PROGRAMS ├── cmake ├── ACL.cmake ├── Doxygen.cmake ├── Doxyrest.cmake ├── FindACL.cmake ├── FindBLAS.cmake ├── FindDoxyrest.cmake ├── FindHIP.cmake ├── FindMIOpen.cmake ├── FindOpenCL.cmake ├── FindPI_CUDA.cmake ├── FindSphinx.cmake ├── FindcuBLAS.cmake ├── FindcuDNN.cmake ├── FindcublasLt.cmake ├── FindrocBLAS.cmake ├── OpenCL.cmake ├── OpenMP.cmake ├── SDL.cmake ├── SYCL.cmake ├── Sphinx.cmake ├── TBB.cmake ├── Threading.cmake ├── Threadpool.cmake ├── blas.cmake ├── build_types.cmake ├── config.cmake.in ├── configuring_primitive_list.cmake ├── coverage.cmake ├── dnnl_compat.cmake ├── doc.cmake ├── dpcpp_driver_check.cmake ├── gen_gpu_kernel.cmake ├── gen_gpu_kernel_list.cmake ├── host_compiler.cmake ├── host_compiler_id.cmake ├── host_compiler_id.cpp ├── lnx │ └── TBBConfig.cmake ├── mac │ └── TBBConfig.cmake ├── options.cmake ├── platform.cmake ├── run_with_env.bat.in ├── template.vcxproj.user ├── testing.cmake ├── utils.cmake ├── version.cmake ├── version.rc.in └── win │ └── TBBConfig.cmake ├── doc ├── Doxyfile.in ├── advanced │ ├── design │ │ ├── mem_fmt_blk.png │ │ ├── mem_fmt_img1.png │ │ ├── mem_fmt_img2.png │ │ ├── mem_fmt_padded_blk.png │ │ └── strides.png │ ├── dpcpp_interoperability.md │ ├── experimental.md │ ├── int8_computations.md │ ├── opencl_interoperability.md │ ├── persistent_cache.md │ ├── primitive_cache.md │ ├── sparsity.md │ ├── threadpool.md │ ├── transition-to-dnnl.md │ └── understanding_memory_formats.md ├── build │ ├── build.md │ ├── build_options.md │ ├── link.md │ └── system_requirements.md ├── doxyrest │ ├── doxyrest-config.lua │ └── frame │ │ ├── cfamily │ │ ├── class.rst.in │ │ ├── compound.rst.in │ │ ├── crefdb.py.in │ │ ├── crefdb_enums.py.in │ │ ├── crefdb_items.py.in │ │ ├── crefdb_members.py.in │ │ ├── details_aliases.rst.in │ │ ├── details_construction.rst.in │ │ ├── details_defines.rst.in │ │ ├── details_events.rst.in │ │ ├── details_functions.rst.in │ │ ├── details_properties.rst.in │ │ ├── details_typedefs.rst.in │ │ ├── details_unnamed_enum_values.rst.in │ │ ├── details_variables.rst.in │ │ ├── enum.rst.in │ │ ├── example.rst.in │ │ ├── global.rst.in │ │ ├── index.rst.in │ │ ├── namespace.rst.in │ │ ├── overview_aliases.rst.in │ │ ├── overview_classes.rst.in │ │ ├── overview_common.rst.in │ │ ├── overview_compound.rst.in │ │ ├── overview_construction.rst.in │ │ ├── overview_defines.rst.in │ │ ├── overview_enums.rst.in │ │ ├── overview_events.rst.in │ │ ├── overview_functions.rst.in │ │ ├── overview_members.rst.in │ │ ├── overview_namespaces.rst.in │ │ ├── overview_properties.rst.in │ │ ├── overview_typedefs.rst.in │ │ ├── overview_variables.rst.in │ │ ├── scope_class.rst.in │ │ ├── scope_namespace.rst.in │ │ └── utils.lua │ │ └── common │ │ ├── crefdb_external.py.in │ │ ├── crefdb_groups.py.in │ │ ├── crefdb_items.py.in │ │ ├── details.rst.in │ │ ├── doc.lua │ │ ├── footnotes.rst.in │ │ ├── group.rst.in │ │ ├── index_main.rst.in │ │ ├── item.lua │ │ ├── page.rst.in │ │ ├── string.lua │ │ ├── table.lua │ │ └── toc.lua ├── environment.yml ├── examples.md ├── graph │ ├── constant_tensor_cache.md │ ├── fusion_patterns │ │ ├── binary_fusion_patterns.md │ │ ├── convolution_fusion_patterns.md │ │ ├── convtranspose_fusions_patterns.md │ │ ├── gated_mlp.md │ │ ├── gqa.md │ │ ├── images │ │ │ ├── binary_pattern.png │ │ │ ├── compressed_sdpa_pattern.png │ │ │ ├── conv_bwd_pattern.png │ │ │ ├── conv_pattern.png │ │ │ ├── convtranspose_pattern.png │ │ │ ├── epilogue_subgraph_conv.png │ │ │ ├── epilogue_subgraph_general_1.png │ │ │ ├── epilogue_subgraph_general_2.png │ │ │ ├── epilogue_subgraph_matmul.png │ │ │ ├── f2f_conversion.png │ │ │ ├── f2q_conversion_general.png │ │ │ ├── f2q_conversion_quantized_conv.png │ │ │ ├── f2q_conversion_quantized_matmul.png │ │ │ ├── f2q_conversion_softmax.png │ │ │ ├── fp-gated-mlp.png │ │ │ ├── gated-mlp-swish.png │ │ │ ├── gqa.png │ │ │ ├── interpolate_pattern.png │ │ │ ├── matmul_pattern.png │ │ │ ├── norm_pattern.png │ │ │ ├── pool_pattern.png │ │ │ ├── q2f_conversion_quantized_conv_matmul.png │ │ │ ├── q2f_conversion_quantized_convtranspose.png │ │ │ ├── quantized_conv_pattern.png │ │ │ ├── quantized_convtranspose_pattern.png │ │ │ ├── quantized_matmul_pattern.png │ │ │ ├── reduction_pattern.png │ │ │ ├── sdpa-mask-1.png │ │ │ ├── sdpa-mask-2.png │ │ │ ├── sdpa-mask-3.png │ │ │ ├── sdpa-mask-4.png │ │ │ ├── sdpa-reorder.png │ │ │ ├── sdpa.png │ │ │ ├── softmax_pattern.png │ │ │ └── unary_pattern.png │ │ ├── interpolate_fusion_patterns.md │ │ ├── matmul_fusion_patterns.md │ │ ├── norm_fusion_patterns.md │ │ ├── pool_fusion_patterns.md │ │ ├── quantized_convolution_fusion_patterns.md │ │ ├── quantized_convtranspose_fusion_patterns.md │ │ ├── quantized_matmul_fusion_patterns.md │ │ ├── reduction_fusion_patterns.md │ │ ├── sdpa.md │ │ ├── sdpa_with_compressed_kv.md │ │ ├── softmax_fusion_patterns.md │ │ └── unary_fusion_patterns.md │ ├── graph_dump.md │ ├── operations │ │ ├── Abs.md │ │ ├── AbsBackward.md │ │ ├── Add.md │ │ ├── AvgPool.md │ │ ├── AvgPoolBackward.md │ │ ├── BatchNormForwardTraining.md │ │ ├── BatchNormInference.md │ │ ├── BatchNormTrainingBackward.md │ │ ├── BiasAdd.md │ │ ├── BiasAddBackward.md │ │ ├── Clamp.md │ │ ├── ClampBackward.md │ │ ├── Concat.md │ │ ├── ConvTranspose.md │ │ ├── ConvTransposeBackwardData.md │ │ ├── ConvTransposeBackwardWeights.md │ │ ├── Convolution.md │ │ ├── ConvolutionBackwardData.md │ │ ├── ConvolutionBackwardWeights.md │ │ ├── Dequantize.md │ │ ├── Divide.md │ │ ├── DynamicDequantize.md │ │ ├── DynamicQuantize.md │ │ ├── Elu.md │ │ ├── EluBackward.md │ │ ├── End.md │ │ ├── Exp.md │ │ ├── GELU.md │ │ ├── GELUBackward.md │ │ ├── GenIndex.md │ │ ├── GreaterEqual.md │ │ ├── GroupNorm.md │ │ ├── HardSigmoid.md │ │ ├── HardSigmoidBackward.md │ │ ├── HardSwish.md │ │ ├── HardSwishBackward.md │ │ ├── Interpolate.md │ │ ├── InterpolateBackward.md │ │ ├── LayerNorm.md │ │ ├── LayerNormBackward.md │ │ ├── LeakyReLU.md │ │ ├── Log.md │ │ ├── LogSoftmax.md │ │ ├── LogSoftmaxBackward.md │ │ ├── MatMul.md │ │ ├── MaxPool.md │ │ ├── MaxPoolBackward.md │ │ ├── Maximum.md │ │ ├── Minimum.md │ │ ├── Mish.md │ │ ├── MishBackward.md │ │ ├── Multiply.md │ │ ├── PReLU.md │ │ ├── PReLUBackward.md │ │ ├── Pow.md │ │ ├── Quantize.md │ │ ├── ReLU.md │ │ ├── ReLUBackward.md │ │ ├── Reciprocal.md │ │ ├── ReduceL1.md │ │ ├── ReduceL2.md │ │ ├── ReduceMax.md │ │ ├── ReduceMean.md │ │ ├── ReduceMin.md │ │ ├── ReduceProd.md │ │ ├── ReduceSum.md │ │ ├── Reorder.md │ │ ├── Round.md │ │ ├── Select.md │ │ ├── Sigmoid.md │ │ ├── SigmoidBackward.md │ │ ├── SoftPlus.md │ │ ├── SoftPlusBackward.md │ │ ├── Softmax.md │ │ ├── SoftmaxBackward.md │ │ ├── Sqrt.md │ │ ├── SqrtBackward.md │ │ ├── Square.md │ │ ├── SquaredDifference.md │ │ ├── StaticReshape.md │ │ ├── StaticTranspose.md │ │ ├── Subtract.md │ │ ├── Tanh.md │ │ ├── TanhBackward.md │ │ ├── TypeCast.md │ │ └── Wildcard.md │ ├── programming_model │ │ ├── graph_basic_concepts.md │ │ ├── images │ │ │ ├── bf16_programming.jpg │ │ │ ├── img_graph_programming_model.png │ │ │ └── int8_programming.jpg │ │ └── low_precision.md │ └── rst │ │ ├── graph_fusion_patterns.rst │ │ ├── graph_programming_model.rst │ │ ├── graph_supported_operations.rst │ │ └── images │ │ └── other_pattern.png ├── legal_information.md ├── mainpage.md ├── naming_conventions.md ├── performance_considerations │ ├── benchdnn.md │ ├── cpu_isa_hints.md │ ├── dispatcher_control.md │ ├── inspecting_jit.md │ ├── perf_settings.md │ ├── profilers.md │ ├── verbose.md │ ├── verbose_table.md │ └── vtune.md ├── primitives │ ├── batch_normalization.md │ ├── binary.md │ ├── concat.md │ ├── convolution.md │ ├── eltwise.md │ ├── group_normalization.md │ ├── images │ │ └── unrolled_stack_rnn.jpg │ ├── inner_product.md │ ├── layer_normalization.md │ ├── lrn.md │ ├── matmul.md │ ├── pooling.md │ ├── prelu.md │ ├── reduction.md │ ├── reorder.md │ ├── resampling.md │ ├── rnn.md │ ├── shuffle.md │ ├── softmax.md │ └── sum.md ├── programming_model │ ├── api.md │ ├── attributes.md │ ├── attributes_accumulation_mode.md │ ├── attributes_deterministic.md │ ├── attributes_dropout.md │ ├── attributes_fpmath_mode.md │ ├── attributes_post_ops.md │ ├── attributes_quantization.md │ ├── attributes_rounding_mode.md │ ├── attributes_scratchpad.md │ ├── basic_concepts.md │ ├── data_types.md │ ├── images │ │ ├── img_depthwise_fusion.jpg │ │ ├── img_dnnl_object_snapshot.jpg │ │ ├── img_dnnl_programming_flow.jpg │ │ ├── img_overview_flow.jpg │ │ └── img_programming_model.png │ └── inference_and_training_aspects.md ├── rst │ ├── advanced_topics.rst │ ├── build_and_link.rst │ ├── dev_guide_examples.rst │ ├── graph_extension.rst │ ├── index.rst │ ├── interop_with_dpcpp_and_opencl.rst │ ├── orphans.rst │ ├── performance_profiling_and_inspection.rst │ ├── programming_model.rst │ ├── supported_primitives.rst │ └── ukernels.rst ├── sphinx │ ├── _static │ │ ├── dnnl.js │ │ ├── doxyrest_code_copy_button.js │ │ ├── favicons.png │ │ └── oneAPI-rgb-rev-100.png │ ├── cleanup.py │ └── conf.py ├── ukernel │ ├── operations │ │ ├── brgemm.md │ │ └── transform.md │ └── programming_model │ │ └── ukernel_basic_concepts.md └── usage_models │ ├── images │ ├── img_bf16_diagram.png │ ├── img_diagram.png │ ├── img_inference_scope.jpg │ ├── img_multiscalar.png │ ├── img_singlescalar.png │ └── img_training_inference_scope.jpg │ ├── inference.md │ ├── inference_int8.md │ ├── training.md │ └── training_bf16.md ├── examples ├── CMakeLists.txt ├── CMakeLists.txt.in ├── bnorm_u8_via_binary_postops.cpp ├── cnn_inference_f32.c ├── cnn_inference_f32.cpp ├── cnn_inference_int8.cpp ├── cnn_training_bf16.cpp ├── cnn_training_f32.cpp ├── cpu_cnn_training_f32.c ├── cpu_matmul_coo.cpp ├── cpu_matmul_csr.cpp ├── cpu_matmul_weights_compression.cpp ├── cpu_rnn_inference_f32.cpp ├── cpu_rnn_inference_int8.cpp ├── cross_engine_reorder.c ├── cross_engine_reorder.cpp ├── example_utils.h ├── example_utils.hpp ├── getting_started.cpp ├── gpu_opencl_interop.cpp ├── graph │ ├── cpu_getting_started.cpp │ ├── cpu_inference_int8.cpp │ ├── cpu_single_op_partition.cpp │ ├── gated_mlp.cpp │ ├── gated_mlp_int4.cpp │ ├── gated_mlp_wei_combined.cpp │ ├── gpu_opencl_getting_started.cpp │ ├── gqa.cpp │ ├── graph_example_utils.hpp │ ├── mqa.cpp │ ├── sdpa.cpp │ ├── sdpa_stacked_qkv.cpp │ ├── sycl_getting_started.cpp │ └── sycl_single_op_partition.cpp ├── matmul_perf.cpp ├── memory_format_propagation.cpp ├── performance_profiling.cpp ├── primitives │ ├── augru.cpp │ ├── batch_normalization.cpp │ ├── binary.cpp │ ├── concat.cpp │ ├── convolution.cpp │ ├── deconvolution.cpp │ ├── eltwise.cpp │ ├── group_normalization.cpp │ ├── inner_product.cpp │ ├── layer_normalization.cpp │ ├── lbr_gru.cpp │ ├── lrn.cpp │ ├── lstm.cpp │ ├── matmul.cpp │ ├── pooling.cpp │ ├── prelu.cpp │ ├── reduction.cpp │ ├── reorder.cpp │ ├── resampling.cpp │ ├── shuffle.cpp │ ├── softmax.cpp │ ├── sum.cpp │ └── vanilla_rnn.cpp ├── rnn_training_f32.cpp ├── sycl_interop_buffer.cpp ├── sycl_interop_usm.cpp ├── tutorials │ └── matmul │ │ ├── cpu_matmul_quantization.cpp │ │ ├── cpu_sgemm_and_matmul.cpp │ │ ├── inference_int8_matmul.cpp │ │ └── weights_decompression_matmul.cpp └── ukernels │ └── cpu_brgemm.cpp ├── include ├── dnnl.h ├── dnnl.hpp ├── dnnl_config.h ├── dnnl_debug.h ├── dnnl_ocl.h ├── dnnl_ocl.hpp ├── dnnl_sycl.h ├── dnnl_sycl.hpp ├── dnnl_sycl_types.h ├── dnnl_threadpool.h ├── dnnl_threadpool.hpp ├── dnnl_threadpool_iface.hpp ├── dnnl_types.h ├── dnnl_version.h └── oneapi │ └── dnnl │ ├── dnnl.h │ ├── dnnl.hpp │ ├── dnnl_common.h │ ├── dnnl_common.hpp │ ├── dnnl_common_types.h │ ├── dnnl_config.h.in │ ├── dnnl_debug.h │ ├── dnnl_graph.h │ ├── dnnl_graph.hpp │ ├── dnnl_graph_ocl.h │ ├── dnnl_graph_ocl.hpp │ ├── dnnl_graph_sycl.h │ ├── dnnl_graph_sycl.hpp │ ├── dnnl_graph_types.h │ ├── dnnl_ocl.h │ ├── dnnl_ocl.hpp │ ├── dnnl_ocl_types.h │ ├── dnnl_sycl.h │ ├── dnnl_sycl.hpp │ ├── dnnl_sycl_types.h │ ├── dnnl_threadpool.h │ ├── dnnl_threadpool.hpp │ ├── dnnl_threadpool_iface.hpp │ ├── dnnl_types.h │ ├── dnnl_ukernel.h │ ├── dnnl_ukernel.hpp │ ├── dnnl_ukernel_types.h │ ├── dnnl_version.h.in │ └── dnnl_version_hash.h.in ├── pyproject.toml ├── scripts ├── README.md ├── fix_header_guards.py ├── generate_dnnl_debug.py ├── generate_format_tags.py ├── synthdnn │ ├── README.md │ ├── matmul │ │ ├── primitive.py │ │ └── sampler.py │ └── synthdnn.py └── verbose_converter │ ├── README.md │ ├── src │ ├── __init__.py │ ├── benchdnn_generator.py │ ├── breakdown_generator.py │ ├── dnnl_parser.py │ ├── ir.py │ ├── parse.py │ └── utils.py │ ├── tests │ ├── README.md │ ├── benchdnn_test.py │ ├── dataset_ci │ └── dataset_simple │ └── verbose_converter.py ├── src ├── CMakeLists.txt ├── common │ ├── CMakeLists.txt │ ├── batch_normalization.cpp │ ├── batch_normalization_pd.hpp │ ├── bfloat16.cpp │ ├── bfloat16.hpp │ ├── binary.cpp │ ├── binary_pd.hpp │ ├── bit_cast.hpp │ ├── broadcast_strategy.cpp │ ├── broadcast_strategy.hpp │ ├── c_types_map.hpp │ ├── cache_blob.hpp │ ├── cache_blob_id.cpp │ ├── cache_blob_id.hpp │ ├── cache_hit_types.hpp │ ├── cache_utils.hpp │ ├── compiler_workarounds.hpp │ ├── concat.cpp │ ├── concat.hpp │ ├── concat_pd.hpp │ ├── convolution.cpp │ ├── convolution_pd.cpp │ ├── convolution_pd.hpp │ ├── counting_barrier.hpp │ ├── cpp_compat.hpp │ ├── deconvolution.cpp │ ├── deconvolution_pd.hpp │ ├── dnnl_debug.cpp │ ├── dnnl_debug_autogenerated.cpp │ ├── dnnl_thread.hpp │ ├── dnnl_thread_tbb_proxy.hpp │ ├── dnnl_threadpool.cpp │ ├── dnnl_traits.hpp │ ├── eltwise.cpp │ ├── eltwise_pd.hpp │ ├── engine.cpp │ ├── engine.hpp │ ├── engine_id.hpp │ ├── engine_impl.hpp │ ├── experimental.cpp │ ├── experimental.hpp │ ├── float16.hpp │ ├── float4.cpp │ ├── float4.hpp │ ├── float8.cpp │ ├── float8.hpp │ ├── fpmath_mode.cpp │ ├── gemm.cpp │ ├── gemm_pd.hpp │ ├── gemm_types.hpp │ ├── gemm_utils.hpp │ ├── group_normalization.cpp │ ├── group_normalization_pd.hpp │ ├── impl_list_item.hpp │ ├── impl_registration.hpp │ ├── inner_product.cpp │ ├── inner_product_pd.hpp │ ├── int4.hpp │ ├── internal_defs.hpp │ ├── ittnotify.cpp │ ├── ittnotify.hpp │ ├── kernel_cache.cpp │ ├── kernel_cache.hpp │ ├── layer_normalization.cpp │ ├── layer_normalization_pd.hpp │ ├── logging.cpp │ ├── logging.hpp │ ├── lrn.cpp │ ├── lrn_pd.hpp │ ├── math_utils.hpp │ ├── matmul.cpp │ ├── matmul_pd.hpp │ ├── memory.cpp │ ├── memory.hpp │ ├── memory_debug.cpp │ ├── memory_debug.hpp │ ├── memory_desc.cpp │ ├── memory_desc.hpp │ ├── memory_desc_wrapper.cpp │ ├── memory_desc_wrapper.hpp │ ├── memory_map_manager.hpp │ ├── memory_storage.cpp │ ├── memory_storage.hpp │ ├── memory_tracking.cpp │ ├── memory_tracking.hpp │ ├── memory_zero_pad.cpp │ ├── nstl.hpp │ ├── opdesc.hpp │ ├── optional.hpp │ ├── pooling.cpp │ ├── pooling_pd.hpp │ ├── prelu.cpp │ ├── prelu_pd.hpp │ ├── primitive.cpp │ ├── primitive.hpp │ ├── primitive_attr.cpp │ ├── primitive_attr.hpp │ ├── primitive_attr_quant.cpp │ ├── primitive_attr_quant.hpp │ ├── primitive_cache.cpp │ ├── primitive_cache.hpp │ ├── primitive_desc.hpp │ ├── primitive_desc_iface.cpp │ ├── primitive_desc_iface.hpp │ ├── primitive_desc_iterator.hpp │ ├── primitive_exec_types.cpp │ ├── primitive_exec_types.hpp │ ├── primitive_hashing.cpp │ ├── primitive_hashing.hpp │ ├── primitive_iface.cpp │ ├── primitive_iface.hpp │ ├── primitive_serialization.cpp │ ├── primitive_serialization.hpp │ ├── profiler.hpp │ ├── query.cpp │ ├── reduction.cpp │ ├── reduction_pd.hpp │ ├── reorder.cpp │ ├── reorder.hpp │ ├── reorder_pd.hpp │ ├── resampling.cpp │ ├── resampling_pd.hpp │ ├── resource.hpp │ ├── rnn.cpp │ ├── rnn.hpp │ ├── rnn_pd.hpp │ ├── rw_mutex.cpp │ ├── rw_mutex.hpp │ ├── scratchpad.cpp │ ├── scratchpad.hpp │ ├── scratchpad_debug.cpp │ ├── scratchpad_debug.hpp │ ├── sdpa_pd.hpp │ ├── sdpa_test_iface.cpp │ ├── sdpa_types.hpp │ ├── sdpa_utils.hpp │ ├── serialization.hpp │ ├── shuffle.cpp │ ├── shuffle_pd.hpp │ ├── softmax.cpp │ ├── softmax_pd.hpp │ ├── stack_checker.hpp │ ├── stream.cpp │ ├── stream.hpp │ ├── stream_impl.hpp │ ├── stream_profiler.cpp │ ├── stream_threadpool.cpp │ ├── sum.cpp │ ├── sum_pd.hpp │ ├── tag_traits.hpp │ ├── thread_local_storage.hpp │ ├── type_helpers.hpp │ ├── utils.cpp │ ├── utils.hpp │ ├── verbose.cpp │ ├── verbose.hpp │ ├── verbose_msg.hpp │ └── z_magic.hpp ├── cpu │ ├── CMakeLists.txt │ ├── README.md │ ├── aarch64 │ │ ├── CMakeLists.txt │ │ ├── acl_batch_normalization.cpp │ │ ├── acl_batch_normalization.hpp │ │ ├── acl_benchmark_scheduler.cpp │ │ ├── acl_benchmark_scheduler.hpp │ │ ├── acl_binary.cpp │ │ ├── acl_binary.hpp │ │ ├── acl_convolution_utils.cpp │ │ ├── acl_convolution_utils.hpp │ │ ├── acl_deconvolution.cpp │ │ ├── acl_deconvolution.hpp │ │ ├── acl_depthwise_convolution.cpp │ │ ├── acl_depthwise_convolution.hpp │ │ ├── acl_eltwise.cpp │ │ ├── acl_eltwise.hpp │ │ ├── acl_gemm_convolution.cpp │ │ ├── acl_gemm_convolution.hpp │ │ ├── acl_indirect_gemm_convolution.cpp │ │ ├── acl_indirect_gemm_convolution.hpp │ │ ├── acl_inner_product.cpp │ │ ├── acl_inner_product.hpp │ │ ├── acl_layer_normalization.cpp │ │ ├── acl_layer_normalization.hpp │ │ ├── acl_pooling.cpp │ │ ├── acl_pooling.hpp │ │ ├── acl_post_ops.cpp │ │ ├── acl_post_ops.hpp │ │ ├── acl_prelu.cpp │ │ ├── acl_prelu.hpp │ │ ├── acl_reorder.cpp │ │ ├── acl_reorder.hpp │ │ ├── acl_softmax.cpp │ │ ├── acl_softmax.hpp │ │ ├── acl_thread.cpp │ │ ├── acl_thread.hpp │ │ ├── acl_threadpool_scheduler.cpp │ │ ├── acl_threadpool_scheduler.hpp │ │ ├── acl_utils.cpp │ │ ├── acl_utils.hpp │ │ ├── acl_winograd_convolution.cpp │ │ ├── acl_winograd_convolution.hpp │ │ ├── brgemm │ │ │ ├── brgemm.cpp │ │ │ ├── brgemm.hpp │ │ │ ├── brgemm_containers.cpp │ │ │ ├── brgemm_containers.hpp │ │ │ ├── brgemm_types.hpp │ │ │ ├── brgemm_utils.cpp │ │ │ ├── brgemm_utils.hpp │ │ │ ├── jit_brdgmm_kernel.cpp │ │ │ ├── jit_brdgmm_kernel.hpp │ │ │ └── jit_brgemm_kernel.cpp │ │ ├── cpu_barrier.cpp │ │ ├── cpu_barrier.hpp │ │ ├── cpu_isa_traits.cpp │ │ ├── cpu_isa_traits.hpp │ │ ├── cpu_reducer.cpp │ │ ├── cpu_reducer.hpp │ │ ├── injectors │ │ │ ├── injector_utils.cpp │ │ │ ├── injector_utils.hpp │ │ │ ├── jit_uni_binary_injector.cpp │ │ │ ├── jit_uni_binary_injector.hpp │ │ │ ├── jit_uni_eltwise_injector.cpp │ │ │ ├── jit_uni_eltwise_injector.hpp │ │ │ ├── jit_uni_postops_injector.cpp │ │ │ └── jit_uni_postops_injector.hpp │ │ ├── jit_brdgmm_dw_conv.cpp │ │ ├── jit_brdgmm_dw_conv.hpp │ │ ├── jit_brgemm_1x1_conv.cpp │ │ ├── jit_brgemm_1x1_conv.hpp │ │ ├── jit_brgemm_conv.cpp │ │ ├── jit_brgemm_conv.hpp │ │ ├── jit_brgemm_conv_bwd.cpp │ │ ├── jit_brgemm_conv_bwd.hpp │ │ ├── jit_brgemm_conv_comp_pad_kernel.cpp │ │ ├── jit_brgemm_conv_comp_pad_kernel.hpp │ │ ├── jit_brgemm_conv_trans_kernel.cpp │ │ ├── jit_brgemm_conv_trans_kernel.hpp │ │ ├── jit_brgemm_conv_utils.cpp │ │ ├── jit_brgemm_conv_utils.hpp │ │ ├── jit_brgemm_post_ops.hpp │ │ ├── jit_brgemm_primitive_conf.hpp │ │ ├── jit_brgemm_transpose_utils.cpp │ │ ├── jit_brgemm_transpose_utils.hpp │ │ ├── jit_generator.hpp │ │ ├── jit_op_imm_check.hpp │ │ ├── jit_primitive_conf.hpp │ │ ├── jit_sve_1x1_conv_kernel.cpp │ │ ├── jit_sve_1x1_conv_kernel.hpp │ │ ├── jit_sve_1x1_convolution.cpp │ │ ├── jit_sve_1x1_convolution.hpp │ │ ├── jit_sve_512_core_x8s8s32x_deconvolution.cpp │ │ ├── jit_sve_512_core_x8s8s32x_deconvolution.hpp │ │ ├── jit_sve_512_x8s8s32x_conv_kernel.cpp │ │ ├── jit_sve_512_x8s8s32x_conv_kernel.hpp │ │ ├── jit_sve_512_x8s8s32x_convolution.cpp │ │ ├── jit_sve_512_x8s8s32x_convolution.hpp │ │ ├── jit_sve_conv_kernel.cpp │ │ ├── jit_sve_conv_kernel.hpp │ │ ├── jit_sve_convolution.cpp │ │ ├── jit_sve_convolution.hpp │ │ ├── jit_uni_1x1_conv_utils.hpp │ │ ├── jit_uni_batch_normalization.cpp │ │ ├── jit_uni_batch_normalization.hpp │ │ ├── jit_uni_batch_normalization_s8.cpp │ │ ├── jit_uni_batch_normalization_s8.hpp │ │ ├── jit_uni_binary.cpp │ │ ├── jit_uni_binary.hpp │ │ ├── jit_uni_binary_kernel.cpp │ │ ├── jit_uni_binary_kernel.hpp │ │ ├── jit_uni_deconv_zp_pad_str_kernel.cpp │ │ ├── jit_uni_deconv_zp_pad_str_kernel.hpp │ │ ├── jit_uni_dw_conv_kernel_f32.cpp │ │ ├── jit_uni_dw_conv_kernel_f32.hpp │ │ ├── jit_uni_dw_conv_kernel_utils.hpp │ │ ├── jit_uni_dw_convolution.cpp │ │ ├── jit_uni_dw_convolution.hpp │ │ ├── jit_uni_eltwise.cpp │ │ ├── jit_uni_eltwise.hpp │ │ ├── jit_uni_eltwise_int.cpp │ │ ├── jit_uni_eltwise_int.hpp │ │ ├── jit_uni_i8i8_pooling.cpp │ │ ├── jit_uni_i8i8_pooling.hpp │ │ ├── jit_uni_pool_kernel.cpp │ │ ├── jit_uni_pool_kernel.hpp │ │ ├── jit_uni_pooling.cpp │ │ ├── jit_uni_pooling.hpp │ │ ├── jit_uni_reorder.cpp │ │ ├── jit_uni_reorder.hpp │ │ ├── jit_uni_reorder_utils.cpp │ │ ├── jit_uni_softmax.cpp │ │ ├── jit_uni_softmax.hpp │ │ ├── matmul │ │ │ ├── acl_lowp_matmul.cpp │ │ │ ├── acl_lowp_matmul.hpp │ │ │ ├── acl_lowp_matmul_sq.cpp │ │ │ ├── acl_lowp_matmul_sq.hpp │ │ │ ├── acl_matmul.cpp │ │ │ ├── acl_matmul.hpp │ │ │ ├── acl_matmul_utils.cpp │ │ │ ├── acl_matmul_utils.hpp │ │ │ ├── brgemm_matmul.cpp │ │ │ ├── brgemm_matmul.hpp │ │ │ ├── brgemm_matmul_copy_utils.cpp │ │ │ ├── brgemm_matmul_copy_utils.hpp │ │ │ ├── brgemm_matmul_reorders.cpp │ │ │ ├── brgemm_matmul_reorders.hpp │ │ │ ├── brgemm_matmul_utils.cpp │ │ │ ├── brgemm_matmul_utils.hpp │ │ │ ├── jit_int8_kernel_types.hpp │ │ │ ├── jit_int8_matmul.cpp │ │ │ ├── jit_int8_matmul.hpp │ │ │ ├── jit_int8_matmul_utils.cpp │ │ │ └── jit_int8_matmul_utils.hpp │ │ ├── shuffle │ │ │ ├── jit_uni_shuffle.cpp │ │ │ ├── jit_uni_shuffle.hpp │ │ │ ├── jit_uni_shuffle_kernel.cpp │ │ │ └── jit_uni_shuffle_kernel.hpp │ │ └── utils │ │ │ ├── jit_io_helper.cpp │ │ │ └── jit_io_helper.hpp │ ├── bfloat16.cpp │ ├── binary_injector_utils.cpp │ ├── binary_injector_utils.hpp │ ├── cpu_batch_normalization_list.cpp │ ├── cpu_batch_normalization_pd.hpp │ ├── cpu_batch_normalization_utils.cpp │ ├── cpu_batch_normalization_utils.hpp │ ├── cpu_binary_list.cpp │ ├── cpu_binary_pd.hpp │ ├── cpu_concat.cpp │ ├── cpu_concat_pd.hpp │ ├── cpu_convolution_list.cpp │ ├── cpu_convolution_pd.hpp │ ├── cpu_deconvolution_list.cpp │ ├── cpu_deconvolution_pd.hpp │ ├── cpu_eltwise_list.cpp │ ├── cpu_eltwise_pd.hpp │ ├── cpu_engine.cpp │ ├── cpu_engine.hpp │ ├── cpu_group_normalization_list.cpp │ ├── cpu_group_normalization_pd.hpp │ ├── cpu_inner_product_list.cpp │ ├── cpu_inner_product_pd.hpp │ ├── cpu_layer_normalization_list.cpp │ ├── cpu_layer_normalization_pd.hpp │ ├── cpu_lrn_list.cpp │ ├── cpu_lrn_pd.hpp │ ├── cpu_memory_storage.hpp │ ├── cpu_pooling_list.cpp │ ├── cpu_pooling_pd.hpp │ ├── cpu_prelu_list.cpp │ ├── cpu_prelu_pd.hpp │ ├── cpu_primitive.hpp │ ├── cpu_reduction_list.cpp │ ├── cpu_reduction_pd.hpp │ ├── cpu_resampling_list.cpp │ ├── cpu_resampling_pd.hpp │ ├── cpu_rnn_list.cpp │ ├── cpu_shuffle_list.cpp │ ├── cpu_shuffle_pd.hpp │ ├── cpu_softmax_list.cpp │ ├── cpu_softmax_pd.hpp │ ├── cpu_stream.hpp │ ├── cpu_sum.cpp │ ├── cpu_sum_pd.hpp │ ├── dw_convolution_utils.hpp │ ├── float16.cpp │ ├── gemm │ │ ├── bf16 │ │ │ ├── ref_gemm_bf16.cpp │ │ │ └── ref_gemm_bf16.hpp │ │ ├── f32 │ │ │ ├── gemm_utils_f32.cpp │ │ │ ├── gemm_utils_f32.hpp │ │ │ ├── ref_gemm_f32.cpp │ │ │ └── ref_gemm_f32.hpp │ │ ├── gemm.cpp │ │ ├── gemm.hpp │ │ ├── gemm_msan_unpoison.hpp │ │ ├── gemm_pack.cpp │ │ ├── gemm_pack.hpp │ │ ├── os_blas.hpp │ │ └── s8x8s32 │ │ │ ├── ref_gemm_s8x8s32.cpp │ │ │ ├── ref_gemm_s8x8s32.hpp │ │ │ ├── simple_gemm_s8s8s32.cpp │ │ │ └── simple_gemm_s8s8s32.hpp │ ├── gemm_convolution.cpp │ ├── gemm_convolution.hpp │ ├── gemm_convolution_utils.cpp │ ├── gemm_convolution_utils.hpp │ ├── gemm_inner_product.cpp │ ├── gemm_inner_product.hpp │ ├── gemm_inner_product_utils.cpp │ ├── gemm_inner_product_utils.hpp │ ├── gemm_x8s8s32x_conv_zp_src_pad_comp.cpp │ ├── gemm_x8s8s32x_conv_zp_src_pad_comp.hpp │ ├── gemm_x8s8s32x_convolution.cpp │ ├── gemm_x8s8s32x_convolution.hpp │ ├── gemm_x8s8s32x_convolution_utils.cpp │ ├── gemm_x8s8s32x_convolution_utils.hpp │ ├── gemm_x8s8s32x_inner_product.cpp │ ├── gemm_x8s8s32x_inner_product.hpp │ ├── jit_utils │ │ ├── jit_utils.cpp │ │ ├── jit_utils.hpp │ │ └── linux_perf │ │ │ ├── README.md │ │ │ ├── linux_perf.cpp │ │ │ └── linux_perf.hpp │ ├── matmul │ │ ├── cpu_matmul_list.cpp │ │ ├── cpu_matmul_pd.hpp │ │ ├── gemm_based_common.hpp │ │ ├── gemm_bf16_matmul.cpp │ │ ├── gemm_bf16_matmul.hpp │ │ ├── gemm_f32_matmul.cpp │ │ ├── gemm_f32_matmul.hpp │ │ ├── gemm_x8s8s32x_matmul.cpp │ │ ├── gemm_x8s8s32x_matmul.hpp │ │ ├── matmul_utils.hpp │ │ ├── ref_matmul.cpp │ │ ├── ref_matmul.hpp │ │ ├── ref_matmul_int8.cpp │ │ ├── ref_matmul_int8.hpp │ │ ├── ref_sparse_matmul.cpp │ │ └── ref_sparse_matmul.hpp │ ├── nchw_pooling.cpp │ ├── nchw_pooling.hpp │ ├── ncsp_batch_normalization.cpp │ ├── ncsp_batch_normalization.hpp │ ├── ncsp_group_normalization.cpp │ ├── ncsp_group_normalization.hpp │ ├── nhwc_pooling.cpp │ ├── nhwc_pooling.hpp │ ├── nspc_batch_normalization.cpp │ ├── nspc_batch_normalization.hpp │ ├── platform.cpp │ ├── platform.hpp │ ├── ppc64 │ │ ├── CMakeLists.txt │ │ ├── gemm │ │ │ ├── gemm_driver.cpp │ │ │ ├── gemm_driver.hpp │ │ │ ├── gemm_info.cpp │ │ │ ├── gemm_info.hpp │ │ │ ├── gemm_pack_storage.hpp │ │ │ ├── gemm_partition.hpp │ │ │ ├── gemm_threading.hpp │ │ │ └── gemm_utils.hpp │ │ ├── ppc64_gemm_driver.hpp │ │ ├── ppc64_gemm_reorder.cpp │ │ ├── ppc64_gemm_reorder.hpp │ │ ├── ppc64_gemm_s8x8s32.cpp │ │ └── ppc64_gemm_s8x8s32.hpp │ ├── primitive_attr_postops.cpp │ ├── primitive_attr_postops.hpp │ ├── ref_batch_normalization.cpp │ ├── ref_batch_normalization.hpp │ ├── ref_binary.cpp │ ├── ref_binary.hpp │ ├── ref_concat.hpp │ ├── ref_convolution.cpp │ ├── ref_convolution.hpp │ ├── ref_convolution_int8.cpp │ ├── ref_convolution_int8.hpp │ ├── ref_convolution_utils.hpp │ ├── ref_deconvolution.cpp │ ├── ref_deconvolution.hpp │ ├── ref_eltwise.cpp │ ├── ref_eltwise.hpp │ ├── ref_fused_convolution.hpp │ ├── ref_group_normalization.cpp │ ├── ref_group_normalization.hpp │ ├── ref_inner_product.cpp │ ├── ref_inner_product.hpp │ ├── ref_inner_product_int8.cpp │ ├── ref_inner_product_int8.hpp │ ├── ref_inner_product_utils.hpp │ ├── ref_io_helper.hpp │ ├── ref_layer_normalization.cpp │ ├── ref_layer_normalization.hpp │ ├── ref_lrn.cpp │ ├── ref_lrn.hpp │ ├── ref_pooling.cpp │ ├── ref_pooling.hpp │ ├── ref_prelu.cpp │ ├── ref_prelu.hpp │ ├── ref_reduction.cpp │ ├── ref_reduction.hpp │ ├── ref_resampling.cpp │ ├── ref_resampling.hpp │ ├── ref_shuffle.cpp │ ├── ref_shuffle.hpp │ ├── ref_softmax.cpp │ ├── ref_softmax.hpp │ ├── ref_sum.hpp │ ├── reorder │ │ ├── cpu_reorder.cpp │ │ ├── cpu_reorder.hpp │ │ ├── cpu_reorder_comp_bf16_s8.cpp │ │ ├── cpu_reorder_comp_f32_s8.cpp │ │ ├── cpu_reorder_comp_s8_s8.cpp │ │ ├── cpu_reorder_pd.hpp │ │ ├── cpu_reorder_regular_bf16.cpp │ │ ├── cpu_reorder_regular_f16.cpp │ │ ├── cpu_reorder_regular_f32_bf16.cpp │ │ ├── cpu_reorder_regular_f32_f16.cpp │ │ ├── cpu_reorder_regular_f32_f32.cpp │ │ ├── cpu_reorder_regular_f32_fp8.cpp │ │ ├── cpu_reorder_regular_f32_s32.cpp │ │ ├── cpu_reorder_regular_f32_s8.cpp │ │ ├── cpu_reorder_regular_f32_u8.cpp │ │ ├── cpu_reorder_regular_fp4.cpp │ │ ├── cpu_reorder_regular_fp8.cpp │ │ ├── cpu_reorder_regular_s32.cpp │ │ ├── cpu_reorder_regular_s4.cpp │ │ ├── cpu_reorder_regular_s8.cpp │ │ ├── cpu_reorder_regular_u4.cpp │ │ ├── cpu_reorder_regular_u8.cpp │ │ ├── simple_reorder.hpp │ │ └── simple_sparse_reorder.hpp │ ├── resampling_utils.hpp │ ├── rnn │ │ ├── brgemm_cell_common.cpp │ │ ├── cell_common.cpp │ │ ├── cell_gru.cpp │ │ ├── cell_gru_lbr.cpp │ │ ├── cpu_rnn_pd.hpp │ │ ├── postgemm_dispatcher.hpp │ │ ├── ref_postgemm_gru.cpp │ │ ├── ref_postgemm_gru_lbr.cpp │ │ ├── ref_postgemm_lstm.cpp │ │ ├── ref_postgemm_lstm_projection.cpp │ │ ├── ref_postgemm_rnn.cpp │ │ ├── ref_rnn.cpp │ │ ├── ref_rnn.hpp │ │ ├── rnn_reorders.hpp │ │ ├── rnn_utils.cpp │ │ └── rnn_utils.hpp │ ├── rv64 │ │ ├── CMakeLists.txt │ │ ├── rvv_nchw_pooling.cpp │ │ └── rvv_nchw_pooling.hpp │ ├── s390x │ │ ├── CMakeLists.txt │ │ ├── gemm.h │ │ ├── gemmu16.cpp │ │ ├── helpers.h │ │ ├── kernel_s16s16s32.hpp │ │ └── pack.hpp │ ├── scale_utils.cpp │ ├── scale_utils.hpp │ ├── simple_concat.cpp │ ├── simple_concat.hpp │ ├── simple_layer_normalization.cpp │ ├── simple_layer_normalization.hpp │ ├── simple_q10n.hpp │ ├── simple_resampling.cpp │ ├── simple_resampling.hpp │ ├── simple_sum.cpp │ ├── simple_sum.hpp │ ├── sycl │ │ ├── CMakeLists.txt │ │ ├── engine.cpp │ │ ├── engine.hpp │ │ ├── stream.cpp │ │ ├── stream.hpp │ │ ├── stream_cpu_thunk.cpp │ │ ├── stream_cpu_thunk.hpp │ │ ├── stream_submit_cpu_primitive.cpp │ │ └── stream_submit_cpu_primitive.hpp │ ├── ukernel │ │ ├── attr_params.cpp │ │ ├── brgemm.cpp │ │ ├── c_types_map.hpp │ │ └── transform.cpp │ ├── x64 │ │ ├── CMakeLists.txt │ │ ├── amx_tile_configure.cpp │ │ ├── amx_tile_configure.hpp │ │ ├── brgemm │ │ │ ├── brgemm.cpp │ │ │ ├── brgemm.hpp │ │ │ ├── brgemm_containers.cpp │ │ │ ├── brgemm_containers.hpp │ │ │ ├── brgemm_types.hpp │ │ │ ├── brgemm_utils.cpp │ │ │ ├── brgemm_utils.hpp │ │ │ ├── jit_brdgmm_kernel.cpp │ │ │ ├── jit_brdgmm_kernel.hpp │ │ │ ├── jit_brgemm_amx_uker.cpp │ │ │ └── jit_brgemm_kernel.cpp │ │ ├── cpu_barrier.cpp │ │ ├── cpu_barrier.hpp │ │ ├── cpu_isa_traits.cpp │ │ ├── cpu_isa_traits.hpp │ │ ├── cpu_reducer.cpp │ │ ├── cpu_reducer.hpp │ │ ├── gemm │ │ │ ├── amx │ │ │ │ ├── jit_avx512_core_amx_copy_kern.cpp │ │ │ │ ├── jit_avx512_core_amx_copy_kern.hpp │ │ │ │ ├── jit_avx512_core_amx_gemm_kern.cpp │ │ │ │ └── jit_avx512_core_amx_gemm_kern.hpp │ │ │ ├── bf16 │ │ │ │ ├── common_s16.hpp │ │ │ │ ├── jit_avx512_core_gemm_bf16bf16f32_kern.cpp │ │ │ │ ├── jit_avx512_core_gemm_bf16bf16f32_kern.hpp │ │ │ │ ├── jit_avx512_core_gemv_bf16bf16f32_kern.cpp │ │ │ │ ├── jit_avx512_core_gemv_bf16bf16f32_kern.hpp │ │ │ │ ├── jit_avx512_core_s16_24x8_copy_an_kern_autogen.cpp │ │ │ │ ├── jit_avx512_core_s16_24x8_copy_at_kern_autogen.cpp │ │ │ │ ├── jit_avx512_core_s16_24x8_copy_bn_kern_autogen.cpp │ │ │ │ ├── jit_avx512_core_s16_24x8_copy_bt_kern_autogen.cpp │ │ │ │ ├── jit_avx512_core_s16_48x8_copy_an_kern_autogen.cpp │ │ │ │ ├── jit_avx512_core_s16_48x8_copy_at_kern_autogen.cpp │ │ │ │ ├── jit_avx512_core_s16_48x8_copy_bn_kern_autogen.cpp │ │ │ │ └── jit_avx512_core_s16_48x8_copy_bt_kern_autogen.cpp │ │ │ ├── f32 │ │ │ │ ├── common_f32.hpp │ │ │ │ ├── jit_avx2_f32_copy_an_kern_autogen.cpp │ │ │ │ ├── jit_avx2_f32_copy_at_kern_autogen.cpp │ │ │ │ ├── jit_avx2_f32_copy_bn_kern_autogen.cpp │ │ │ │ ├── jit_avx2_f32_copy_bt_kern_autogen.cpp │ │ │ │ ├── jit_avx2_kernel_sgemm_kern.cpp │ │ │ │ ├── jit_avx2_kernel_sgemm_kern.hpp │ │ │ │ ├── jit_avx512_common_gemm_f32.cpp │ │ │ │ ├── jit_avx512_common_gemm_f32.hpp │ │ │ │ ├── jit_avx512_core_f32_copy_an_kern_autogen.cpp │ │ │ │ ├── jit_avx512_core_f32_copy_at_kern_autogen.hpp │ │ │ │ ├── jit_avx512_core_f32_copy_at_kern_part1_autogen.cpp │ │ │ │ ├── jit_avx512_core_f32_copy_at_kern_part2_autogen.cpp │ │ │ │ ├── jit_avx512_core_f32_copy_bn_kern_autogen.cpp │ │ │ │ ├── jit_avx512_core_f32_copy_bt_kern_autogen.cpp │ │ │ │ ├── jit_avx512_core_gemm_smalln_tn_f32_kern.cpp │ │ │ │ ├── jit_avx512_core_gemm_smalln_tn_f32_kern.hpp │ │ │ │ ├── jit_avx_f32_copy_an_kern_autogen.cpp │ │ │ │ ├── jit_avx_f32_copy_at_kern_autogen.cpp │ │ │ │ ├── jit_avx_f32_copy_bn_kern_autogen.cpp │ │ │ │ ├── jit_avx_f32_copy_bt_kern_autogen.cpp │ │ │ │ ├── jit_avx_gemm_f32.cpp │ │ │ │ ├── jit_avx_gemm_f32.hpp │ │ │ │ ├── jit_avx_gemv_t_f32_kern.cpp │ │ │ │ ├── jit_avx_gemv_t_f32_kern.hpp │ │ │ │ ├── jit_avx_kernel_b0_sgemm_kern_autogen.hpp │ │ │ │ ├── jit_avx_kernel_b0_sgemm_kern_part1_autogen.cpp │ │ │ │ ├── jit_avx_kernel_b0_sgemm_kern_part2_autogen.cpp │ │ │ │ ├── jit_avx_kernel_sgemm_kern_autogen.hpp │ │ │ │ ├── jit_avx_kernel_sgemm_kern_part1_autogen.cpp │ │ │ │ ├── jit_avx_kernel_sgemm_kern_part2_autogen.cpp │ │ │ │ ├── jit_sse41_f32_copy_an_kern_autogen.cpp │ │ │ │ ├── jit_sse41_f32_copy_at_kern_autogen.cpp │ │ │ │ ├── jit_sse41_f32_copy_bn_kern_autogen.cpp │ │ │ │ ├── jit_sse41_f32_copy_bt_kern_autogen.cpp │ │ │ │ ├── jit_sse41_gemv_n_f32_kern.cpp │ │ │ │ ├── jit_sse41_gemv_n_f32_kern.hpp │ │ │ │ ├── jit_sse41_gemv_t_f32_kern.cpp │ │ │ │ ├── jit_sse41_gemv_t_f32_kern.hpp │ │ │ │ ├── jit_sse41_kernel_b0_sgemm_kern_autogen.cpp │ │ │ │ └── jit_sse41_kernel_sgemm_kern_autogen.cpp │ │ │ ├── gemm_driver.cpp │ │ │ ├── gemm_driver.hpp │ │ │ ├── gemm_info.cpp │ │ │ ├── gemm_info.hpp │ │ │ ├── gemm_pack.cpp │ │ │ ├── gemm_pack.hpp │ │ │ ├── gemm_pack_storage.hpp │ │ │ ├── gemm_partition.hpp │ │ │ ├── gemm_threading.hpp │ │ │ ├── gemm_utils.hpp │ │ │ ├── gemv_driver.cpp │ │ │ ├── gemv_driver.hpp │ │ │ └── s8x8s32 │ │ │ │ ├── common_u8.hpp │ │ │ │ ├── jit_avx2_gemm_s8u8s32_kern.cpp │ │ │ │ ├── jit_avx2_gemm_s8u8s32_kern.hpp │ │ │ │ ├── jit_avx2_u8_copy_an_kern_autogen.cpp │ │ │ │ ├── jit_avx2_u8_copy_at_kern_autogen.cpp │ │ │ │ ├── jit_avx2_u8_copy_bn_kern_autogen.cpp │ │ │ │ ├── jit_avx2_u8_copy_bt_kern_autogen.cpp │ │ │ │ ├── jit_avx2_u8_copy_sum_an_kern_autogen.cpp │ │ │ │ ├── jit_avx2_u8_copy_sum_at_kern_autogen.cpp │ │ │ │ ├── jit_avx2_u8_copy_sum_bn_kern_autogen.cpp │ │ │ │ ├── jit_avx2_u8_copy_sum_bt_kern_autogen.cpp │ │ │ │ ├── jit_avx2_vnni_u8_copy_an_kern_autogen.cpp │ │ │ │ ├── jit_avx2_vnni_u8_copy_at_kern_autogen.cpp │ │ │ │ ├── jit_avx2_vnni_u8_copy_bn_kern_autogen.cpp │ │ │ │ ├── jit_avx2_vnni_u8_copy_bt_kern_autogen.cpp │ │ │ │ ├── jit_avx2_vnni_u8_copy_sum_an_kern_autogen.cpp │ │ │ │ ├── jit_avx2_vnni_u8_copy_sum_at_kern_autogen.cpp │ │ │ │ ├── jit_avx2_vnni_u8_copy_sum_bn_kern_autogen.cpp │ │ │ │ ├── jit_avx2_vnni_u8_copy_sum_bt_kern_autogen.cpp │ │ │ │ ├── jit_avx512_core_gemm_s8u8s32_kern.cpp │ │ │ │ ├── jit_avx512_core_gemm_s8u8s32_kern.hpp │ │ │ │ ├── jit_avx512_core_gemv_s8x8s32.cpp │ │ │ │ ├── jit_avx512_core_gemv_s8x8s32.hpp │ │ │ │ ├── jit_avx512_core_kernel_gemv_s8x8s32_kern.cpp │ │ │ │ ├── jit_avx512_core_kernel_gemv_s8x8s32_kern.hpp │ │ │ │ ├── jit_avx512_core_u8_copy_an_kern_autogen.cpp │ │ │ │ ├── jit_avx512_core_u8_copy_at_kern_autogen.cpp │ │ │ │ ├── jit_avx512_core_u8_copy_bn_kern_autogen.cpp │ │ │ │ ├── jit_avx512_core_u8_copy_bt_kern_autogen.cpp │ │ │ │ ├── jit_avx512_core_u8_copy_sum_an_kern_autogen.cpp │ │ │ │ ├── jit_avx512_core_u8_copy_sum_at_kern_autogen.cpp │ │ │ │ ├── jit_avx512_core_u8_copy_sum_bn_kern_autogen.cpp │ │ │ │ ├── jit_avx512_core_u8_copy_sum_bt_kern_autogen.cpp │ │ │ │ ├── jit_avx_kernel_b0_b_gemm_s8u8s32_kern_autogen.cpp │ │ │ │ ├── jit_avx_kernel_b0_c_gemm_s8u8s32_kern_autogen.cpp │ │ │ │ ├── jit_avx_kernel_b0_gemm_s8u8s32_kern_autogen.cpp │ │ │ │ ├── jit_avx_kernel_b0_r_gemm_s8u8s32_kern_autogen.cpp │ │ │ │ ├── jit_avx_kernel_b_gemm_s8u8s32_kern_autogen.cpp │ │ │ │ ├── jit_avx_kernel_c_gemm_s8u8s32_kern_autogen.cpp │ │ │ │ ├── jit_avx_kernel_gemm_s8u8s32_kern_autogen.cpp │ │ │ │ ├── jit_avx_kernel_r_gemm_s8u8s32_kern_autogen.cpp │ │ │ │ ├── jit_avx_u8_copy_an_kern_autogen.cpp │ │ │ │ ├── jit_avx_u8_copy_at_kern_autogen.cpp │ │ │ │ ├── jit_avx_u8_copy_bn_kern_autogen.cpp │ │ │ │ ├── jit_avx_u8_copy_bt_kern_autogen.cpp │ │ │ │ ├── jit_avx_u8_copy_sum_an_kern_autogen.cpp │ │ │ │ ├── jit_avx_u8_copy_sum_at_kern_autogen.cpp │ │ │ │ ├── jit_avx_u8_copy_sum_bn_kern_autogen.cpp │ │ │ │ ├── jit_avx_u8_copy_sum_bt_kern_autogen.cpp │ │ │ │ ├── jit_sse41_kernel_b0_b_gemm_s8u8s32_kern_autogen.cpp │ │ │ │ ├── jit_sse41_kernel_b0_c_gemm_s8u8s32_kern_autogen.cpp │ │ │ │ ├── jit_sse41_kernel_b0_gemm_s8u8s32_kern_autogen.cpp │ │ │ │ ├── jit_sse41_kernel_b0_r_gemm_s8u8s32_kern_autogen.cpp │ │ │ │ ├── jit_sse41_kernel_b_gemm_s8u8s32_kern_autogen.cpp │ │ │ │ ├── jit_sse41_kernel_c_gemm_s8u8s32_kern_autogen.cpp │ │ │ │ ├── jit_sse41_kernel_gemm_s8u8s32_kern_autogen.cpp │ │ │ │ ├── jit_sse41_kernel_r_gemm_s8u8s32_kern_autogen.cpp │ │ │ │ ├── jit_sse41_u8_copy_an_kern_autogen.cpp │ │ │ │ ├── jit_sse41_u8_copy_at_kern_autogen.cpp │ │ │ │ ├── jit_sse41_u8_copy_bn_kern_autogen.cpp │ │ │ │ ├── jit_sse41_u8_copy_bt_kern_autogen.cpp │ │ │ │ ├── jit_sse41_u8_copy_sum_an_kern_autogen.cpp │ │ │ │ ├── jit_sse41_u8_copy_sum_at_kern_autogen.cpp │ │ │ │ ├── jit_sse41_u8_copy_sum_bn_kern_autogen.cpp │ │ │ │ └── jit_sse41_u8_copy_sum_bt_kern_autogen.cpp │ │ ├── gemm_bf16_convolution.cpp │ │ ├── gemm_bf16_convolution.hpp │ │ ├── gemm_bf16_inner_product.cpp │ │ ├── gemm_bf16_inner_product.hpp │ │ ├── injectors │ │ │ ├── injector_utils.cpp │ │ │ ├── injector_utils.hpp │ │ │ ├── jit_uni_binary_injector.cpp │ │ │ ├── jit_uni_binary_injector.hpp │ │ │ ├── jit_uni_eltwise_injector.cpp │ │ │ ├── jit_uni_eltwise_injector.hpp │ │ │ ├── jit_uni_postops_injector.cpp │ │ │ └── jit_uni_postops_injector.hpp │ │ ├── ip_convolution.cpp │ │ ├── ip_convolution.hpp │ │ ├── jit_avx2_1x1_conv_kernel_f32.cpp │ │ ├── jit_avx2_1x1_conv_kernel_f32.hpp │ │ ├── jit_avx2_1x1_convolution.cpp │ │ ├── jit_avx2_1x1_convolution.hpp │ │ ├── jit_avx2_conv_kernel_f32.cpp │ │ ├── jit_avx2_conv_kernel_f32.hpp │ │ ├── jit_avx2_convolution.cpp │ │ ├── jit_avx2_convolution.hpp │ │ ├── jit_avx512_common_1x1_conv_kernel.cpp │ │ ├── jit_avx512_common_1x1_conv_kernel.hpp │ │ ├── jit_avx512_common_1x1_convolution.cpp │ │ ├── jit_avx512_common_1x1_convolution.hpp │ │ ├── jit_avx512_common_conv_kernel.cpp │ │ ├── jit_avx512_common_conv_kernel.hpp │ │ ├── jit_avx512_common_convolution.cpp │ │ ├── jit_avx512_common_convolution.hpp │ │ ├── jit_avx512_core_amx_1x1_conv_kernel.cpp │ │ ├── jit_avx512_core_amx_1x1_conv_kernel.hpp │ │ ├── jit_avx512_core_amx_1x1_convolution.cpp │ │ ├── jit_avx512_core_amx_1x1_convolution.hpp │ │ ├── jit_avx512_core_amx_conv_kernel.cpp │ │ ├── jit_avx512_core_amx_conv_kernel.hpp │ │ ├── jit_avx512_core_amx_conv_utils.hpp │ │ ├── jit_avx512_core_amx_convolution.cpp │ │ ├── jit_avx512_core_amx_convolution.hpp │ │ ├── jit_avx512_core_amx_deconvolution.cpp │ │ ├── jit_avx512_core_amx_deconvolution.hpp │ │ ├── jit_avx512_core_bf16_1x1_conv_kernel.cpp │ │ ├── jit_avx512_core_bf16_1x1_conv_kernel.hpp │ │ ├── jit_avx512_core_bf16_1x1_convolution.cpp │ │ ├── jit_avx512_core_bf16_1x1_convolution.hpp │ │ ├── jit_avx512_core_bf16_conv_kernel.cpp │ │ ├── jit_avx512_core_bf16_conv_kernel.hpp │ │ ├── jit_avx512_core_bf16_convolution.cpp │ │ ├── jit_avx512_core_bf16_convolution.hpp │ │ ├── jit_avx512_core_bf16_dw_conv_kernel.cpp │ │ ├── jit_avx512_core_bf16_dw_conv_kernel.hpp │ │ ├── jit_avx512_core_bf16cvt.hpp │ │ ├── jit_avx512_core_f16_dw_conv_kernel.cpp │ │ ├── jit_avx512_core_f16_dw_conv_kernel.hpp │ │ ├── jit_avx512_core_fp16cvt.cpp │ │ ├── jit_avx512_core_fp16cvt.hpp │ │ ├── jit_avx512_core_fp8cvt.cpp │ │ ├── jit_avx512_core_fp8cvt.hpp │ │ ├── jit_avx512_core_resampling.cpp │ │ ├── jit_avx512_core_resampling.hpp │ │ ├── jit_avx512_core_scale_precompute.cpp │ │ ├── jit_avx512_core_scale_precompute.hpp │ │ ├── jit_avx512_core_x8s8s32x_1x1_conv_kernel.cpp │ │ ├── jit_avx512_core_x8s8s32x_1x1_conv_kernel.hpp │ │ ├── jit_avx512_core_x8s8s32x_1x1_convolution.cpp │ │ ├── jit_avx512_core_x8s8s32x_1x1_convolution.hpp │ │ ├── jit_avx512_core_x8s8s32x_1x1_deconvolution.hpp │ │ ├── jit_avx512_core_x8s8s32x_conv_kernel.cpp │ │ ├── jit_avx512_core_x8s8s32x_conv_kernel.hpp │ │ ├── jit_avx512_core_x8s8s32x_convolution.cpp │ │ ├── jit_avx512_core_x8s8s32x_convolution.hpp │ │ ├── jit_avx512_core_x8s8s32x_deconvolution.cpp │ │ ├── jit_avx512_core_x8s8s32x_deconvolution.hpp │ │ ├── jit_avx512_sparse_decompress_kernel.cpp │ │ ├── jit_avx512_sparse_decompress_kernel.hpp │ │ ├── jit_brdgmm_dw_conv.cpp │ │ ├── jit_brdgmm_dw_conv.hpp │ │ ├── jit_brgemm_1x1_conv.cpp │ │ ├── jit_brgemm_1x1_conv.hpp │ │ ├── jit_brgemm_conv.cpp │ │ ├── jit_brgemm_conv.hpp │ │ ├── jit_brgemm_conv_bwd.cpp │ │ ├── jit_brgemm_conv_bwd.hpp │ │ ├── jit_brgemm_conv_bwd_copy_kernel.cpp │ │ ├── jit_brgemm_conv_bwd_copy_kernel.hpp │ │ ├── jit_brgemm_conv_bwd_strided.cpp │ │ ├── jit_brgemm_conv_bwd_strided.hpp │ │ ├── jit_brgemm_conv_bwd_trans_kernel.cpp │ │ ├── jit_brgemm_conv_bwd_trans_kernel.hpp │ │ ├── jit_brgemm_conv_bwd_utils.cpp │ │ ├── jit_brgemm_conv_bwd_utils.hpp │ │ ├── jit_brgemm_conv_bwd_w.cpp │ │ ├── jit_brgemm_conv_bwd_w.hpp │ │ ├── jit_brgemm_conv_comp_pad_kernel.cpp │ │ ├── jit_brgemm_conv_comp_pad_kernel.hpp │ │ ├── jit_brgemm_conv_trans_kernel.cpp │ │ ├── jit_brgemm_conv_trans_kernel.hpp │ │ ├── jit_brgemm_conv_utils.cpp │ │ ├── jit_brgemm_conv_utils.hpp │ │ ├── jit_brgemm_deconv.cpp │ │ ├── jit_brgemm_deconv.hpp │ │ ├── jit_brgemm_inner_product.cpp │ │ ├── jit_brgemm_inner_product.hpp │ │ ├── jit_brgemm_inner_product_utils.cpp │ │ ├── jit_brgemm_inner_product_utils.hpp │ │ ├── jit_brgemm_post_ops.cpp │ │ ├── jit_brgemm_post_ops.hpp │ │ ├── jit_brgemm_primitive_conf.cpp │ │ ├── jit_brgemm_primitive_conf.hpp │ │ ├── jit_brgemm_transpose_utils.cpp │ │ ├── jit_brgemm_transpose_utils.hpp │ │ ├── jit_gemm_inner_product_utils.cpp │ │ ├── jit_gemm_inner_product_utils.hpp │ │ ├── jit_gemm_x8s8s32x_conv_zp_src_pad_comp.cpp │ │ ├── jit_gemm_x8s8s32x_conv_zp_src_pad_comp.hpp │ │ ├── jit_gemm_x8s8s32x_convolution_utils.cpp │ │ ├── jit_gemm_x8s8s32x_convolution_utils.hpp │ │ ├── jit_generator.cpp │ │ ├── jit_generator.hpp │ │ ├── jit_primitive_conf.hpp │ │ ├── jit_sse41_1x1_conv_kernel_f32.cpp │ │ ├── jit_sse41_1x1_conv_kernel_f32.hpp │ │ ├── jit_sse41_1x1_convolution.cpp │ │ ├── jit_sse41_1x1_convolution.hpp │ │ ├── jit_sse41_conv_kernel_f32.cpp │ │ ├── jit_sse41_conv_kernel_f32.hpp │ │ ├── jit_sse41_convolution.cpp │ │ ├── jit_sse41_convolution.hpp │ │ ├── jit_transpose_utils.cpp │ │ ├── jit_transpose_utils.hpp │ │ ├── jit_uni_1x1_conv_utils.hpp │ │ ├── jit_uni_batch_normalization.cpp │ │ ├── jit_uni_batch_normalization.hpp │ │ ├── jit_uni_batch_normalization_s8.cpp │ │ ├── jit_uni_batch_normalization_s8.hpp │ │ ├── jit_uni_binary.cpp │ │ ├── jit_uni_binary.hpp │ │ ├── jit_uni_binary_kernel.cpp │ │ ├── jit_uni_binary_kernel.hpp │ │ ├── jit_uni_convert_xf16.cpp │ │ ├── jit_uni_convert_xf16.hpp │ │ ├── jit_uni_deconv_zp_pad_str_kernel.cpp │ │ ├── jit_uni_deconv_zp_pad_str_kernel.hpp │ │ ├── jit_uni_dw_conv_kernel_f32.cpp │ │ ├── jit_uni_dw_conv_kernel_f32.hpp │ │ ├── jit_uni_dw_conv_kernel_utils.cpp │ │ ├── jit_uni_dw_conv_kernel_utils.hpp │ │ ├── jit_uni_dw_convolution.cpp │ │ ├── jit_uni_dw_convolution.hpp │ │ ├── jit_uni_eltwise.cpp │ │ ├── jit_uni_eltwise.hpp │ │ ├── jit_uni_eltwise_int.cpp │ │ ├── jit_uni_eltwise_int.hpp │ │ ├── jit_uni_group_normalization.cpp │ │ ├── jit_uni_group_normalization.hpp │ │ ├── jit_uni_i8i8_pooling.cpp │ │ ├── jit_uni_i8i8_pooling.hpp │ │ ├── jit_uni_instance_normalization.cpp │ │ ├── jit_uni_instance_normalization.hpp │ │ ├── jit_uni_layer_normalization.cpp │ │ ├── jit_uni_layer_normalization.hpp │ │ ├── jit_uni_ncsp_convolution.cpp │ │ ├── jit_uni_ncsp_convolution.hpp │ │ ├── jit_uni_pool_kernel.cpp │ │ ├── jit_uni_pool_kernel.hpp │ │ ├── jit_uni_pooling.cpp │ │ ├── jit_uni_pooling.hpp │ │ ├── jit_uni_reduction.cpp │ │ ├── jit_uni_reduction.hpp │ │ ├── jit_uni_reduction_kernel.cpp │ │ ├── jit_uni_reduction_kernel.hpp │ │ ├── jit_uni_reorder.cpp │ │ ├── jit_uni_reorder.hpp │ │ ├── jit_uni_reorder_direct_copy.cpp │ │ ├── jit_uni_reorder_direct_copy.hpp │ │ ├── jit_uni_reorder_utils.cpp │ │ ├── jit_uni_resampling.cpp │ │ ├── jit_uni_resampling.hpp │ │ ├── jit_uni_resampling_kernel.cpp │ │ ├── jit_uni_resampling_kernel.hpp │ │ ├── jit_uni_softmax.cpp │ │ ├── jit_uni_softmax.hpp │ │ ├── jit_uni_tbb_batch_normalization.cpp │ │ ├── jit_uni_tbb_batch_normalization.hpp │ │ ├── jit_uni_x8s8s32x_1x1_conv_kernel.cpp │ │ ├── jit_uni_x8s8s32x_1x1_conv_kernel.hpp │ │ ├── jit_uni_x8s8s32x_1x1_convolution.cpp │ │ ├── jit_uni_x8s8s32x_1x1_convolution.hpp │ │ ├── jit_uni_x8s8s32x_1x1_deconvolution.hpp │ │ ├── jit_uni_x8s8s32x_conv_kernel.cpp │ │ ├── jit_uni_x8s8s32x_conv_kernel.hpp │ │ ├── jit_uni_x8s8s32x_convolution.cpp │ │ ├── jit_uni_x8s8s32x_convolution.hpp │ │ ├── jit_uni_x8s8s32x_deconvolution.cpp │ │ ├── jit_uni_x8s8s32x_deconvolution.hpp │ │ ├── jit_uni_xf16_sum.cpp │ │ ├── jit_uni_xf16_sum.hpp │ │ ├── lrn │ │ │ ├── jit_avx512_common_lrn.cpp │ │ │ ├── jit_avx512_common_lrn.hpp │ │ │ ├── jit_avx512_common_lrn_bwd_base.cpp │ │ │ ├── jit_avx512_common_lrn_bwd_base.hpp │ │ │ ├── jit_avx512_common_lrn_bwd_blocked.cpp │ │ │ ├── jit_avx512_common_lrn_bwd_blocked.hpp │ │ │ ├── jit_avx512_common_lrn_bwd_nhwc.cpp │ │ │ ├── jit_avx512_common_lrn_bwd_nhwc.hpp │ │ │ ├── jit_avx512_common_lrn_fwd_base.cpp │ │ │ ├── jit_avx512_common_lrn_fwd_base.hpp │ │ │ ├── jit_avx512_common_lrn_fwd_blocked.cpp │ │ │ ├── jit_avx512_common_lrn_fwd_blocked.hpp │ │ │ ├── jit_avx512_common_lrn_fwd_nhwc.cpp │ │ │ ├── jit_avx512_common_lrn_fwd_nhwc.hpp │ │ │ ├── jit_avx512_common_lrn_utils.hpp │ │ │ ├── jit_uni_lrn.cpp │ │ │ ├── jit_uni_lrn.hpp │ │ │ ├── jit_uni_lrn_kernel.cpp │ │ │ ├── jit_uni_lrn_kernel.hpp │ │ │ ├── lrn_avx512_blocked_executor.hpp │ │ │ ├── lrn_avx512_nhwc_executor.hpp │ │ │ ├── lrn_executor.hpp │ │ │ └── lrn_executor_factory.hpp │ │ ├── matmul │ │ │ ├── amx_blocking_heuristics.cpp │ │ │ ├── amx_blocking_heuristics.hpp │ │ │ ├── brgemm_matmul.cpp │ │ │ ├── brgemm_matmul.hpp │ │ │ ├── brgemm_matmul_copy_utils.cpp │ │ │ ├── brgemm_matmul_copy_utils.hpp │ │ │ ├── brgemm_matmul_reorders.cpp │ │ │ ├── brgemm_matmul_reorders.hpp │ │ │ ├── brgemm_matmul_utils.cpp │ │ │ ├── brgemm_matmul_utils.hpp │ │ │ ├── jit_uni_sparse_matmul.cpp │ │ │ └── jit_uni_sparse_matmul.hpp │ │ ├── matmul_inner_product.cpp │ │ ├── matmul_inner_product.hpp │ │ ├── prelu │ │ │ ├── jit_prelu_backward.cpp │ │ │ ├── jit_prelu_backward.hpp │ │ │ ├── jit_prelu_base_kernel.cpp │ │ │ ├── jit_prelu_base_kernel.hpp │ │ │ ├── jit_prelu_forward.cpp │ │ │ ├── jit_prelu_forward.hpp │ │ │ ├── jit_prelu_reduction_kernel.cpp │ │ │ ├── jit_prelu_reduction_kernel.hpp │ │ │ ├── jit_prelu_utils.cpp │ │ │ ├── jit_prelu_utils.hpp │ │ │ ├── jit_uni_prelu_backward_kernel.cpp │ │ │ ├── jit_uni_prelu_backward_kernel.hpp │ │ │ ├── jit_uni_prelu_forward_kernel.cpp │ │ │ └── jit_uni_prelu_forward_kernel.hpp │ │ ├── rnn │ │ │ ├── brgemm_cell_common_bwd.cpp │ │ │ ├── brgemm_cell_common_bwd.hpp │ │ │ ├── brgemm_cell_common_fwd.cpp │ │ │ ├── brgemm_cell_common_fwd.hpp │ │ │ ├── brgemm_cell_common_reorders.cpp │ │ │ ├── brgemm_cell_common_reorders.hpp │ │ │ ├── brgemm_cell_common_utils.cpp │ │ │ ├── brgemm_cell_common_utils.hpp │ │ │ ├── jit_brgemm_transpose_single_row.cpp │ │ │ ├── jit_brgemm_transpose_single_row.hpp │ │ │ ├── jit_diff_weights_peephole.cpp │ │ │ ├── jit_diff_weights_peephole.hpp │ │ │ ├── jit_gates_reduction.cpp │ │ │ ├── jit_gates_reduction.hpp │ │ │ ├── jit_uni_gru_cell_postgemm_1_bwd.hpp │ │ │ ├── jit_uni_gru_cell_postgemm_1_fwd.hpp │ │ │ ├── jit_uni_gru_cell_postgemm_2_bwd.hpp │ │ │ ├── jit_uni_gru_cell_postgemm_2_fwd.hpp │ │ │ ├── jit_uni_gru_lbr_cell_postgemm_bwd.hpp │ │ │ ├── jit_uni_gru_lbr_cell_postgemm_fwd.hpp │ │ │ ├── jit_uni_lstm_cell_postgemm.hpp │ │ │ ├── jit_uni_lstm_cell_postgemm_bwd.hpp │ │ │ ├── jit_uni_lstm_cell_postgemm_fwd.hpp │ │ │ ├── jit_uni_lstm_cell_projection_postgemm_fwd.hpp │ │ │ ├── jit_uni_rnn_cell_postgemm_bwd.hpp │ │ │ ├── jit_uni_rnn_cell_postgemm_fwd.hpp │ │ │ ├── jit_uni_rnn_common_postgemm.hpp │ │ │ ├── rnn_brgemm_utils.cpp │ │ │ └── rnn_brgemm_utils.hpp │ │ ├── shuffle │ │ │ ├── jit_uni_shuffle.cpp │ │ │ ├── jit_uni_shuffle.hpp │ │ │ ├── jit_uni_shuffle_kernel.cpp │ │ │ └── jit_uni_shuffle_kernel.hpp │ │ ├── ukernel │ │ │ ├── attr_params.cpp │ │ │ ├── attr_params.hpp │ │ │ ├── brgemm.cpp │ │ │ ├── brgemm.hpp │ │ │ ├── transform.cpp │ │ │ └── transform.hpp │ │ └── utils │ │ │ ├── jit_io_helper.cpp │ │ │ └── jit_io_helper.hpp │ ├── zero_point_utils.cpp │ └── zero_point_utils.hpp ├── gpu │ ├── CMakeLists.txt │ ├── README.md │ ├── amd │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ ├── engine.cpp │ │ ├── engine.hpp │ │ ├── miopen_batch_normalization.cpp │ │ ├── miopen_batch_normalization.hpp │ │ ├── miopen_batch_normalization_executor.hpp │ │ ├── miopen_batch_normalization_impl.hpp │ │ ├── miopen_binary.cpp │ │ ├── miopen_binary.hpp │ │ ├── miopen_binary_impl.hpp │ │ ├── miopen_conv_filter_adjustment_base.hpp │ │ ├── miopen_convolution.cpp │ │ ├── miopen_convolution.hpp │ │ ├── miopen_convolution_impl.hpp │ │ ├── miopen_convolution_pd.hpp │ │ ├── miopen_deconvolution.cpp │ │ ├── miopen_deconvolution.hpp │ │ ├── miopen_deconvolution_impl.hpp │ │ ├── miopen_eltwise.cpp │ │ ├── miopen_eltwise.hpp │ │ ├── miopen_eltwise_impl.hpp │ │ ├── miopen_gemm_inner_product.hpp │ │ ├── miopen_gemm_inner_product_impl.hpp │ │ ├── miopen_inner_product.cpp │ │ ├── miopen_inner_product.hpp │ │ ├── miopen_inner_product_impl.hpp │ │ ├── miopen_lrn.cpp │ │ ├── miopen_lrn.hpp │ │ ├── miopen_lrn_impl.hpp │ │ ├── miopen_matmul.cpp │ │ ├── miopen_matmul.hpp │ │ ├── miopen_matmul_executor.hpp │ │ ├── miopen_matmul_impl.hpp │ │ ├── miopen_pooling.cpp │ │ ├── miopen_pooling.hpp │ │ ├── miopen_pooling_impl.hpp │ │ ├── miopen_reduction.cpp │ │ ├── miopen_reduction.hpp │ │ ├── miopen_reduction_impl.hpp │ │ ├── miopen_reorder.cpp │ │ ├── miopen_reorder.hpp │ │ ├── miopen_reorder_impl.hpp │ │ ├── miopen_softmax.cpp │ │ ├── miopen_softmax.hpp │ │ ├── miopen_softmax_impl.hpp │ │ ├── stream.cpp │ │ ├── stream.hpp │ │ ├── sycl_hip_compat.cpp │ │ ├── sycl_hip_compat.hpp │ │ ├── sycl_hip_scoped_context.cpp │ │ ├── sycl_hip_scoped_context.hpp │ │ ├── sycl_hip_utils.cpp │ │ └── sycl_hip_utils.hpp │ ├── generic │ │ ├── CMakeLists.txt │ │ ├── convolution_deconvolution.hpp │ │ ├── cross_engine_reorder.cpp │ │ ├── cross_engine_reorder.hpp │ │ ├── direct_copy.hpp │ │ ├── ref_concat.hpp │ │ ├── ref_sum.hpp │ │ └── sycl │ │ │ ├── CMakeLists.txt │ │ │ ├── README.md │ │ │ ├── batch_normalizations_kernels.hpp │ │ │ ├── binary_kernels.hpp │ │ │ ├── convolution_kernels.hpp │ │ │ ├── eltwise_kernels.hpp │ │ │ ├── engine.cpp │ │ │ ├── engine.hpp │ │ │ ├── group_normalization_kernel.hpp │ │ │ ├── layer_normalizations_kernels.hpp │ │ │ ├── lrn_kernels.hpp │ │ │ ├── matmul_kernels.hpp │ │ │ ├── pooling_kernels.hpp │ │ │ ├── prelu_kernels.hpp │ │ │ ├── reduction_kernels.hpp │ │ │ ├── ref_batch_normalization.cpp │ │ │ ├── ref_batch_normalization.hpp │ │ │ ├── ref_binary.cpp │ │ │ ├── ref_binary.hpp │ │ │ ├── ref_convolution.cpp │ │ │ ├── ref_convolution.hpp │ │ │ ├── ref_deconvolution.cpp │ │ │ ├── ref_deconvolution.hpp │ │ │ ├── ref_eltwise.cpp │ │ │ ├── ref_eltwise.hpp │ │ │ ├── ref_group_normalization.cpp │ │ │ ├── ref_group_normalization.hpp │ │ │ ├── ref_inner_product.cpp │ │ │ ├── ref_inner_product.hpp │ │ │ ├── ref_layer_normalizations.cpp │ │ │ ├── ref_layer_normalizations.hpp │ │ │ ├── ref_lrn.cpp │ │ │ ├── ref_lrn.hpp │ │ │ ├── ref_matmul.cpp │ │ │ ├── ref_matmul.hpp │ │ │ ├── ref_pooling.cpp │ │ │ ├── ref_pooling.hpp │ │ │ ├── ref_prelu.cpp │ │ │ ├── ref_prelu.hpp │ │ │ ├── ref_reduction.cpp │ │ │ ├── ref_reduction.hpp │ │ │ ├── ref_reorder.cpp │ │ │ ├── ref_reorder.hpp │ │ │ ├── ref_resampling.cpp │ │ │ ├── ref_resampling.hpp │ │ │ ├── ref_shuffle.cpp │ │ │ ├── ref_shuffle.hpp │ │ │ ├── ref_softmax.cpp │ │ │ ├── ref_softmax.hpp │ │ │ ├── ref_sum.cpp │ │ │ ├── ref_sum.hpp │ │ │ ├── ref_sum_many_inputs.cpp │ │ │ ├── ref_sum_many_inputs.hpp │ │ │ ├── reorder_kernels.hpp │ │ │ ├── resampling_kernels.hpp │ │ │ ├── resampling_utils.hpp │ │ │ ├── rnn │ │ │ ├── cell_common.cpp │ │ │ ├── ref_rnn.cpp │ │ │ ├── ref_rnn.hpp │ │ │ ├── rnn_kernels.hpp │ │ │ ├── rnn_utils.cpp │ │ │ └── rnn_utils.hpp │ │ │ ├── shuffle_kernels.hpp │ │ │ ├── simple_reduction.cpp │ │ │ ├── simple_reduction.hpp │ │ │ ├── simple_reduction_kernels.hpp │ │ │ ├── softmax_kernels.hpp │ │ │ ├── stream.cpp │ │ │ ├── stream.hpp │ │ │ ├── sum_kernels.hpp │ │ │ ├── sycl_gpu_kernel.cpp │ │ │ ├── sycl_gpu_kernel.hpp │ │ │ ├── sycl_gpu_primitive.hpp │ │ │ ├── sycl_io_helper.hpp │ │ │ ├── sycl_math_utils.hpp │ │ │ ├── sycl_post_ops.hpp │ │ │ ├── sycl_primitive_conf.hpp │ │ │ ├── sycl_q10n.hpp │ │ │ └── sycl_utils.hpp │ ├── gpu_batch_normalization_list.cpp │ ├── gpu_batch_normalization_pd.hpp │ ├── gpu_binary_list.cpp │ ├── gpu_binary_pd.hpp │ ├── gpu_concat_list.cpp │ ├── gpu_concat_pd.hpp │ ├── gpu_convolution_list.cpp │ ├── gpu_convolution_pd.hpp │ ├── gpu_deconvolution_list.cpp │ ├── gpu_deconvolution_pd.hpp │ ├── gpu_eltwise_list.cpp │ ├── gpu_eltwise_pd.hpp │ ├── gpu_engine.hpp │ ├── gpu_gemm_list.cpp │ ├── gpu_gemm_pd.hpp │ ├── gpu_group_normalization_list.cpp │ ├── gpu_impl_list.cpp │ ├── gpu_impl_list.hpp │ ├── gpu_inner_product_list.cpp │ ├── gpu_inner_product_pd.hpp │ ├── gpu_layer_normalization_list.cpp │ ├── gpu_layer_normalization_pd.hpp │ ├── gpu_lrn_list.cpp │ ├── gpu_lrn_pd.hpp │ ├── gpu_matmul_list.cpp │ ├── gpu_matmul_pd.hpp │ ├── gpu_pooling_list.cpp │ ├── gpu_pooling_pd.hpp │ ├── gpu_prelu_list.cpp │ ├── gpu_prelu_pd.hpp │ ├── gpu_primitive.hpp │ ├── gpu_reduction_list.cpp │ ├── gpu_reduction_pd.hpp │ ├── gpu_reorder_list.cpp │ ├── gpu_reorder_pd.cpp │ ├── gpu_reorder_pd.hpp │ ├── gpu_resampling_list.cpp │ ├── gpu_resampling_pd.hpp │ ├── gpu_resource.hpp │ ├── gpu_rnn_list.cpp │ ├── gpu_rnn_pd.hpp │ ├── gpu_sdpa_list.cpp │ ├── gpu_shuffle_list.cpp │ ├── gpu_shuffle_pd.hpp │ ├── gpu_softmax_list.cpp │ ├── gpu_softmax_pd.hpp │ ├── gpu_stream.hpp │ ├── gpu_sum_list.cpp │ ├── gpu_sum_pd.hpp │ ├── gpu_utils.hpp │ ├── gpu_zero_pad_list.cpp │ ├── gpu_zero_pad_pd.hpp │ ├── gpu_zero_points_conv.cpp │ ├── gpu_zero_points_conv.hpp │ ├── intel │ │ ├── CMakeLists.txt │ │ ├── block_structure.cpp │ │ ├── block_structure.hpp │ │ ├── compute │ │ │ ├── CMakeLists.txt │ │ │ ├── block_manipulation.cpp │ │ │ ├── block_manipulation.hpp │ │ │ ├── compute_engine.cpp │ │ │ ├── compute_engine.hpp │ │ │ ├── compute_stream.cpp │ │ │ ├── compute_stream.hpp │ │ │ ├── data_type_converter.hpp │ │ │ ├── device_info.cpp │ │ │ ├── device_info.hpp │ │ │ ├── dispatch.cpp │ │ │ ├── dispatch.hpp │ │ │ ├── dispatch_reusable.cpp │ │ │ ├── dispatch_reusable.hpp │ │ │ ├── kernel.hpp │ │ │ ├── kernel_arg_list.hpp │ │ │ ├── kernel_ctx.cpp │ │ │ ├── kernel_ctx.hpp │ │ │ ├── utils.hpp │ │ │ ├── zero_pool.cpp │ │ │ └── zero_pool.hpp │ │ ├── config.hpp │ │ ├── gemm │ │ │ ├── gpu_gemm.hpp │ │ │ └── gpu_gemm_exec_types.hpp │ │ ├── gpu_post_ops.hpp │ │ ├── gpu_primitive.hpp │ │ ├── gpu_primitive_attr.hpp │ │ ├── jit │ │ │ ├── CMakeLists.txt │ │ │ ├── README.md │ │ │ ├── binary_format.cpp │ │ │ ├── binary_format.hpp │ │ │ ├── codegen │ │ │ │ ├── bank_conflict_allocation.cpp │ │ │ │ ├── bank_conflict_allocation.hpp │ │ │ │ ├── codegen.cpp │ │ │ │ ├── codegen.hpp │ │ │ │ ├── kernel.hpp │ │ │ │ ├── ngen_helpers.hpp │ │ │ │ ├── operand.cpp │ │ │ │ ├── operand.hpp │ │ │ │ ├── reduce.hpp │ │ │ │ ├── reg_buf.hpp │ │ │ │ ├── register_allocator.hpp │ │ │ │ ├── register_scope.hpp │ │ │ │ ├── reorder.hpp │ │ │ │ └── send.hpp │ │ │ ├── config │ │ │ │ ├── gemmstone_config.cpp │ │ │ │ ├── gemmstone_config.hpp │ │ │ │ └── ngen_config.hpp │ │ │ ├── conv │ │ │ │ ├── README.md │ │ │ │ ├── config.cpp │ │ │ │ ├── config.hpp │ │ │ │ ├── conv_kernel.hpp │ │ │ │ ├── gen_convolution.cpp │ │ │ │ ├── gen_convolution.hpp │ │ │ │ ├── grf_usage.cpp │ │ │ │ ├── grf_usage.hpp │ │ │ │ ├── ir_builder.cpp │ │ │ │ ├── ir_builder.hpp │ │ │ │ ├── key.cpp │ │ │ │ ├── key.hpp │ │ │ │ ├── lookup_table.cpp │ │ │ │ ├── lookup_table.hpp │ │ │ │ ├── lookup_table_data.cpp │ │ │ │ ├── message_patterns.hpp │ │ │ │ ├── model.hpp │ │ │ │ ├── model_bridge.cpp │ │ │ │ ├── model_bridge.hpp │ │ │ │ ├── model_data.hpp │ │ │ │ ├── model_xehpc_common_data.cpp │ │ │ │ ├── model_xehpc_dw_data.cpp │ │ │ │ ├── model_xehpg_common_data.cpp │ │ │ │ ├── model_xehpg_dw_data.cpp │ │ │ │ ├── normalization.cpp │ │ │ │ ├── normalization.hpp │ │ │ │ ├── pipeline.cpp │ │ │ │ ├── pipeline.hpp │ │ │ │ ├── plan.cpp │ │ │ │ ├── plan.hpp │ │ │ │ ├── plan_utils.hpp │ │ │ │ ├── problem.cpp │ │ │ │ ├── problem.hpp │ │ │ │ ├── tiler.cpp │ │ │ │ ├── tiler.hpp │ │ │ │ ├── zero_out.cpp │ │ │ │ ├── zero_out.hpp │ │ │ │ ├── zp_plan.cpp │ │ │ │ └── zp_plan.hpp │ │ │ ├── eltwise_injector.cpp │ │ │ ├── eltwise_injector.hpp │ │ │ ├── emulated_generator.cpp │ │ │ ├── emulated_generator.hpp │ │ │ ├── gemm │ │ │ │ ├── .clang-tidy │ │ │ │ ├── CMakeLists.txt │ │ │ │ ├── gemm_walk_orders.hpp │ │ │ │ ├── gen_gemm.cpp │ │ │ │ ├── gen_gemm.hpp │ │ │ │ ├── gen_gemm_kernel.cpp │ │ │ │ ├── gen_gemm_kernel.hpp │ │ │ │ ├── gen_gemm_kernel_db.cpp │ │ │ │ ├── gen_gemm_kernel_db.hpp │ │ │ │ ├── generator │ │ │ │ │ ├── .clang-format │ │ │ │ │ ├── generator.cpp │ │ │ │ │ ├── microkernel_provider.cpp │ │ │ │ │ ├── pieces │ │ │ │ │ │ ├── address_setup.cxx │ │ │ │ │ │ ├── alloc_utils.cpp │ │ │ │ │ │ ├── alloc_utils.hpp │ │ │ │ │ │ ├── allocators.cpp │ │ │ │ │ │ ├── allocators.hpp │ │ │ │ │ │ ├── asm_helpers.cxx │ │ │ │ │ │ ├── atomic_fusions.cxx │ │ │ │ │ │ ├── atomic_fusions.hpp │ │ │ │ │ │ ├── c_update.cxx │ │ │ │ │ │ ├── common.cxx │ │ │ │ │ │ ├── compute_utils.hpp │ │ │ │ │ │ ├── cooperative_split.cpp │ │ │ │ │ │ ├── cooperative_split.hpp │ │ │ │ │ │ ├── copy.cxx │ │ │ │ │ │ ├── copy_plan.cpp │ │ │ │ │ │ ├── copy_plan.hpp │ │ │ │ │ │ ├── driver_info.cxx │ │ │ │ │ │ ├── emulation.cxx │ │ │ │ │ │ ├── gemm.cxx │ │ │ │ │ │ ├── gemm_microkernel.cxx │ │ │ │ │ │ ├── gemm_setup.cxx │ │ │ │ │ │ ├── grf_multirange.hpp │ │ │ │ │ │ ├── hw_template_instantiations.cxx │ │ │ │ │ │ ├── hw_utils.hpp │ │ │ │ │ │ ├── invert.hpp │ │ │ │ │ │ ├── k_loop.cxx │ │ │ │ │ │ ├── k_loop_setup.cxx │ │ │ │ │ │ ├── kernel_queries.cpp │ │ │ │ │ │ ├── kernel_queries.hpp │ │ │ │ │ │ ├── l3_prefetch.cxx │ │ │ │ │ │ ├── layout_setup.cxx │ │ │ │ │ │ ├── layout_utils.cpp │ │ │ │ │ │ ├── layout_utils.hpp │ │ │ │ │ │ ├── loop_sequencer.cpp │ │ │ │ │ │ ├── loop_sequencer.hpp │ │ │ │ │ │ ├── map.hpp │ │ │ │ │ │ ├── masks.cxx │ │ │ │ │ │ ├── math_helpers.cxx │ │ │ │ │ │ ├── matrix_access.cxx │ │ │ │ │ │ ├── matrix_multiply.cxx │ │ │ │ │ │ ├── monolithic_k_loop_dpasw.cxx │ │ │ │ │ │ ├── ngen_object_helpers.cpp │ │ │ │ │ │ ├── ngen_object_helpers.hpp │ │ │ │ │ │ ├── post_ops.cxx │ │ │ │ │ │ ├── problem_utils.cpp │ │ │ │ │ │ ├── quantization.cpp │ │ │ │ │ │ ├── quantization.cxx │ │ │ │ │ │ ├── quantization.hpp │ │ │ │ │ │ ├── register_allocation.cxx │ │ │ │ │ │ ├── register_layout.cpp │ │ │ │ │ │ ├── register_layout.hpp │ │ │ │ │ │ ├── remask.cpp │ │ │ │ │ │ ├── remask.cxx │ │ │ │ │ │ ├── remask.hpp │ │ │ │ │ │ ├── row_column_sums.cxx │ │ │ │ │ │ ├── state.cpp │ │ │ │ │ │ ├── state.hpp │ │ │ │ │ │ ├── state_debug.cpp │ │ │ │ │ │ ├── state_utils.cpp │ │ │ │ │ │ ├── state_utils.cxx │ │ │ │ │ │ ├── state_utils.hpp │ │ │ │ │ │ ├── stream_k.cxx │ │ │ │ │ │ ├── tlb_warmup.cxx │ │ │ │ │ │ ├── token_alloc_utils.cpp │ │ │ │ │ │ ├── token_alloc_utils.hpp │ │ │ │ │ │ └── walk_orders.cxx │ │ │ │ │ ├── strategy.cpp │ │ │ │ │ └── strategy_parser.cpp │ │ │ │ ├── include │ │ │ │ │ ├── .clang-format │ │ │ │ │ ├── gemmstone │ │ │ │ │ │ ├── config.hpp │ │ │ │ │ │ ├── driver_info.hpp │ │ │ │ │ │ ├── generator.hpp │ │ │ │ │ │ ├── kernel_catalog.hpp │ │ │ │ │ │ ├── kernel_evaluator.hpp │ │ │ │ │ │ ├── kernel_selector.hpp │ │ │ │ │ │ ├── microkernel_provider.hpp │ │ │ │ │ │ ├── problem.hpp │ │ │ │ │ │ ├── strategy.hpp │ │ │ │ │ │ ├── strategy_parser.hpp │ │ │ │ │ │ └── type.hpp │ │ │ │ │ └── internal │ │ │ │ │ │ ├── generator_inline.hxx │ │ │ │ │ │ ├── namespace_end.hxx │ │ │ │ │ │ ├── namespace_start.hxx │ │ │ │ │ │ ├── ngen_includes.hpp │ │ │ │ │ │ └── utils.hpp │ │ │ │ ├── jit_gemm_pd.cpp │ │ │ │ ├── jit_gemm_pd.hpp │ │ │ │ ├── selector │ │ │ │ │ ├── .clang-format │ │ │ │ │ ├── db │ │ │ │ │ │ ├── kernel.db │ │ │ │ │ │ ├── ukernel_lmr.db │ │ │ │ │ │ ├── ukernel_mlr.db │ │ │ │ │ │ └── ukernel_mmr.db │ │ │ │ │ ├── kernel_evaluator.cpp │ │ │ │ │ └── kernel_selector.cpp │ │ │ │ ├── xe_hp_systolic_gemm.cpp │ │ │ │ └── xe_hp_systolic_gemm.hpp │ │ │ ├── generator.cpp │ │ │ ├── generator.hpp │ │ │ ├── generator_base.hpp │ │ │ ├── ir │ │ │ │ ├── README.md │ │ │ │ ├── block_2d_utils.hpp │ │ │ │ ├── blocking.cpp │ │ │ │ ├── blocking.hpp │ │ │ │ ├── config.hpp │ │ │ │ ├── core.cpp │ │ │ │ ├── core.hpp │ │ │ │ ├── eltwise.hpp │ │ │ │ ├── epilogue.cpp │ │ │ │ ├── epilogue.hpp │ │ │ │ ├── fma.cpp │ │ │ │ ├── fma.hpp │ │ │ │ ├── gemm_schedule.cpp │ │ │ │ ├── gemm_schedule.hpp │ │ │ │ ├── grf_permutation.hpp │ │ │ │ ├── hw.hpp │ │ │ │ ├── ir.cpp │ │ │ │ ├── ir.hpp │ │ │ │ ├── ir_builder.cpp │ │ │ │ ├── ir_builder.hpp │ │ │ │ ├── kernel_desc.hpp │ │ │ │ ├── kernel_info.hpp │ │ │ │ ├── linear_expr.cpp │ │ │ │ ├── linear_expr.hpp │ │ │ │ ├── message.cpp │ │ │ │ ├── message.hpp │ │ │ │ ├── message_patterns.hpp │ │ │ │ ├── post_ops.cpp │ │ │ │ ├── post_ops.hpp │ │ │ │ ├── primitive_plan.cpp │ │ │ │ ├── primitive_plan.hpp │ │ │ │ ├── problem.cpp │ │ │ │ ├── problem.hpp │ │ │ │ ├── reduce.cpp │ │ │ │ ├── reduce.hpp │ │ │ │ ├── reorder.hpp │ │ │ │ ├── send_plan.cpp │ │ │ │ ├── send_plan.hpp │ │ │ │ ├── slm_reduce_builder.cpp │ │ │ │ ├── slm_reduce_builder.hpp │ │ │ │ ├── tensor.cpp │ │ │ │ ├── tensor.hpp │ │ │ │ ├── tensor_config.cpp │ │ │ │ ├── tensor_config.hpp │ │ │ │ └── walk_order.hpp │ │ │ ├── pass │ │ │ │ ├── alloc.cpp │ │ │ │ ├── alloc.hpp │ │ │ │ ├── bank_conflict.cpp │ │ │ │ ├── bank_conflict.hpp │ │ │ │ ├── barrier.cpp │ │ │ │ ├── barrier.hpp │ │ │ │ ├── cse.cpp │ │ │ │ ├── cse.hpp │ │ │ │ ├── dp4a.cpp │ │ │ │ ├── dp4a.hpp │ │ │ │ ├── dpas.cpp │ │ │ │ ├── dpas.hpp │ │ │ │ ├── dpasw.cpp │ │ │ │ ├── dpasw.hpp │ │ │ │ ├── expr_scalarizer.hpp │ │ │ │ ├── hoist.cpp │ │ │ │ ├── hoist.hpp │ │ │ │ ├── overflow.cpp │ │ │ │ ├── overflow.hpp │ │ │ │ ├── pass.cpp │ │ │ │ ├── pass.hpp │ │ │ │ ├── peephole.cpp │ │ │ │ ├── peephole.hpp │ │ │ │ ├── send.cpp │ │ │ │ ├── send.hpp │ │ │ │ ├── shuffle_splitter.cpp │ │ │ │ ├── shuffle_splitter.hpp │ │ │ │ ├── simplify.cpp │ │ │ │ ├── simplify.hpp │ │ │ │ ├── slm.cpp │ │ │ │ ├── slm.hpp │ │ │ │ ├── strength_reduce.cpp │ │ │ │ ├── strength_reduce.hpp │ │ │ │ ├── unroll.cpp │ │ │ │ └── unroll.hpp │ │ │ ├── pooling │ │ │ │ ├── config.hpp │ │ │ │ ├── gen_pooling.cpp │ │ │ │ ├── gen_pooling.hpp │ │ │ │ ├── ir_builder.cpp │ │ │ │ ├── ir_builder.hpp │ │ │ │ └── pooling_kernel.hpp │ │ │ ├── post_op_injector.cpp │ │ │ ├── post_op_injector.hpp │ │ │ ├── reduction.cpp │ │ │ ├── reduction.hpp │ │ │ ├── reduction_generator.hpp │ │ │ ├── reduction_injector.cpp │ │ │ ├── reduction_injector.hpp │ │ │ ├── reorder │ │ │ │ ├── config.cpp │ │ │ │ ├── config.hpp │ │ │ │ ├── gen_reorder.cpp │ │ │ │ ├── gen_reorder.hpp │ │ │ │ ├── ir_builder.cpp │ │ │ │ ├── ir_builder.hpp │ │ │ │ ├── normalization.cpp │ │ │ │ ├── normalization.hpp │ │ │ │ ├── reorder_kernel.hpp │ │ │ │ ├── tiler.cpp │ │ │ │ └── tiler.hpp │ │ │ ├── utils │ │ │ │ ├── iterator.hpp │ │ │ │ ├── ngen_type_bridge.hpp │ │ │ │ ├── range.hpp │ │ │ │ ├── trace.cpp │ │ │ │ ├── trace.hpp │ │ │ │ ├── utils.cpp │ │ │ │ └── utils.hpp │ │ │ └── v2 │ │ │ │ ├── conv │ │ │ │ ├── README.md │ │ │ │ ├── bench_data.cpp │ │ │ │ ├── bench_data.hpp │ │ │ │ ├── bridge.hpp │ │ │ │ ├── builder.cpp │ │ │ │ ├── builder.hpp │ │ │ │ ├── debug.cpp │ │ │ │ ├── debug.hpp │ │ │ │ ├── gen_convolution.cpp │ │ │ │ ├── gen_convolution.hpp │ │ │ │ ├── kernel.hpp │ │ │ │ ├── kernel_desc.cpp │ │ │ │ ├── kernel_desc.hpp │ │ │ │ ├── kernel_desc_2d_reqs.cpp │ │ │ │ ├── model.cpp │ │ │ │ ├── model.hpp │ │ │ │ ├── plan.cpp │ │ │ │ ├── plan.hpp │ │ │ │ ├── plan_registry.cpp │ │ │ │ ├── plan_registry.hpp │ │ │ │ ├── plan_registry_data.cpp │ │ │ │ ├── planner │ │ │ │ │ ├── CMakeLists.txt │ │ │ │ │ ├── bench.cpp │ │ │ │ │ ├── bench.hpp │ │ │ │ │ ├── model_fit.cpp │ │ │ │ │ ├── model_fit.hpp │ │ │ │ │ ├── planner.cpp │ │ │ │ │ ├── planner.hpp │ │ │ │ │ ├── planner_main.cpp │ │ │ │ │ ├── search.cpp │ │ │ │ │ └── search.hpp │ │ │ │ ├── problem.cpp │ │ │ │ ├── problem.hpp │ │ │ │ ├── tensor_utils.cpp │ │ │ │ └── tensor_utils.hpp │ │ │ │ └── ir │ │ │ │ ├── bridge.hpp │ │ │ │ ├── builder.cpp │ │ │ │ ├── builder.hpp │ │ │ │ ├── plan.hpp │ │ │ │ ├── plan_utils.hpp │ │ │ │ ├── reqs.cpp │ │ │ │ ├── reqs.hpp │ │ │ │ ├── send.cpp │ │ │ │ ├── send.hpp │ │ │ │ ├── tensor.cpp │ │ │ │ └── tensor.hpp │ │ ├── kernel_cache.cpp │ │ ├── kernel_cache.hpp │ │ ├── logging.hpp │ │ ├── microkernels │ │ │ ├── .clang-tidy │ │ │ ├── CMakeLists.txt │ │ │ ├── elf.hpp │ │ │ ├── entrance_agent.cpp │ │ │ ├── entrance_agent.hpp │ │ │ ├── fuser.cpp │ │ │ ├── fuser.hpp │ │ │ ├── internal_utilities.hpp │ │ │ ├── package.hpp │ │ │ ├── protocol.cpp │ │ │ ├── protocol.hpp │ │ │ ├── shim.cpp │ │ │ └── shim.hpp │ │ ├── ocl │ │ │ ├── CMakeLists.txt │ │ │ ├── binary_common.h │ │ │ ├── bnorm │ │ │ │ ├── lookup_table.cpp │ │ │ │ ├── lookup_table.hpp │ │ │ │ ├── model.cpp │ │ │ │ ├── model.hpp │ │ │ │ ├── nhwc_batch_normalization.cpp │ │ │ │ ├── nhwc_batch_normalization.hpp │ │ │ │ ├── nhwc_reusable.cl │ │ │ │ ├── nhwc_reusable.cpp │ │ │ │ ├── nhwc_reusable.h │ │ │ │ ├── nhwc_reusable.hpp │ │ │ │ ├── ref_batch_normalization.cpp │ │ │ │ ├── ref_batch_normalization.hpp │ │ │ │ ├── ref_bnorm.cl │ │ │ │ ├── reusable_bnorm.cl │ │ │ │ ├── reusable_bnorm.cpp │ │ │ │ ├── reusable_bnorm.hpp │ │ │ │ ├── simple_bnorm.cl │ │ │ │ ├── simple_bnorm.cpp │ │ │ │ ├── simple_bnorm.hpp │ │ │ │ ├── utils.cpp │ │ │ │ ├── utils.hpp │ │ │ │ ├── xe_batch_normalization.cpp │ │ │ │ ├── xe_batch_normalization.hpp │ │ │ │ ├── xe_bnorm.h │ │ │ │ ├── xe_bnorm_bwd.cl │ │ │ │ ├── xe_bnorm_fwd.cl │ │ │ │ ├── xe_bnorm_nhwc_bwd.cl │ │ │ │ ├── xe_bnorm_nhwc_fwd.cl │ │ │ │ └── xe_bnorm_reduce.h │ │ │ ├── concat_common.h │ │ │ ├── concat_utils.hpp │ │ │ ├── convolution_deconvolution.hpp │ │ │ ├── convolution_inner_product.cpp │ │ │ ├── convolution_inner_product.hpp │ │ │ ├── custom_reorder.cl │ │ │ ├── custom_reorder.cpp │ │ │ ├── custom_reorder.hpp │ │ │ ├── deconv_backward_bias.cl │ │ │ ├── device_info.cpp │ │ │ ├── device_info.hpp │ │ │ ├── dispatch.h │ │ │ ├── engine.cpp │ │ │ ├── engine.hpp │ │ │ ├── gemm │ │ │ │ ├── conv_gemm.hpp │ │ │ │ ├── gemm_with_post_ops.cl │ │ │ │ ├── gemm_with_post_ops.cpp │ │ │ │ ├── gemm_with_post_ops.hpp │ │ │ │ ├── ocl_gemm_attrs.h │ │ │ │ ├── ref_gemm.cl │ │ │ │ ├── ref_gemm.cpp │ │ │ │ ├── ref_gemm.hpp │ │ │ │ ├── xe_hp_systolic_gemm_copy.cl │ │ │ │ ├── xe_hpc_systolic_gemm_copy.cl │ │ │ │ └── xe_systolic_gemm_copy_kernel.hpp │ │ │ ├── gemm_inner_product.cpp │ │ │ ├── gemm_inner_product.hpp │ │ │ ├── gemm_matmul.cpp │ │ │ ├── gemm_matmul.hpp │ │ │ ├── generic_reorder.cl │ │ │ ├── generic_reorder.cpp │ │ │ ├── generic_reorder.hpp │ │ │ ├── graph │ │ │ │ └── gen_index.cl │ │ │ ├── hw_info.cpp │ │ │ ├── hw_info.hpp │ │ │ ├── kernel.cpp │ │ │ ├── kernel.hpp │ │ │ ├── layer_norm_common.h │ │ │ ├── lnorm_utils.hpp │ │ │ ├── many_inputs_sum.cl │ │ │ ├── many_inputs_sum.cpp │ │ │ ├── many_inputs_sum.hpp │ │ │ ├── mdapi_utils.cpp │ │ │ ├── mdapi_utils.hpp │ │ │ ├── micro_sdpa.cl │ │ │ ├── micro_sdpa.cpp │ │ │ ├── micro_sdpa.hpp │ │ │ ├── micro_sdpa_configs.cpp │ │ │ ├── micro_sdpa_configs.hpp │ │ │ ├── multi_concat.hpp │ │ │ ├── multi_po_reorder_binary.hpp │ │ │ ├── multi_po_reorder_sum.hpp │ │ │ ├── ocl_conversion.h │ │ │ ├── ocl_custom_types.h │ │ │ ├── ocl_eltwise.h │ │ │ ├── ocl_generic_vector_ops.h │ │ │ ├── ocl_io.h │ │ │ ├── ocl_kernel_list.cpp.in │ │ │ ├── ocl_math_utils.h │ │ │ ├── ocl_overrides.md │ │ │ ├── ocl_philox.h │ │ │ ├── ocl_post_ops.h │ │ │ ├── ocl_scales.h │ │ │ ├── ocl_types.h │ │ │ ├── ocl_types_specific.h │ │ │ ├── ocl_utils.h │ │ │ ├── offsets.h │ │ │ ├── reduction │ │ │ │ ├── atomic_reduction.cl │ │ │ │ ├── atomic_reduction.cpp │ │ │ │ ├── atomic_reduction.hpp │ │ │ │ ├── combined_reduction.cl │ │ │ │ ├── combined_reduction.cpp │ │ │ │ ├── combined_reduction.hpp │ │ │ │ ├── ocl_reduction.h │ │ │ │ ├── ref_reduction.cl │ │ │ │ ├── ref_reduction.cpp │ │ │ │ ├── ref_reduction.hpp │ │ │ │ ├── reusable_ref_reduction.cl │ │ │ │ ├── reusable_ref_reduction.cpp │ │ │ │ ├── reusable_ref_reduction.hpp │ │ │ │ ├── utils.cpp │ │ │ │ └── utils.hpp │ │ │ ├── ref_convolution.cl │ │ │ ├── ref_convolution.cpp │ │ │ ├── ref_convolution.hpp │ │ │ ├── ref_eltwise.cl │ │ │ ├── ref_eltwise.cpp │ │ │ ├── ref_eltwise.hpp │ │ │ ├── ref_group_normalization.cl │ │ │ ├── ref_group_normalization.cpp │ │ │ ├── ref_group_normalization.hpp │ │ │ ├── ref_inner_product.cl │ │ │ ├── ref_inner_product.cpp │ │ │ ├── ref_inner_product.hpp │ │ │ ├── ref_layer_normalization.cl │ │ │ ├── ref_layer_normalization.cpp │ │ │ ├── ref_layer_normalization.hpp │ │ │ ├── ref_lrn.cl │ │ │ ├── ref_lrn.cpp │ │ │ ├── ref_lrn.hpp │ │ │ ├── ref_matmul.cl │ │ │ ├── ref_matmul.cpp │ │ │ ├── ref_matmul.hpp │ │ │ ├── ref_pooling.cl │ │ │ ├── ref_pooling.cpp │ │ │ ├── ref_pooling.hpp │ │ │ ├── ref_prelu.cl │ │ │ ├── ref_prelu.cpp │ │ │ ├── ref_prelu.hpp │ │ │ ├── ref_reorder.cl │ │ │ ├── ref_reorder.cpp │ │ │ ├── ref_reorder.hpp │ │ │ ├── ref_resampling.cl │ │ │ ├── ref_resampling.cpp │ │ │ ├── ref_resampling.hpp │ │ │ ├── ref_sdpa.cl │ │ │ ├── ref_sdpa.cpp │ │ │ ├── ref_sdpa.hpp │ │ │ ├── ref_shuffle.cl │ │ │ ├── ref_shuffle.cpp │ │ │ ├── ref_shuffle.hpp │ │ │ ├── ref_sparse_matmul.cl │ │ │ ├── ref_sparse_matmul.cpp │ │ │ ├── ref_sparse_matmul.hpp │ │ │ ├── reorder_common.h │ │ │ ├── reusable_lnorm.cl │ │ │ ├── reusable_lnorm.cpp │ │ │ ├── reusable_lnorm.hpp │ │ │ ├── reusable_simple_concat.cl │ │ │ ├── reusable_simple_concat.cpp │ │ │ ├── reusable_simple_concat.hpp │ │ │ ├── reusable_softmax.cl │ │ │ ├── reusable_softmax.cpp │ │ │ ├── reusable_softmax.hpp │ │ │ ├── reusable_vectorized_lnorm.cl │ │ │ ├── reusable_vectorized_lnorm.cpp │ │ │ ├── reusable_vectorized_lnorm.hpp │ │ │ ├── rnn │ │ │ │ ├── cell_common.cpp │ │ │ │ ├── cell_compute.h │ │ │ │ ├── cell_gru.cpp │ │ │ │ ├── cell_gru_lbr.cpp │ │ │ │ ├── cell_kind_utility.h │ │ │ │ ├── common.h │ │ │ │ ├── grid.cl │ │ │ │ ├── grid.cpp │ │ │ │ ├── grid.hpp │ │ │ │ ├── reorders.cpp │ │ │ │ ├── reorders.hpp │ │ │ │ ├── rnn_reorder.cl │ │ │ │ ├── simple_cell_fusion.cpp │ │ │ │ ├── simple_cell_fusion.hpp │ │ │ │ ├── simple_postgemm.cpp │ │ │ │ ├── utils.cpp │ │ │ │ └── utils.hpp │ │ │ ├── sdpa_utils.h │ │ │ ├── shuffle_by_reorder.hpp │ │ │ ├── simple_binary.cl │ │ │ ├── simple_binary.cpp │ │ │ ├── simple_binary.hpp │ │ │ ├── simple_layer_normalization.cl │ │ │ ├── simple_layer_normalization.cpp │ │ │ ├── simple_layer_normalization.hpp │ │ │ ├── simple_softmax.cl │ │ │ ├── simple_softmax.cpp │ │ │ ├── simple_softmax.h │ │ │ ├── simple_softmax.hpp │ │ │ ├── simple_sum.cl │ │ │ ├── simple_sum.cpp │ │ │ ├── simple_sum.hpp │ │ │ ├── simple_zero_pad.cl │ │ │ ├── simple_zero_pad.cpp │ │ │ ├── simple_zero_pad.hpp │ │ │ ├── stream.cpp │ │ │ ├── stream.hpp │ │ │ ├── subbyte_pack.cl │ │ │ ├── tile_ops.h │ │ │ ├── types_interop.h │ │ │ ├── types_interop.hpp │ │ │ ├── usm_utils.cpp │ │ │ ├── usm_utils.hpp │ │ │ ├── utils.cpp │ │ │ ├── utils.hpp │ │ │ ├── vectorized_lnorm.cl │ │ │ ├── vectorized_lnorm.cpp │ │ │ ├── vectorized_lnorm.hpp │ │ │ ├── vectorized_lnorm_fused.cl │ │ │ ├── vectorized_resampling.cl │ │ │ ├── vectorized_resampling.cpp │ │ │ ├── vectorized_resampling.hpp │ │ │ ├── xe_binary.cl │ │ │ ├── xe_binary.cpp │ │ │ ├── xe_binary.hpp │ │ │ ├── xe_concat.cl │ │ │ ├── xe_concat.cpp │ │ │ ├── xe_concat.hpp │ │ │ ├── xe_eltwise.cl │ │ │ ├── xe_eltwise.cpp │ │ │ ├── xe_eltwise.hpp │ │ │ ├── xe_global_pooling.cl │ │ │ ├── xe_global_pooling.cpp │ │ │ ├── xe_global_pooling.hpp │ │ │ ├── xe_pooling.cl │ │ │ ├── xe_pooling.cpp │ │ │ ├── xe_pooling.hpp │ │ │ ├── xe_softmax.cl │ │ │ ├── xe_softmax.cpp │ │ │ ├── xe_softmax.hpp │ │ │ ├── xe_sum.cl │ │ │ ├── xe_sum.cpp │ │ │ ├── xe_sum.hpp │ │ │ ├── xe_wino_conv_fwd_data_2x3.cl │ │ │ ├── xe_wino_conv_fwd_data_fused.cl │ │ │ ├── xe_wino_convolution.cpp │ │ │ └── xe_wino_convolution.hpp │ │ ├── primitive_conf.cpp │ │ ├── primitive_conf.hpp │ │ ├── sycl │ │ │ ├── CMakeLists.txt │ │ │ ├── compat.cpp │ │ │ ├── compat.hpp │ │ │ ├── device_info.cpp │ │ │ ├── device_info.hpp │ │ │ ├── engine.cpp │ │ │ ├── engine.hpp │ │ │ ├── interop_kernel.cpp │ │ │ ├── interop_kernel.hpp │ │ │ ├── l0 │ │ │ │ ├── utils.cpp │ │ │ │ └── utils.hpp │ │ │ ├── stream.cpp │ │ │ ├── stream.hpp │ │ │ ├── utils.cpp │ │ │ └── utils.hpp │ │ ├── utils.cpp │ │ └── utils.hpp │ └── nvidia │ │ ├── CMakeLists.txt │ │ ├── README.md │ │ ├── cudnn_batch_normalization.cpp │ │ ├── cudnn_batch_normalization.hpp │ │ ├── cudnn_batch_normalization_executor.hpp │ │ ├── cudnn_batch_normalization_impl.hpp │ │ ├── cudnn_binary.cpp │ │ ├── cudnn_binary.hpp │ │ ├── cudnn_binary_impl.hpp │ │ ├── cudnn_conv_filter_adjustment_base.hpp │ │ ├── cudnn_conv_inner_product.hpp │ │ ├── cudnn_conv_inner_product_impl.hpp │ │ ├── cudnn_convolution.cpp │ │ ├── cudnn_convolution.hpp │ │ ├── cudnn_convolution_impl.hpp │ │ ├── cudnn_convolution_pd.hpp │ │ ├── cudnn_deconvolution.cpp │ │ ├── cudnn_deconvolution.hpp │ │ ├── cudnn_deconvolution_impl.hpp │ │ ├── cudnn_eltwise.cpp │ │ ├── cudnn_eltwise.hpp │ │ ├── cudnn_eltwise_impl.hpp │ │ ├── cudnn_gemm_inner_product.hpp │ │ ├── cudnn_gemm_inner_product_impl.hpp │ │ ├── cudnn_inner_product.cpp │ │ ├── cudnn_inner_product.hpp │ │ ├── cudnn_inner_product_impl.hpp │ │ ├── cudnn_lrn.cpp │ │ ├── cudnn_lrn.hpp │ │ ├── cudnn_lrn_impl.hpp │ │ ├── cudnn_matmul.cpp │ │ ├── cudnn_matmul.hpp │ │ ├── cudnn_matmul_base_impl.hpp │ │ ├── cudnn_matmul_executor.hpp │ │ ├── cudnn_matmul_impl.hpp │ │ ├── cudnn_matmul_lt.hpp │ │ ├── cudnn_matmul_lt_impl.hpp │ │ ├── cudnn_pooling.cpp │ │ ├── cudnn_pooling.hpp │ │ ├── cudnn_pooling_impl.hpp │ │ ├── cudnn_reduction.cpp │ │ ├── cudnn_reduction.hpp │ │ ├── cudnn_reduction_impl.hpp │ │ ├── cudnn_reorder.cpp │ │ ├── cudnn_reorder.hpp │ │ ├── cudnn_reorder_impl.hpp │ │ ├── cudnn_reorder_lt.cpp │ │ ├── cudnn_reorder_lt.hpp │ │ ├── cudnn_reorder_lt_impl.hpp │ │ ├── cudnn_softmax.cpp │ │ ├── cudnn_softmax.hpp │ │ ├── cudnn_softmax_impl.hpp │ │ ├── cudnn_sum.hpp │ │ ├── engine.cpp │ │ ├── engine.hpp │ │ ├── stream.cpp │ │ ├── stream.hpp │ │ ├── sycl_cuda_compat.cpp │ │ ├── sycl_cuda_compat.hpp │ │ ├── sycl_cuda_scoped_context.cpp │ │ ├── sycl_cuda_scoped_context.hpp │ │ ├── sycl_cuda_stream_utils.hpp │ │ ├── sycl_cuda_utils.cpp │ │ └── sycl_cuda_utils.hpp ├── graph │ ├── CMakeLists.txt │ ├── backend │ │ ├── CMakeLists.txt │ │ ├── dnnl │ │ │ ├── CMakeLists.txt │ │ │ ├── common.cpp │ │ │ ├── common.hpp │ │ │ ├── dnnl_backend.cpp │ │ │ ├── dnnl_backend.hpp │ │ │ ├── dnnl_constant_tensor_cache.hpp │ │ │ ├── dnnl_op_def.hpp │ │ │ ├── dnnl_opset.hpp │ │ │ ├── dnnl_partition_impl.cpp │ │ │ ├── dnnl_partition_impl.hpp │ │ │ ├── dnnl_shape_infer.cpp │ │ │ ├── dnnl_shape_infer.hpp │ │ │ ├── fusion_info.cpp │ │ │ ├── fusion_info.hpp │ │ │ ├── internal_attrs.hpp │ │ │ ├── internal_ops.hpp │ │ │ ├── kernels │ │ │ │ ├── batch_norm.cpp │ │ │ │ ├── batch_norm.hpp │ │ │ │ ├── binary.cpp │ │ │ │ ├── binary.hpp │ │ │ │ ├── concat.cpp │ │ │ │ ├── concat.hpp │ │ │ │ ├── conv.cpp │ │ │ │ ├── conv.hpp │ │ │ │ ├── conv_base.cpp │ │ │ │ ├── conv_base.hpp │ │ │ │ ├── conv_transpose.cpp │ │ │ │ ├── conv_transpose.hpp │ │ │ │ ├── dummy.cpp │ │ │ │ ├── dummy.hpp │ │ │ │ ├── eltwise.cpp │ │ │ │ ├── eltwise.hpp │ │ │ │ ├── gen_index.cpp │ │ │ │ ├── gen_index.hpp │ │ │ │ ├── group_norm.cpp │ │ │ │ ├── group_norm.hpp │ │ │ │ ├── kernel_base.cpp │ │ │ │ ├── kernel_base.hpp │ │ │ │ ├── kernels.hpp │ │ │ │ ├── large_partition.cpp │ │ │ │ ├── large_partition.hpp │ │ │ │ ├── layer_norm.cpp │ │ │ │ ├── layer_norm.hpp │ │ │ │ ├── log_softmax.cpp │ │ │ │ ├── log_softmax.hpp │ │ │ │ ├── matmul.cpp │ │ │ │ ├── matmul.hpp │ │ │ │ ├── mqa.hpp │ │ │ │ ├── mqa_decomp.cpp │ │ │ │ ├── mqa_decomp.hpp │ │ │ │ ├── mqa_decomp_config.cpp │ │ │ │ ├── mqa_decomp_config.hpp │ │ │ │ ├── pool.cpp │ │ │ │ ├── pool.hpp │ │ │ │ ├── prelu.cpp │ │ │ │ ├── prelu.hpp │ │ │ │ ├── quantize.cpp │ │ │ │ ├── quantize.hpp │ │ │ │ ├── reduction.cpp │ │ │ │ ├── reduction.hpp │ │ │ │ ├── reorder.cpp │ │ │ │ ├── reorder.hpp │ │ │ │ ├── resampling.cpp │ │ │ │ ├── resampling.hpp │ │ │ │ ├── sdp.hpp │ │ │ │ ├── sdp_decomp.cpp │ │ │ │ ├── sdp_decomp.hpp │ │ │ │ ├── sdp_decomp_config.cpp │ │ │ │ ├── sdp_decomp_config.hpp │ │ │ │ ├── sdp_primitive.cpp │ │ │ │ ├── sdp_primitive.hpp │ │ │ │ ├── sdp_primitive_config.cpp │ │ │ │ ├── sdp_primitive_config.hpp │ │ │ │ ├── sdp_primitive_v1.cpp │ │ │ │ ├── sdp_primitive_v1.hpp │ │ │ │ ├── shuffle.cpp │ │ │ │ ├── shuffle.hpp │ │ │ │ ├── softmax.cpp │ │ │ │ ├── softmax.hpp │ │ │ │ ├── sum.cpp │ │ │ │ └── sum.hpp │ │ │ ├── layout_id_mgr.cpp │ │ │ ├── layout_id_mgr.hpp │ │ │ ├── layout_propagator.cpp │ │ │ ├── layout_propagator.hpp │ │ │ ├── op_executable.cpp │ │ │ ├── op_executable.hpp │ │ │ ├── passes │ │ │ │ ├── compile_ops.cpp │ │ │ │ ├── compile_ops.hpp │ │ │ │ ├── constant_propagation.cpp │ │ │ │ ├── constant_propagation.hpp │ │ │ │ ├── insert_ops.cpp │ │ │ │ ├── insert_ops.hpp │ │ │ │ ├── layout_propagation.cpp │ │ │ │ ├── layout_propagation.hpp │ │ │ │ ├── lower.cpp │ │ │ │ ├── lower.hpp │ │ │ │ ├── memory_planning.cpp │ │ │ │ ├── memory_planning.hpp │ │ │ │ ├── transform.cpp │ │ │ │ ├── transform.hpp │ │ │ │ ├── utils.cpp │ │ │ │ └── utils.hpp │ │ │ ├── patterns │ │ │ │ ├── binary_fusion.cpp │ │ │ │ ├── bn_fusion.cpp │ │ │ │ ├── concat_fusion.cpp │ │ │ │ ├── conv_block_fusion.cpp │ │ │ │ ├── conv_post_ops.cpp │ │ │ │ ├── convtranspose_fusion.cpp │ │ │ │ ├── data_type_check_pass.hpp │ │ │ │ ├── eltwise_fusion.cpp │ │ │ │ ├── fusions.hpp │ │ │ │ ├── groupnorm_fusion.cpp │ │ │ │ ├── interpolate_fusion.cpp │ │ │ │ ├── layernorm_fusion.cpp │ │ │ │ ├── matmul_post_ops.cpp │ │ │ │ ├── mlp.cpp │ │ │ │ ├── pattern_matcher_pass.hpp │ │ │ │ ├── pool_post_ops.cpp │ │ │ │ ├── quantize_fusion.cpp │ │ │ │ ├── reduction_fusion.cpp │ │ │ │ ├── reorder_fusion.cpp │ │ │ │ ├── sdp.cpp │ │ │ │ ├── shuffle_fusion.cpp │ │ │ │ ├── single_op_pattern.cpp │ │ │ │ ├── softmax_post_ops.cpp │ │ │ │ ├── sum_fusion.cpp │ │ │ │ └── utils.hpp │ │ │ ├── platform.cpp │ │ │ ├── platform.hpp │ │ │ ├── scratchpad.hpp │ │ │ ├── subgraph.cpp │ │ │ ├── subgraph.hpp │ │ │ ├── thread_local_cache.hpp │ │ │ └── utils.hpp │ │ └── fake │ │ │ ├── CMakeLists.txt │ │ │ ├── fake_backend.cpp │ │ │ ├── fake_backend.hpp │ │ │ ├── fake_partition_impl.hpp │ │ │ ├── pattern_utils.hpp │ │ │ ├── single_op_pass.hpp │ │ │ └── transformation_pass.hpp │ ├── interface │ │ ├── CMakeLists.txt │ │ ├── allocator.cpp │ │ ├── allocator.hpp │ │ ├── backend.cpp │ │ ├── backend.hpp │ │ ├── c_types_map.hpp │ │ ├── constant_tensor_cache.cpp │ │ ├── constant_tensor_cache.hpp │ │ ├── graph.cpp │ │ ├── graph.hpp │ │ ├── graph_attr.hpp │ │ ├── logical_tensor.cpp │ │ ├── logical_tensor.hpp │ │ ├── op.cpp │ │ ├── op.hpp │ │ ├── op_def.hpp │ │ ├── op_def_constraint.cpp │ │ ├── op_def_constraint.hpp │ │ ├── op_schema.cpp │ │ ├── op_schema.hpp │ │ ├── opset.hpp │ │ ├── partition.cpp │ │ ├── partition.hpp │ │ ├── partition_cache.cpp │ │ ├── partition_cache.hpp │ │ ├── partition_hashing.cpp │ │ ├── partition_hashing.hpp │ │ ├── partition_impl.cpp │ │ ├── partition_impl.hpp │ │ ├── shape_infer.cpp │ │ ├── shape_infer.hpp │ │ ├── tensor.cpp │ │ ├── tensor.hpp │ │ ├── value.cpp │ │ └── value.hpp │ └── utils │ │ ├── CMakeLists.txt │ │ ├── alloc.cpp │ │ ├── alloc.hpp │ │ ├── any.hpp │ │ ├── attribute_value.hpp │ │ ├── debug.cpp │ │ ├── debug.hpp │ │ ├── id.cpp │ │ ├── id.hpp │ │ ├── json.hpp │ │ ├── ocl_check.hpp │ │ ├── ocl_usm_utils.cpp │ │ ├── ocl_usm_utils.hpp │ │ ├── pm │ │ ├── dag_check_pass.hpp │ │ ├── nested_matcher.cpp │ │ ├── nested_matcher.hpp │ │ ├── op_depth_check_pass.hpp │ │ ├── pass_base.cpp │ │ ├── pass_base.hpp │ │ ├── pass_manager.cpp │ │ ├── pass_manager.hpp │ │ ├── pbuilder.cpp │ │ └── pbuilder.hpp │ │ ├── utils.cpp │ │ ├── utils.hpp │ │ ├── verbose.cpp │ │ └── verbose.hpp └── xpu │ ├── CMakeLists.txt │ ├── README.md │ ├── context.hpp │ ├── ocl │ ├── CMakeLists.txt │ ├── buffer_memory_storage.cpp │ ├── buffer_memory_storage.hpp │ ├── c_types_map.hpp │ ├── capi │ │ ├── engine.cpp │ │ ├── memory.cpp │ │ ├── primitive.cpp │ │ └── stream.cpp │ ├── context.hpp │ ├── engine_factory.hpp │ ├── engine_id.hpp │ ├── engine_impl.cpp │ ├── engine_impl.hpp │ ├── memory_storage.hpp │ ├── memory_storage_base.hpp │ ├── stream_impl.cpp │ ├── stream_impl.hpp │ ├── stream_profiler.cpp │ ├── stream_profiler.hpp │ ├── usm_memory_storage.cpp │ ├── usm_memory_storage.hpp │ ├── usm_utils.cpp │ ├── usm_utils.hpp │ ├── utils.cpp │ ├── utils.hpp │ └── verbose.hpp │ ├── stream_profiler.hpp │ ├── sycl │ ├── CMakeLists.txt │ ├── buffer_memory_storage.cpp │ ├── buffer_memory_storage.hpp │ ├── c_types_map.hpp │ ├── capi │ │ ├── capi_engine.cpp │ │ ├── capi_memory.cpp │ │ ├── capi_primitive.cpp │ │ └── capi_stream.cpp │ ├── compat.cpp │ ├── compat.hpp │ ├── context.hpp │ ├── engine_factory.cpp │ ├── engine_factory.hpp │ ├── engine_id.hpp │ ├── engine_impl.cpp │ ├── engine_impl.hpp │ ├── memory_storage.hpp │ ├── memory_storage_base.cpp │ ├── memory_storage_base.hpp │ ├── memory_storage_helper.hpp │ ├── stream_impl.cpp │ ├── stream_impl.hpp │ ├── stream_profiler.cpp │ ├── stream_profiler.hpp │ ├── types.hpp │ ├── usm_memory_storage.cpp │ ├── usm_memory_storage.hpp │ ├── utils.cpp │ ├── utils.hpp │ └── verbose.hpp │ ├── utils.cpp │ └── utils.hpp ├── tests ├── CMakeLists.txt ├── api.c ├── benchdnn │ ├── CMakeLists.txt │ ├── README.md │ ├── benchdnn.cpp │ ├── binary │ │ ├── bench_binary.cpp │ │ ├── binary.cpp │ │ ├── binary.hpp │ │ ├── binary_aux.cpp │ │ └── ref_binary.cpp │ ├── bnorm │ │ ├── bench_bnorm.cpp │ │ ├── bnorm.cpp │ │ ├── bnorm.hpp │ │ ├── bnorm_aux.cpp │ │ └── ref_bnorm.cpp │ ├── brgemm │ │ ├── bench_brgemm.cpp │ │ ├── brgemm.cpp │ │ ├── brgemm.hpp │ │ ├── brgemm_aux.cpp │ │ ├── cfg.cpp │ │ └── ref_brgemm.cpp │ ├── common.cpp │ ├── common.hpp │ ├── concat │ │ ├── bench_concat.cpp │ │ ├── concat.cpp │ │ ├── concat.hpp │ │ ├── concat_aux.cpp │ │ └── ref_concat.cpp │ ├── conv │ │ ├── bench_conv.cpp │ │ ├── cfg.cpp │ │ ├── conv.cpp │ │ ├── conv.hpp │ │ ├── conv_aux.cpp │ │ ├── conv_dw_fusion.cpp │ │ ├── conv_dw_fusion.hpp │ │ ├── ref_conv.cpp │ │ ├── ref_conv.hpp │ │ └── ref_wino.cpp │ ├── deconv │ │ ├── bench_deconv.cpp │ │ ├── cfg.cpp │ │ ├── deconv.cpp │ │ ├── deconv.hpp │ │ ├── deconv_aux.cpp │ │ ├── ref_deconv.cpp │ │ ├── ref_deconv.hpp │ │ └── ref_wino.cpp │ ├── dnn_types.cpp │ ├── dnn_types.hpp │ ├── dnnl_common.cpp │ ├── dnnl_common.hpp │ ├── dnnl_debug.hpp │ ├── dnnl_debug_autogenerated.cpp │ ├── dnnl_memory.cpp │ ├── dnnl_memory.hpp │ ├── doc │ │ ├── benchdnn_general_info.md │ │ ├── benchdnn_input_files_naming_convention.md │ │ ├── driver_binary.md │ │ ├── driver_bnorm.md │ │ ├── driver_brgemm.md │ │ ├── driver_concat.md │ │ ├── driver_conv.md │ │ ├── driver_eltwise.md │ │ ├── driver_gnorm.md │ │ ├── driver_graph.md │ │ ├── driver_ip.md │ │ ├── driver_lnorm.md │ │ ├── driver_lrn.md │ │ ├── driver_matmul.md │ │ ├── driver_pool.md │ │ ├── driver_prelu.md │ │ ├── driver_reduction.md │ │ ├── driver_reorder.md │ │ ├── driver_resampling.md │ │ ├── driver_rnn.md │ │ ├── driver_shuffle.md │ │ ├── driver_softmax.md │ │ ├── driver_sum.md │ │ ├── driver_zeropad.md │ │ ├── knob_cold_cache.md │ │ ├── knob_impl_filter.md │ │ ├── knob_strides.md │ │ ├── knob_summary.md │ │ ├── knob_use_fast_ref.md │ │ ├── knobs_attr.md │ │ ├── knobs_common.md │ │ ├── knobs_desc.md │ │ ├── knobs_dir.md │ │ ├── knobs_dt.md │ │ ├── knobs_encoding.md │ │ ├── knobs_perf_report.md │ │ ├── knobs_tag.md │ │ └── knobs_verbose.md │ ├── eltwise │ │ ├── bench_eltwise.cpp │ │ ├── eltwise.cpp │ │ ├── eltwise.hpp │ │ ├── eltwise_aux.cpp │ │ └── ref_eltwise.cpp │ ├── gnorm │ │ ├── bench_gnorm.cpp │ │ ├── gnorm.cpp │ │ ├── gnorm.hpp │ │ ├── gnorm_aux.cpp │ │ └── ref_gnorm.cpp │ ├── graph │ │ ├── allocator.cpp │ │ ├── allocator.hpp │ │ ├── bench_graph.cpp │ │ ├── custom_driver.cpp │ │ ├── custom_driver.hpp │ │ ├── deserialize.cpp │ │ ├── deserialize.hpp │ │ ├── flex_rewrite.cpp │ │ ├── flex_rewrite.hpp │ │ ├── graph.cpp │ │ ├── graph.hpp │ │ ├── graph_memory.cpp │ │ ├── graph_memory.hpp │ │ ├── input_displacer.cpp │ │ ├── input_displacer.hpp │ │ ├── memory_pool.hpp │ │ ├── parser.cpp │ │ ├── parser.hpp │ │ ├── ref_partition.cpp │ │ ├── ref_partition.hpp │ │ ├── ref_primitive.cpp │ │ ├── ref_primitive.hpp │ │ ├── setting_handler.cpp │ │ ├── setting_handler.hpp │ │ ├── utils.cpp │ │ └── utils.hpp │ ├── inputs │ │ ├── binary │ │ │ ├── harness_binary_bf16 │ │ │ ├── harness_binary_different_dt │ │ │ ├── harness_binary_f16 │ │ │ ├── harness_binary_f32 │ │ │ ├── harness_binary_i8 │ │ │ ├── harness_binary_regression │ │ │ ├── option_set_all │ │ │ ├── option_set_fwks_ext_gpu │ │ │ ├── option_set_fwks_key_gpu │ │ │ ├── option_set_minimal │ │ │ ├── option_set_src0_bcast │ │ │ ├── perf_binary_gpu │ │ │ ├── shapes_ci │ │ │ ├── shapes_perf_1st_conv │ │ │ ├── shapes_perf_scaleshift │ │ │ ├── test_binary_all │ │ │ ├── test_binary_bfloat16 │ │ │ ├── test_binary_ci │ │ │ ├── test_binary_different_dt_ci │ │ │ ├── test_binary_float16 │ │ │ ├── test_binary_gpu │ │ │ └── test_binary_smoke │ │ ├── bnorm │ │ │ ├── option_set_fwks_ext_gpu │ │ │ ├── option_set_fwks_key_gpu │ │ │ ├── perf_bnorm_gpu │ │ │ ├── set_nd │ │ │ ├── shapes_1d │ │ │ ├── shapes_2d │ │ │ ├── shapes_3d │ │ │ ├── shapes_ci │ │ │ ├── shapes_densenet_121 │ │ │ ├── shapes_googlenet_v2 │ │ │ ├── shapes_googlenet_v3 │ │ │ ├── shapes_large │ │ │ ├── shapes_regressions │ │ │ ├── shapes_resnet_50 │ │ │ ├── shapes_topologies_small │ │ │ ├── test_bnorm_all_blocked │ │ │ ├── test_bnorm_all_plain │ │ │ ├── test_bnorm_bfloat16_blocked │ │ │ ├── test_bnorm_bfloat16_plain │ │ │ ├── test_bnorm_ci │ │ │ ├── test_bnorm_float16_plain │ │ │ ├── test_bnorm_gpu │ │ │ ├── test_bnorm_regressions │ │ │ ├── test_bnorm_regressions_large │ │ │ └── test_bnorm_smoke │ │ ├── brgemm │ │ │ ├── harness_brgemm_f32 │ │ │ ├── harness_brgemm_f8 │ │ │ ├── harness_brgemm_fpmath │ │ │ ├── option_set_bf16 │ │ │ ├── option_set_f32 │ │ │ ├── option_set_int8 │ │ │ ├── shapes_2d_big_k_bf16 │ │ │ ├── shapes_2d_big_k_f32 │ │ │ ├── shapes_2d_big_k_int8 │ │ │ ├── shapes_2d_big_k_tail_n_bf16 │ │ │ ├── shapes_2d_big_k_tail_n_f32 │ │ │ ├── shapes_2d_big_k_tail_n_int8 │ │ │ ├── shapes_2d_no_tail_bf16 │ │ │ ├── shapes_2d_no_tail_f32 │ │ │ ├── shapes_2d_no_tail_int8 │ │ │ ├── shapes_2d_tail_k_bf16 │ │ │ ├── shapes_2d_tail_k_f32 │ │ │ ├── shapes_2d_tail_k_int8 │ │ │ ├── shapes_2d_tail_k_tail_n_bf16 │ │ │ ├── shapes_2d_tail_k_tail_n_f32 │ │ │ ├── shapes_2d_tail_k_tail_n_int8 │ │ │ ├── shapes_2d_tail_n_bf16 │ │ │ ├── shapes_2d_tail_n_f32 │ │ │ ├── shapes_2d_tail_n_int8 │ │ │ ├── test_brgemm_all │ │ │ ├── test_brgemm_bf16 │ │ │ ├── test_brgemm_ci │ │ │ ├── test_brgemm_f16 │ │ │ ├── test_brgemm_f32 │ │ │ ├── test_brgemm_f8 │ │ │ ├── test_brgemm_int8 │ │ │ ├── test_brgemm_regression │ │ │ └── test_brgemm_smoke │ │ ├── concat │ │ │ ├── option_set_fwks_ext_gpu │ │ │ ├── option_set_fwks_key_gpu │ │ │ ├── option_set_xe_gpu │ │ │ ├── test_concat_all │ │ │ ├── test_concat_bfloat16 │ │ │ ├── test_concat_ci │ │ │ ├── test_concat_float16 │ │ │ ├── test_concat_gpu │ │ │ ├── test_concat_large_gpu │ │ │ └── test_concat_smoke │ │ ├── conv │ │ │ ├── harness_conv_arbitrary_dst │ │ │ ├── harness_conv_attrs_gpu │ │ │ ├── harness_conv_attrs_int8 │ │ │ ├── harness_conv_attrs_int8_asymmetric │ │ │ ├── harness_conv_auto │ │ │ ├── harness_conv_deepbench │ │ │ ├── harness_conv_depthwise_int8 │ │ │ ├── harness_conv_dilated_3d │ │ │ ├── harness_conv_dilated_int8 │ │ │ ├── harness_conv_dw_bfloat16 │ │ │ ├── harness_conv_dw_bfloat16_nxc │ │ │ ├── harness_conv_dw_float16_nxc │ │ │ ├── harness_conv_dw_fp8_nxc │ │ │ ├── harness_conv_f32 │ │ │ ├── harness_conv_f32_plain │ │ │ ├── harness_conv_fused_depthwise │ │ │ ├── harness_conv_int8 │ │ │ ├── harness_conv_output_striding │ │ │ ├── harness_conv_regression_general │ │ │ ├── harness_conv_saturation_int8 │ │ │ ├── harness_conv_smoke_ref │ │ │ ├── harness_conv_tags │ │ │ ├── harness_conv_zero_points │ │ │ ├── option_gpu_ci │ │ │ ├── option_set_all_eltwise_postops │ │ │ ├── option_set_combined_postops │ │ │ ├── option_set_fwks_ext_gpu │ │ │ ├── option_set_fwks_key_gpu │ │ │ ├── option_set_v2 │ │ │ ├── perf_conv_bdw_1sock │ │ │ ├── perf_conv_clx_1sock │ │ │ ├── perf_conv_skx_1sock │ │ │ ├── perf_conv_xe │ │ │ ├── perf_conv_xe_hp │ │ │ ├── perf_conv_xe_lp │ │ │ ├── set_all_topologies │ │ │ ├── set_conv_3d │ │ │ ├── set_conv_all │ │ │ ├── set_conv_dw │ │ │ ├── set_dilated-conv │ │ │ ├── set_dilated-conv_1st │ │ │ ├── set_dilated-conv_3d │ │ │ ├── set_fastrcnn │ │ │ ├── set_gpu │ │ │ ├── set_maskrcnn │ │ │ ├── set_perf_cpu_all_mb │ │ │ ├── set_perf_cpu_inference_only │ │ │ ├── set_perf_cpu_large_mb │ │ │ ├── set_perf_cpu_small_mb │ │ │ ├── set_perf_gpu_all_mb │ │ │ ├── set_perf_gpu_large_mb │ │ │ ├── set_perf_gpu_small_mb │ │ │ ├── set_topologies_inference_only │ │ │ ├── shapes_1d │ │ │ ├── shapes_1d_wavenet │ │ │ ├── shapes_1x1 │ │ │ ├── shapes_3d │ │ │ ├── shapes_3d_1st_strided_padding │ │ │ ├── shapes_3d_1x1_strided_no-padding │ │ │ ├── shapes_3d_1x1_strided_padding │ │ │ ├── shapes_3d_1x1_unit-stride_no-padding │ │ │ ├── shapes_3d_1x1_unit-stride_padding │ │ │ ├── shapes_3d_2d_strided_padding │ │ │ ├── shapes_3d_gpu │ │ │ ├── shapes_3d_i3d │ │ │ ├── shapes_3d_resnext101 │ │ │ ├── shapes_3d_strided_no-padding │ │ │ ├── shapes_3d_strided_padding │ │ │ ├── shapes_3d_unet │ │ │ ├── shapes_3d_unit-stride_no-padding │ │ │ ├── shapes_3d_unit-stride_padding │ │ │ ├── shapes_4bit │ │ │ ├── shapes_a3c │ │ │ ├── shapes_alexnet │ │ │ ├── shapes_auto │ │ │ ├── shapes_basic │ │ │ ├── shapes_ci_gpu │ │ │ ├── shapes_cosmictagger │ │ │ ├── shapes_deepbench_inference_device │ │ │ ├── shapes_deepbench_inference_server │ │ │ ├── shapes_deepbench_training │ │ │ ├── shapes_densnet │ │ │ ├── shapes_dilated │ │ │ ├── shapes_dilated_1d_1st_strided_padding │ │ │ ├── shapes_dilated_1d_strided_no-padding │ │ │ ├── shapes_dilated_1d_strided_padding │ │ │ ├── shapes_dilated_1d_unit-stride_no-padding │ │ │ ├── shapes_dilated_1d_unit-stride_padding │ │ │ ├── shapes_dilated_2d_1st_strided_padding │ │ │ ├── shapes_dilated_2d_strided_no-padding │ │ │ ├── shapes_dilated_2d_strided_padding │ │ │ ├── shapes_dilated_2d_unit-stride_no-padding │ │ │ ├── shapes_dilated_2d_unit-stride_padding │ │ │ ├── shapes_dilated_3d_strided_no-padding │ │ │ ├── shapes_dilated_3d_strided_padding │ │ │ ├── shapes_dilated_3d_unit-stride_no-padding │ │ │ ├── shapes_dilated_3d_unit-stride_padding │ │ │ ├── shapes_dilated_rfcn │ │ │ ├── shapes_dw_1d_stride_no-padding │ │ │ ├── shapes_dw_1d_unit-stride_no-padding │ │ │ ├── shapes_dw_1d_unit-stride_padding │ │ │ ├── shapes_dw_2d_1d_strided_padding │ │ │ ├── shapes_dw_2d_strided_no-padding │ │ │ ├── shapes_dw_2d_strided_padding │ │ │ ├── shapes_dw_2d_unit-stride_no-padding │ │ │ ├── shapes_dw_2d_unit-stride_padding │ │ │ ├── shapes_dw_3d_strided_no-padding │ │ │ ├── shapes_dw_3d_strided_padding │ │ │ ├── shapes_dw_3d_unit-stride_no-padding │ │ │ ├── shapes_dw_3d_unit-stride_padding │ │ │ ├── shapes_dw_minibatch_2d-spatial │ │ │ ├── shapes_dw_minibatch_channel_2d-spatial │ │ │ ├── shapes_efficientdet │ │ │ ├── shapes_fastrcnn_p1 │ │ │ ├── shapes_fastrcnn_p2 │ │ │ ├── shapes_fastrcnn_p3 │ │ │ ├── shapes_ffn │ │ │ ├── shapes_fused_large_src │ │ │ ├── shapes_fused_mobilenet_stride_1 │ │ │ ├── shapes_fused_mobilenet_stride_2 │ │ │ ├── shapes_gemm │ │ │ ├── shapes_googlenet_v1 │ │ │ ├── shapes_googlenet_v2 │ │ │ ├── shapes_googlenet_v3 │ │ │ ├── shapes_large_conv │ │ │ ├── shapes_large_padding │ │ │ ├── shapes_maskrcnn_p1 │ │ │ ├── shapes_maskrcnn_p2 │ │ │ ├── shapes_mem_strided │ │ │ ├── shapes_mobilenet │ │ │ ├── shapes_mobilenet_dw │ │ │ ├── shapes_movinet_dw │ │ │ ├── shapes_pointnet │ │ │ ├── shapes_regression_1x1 │ │ │ ├── shapes_regression_dw │ │ │ ├── shapes_regression_gemm │ │ │ ├── shapes_regression_padding │ │ │ ├── shapes_regression_small_spatial │ │ │ ├── shapes_resnet_50 │ │ │ ├── shapes_resnet_50_sparse │ │ │ ├── shapes_resnet_50_v1_5 │ │ │ ├── shapes_resnext_101 │ │ │ ├── shapes_segnet │ │ │ ├── shapes_src-transpose_padding │ │ │ ├── shapes_ssd_300_voc0712 │ │ │ ├── shapes_ssd_mobilenet │ │ │ ├── shapes_ssd_resnet34_inference │ │ │ ├── shapes_ssd_resnet34_training │ │ │ ├── shapes_tails │ │ │ ├── shapes_tails_gpu │ │ │ ├── shapes_unet │ │ │ ├── shapes_vgg_11 │ │ │ ├── shapes_vgg_19 │ │ │ ├── shapes_x3d_dw │ │ │ ├── shapes_xception │ │ │ ├── shapes_yolov2 │ │ │ ├── test_conv_3d │ │ │ ├── test_conv_3d_f32_plain │ │ │ ├── test_conv_all │ │ │ ├── test_conv_all_topologies │ │ │ ├── test_conv_all_topologies_f32_plain │ │ │ ├── test_conv_attrs │ │ │ ├── test_conv_attrs_f32_plain │ │ │ ├── test_conv_bfloat16 │ │ │ ├── test_conv_bfloat16_nxc │ │ │ ├── test_conv_bfloat16_ymm │ │ │ ├── test_conv_ci │ │ │ ├── test_conv_depthwise │ │ │ ├── test_conv_dilated │ │ │ ├── test_conv_dilated_f32_plain │ │ │ ├── test_conv_dt │ │ │ ├── test_conv_dt_plain │ │ │ ├── test_conv_float16_nxc │ │ │ ├── test_conv_fp4 │ │ │ ├── test_conv_fp8_nxc │ │ │ ├── test_conv_function │ │ │ ├── test_conv_gemm_bfloat16 │ │ │ ├── test_conv_gemm_bfloat16_nxc │ │ │ ├── test_conv_gemm_dt │ │ │ ├── test_conv_gemm_dt_nxc │ │ │ ├── test_conv_gemm_int8 │ │ │ ├── test_conv_gpu │ │ │ ├── test_conv_gpu_ci │ │ │ ├── test_conv_int8 │ │ │ ├── test_conv_large_gpu │ │ │ ├── test_conv_regression │ │ │ ├── test_conv_regression_gpu │ │ │ ├── test_conv_smoke │ │ │ ├── test_conv_wino_f32 │ │ │ └── test_conv_wino_gpu │ │ ├── deconv │ │ │ ├── harness_deconv_attrs_int8 │ │ │ ├── harness_deconv_attrs_int8_asymmetric │ │ │ ├── harness_deconv_regression_general_f32 │ │ │ ├── harness_deconv_regression_general_int8 │ │ │ ├── option_set_fwks_ext_gpu │ │ │ ├── option_set_fwks_key_gpu │ │ │ ├── set_all │ │ │ ├── shapes_1d │ │ │ ├── shapes_1x1 │ │ │ ├── shapes_2d │ │ │ ├── shapes_3d │ │ │ ├── shapes_ci │ │ │ ├── shapes_dilated │ │ │ ├── test_deconv_all │ │ │ ├── test_deconv_all_f32_nxc │ │ │ ├── test_deconv_bfloat16 │ │ │ ├── test_deconv_bfloat16_nxc │ │ │ ├── test_deconv_bfloat16_ymm │ │ │ ├── test_deconv_ci │ │ │ ├── test_deconv_float16_nxc │ │ │ ├── test_deconv_fp8_nxc │ │ │ ├── test_deconv_gpu │ │ │ ├── test_deconv_int8 │ │ │ └── test_deconv_smoke │ │ ├── eltwise │ │ │ ├── harness_eltwise_large_buffer │ │ │ ├── harness_eltwise_regression │ │ │ ├── harness_eltwise_saturation │ │ │ ├── option_set_all_algs │ │ │ ├── option_set_all_algs_ci │ │ │ ├── option_set_all_algs_int8 │ │ │ ├── option_set_all_algs_int8_ci │ │ │ ├── option_set_fwks_ext_gpu │ │ │ ├── option_set_fwks_key_gpu │ │ │ ├── shapes_ci │ │ │ ├── shapes_eltwise │ │ │ ├── shapes_large_buffer │ │ │ ├── test_eltwise_all │ │ │ ├── test_eltwise_bfloat16 │ │ │ ├── test_eltwise_ci │ │ │ ├── test_eltwise_float16 │ │ │ ├── test_eltwise_float8 │ │ │ ├── test_eltwise_gpu │ │ │ └── test_eltwise_smoke │ │ ├── gnorm │ │ │ ├── shapes_all │ │ │ ├── shapes_ci │ │ │ ├── shapes_sd │ │ │ ├── test_gnorm_all │ │ │ └── test_gnorm_ci │ │ ├── graph │ │ │ ├── complex_fusion │ │ │ │ ├── harness_mha_all │ │ │ │ ├── harness_mha_ci │ │ │ │ ├── harness_mlp_all │ │ │ │ ├── harness_mlp_ci │ │ │ │ ├── mha │ │ │ │ │ ├── GQA-fp16-v2.json │ │ │ │ │ ├── GQA-fp16.json │ │ │ │ │ ├── JAX-MHA-inf-fp32.json │ │ │ │ │ ├── JAX-MQA-inf-fp32.json │ │ │ │ │ ├── MHA-GPT-inf-fp32-bs1.json │ │ │ │ │ ├── MHA-GPT-inf-int8-bs1.json │ │ │ │ │ ├── MHA-bert_large-inf-fp32-bs1.json │ │ │ │ │ ├── MHA-bert_large-inf-int8-bs1.json │ │ │ │ │ ├── MHA-distill_bert-inf-fp32-bs1.json │ │ │ │ │ ├── MHA-distill_bert-inf-int8-bs1.json │ │ │ │ │ ├── MHA-stable_diffusion-inf-fp32-bs1.json │ │ │ │ │ ├── codegemma-bf16-f32.json │ │ │ │ │ ├── gemma2-bf16-f32.json │ │ │ │ │ ├── gqa-plain-bottom-right-implicit-causal-mask-f16-f32.json │ │ │ │ │ ├── gqa-plain-implicit-causal-mask-fp32-bs1.json │ │ │ │ │ ├── sdpa-compressed-k-int8-gs32.json │ │ │ │ │ ├── sdpa-compressed-kv-implicit-causal-mask-int8-gs128.json │ │ │ │ │ ├── sdpa-compressed-kv-int4-gs32.json │ │ │ │ │ ├── sdpa-compressed-v-int8-gs32.json │ │ │ │ │ ├── sdpa-plain-bottom-right-implicit-causal-mask-f16-f32.json │ │ │ │ │ ├── sdpa-plain-implicit-causal-mask-fp32-bs1.json │ │ │ │ │ ├── sdpa-plain-simplified-f16-f32.json │ │ │ │ │ ├── sdpa-plain-simplified-f16.json │ │ │ │ │ ├── sdpa-plain-wo-mask-f16.json │ │ │ │ │ ├── sdpa-plain-wo-scale-f16-bs1.json │ │ │ │ │ └── sdpa-plain-wo-scale-int8-bs1.json │ │ │ │ └── mlp │ │ │ │ │ ├── gated-mlp-f32.json │ │ │ │ │ └── gated-mlp-int4.json │ │ │ ├── op │ │ │ │ ├── bf16 │ │ │ │ │ ├── dynamicdq_s4.json │ │ │ │ │ ├── dynamicdq_u4.json │ │ │ │ │ └── typecast.json │ │ │ │ ├── f16 │ │ │ │ │ ├── dynamicdq_s4.json │ │ │ │ │ ├── dynamicdq_u4.json │ │ │ │ │ └── typecast.json │ │ │ │ ├── f32 │ │ │ │ │ ├── abs.json │ │ │ │ │ ├── abs_bwd.json │ │ │ │ │ ├── add.json │ │ │ │ │ ├── avgpool.json │ │ │ │ │ ├── avgpool_bwd.json │ │ │ │ │ ├── biasadd.json │ │ │ │ │ ├── biasadd_bwd.json │ │ │ │ │ ├── bnorm.json │ │ │ │ │ ├── bnorm_bwd.json │ │ │ │ │ ├── bnorm_fwd_d.json │ │ │ │ │ ├── clamp.json │ │ │ │ │ ├── clamp_bwd.json │ │ │ │ │ ├── concat.json │ │ │ │ │ ├── concat_2.json │ │ │ │ │ ├── concat_3.json │ │ │ │ │ ├── conv_2d.json │ │ │ │ │ ├── conv_3d.json │ │ │ │ │ ├── conv_bwd_d_2d.json │ │ │ │ │ ├── conv_bwd_d_3d.json │ │ │ │ │ ├── conv_bwd_w_2d.json │ │ │ │ │ ├── deconv.json │ │ │ │ │ ├── deconv_bwd_d.json │ │ │ │ │ ├── deconv_bwd_w.json │ │ │ │ │ ├── dequantize_f8_e4m3.json │ │ │ │ │ ├── dequantize_f8_e5m2.json │ │ │ │ │ ├── dequantize_s8.json │ │ │ │ │ ├── dequantize_u8.json │ │ │ │ │ ├── dynamicdq.json │ │ │ │ │ ├── dynamicdq_s4.json │ │ │ │ │ ├── dynamicdq_u4.json │ │ │ │ │ ├── dynamicq.json │ │ │ │ │ ├── dynamicq_s4.json │ │ │ │ │ ├── elu.json │ │ │ │ │ ├── elu_bwd.json │ │ │ │ │ ├── gelu.json │ │ │ │ │ ├── genindex.json │ │ │ │ │ ├── gnorm.json │ │ │ │ │ ├── greaterequal.json │ │ │ │ │ ├── hardsigmoid.json │ │ │ │ │ ├── hardsigmoid_bwd.json │ │ │ │ │ ├── interpolate.json │ │ │ │ │ ├── interpolate_3d.json │ │ │ │ │ ├── interpolate_bwd.json │ │ │ │ │ ├── interpolate_bwd_2d.json │ │ │ │ │ ├── lnorm.json │ │ │ │ │ ├── lnorm_3d.json │ │ │ │ │ ├── lnorm_3d_bwd.json │ │ │ │ │ ├── lnorm_bwd.json │ │ │ │ │ ├── lnorm_ks.json │ │ │ │ │ ├── logsoftmax.json │ │ │ │ │ ├── logsoftmax_bwd.json │ │ │ │ │ ├── matmul_2d_4d.json │ │ │ │ │ ├── maxpool.json │ │ │ │ │ ├── maxpool_bwd.json │ │ │ │ │ ├── prelu.json │ │ │ │ │ ├── prelu_bwd.json │ │ │ │ │ ├── prelu_bwd_dw_5d.json │ │ │ │ │ ├── quantize.json │ │ │ │ │ ├── quantize_f8_e4m3.json │ │ │ │ │ ├── quantize_f8_e5m2.json │ │ │ │ │ ├── reciprocal.json │ │ │ │ │ ├── reducel1.json │ │ │ │ │ ├── reducel2.json │ │ │ │ │ ├── reducemax.json │ │ │ │ │ ├── reducemean.json │ │ │ │ │ ├── reducemin.json │ │ │ │ │ ├── reduceprod.json │ │ │ │ │ ├── reducesum.json │ │ │ │ │ ├── relu.json │ │ │ │ │ ├── relu_bwd.json │ │ │ │ │ ├── reorder.json │ │ │ │ │ ├── select.json │ │ │ │ │ ├── softmax.json │ │ │ │ │ ├── softmax_bwd.json │ │ │ │ │ ├── softmax_bwd_d_3d.json │ │ │ │ │ ├── softplus_bwd.json │ │ │ │ │ ├── static_reshape.json │ │ │ │ │ ├── static_transpose.json │ │ │ │ │ └── typecast.json │ │ │ │ ├── harness_bf16_all │ │ │ │ ├── harness_bf16_ci │ │ │ │ ├── harness_f16_all │ │ │ │ ├── harness_f16_ci │ │ │ │ ├── harness_f32_all │ │ │ │ └── harness_f32_ci │ │ │ ├── pattern │ │ │ │ ├── f32 │ │ │ │ │ ├── avgpool_3d_chain_fusion.json │ │ │ │ │ ├── binary_2d_post_ops_relu_fusion.json │ │ │ │ │ ├── binary_2d_post_ops_sum_fusion.json │ │ │ │ │ ├── binary_3d_post_ops_add_fusion.json │ │ │ │ │ ├── binary_4d_post_ops_relu_fusion.json │ │ │ │ │ ├── binary_4d_post_ops_sum_fusion.json │ │ │ │ │ ├── binary_post_ops_chain_fusion.json │ │ │ │ │ ├── binary_post_ops_fusion.json │ │ │ │ │ ├── binary_post_ops_logistic_fusion.json │ │ │ │ │ ├── bn_bwd_relu_bwd_fusion.json │ │ │ │ │ ├── bn_relu_fusion.json │ │ │ │ │ ├── conv_add_sigmoid_multiply_relu_fusion.json │ │ │ │ │ ├── conv_add_swish_relu_fusion.json │ │ │ │ │ ├── conv_bias_add_fusion.json │ │ │ │ │ ├── conv_bias_mul_mul_depthwise_bias_swish_fusion_cpu.json │ │ │ │ │ ├── conv_bias_post_ops_chain_fusion.json │ │ │ │ │ ├── conv_bias_post_ops_fusion.json │ │ │ │ │ ├── conv_bias_relu_depthwise_bias_relu_fusion_cpu.json │ │ │ │ │ ├── conv_bias_sum_fusion.json │ │ │ │ │ ├── conv_bias_sum_fusion_2.json │ │ │ │ │ ├── conv_bias_swish_fusion.json │ │ │ │ │ ├── conv_depthwise_fusion_cpu.json │ │ │ │ │ ├── conv_post_ops_fusion.json │ │ │ │ │ ├── convtranspose_post_ops_fusion.json │ │ │ │ │ ├── interpolate_post_ops_chain_fusion.json │ │ │ │ │ ├── interpolate_post_ops_chain_fusion_2.json │ │ │ │ │ ├── interpolate_post_ops_chain_fusion_3.json │ │ │ │ │ ├── interpolate_post_ops_chain_fusion_4.json │ │ │ │ │ ├── lnorm_gelu.json │ │ │ │ │ ├── matmul_bias_post_ops_chain_fusion.json │ │ │ │ │ ├── matmul_bias_post_ops_clip_fusion.json │ │ │ │ │ ├── matmul_bias_post_ops_elu_fusion.json │ │ │ │ │ ├── matmul_post_ops_add_add_fusion.json │ │ │ │ │ ├── matmul_post_ops_chain_fusion.json │ │ │ │ │ ├── matmul_post_ops_clip_fusion.json │ │ │ │ │ ├── matmul_post_ops_relu_add_fusion.json │ │ │ │ │ ├── matmul_post_ops_sum_logistic_fusion.json │ │ │ │ │ ├── matmul_post_ops_sum_relu_fusion.json │ │ │ │ │ ├── matmul_post_ops_swish_fusion.json │ │ │ │ │ ├── matmul_select.json │ │ │ │ │ ├── maxpool_chain_fusion.json │ │ │ │ │ ├── maxpool_sum_relu_fusion.json │ │ │ │ │ ├── reciprocal_multiply_fusion.json │ │ │ │ │ ├── reduction_post_ops_l1_chain_fusion.json │ │ │ │ │ ├── reduction_post_ops_l2_fusion.json │ │ │ │ │ ├── reduction_post_ops_max_chain_fusion.json │ │ │ │ │ ├── reduction_post_ops_mean_fusion.json │ │ │ │ │ ├── reduction_post_ops_min_chain_fusion.json │ │ │ │ │ ├── reduction_post_ops_prod_chain_fusion.json │ │ │ │ │ ├── reduction_post_ops_sum_chain_fusion.json │ │ │ │ │ ├── shuffle_fusion.json │ │ │ │ │ ├── softmax_post_ops_binary_fusion.json │ │ │ │ │ ├── softmax_post_ops_unary_fusion.json │ │ │ │ │ ├── unary_post_ops_elu_fusion.json │ │ │ │ │ ├── unary_post_ops_gelu_fusion.json │ │ │ │ │ ├── unary_post_ops_hardswish_fusion.json │ │ │ │ │ ├── unary_post_ops_log_fusion.json │ │ │ │ │ ├── unary_post_ops_round_fusion.json │ │ │ │ │ ├── unary_post_ops_sqrt_fusion.json │ │ │ │ │ ├── unary_post_ops_square_fusion.json │ │ │ │ │ └── unary_post_ops_tanh_fusion.json │ │ │ │ ├── f8 │ │ │ │ │ ├── f8_bf16_matmul_add_fusion.json │ │ │ │ │ ├── f8_bf16_matmul_sum_add_mul_relu.json │ │ │ │ │ ├── f8_conv_add_add_fusion.json │ │ │ │ │ ├── f8_conv_bias_relu_fusion.json │ │ │ │ │ ├── f8_conv_fwd.json │ │ │ │ │ ├── f8_conv_post_ops_fusion.json │ │ │ │ │ ├── f8_conv_post_ops_int8_add_fusion.json │ │ │ │ │ ├── f8_f32_matmul_mul_add_fusion.json │ │ │ │ │ ├── f8_matmul.json │ │ │ │ │ └── f8_matmul_sum_add_mul_relu.json │ │ │ │ ├── harness_bf16_all │ │ │ │ ├── harness_bf16_ci │ │ │ │ ├── harness_f16_all │ │ │ │ ├── harness_f16_ci │ │ │ │ ├── harness_f32_all │ │ │ │ ├── harness_f32_ci │ │ │ │ ├── harness_f8_all │ │ │ │ ├── harness_f8_ci │ │ │ │ ├── harness_int8_all │ │ │ │ ├── harness_int8_ci │ │ │ │ └── int8 │ │ │ │ │ ├── int8_avgpool_reshape_fusion.json │ │ │ │ │ ├── int8_avgpool_transpose_fusion.json │ │ │ │ │ ├── int8_bf16_conv_add_fusion.json │ │ │ │ │ ├── int8_bf16_conv_add_relu_mul.json │ │ │ │ │ ├── int8_bf16_conv_binary_add_fusion.json │ │ │ │ │ ├── int8_bf16_conv_binary_add_fusion_2.json │ │ │ │ │ ├── int8_bf16_gnorm_add_fusion.json │ │ │ │ │ ├── int8_bf16_gnorm_relu_fusion.json │ │ │ │ │ ├── int8_bf16_matmul.json │ │ │ │ │ ├── int8_bf16_matmul_add_fusion.json │ │ │ │ │ ├── int8_bf16_matmul_add_mul_relu.json │ │ │ │ │ ├── int8_bf16_matmul_mul_add_fusion.json │ │ │ │ │ ├── int8_bf16_matmul_mul_add_fusion_2.json │ │ │ │ │ ├── int8_bf16_matmul_mul_w_smooth_quant_fusion.json │ │ │ │ │ ├── int8_bf16_matmul_post_ops_fusion.json │ │ │ │ │ ├── int8_bf16_matmul_relu_w_smooth_quant_fusion.json │ │ │ │ │ ├── int8_bf16_matmul_sum_add_mul_relu.json │ │ │ │ │ ├── int8_bf16_matmul_tc_add_fusion.json │ │ │ │ │ ├── int8_bf16_matmul_tc_add_quant_fusion.json │ │ │ │ │ ├── int8_bnorm_relu_fusion.json │ │ │ │ │ ├── int8_concat_fusion.json │ │ │ │ │ ├── int8_concat_fusion_3.json │ │ │ │ │ ├── int8_conv_2d_fusion.json │ │ │ │ │ ├── int8_conv_2d_fusion_2.json │ │ │ │ │ ├── int8_conv_2d_fwd_i_fusion.json │ │ │ │ │ ├── int8_conv_add_add_fusion.json │ │ │ │ │ ├── int8_conv_add_mul_fusion.json │ │ │ │ │ ├── int8_conv_bias_fusion.json │ │ │ │ │ ├── int8_conv_bias_mish_fusion.json │ │ │ │ │ ├── int8_conv_bias_relu_fusion.json │ │ │ │ │ ├── int8_conv_bias_relu_fusion_2.json │ │ │ │ │ ├── int8_conv_bias_relu_fusion_3.json │ │ │ │ │ ├── int8_conv_post_ops_fusion.json │ │ │ │ │ ├── int8_conv_post_ops_int8_add_fusion.json │ │ │ │ │ ├── int8_conv_relu_fusion.json │ │ │ │ │ ├── int8_convtranspose_post_ops_add_fusion.json │ │ │ │ │ ├── int8_convtranspose_post_ops_chain_fusion.json │ │ │ │ │ ├── int8_convtranspose_post_ops_fusion.json │ │ │ │ │ ├── int8_convtranspose_post_ops_square_fusion.json │ │ │ │ │ ├── int8_convtranspose_post_ops_sum_fusion.json │ │ │ │ │ ├── int8_convtranspose_post_ops_sum_fusion_2.json │ │ │ │ │ ├── int8_depthwise_conv_fusion.json │ │ │ │ │ ├── int8_f32_matmul_mul_add_fusion.json │ │ │ │ │ ├── int8_f32_matmul_mul_add_fusion_2.json │ │ │ │ │ ├── int8_lnorm_gelu_quantize.json │ │ │ │ │ ├── int8_lnorm_multiply_quantize.json │ │ │ │ │ ├── int8_lnorm_tc_multiply_quantize.json │ │ │ │ │ ├── int8_matmul_add_mul_fusion.json │ │ │ │ │ ├── int8_matmul_add_mul_relu.json │ │ │ │ │ ├── int8_matmul_bia_relu_fusion.json │ │ │ │ │ ├── int8_matmul_bias_sum_fusion.json │ │ │ │ │ ├── int8_matmul_logistic_fusion.json │ │ │ │ │ ├── int8_matmul_mul_add_mul_fusion.json │ │ │ │ │ ├── int8_matmul_post_ops_fusion.json │ │ │ │ │ ├── int8_matmul_sum_add_mul_relu.json │ │ │ │ │ ├── int8_maxpool_add_mul_fusion.json │ │ │ │ │ ├── int8_reorder_fusion.json │ │ │ │ │ ├── int8_reorder_fusion_2.json │ │ │ │ │ ├── int8_reorder_fusion_3.json │ │ │ │ │ └── int8_softmax_add.json │ │ │ ├── test_graph_all │ │ │ ├── test_graph_bf16 │ │ │ ├── test_graph_ci │ │ │ ├── test_graph_f16 │ │ │ ├── test_graph_f32 │ │ │ ├── test_graph_f8 │ │ │ ├── test_graph_fusions │ │ │ ├── test_graph_fusions_gpu │ │ │ ├── test_graph_int8 │ │ │ ├── test_graph_op_gpu │ │ │ └── test_graph_pattern_gpu │ │ ├── ip │ │ │ ├── harness_ip_gpt-j_2016-32_inf_lb_bfloat16 │ │ │ ├── harness_ip_gpt-j_2016-32_inf_lb_f32 │ │ │ ├── harness_ip_gpt-j_2016-32_inf_lb_float16 │ │ │ ├── harness_ip_gpt-j_2016-32_inf_sb_bfloat16 │ │ │ ├── harness_ip_gpt-j_2016-32_inf_sb_f32 │ │ │ ├── harness_ip_gpt-j_2016-32_inf_sb_float16 │ │ │ ├── harness_ip_gpt-j_32-32_inf_lb_bfloat16 │ │ │ ├── harness_ip_gpt-j_32-32_inf_lb_f32 │ │ │ ├── harness_ip_gpt-j_32-32_inf_lb_float16 │ │ │ ├── harness_ip_gpt-j_32-32_inf_sb_bfloat16 │ │ │ ├── harness_ip_gpt-j_32-32_inf_sb_f32 │ │ │ ├── harness_ip_gpt-j_32-32_inf_sb_float16 │ │ │ ├── harness_ip_regression │ │ │ ├── harness_ip_sanitizers │ │ │ ├── harness_ip_saturation │ │ │ ├── harness_ip_smoke_ref │ │ │ ├── harness_ip_tag │ │ │ ├── harness_ip_tag_gpu │ │ │ ├── option_set_fwks_ext_gpu │ │ │ ├── option_set_fwks_key_gpu │ │ │ ├── option_set_fwks_key_perf_gpu │ │ │ ├── option_set_fwks_llm_gpu │ │ │ ├── perf_ip_cpu │ │ │ ├── perf_ip_inference_lb │ │ │ ├── perf_ip_inference_sb │ │ │ ├── perf_ip_knx │ │ │ ├── perf_ip_training │ │ │ ├── perf_ip_xe │ │ │ ├── perf_ip_xe_hp │ │ │ ├── perf_ip_xe_lp │ │ │ ├── set_all │ │ │ ├── set_gpu │ │ │ ├── set_topologies │ │ │ ├── shapes_0d │ │ │ ├── shapes_0d_gpu │ │ │ ├── shapes_1d │ │ │ ├── shapes_3d │ │ │ ├── shapes_alexnet │ │ │ ├── shapes_bert │ │ │ ├── shapes_bert_large │ │ │ ├── shapes_ci │ │ │ ├── shapes_dien_sb │ │ │ ├── shapes_dlrm │ │ │ ├── shapes_gnmt │ │ │ ├── shapes_googlenet_v1 │ │ │ ├── shapes_googlenet_v3 │ │ │ ├── shapes_maskrcnn │ │ │ ├── shapes_ncf │ │ │ ├── shapes_regression │ │ │ ├── shapes_resnet_50 │ │ │ ├── shapes_resnet_50_sparse │ │ │ ├── shapes_rnn_t │ │ │ ├── shapes_transformer_lt │ │ │ ├── shapes_vgg16 │ │ │ ├── shapes_wd │ │ │ ├── test_ip_acl │ │ │ ├── test_ip_all │ │ │ ├── test_ip_bf32_bfloat16 │ │ │ ├── test_ip_bfloat16 │ │ │ ├── test_ip_bfloat16_ymm │ │ │ ├── test_ip_ci │ │ │ ├── test_ip_float16 │ │ │ ├── test_ip_fp8 │ │ │ ├── test_ip_gpu │ │ │ ├── test_ip_int8 │ │ │ ├── test_ip_large_gpu │ │ │ └── test_ip_smoke │ │ ├── lnorm │ │ │ ├── option_set_all │ │ │ ├── option_set_fwks_ext_gpu │ │ │ ├── option_set_fwks_key_gpu │ │ │ ├── shapes_ci │ │ │ ├── test_lnorm_all │ │ │ ├── test_lnorm_bfloat16 │ │ │ ├── test_lnorm_ci │ │ │ ├── test_lnorm_float16 │ │ │ ├── test_lnorm_gpu │ │ │ ├── test_lnorm_int8 │ │ │ └── test_lnorm_smoke │ │ ├── lrn │ │ │ ├── set_all │ │ │ ├── shapes_0d │ │ │ ├── shapes_2d │ │ │ ├── shapes_3d │ │ │ ├── shapes_ci │ │ │ ├── shapes_topologies │ │ │ ├── test_lrn_all │ │ │ ├── test_lrn_bfloat16 │ │ │ ├── test_lrn_ci │ │ │ ├── test_lrn_float16 │ │ │ ├── test_lrn_gpu │ │ │ └── test_lrn_smoke │ │ ├── matmul │ │ │ ├── harness_matmul_3d_bcast │ │ │ ├── harness_matmul_bert_inf_lb_bfloat16 │ │ │ ├── harness_matmul_bert_inf_lb_int8 │ │ │ ├── harness_matmul_bert_inf_sb_bfloat16 │ │ │ ├── harness_matmul_bert_inf_sb_int8 │ │ │ ├── harness_matmul_bert_tr_bfloat16 │ │ │ ├── harness_matmul_bert_tr_float16 │ │ │ ├── harness_matmul_data_tags │ │ │ ├── harness_matmul_decompression │ │ │ ├── harness_matmul_dropout │ │ │ ├── harness_matmul_generated_ci │ │ │ ├── harness_matmul_gpt-j_2016-32_inf_lb_bfloat16 │ │ │ ├── harness_matmul_gpt-j_2016-32_inf_lb_f32 │ │ │ ├── harness_matmul_gpt-j_2016-32_inf_lb_float16 │ │ │ ├── harness_matmul_gpt-j_2016-32_inf_sb_bfloat16 │ │ │ ├── harness_matmul_gpt-j_2016-32_inf_sb_f32 │ │ │ ├── harness_matmul_gpt-j_2016-32_inf_sb_float16 │ │ │ ├── harness_matmul_gpt-j_32-32_inf_lb_bfloat16 │ │ │ ├── harness_matmul_gpt-j_32-32_inf_lb_f32 │ │ │ ├── harness_matmul_gpt-j_32-32_inf_lb_float16 │ │ │ ├── harness_matmul_gpt-j_32-32_inf_sb_bfloat16 │ │ │ ├── harness_matmul_gpt-j_32-32_inf_sb_f32 │ │ │ ├── harness_matmul_gpt-j_32-32_inf_sb_float16 │ │ │ ├── harness_matmul_regression_bf16 │ │ │ ├── harness_matmul_regression_f32 │ │ │ ├── harness_matmul_regression_float16 │ │ │ ├── harness_matmul_regression_int8 │ │ │ ├── harness_matmul_runtime_f32 │ │ │ ├── harness_matmul_runtime_int8 │ │ │ ├── harness_matmul_smoke_ref │ │ │ ├── harness_matmul_strides │ │ │ ├── harness_matmul_transformer_lt_inf_lb_bfloat16 │ │ │ ├── harness_matmul_transformer_lt_inf_lb_int8 │ │ │ ├── harness_matmul_transformer_lt_inf_sb_bfloat16 │ │ │ ├── harness_matmul_transformer_lt_inf_sb_int8 │ │ │ ├── harness_matmul_transformer_lt_tr_bfloat16 │ │ │ ├── option_set_fp8_mixed │ │ │ ├── option_set_fwks_ext_gpu │ │ │ ├── option_set_fwks_key_gpu │ │ │ ├── option_set_fwks_key_gpu_tf32 │ │ │ ├── option_set_fwks_key_perf_gpu │ │ │ ├── option_set_fwks_llm_gpu │ │ │ ├── perf_matmul_inference_batched │ │ │ ├── perf_matmul_inference_lb │ │ │ ├── perf_matmul_training │ │ │ ├── shapes_2d │ │ │ ├── shapes_2d_ci │ │ │ ├── shapes_3d │ │ │ ├── shapes_4bit │ │ │ ├── shapes_4d │ │ │ ├── shapes_bert │ │ │ ├── shapes_bert_large │ │ │ ├── shapes_converted_ip_inf_lb_alexnet │ │ │ ├── shapes_converted_ip_inf_lb_dlrm │ │ │ ├── shapes_converted_ip_inf_lb_gmnt │ │ │ ├── shapes_converted_ip_inf_lb_googlenet │ │ │ ├── shapes_converted_ip_inf_lb_maskrcnn │ │ │ ├── shapes_converted_ip_inf_lb_ncf │ │ │ ├── shapes_converted_ip_inf_lb_resnet │ │ │ ├── shapes_converted_ip_inf_lb_rnn_t │ │ │ ├── shapes_converted_ip_inf_lb_vgg16 │ │ │ ├── shapes_converted_ip_inf_lb_wd │ │ │ ├── shapes_converted_ip_inf_sb_dien │ │ │ ├── shapes_converted_ip_tr_alexnet_bwd_d │ │ │ ├── shapes_converted_ip_tr_alexnet_bwd_w │ │ │ ├── shapes_converted_ip_tr_alexnet_fwd │ │ │ ├── shapes_converted_ip_tr_dlrm_bwd_d │ │ │ ├── shapes_converted_ip_tr_dlrm_bwd_w │ │ │ ├── shapes_converted_ip_tr_dlrm_fwd │ │ │ ├── shapes_converted_ip_tr_gmnt_bwd_d │ │ │ ├── shapes_converted_ip_tr_gmnt_bwd_w │ │ │ ├── shapes_converted_ip_tr_gmnt_fwd │ │ │ ├── shapes_converted_ip_tr_googlenet_bwd_d │ │ │ ├── shapes_converted_ip_tr_googlenet_bwd_w │ │ │ ├── shapes_converted_ip_tr_googlenet_fwd │ │ │ ├── shapes_converted_ip_tr_maskrcnn_bwd_d │ │ │ ├── shapes_converted_ip_tr_maskrcnn_bwd_w │ │ │ ├── shapes_converted_ip_tr_maskrcnn_fwd │ │ │ ├── shapes_converted_ip_tr_ncf_bwd_d │ │ │ ├── shapes_converted_ip_tr_ncf_bwd_w │ │ │ ├── shapes_converted_ip_tr_ncf_fwd │ │ │ ├── shapes_converted_ip_tr_resnet_bwd_d │ │ │ ├── shapes_converted_ip_tr_resnet_bwd_w │ │ │ ├── shapes_converted_ip_tr_resnet_fwd │ │ │ ├── shapes_converted_ip_tr_rnn_t_bwd_d │ │ │ ├── shapes_converted_ip_tr_rnn_t_bwd_w │ │ │ ├── shapes_converted_ip_tr_rnn_t_fwd │ │ │ ├── shapes_converted_ip_tr_vgg16_bwd_d │ │ │ ├── shapes_converted_ip_tr_vgg16_bwd_w │ │ │ ├── shapes_converted_ip_tr_vgg16_fwd │ │ │ ├── shapes_converted_ip_tr_wd_bwd_d │ │ │ ├── shapes_converted_ip_tr_wd_bwd_w │ │ │ ├── shapes_converted_ip_tr_wd_fwd │ │ │ ├── shapes_mem_strided │ │ │ ├── shapes_multidim │ │ │ ├── shapes_sparse │ │ │ ├── shapes_sparse_packed │ │ │ ├── shapes_transformer │ │ │ ├── test_matmul_all │ │ │ ├── test_matmul_bf32_bf16 │ │ │ ├── test_matmul_bfloat16 │ │ │ ├── test_matmul_bfloat16_ymm │ │ │ ├── test_matmul_ci │ │ │ ├── test_matmul_float16 │ │ │ ├── test_matmul_fp4 │ │ │ ├── test_matmul_fp8 │ │ │ ├── test_matmul_gpu │ │ │ ├── test_matmul_int8 │ │ │ ├── test_matmul_large_gpu │ │ │ ├── test_matmul_llm_gpu │ │ │ ├── test_matmul_multidims │ │ │ ├── test_matmul_smoke │ │ │ ├── test_matmul_sparse │ │ │ ├── test_matmul_sparse_ci │ │ │ └── test_matmul_sparse_gpu │ │ ├── pool │ │ │ ├── harness_pool_regression │ │ │ ├── harness_pool_smoke_ref │ │ │ ├── harness_pooling_different_dt │ │ │ ├── option_set_fwks_ext_gpu │ │ │ ├── option_set_fwks_key_gpu │ │ │ ├── perf_pool_gpu │ │ │ ├── set_all │ │ │ ├── set_all_small │ │ │ ├── set_topologies │ │ │ ├── set_topologies_gpu │ │ │ ├── shapes_1d │ │ │ ├── shapes_2d │ │ │ ├── shapes_2d_small │ │ │ ├── shapes_3d │ │ │ ├── shapes_3d_small │ │ │ ├── shapes_3d_unet │ │ │ ├── shapes_alexnet │ │ │ ├── shapes_basic │ │ │ ├── shapes_global_pooling │ │ │ ├── shapes_googlenet_v1 │ │ │ ├── shapes_googlenet_v3 │ │ │ ├── shapes_i3d_resnet50_v1 │ │ │ ├── shapes_large_pool │ │ │ ├── shapes_resnet_50 │ │ │ ├── test_pool_all │ │ │ ├── test_pool_bfloat16 │ │ │ ├── test_pool_ci │ │ │ ├── test_pool_float16 │ │ │ ├── test_pool_fp8 │ │ │ ├── test_pool_gpu │ │ │ ├── test_pool_large_gpu │ │ │ └── test_pool_smoke │ │ ├── prelu │ │ │ ├── option_set_all │ │ │ ├── shapes_all │ │ │ ├── shapes_ci │ │ │ ├── test_prelu_all │ │ │ ├── test_prelu_bfloat16 │ │ │ ├── test_prelu_ci │ │ │ ├── test_prelu_float16 │ │ │ ├── test_prelu_gpu │ │ │ └── test_prelu_smoke │ │ ├── reduction │ │ │ ├── harness_reduction_bf16 │ │ │ ├── harness_reduction_f16 │ │ │ ├── harness_reduction_f32 │ │ │ ├── harness_reduction_i8 │ │ │ ├── option_set_all │ │ │ ├── option_set_all_algs │ │ │ ├── option_set_all_algs_ci │ │ │ ├── option_set_all_algs_int8 │ │ │ ├── option_set_all_algs_int8_ci │ │ │ ├── option_set_fwks_ext_gpu │ │ │ ├── option_set_fwks_key_gpu │ │ │ ├── perf_reduction_gpu │ │ │ ├── shapes_ci │ │ │ ├── shapes_gpu_all │ │ │ ├── shapes_nested_gpu │ │ │ ├── test_reduction_all │ │ │ ├── test_reduction_bfloat16 │ │ │ ├── test_reduction_ci │ │ │ ├── test_reduction_float16 │ │ │ ├── test_reduction_gpu │ │ │ └── test_reduction_smoke │ │ ├── reorder │ │ │ ├── harness_conv_reorders_gpu │ │ │ ├── harness_reorder_amx │ │ │ ├── harness_reorder_compensation │ │ │ ├── harness_reorder_cross_engine_gpu │ │ │ ├── harness_reorder_decompression │ │ │ ├── harness_reorder_large │ │ │ ├── harness_reorder_regression │ │ │ ├── harness_reorder_runtime │ │ │ ├── harness_reorder_saturation │ │ │ ├── harness_reorder_scales │ │ │ ├── option_set_fwks_ext_gpu │ │ │ ├── option_set_fwks_key_gpu │ │ │ ├── test_reorder_all │ │ │ ├── test_reorder_bfloat16 │ │ │ ├── test_reorder_ci │ │ │ ├── test_reorder_float16 │ │ │ ├── test_reorder_float8 │ │ │ ├── test_reorder_fp4 │ │ │ ├── test_reorder_gpu │ │ │ ├── test_reorder_int4 │ │ │ └── test_reorder_smoke │ │ ├── resampling │ │ │ ├── option_set_fwks_key_gpu │ │ │ ├── set_all │ │ │ ├── shapes_1d │ │ │ ├── shapes_2d │ │ │ ├── shapes_3d │ │ │ ├── shapes_ci │ │ │ ├── shapes_maskrcnn │ │ │ ├── test_resampling_all │ │ │ ├── test_resampling_bfloat16 │ │ │ ├── test_resampling_ci │ │ │ ├── test_resampling_float16 │ │ │ ├── test_resampling_gpu │ │ │ └── test_resampling_smoke │ │ ├── rnn │ │ │ ├── harness_augru_bf32 │ │ │ ├── harness_augru_bfloat16 │ │ │ ├── harness_augru_float16 │ │ │ ├── harness_gru_bf32 │ │ │ ├── harness_gru_bfloat16 │ │ │ ├── harness_gru_f32 │ │ │ ├── harness_gru_float16 │ │ │ ├── harness_gru_int8 │ │ │ ├── harness_gru_regression │ │ │ ├── harness_lstm_bf32 │ │ │ ├── harness_lstm_bfloat16 │ │ │ ├── harness_lstm_f32 │ │ │ ├── harness_lstm_float16 │ │ │ ├── harness_lstm_int8 │ │ │ ├── harness_rnn_bf32 │ │ │ ├── harness_rnn_bfloat16 │ │ │ ├── harness_rnn_f32 │ │ │ ├── harness_rnn_float16 │ │ │ ├── option_set_fwks_key_gpu │ │ │ ├── option_set_gnmt_decoder │ │ │ ├── option_set_gnmt_encoder │ │ │ ├── option_set_large │ │ │ ├── option_set_lstmp_large │ │ │ ├── option_set_lstmp_small │ │ │ ├── option_set_perf_inference_lb │ │ │ ├── option_set_perf_inference_sb │ │ │ ├── option_set_perf_training │ │ │ ├── option_set_rnnt │ │ │ ├── option_set_small │ │ │ ├── perf_rnn_cpu │ │ │ ├── perf_rnn_inference_lb │ │ │ ├── perf_rnn_inference_sb │ │ │ ├── perf_rnn_knx │ │ │ ├── perf_rnn_training │ │ │ ├── perf_rnn_xe │ │ │ ├── perf_rnn_xe_hp │ │ │ ├── perf_rnn_xe_lp │ │ │ ├── shapes_deepspeech_2 │ │ │ ├── shapes_inference │ │ │ ├── shapes_large │ │ │ ├── shapes_large_gru │ │ │ ├── shapes_lstmp_large │ │ │ ├── shapes_lstmp_small │ │ │ ├── shapes_rnn_t │ │ │ ├── shapes_small │ │ │ ├── shapes_small_gru │ │ │ ├── shapes_training │ │ │ ├── test_augru_all │ │ │ ├── test_augru_bf32_bfloat16 │ │ │ ├── test_augru_bfloat16 │ │ │ ├── test_augru_ci │ │ │ ├── test_augru_float16 │ │ │ ├── test_gru_all │ │ │ ├── test_gru_bf32_bfloat16 │ │ │ ├── test_gru_bfloat16 │ │ │ ├── test_gru_ci │ │ │ ├── test_gru_float16 │ │ │ ├── test_gru_int8 │ │ │ ├── test_lstm_all │ │ │ ├── test_lstm_bf32_bfloat16 │ │ │ ├── test_lstm_bfloat16 │ │ │ ├── test_lstm_bfloat16_ymm │ │ │ ├── test_lstm_ci │ │ │ ├── test_lstm_f32 │ │ │ ├── test_lstm_float16 │ │ │ ├── test_lstm_int8 │ │ │ ├── test_rnn_all │ │ │ ├── test_rnn_bf32_bfloat16 │ │ │ ├── test_rnn_bfloat16 │ │ │ ├── test_rnn_ci │ │ │ ├── test_rnn_float16 │ │ │ └── test_rnn_gpu │ │ ├── self │ │ │ ├── test_self_ci │ │ │ ├── test_self_f32 │ │ │ └── test_self_smoke │ │ ├── shuffle │ │ │ ├── option_set_all │ │ │ ├── option_set_min │ │ │ ├── option_set_perf │ │ │ ├── perf_shuffle_cpu │ │ │ ├── test_shuffle_all │ │ │ ├── test_shuffle_bfloat16 │ │ │ ├── test_shuffle_ci │ │ │ ├── test_shuffle_float16 │ │ │ ├── test_shuffle_gpu │ │ │ └── test_shuffle_smoke │ │ ├── softmax │ │ │ ├── harness_softmax_regression │ │ │ ├── option_set_fwks_ext_gpu │ │ │ ├── option_set_fwks_key_gpu │ │ │ ├── set_0d │ │ │ ├── shapes_0d │ │ │ ├── shapes_2d │ │ │ ├── shapes_3d │ │ │ ├── shapes_ci │ │ │ ├── shapes_large │ │ │ ├── shapes_large_axis │ │ │ ├── shapes_nlp │ │ │ ├── test_softmax_acl │ │ │ ├── test_softmax_all │ │ │ ├── test_softmax_bfloat16 │ │ │ ├── test_softmax_ci │ │ │ ├── test_softmax_float16 │ │ │ ├── test_softmax_gpu │ │ │ └── test_softmax_smoke │ │ ├── sum │ │ │ ├── option_set_fwks_ext_gpu │ │ │ ├── option_set_fwks_key_gpu │ │ │ ├── test_sum_all │ │ │ ├── test_sum_bfloat16 │ │ │ ├── test_sum_ci │ │ │ ├── test_sum_float16 │ │ │ ├── test_sum_gpu │ │ │ └── test_sum_smoke │ │ └── zeropad │ │ │ ├── option_set_fwks_ext_gpu │ │ │ ├── option_set_fwks_key_gpu │ │ │ ├── set_dim1_block_3d │ │ │ ├── set_dim1dim2_block_2d │ │ │ ├── set_dim1dim2_block_3d │ │ │ ├── set_dim2_block_3d │ │ │ ├── set_dim2dim3_block_4d │ │ │ ├── shapes_dim1_block_3d │ │ │ ├── shapes_dim1dim2_block_2d │ │ │ ├── shapes_dim1dim2_block_3d │ │ │ ├── shapes_dim2_block_3d │ │ │ ├── shapes_dim2dim3_block_4d │ │ │ ├── test_zeropad_ci │ │ │ └── test_zeropad_gpu │ ├── ip │ │ ├── bench_ip.cpp │ │ ├── cfg.cpp │ │ ├── ip.cpp │ │ ├── ip.hpp │ │ ├── ip_aux.cpp │ │ └── ref_ip.cpp │ ├── lnorm │ │ ├── bench_lnorm.cpp │ │ ├── lnorm.cpp │ │ ├── lnorm.hpp │ │ ├── lnorm_aux.cpp │ │ └── ref_lnorm.cpp │ ├── lrn │ │ ├── bench_lrn.cpp │ │ ├── lrn.cpp │ │ ├── lrn.hpp │ │ ├── lrn_aux.cpp │ │ └── ref_lrn.cpp │ ├── matmul │ │ ├── bench_matmul.cpp │ │ ├── cfg.cpp │ │ ├── matmul.cpp │ │ ├── matmul.hpp │ │ ├── matmul_aux.cpp │ │ └── ref_matmul.cpp │ ├── pool │ │ ├── bench_pool.cpp │ │ ├── cfg.cpp │ │ ├── pool.cpp │ │ ├── pool.hpp │ │ ├── pool_aux.cpp │ │ └── ref_pool.cpp │ ├── prelu │ │ ├── bench_prelu.cpp │ │ ├── prelu.cpp │ │ ├── prelu.hpp │ │ ├── prelu_aux.cpp │ │ └── ref_prelu.cpp │ ├── reduction │ │ ├── bench_reduction.cpp │ │ ├── reduction.cpp │ │ ├── reduction.hpp │ │ ├── reduction_aux.cpp │ │ └── ref_reduction.cpp │ ├── reorder │ │ ├── bench_reorder.cpp │ │ ├── cfg.cpp │ │ ├── ref_reorder.cpp │ │ ├── reorder.cpp │ │ ├── reorder.hpp │ │ └── reorder_aux.cpp │ ├── resampling │ │ ├── bench_resampling.cpp │ │ ├── ref_resampling.cpp │ │ ├── resampling.cpp │ │ ├── resampling.hpp │ │ └── resampling_aux.cpp │ ├── rnn │ │ ├── bench_rnn.cpp │ │ ├── cells.hpp │ │ ├── cfg.cpp │ │ ├── gru_cell.cpp │ │ ├── lbr_gru_cell.cpp │ │ ├── lstm_cell.cpp │ │ ├── ref_rnn_bwd.cpp │ │ ├── ref_rnn_fwd.cpp │ │ ├── rnn.cpp │ │ ├── rnn.hpp │ │ ├── rnn_aux.cpp │ │ ├── rnn_aux.hpp │ │ ├── rnn_cell.cpp │ │ ├── rnn_task.hpp │ │ ├── rnn_task_executor.hpp │ │ └── rnn_utils.cpp │ ├── self │ │ ├── bnorm.cpp │ │ ├── common.cpp │ │ ├── compare.cpp │ │ ├── conv.cpp │ │ ├── graph_example.cpp │ │ ├── memory.cpp │ │ ├── norm.cpp │ │ ├── res.cpp │ │ ├── self.cpp │ │ └── self.hpp │ ├── shuffle │ │ ├── bench_shuffle.cpp │ │ ├── ref_shuffle.cpp │ │ ├── shuffle.cpp │ │ ├── shuffle.hpp │ │ └── shuffle_aux.cpp │ ├── softmax │ │ ├── bench_softmax.cpp │ │ ├── ref_softmax.cpp │ │ ├── softmax.cpp │ │ ├── softmax.hpp │ │ └── softmax_aux.cpp │ ├── sum │ │ ├── bench_sum.cpp │ │ ├── ref_sum.cpp │ │ ├── sum.cpp │ │ ├── sum.hpp │ │ └── sum_aux.cpp │ ├── utils │ │ ├── bench_mode.cpp │ │ ├── bench_mode.hpp │ │ ├── cfg.hpp │ │ ├── cold_cache.cpp │ │ ├── cold_cache.hpp │ │ ├── compare.cpp │ │ ├── compare.hpp │ │ ├── data_kind.cpp │ │ ├── data_kind.hpp │ │ ├── dims.cpp │ │ ├── dims.hpp │ │ ├── dnnl_query.cpp │ │ ├── dnnl_query.hpp │ │ ├── fill.cpp │ │ ├── fill.hpp │ │ ├── impl_filter.cpp │ │ ├── impl_filter.hpp │ │ ├── norm.hpp │ │ ├── numeric.cpp │ │ ├── numeric.hpp │ │ ├── parallel.cpp │ │ ├── parallel.hpp │ │ ├── parser.cpp │ │ ├── parser.hpp │ │ ├── perf_report.cpp │ │ ├── perf_report.hpp │ │ ├── res.hpp │ │ ├── settings.hpp │ │ ├── stream_kind.cpp │ │ ├── stream_kind.hpp │ │ ├── task.hpp │ │ ├── task_executor.hpp │ │ ├── timer.cpp │ │ ├── timer.hpp │ │ └── wrapper.hpp │ └── zeropad │ │ ├── bench_zeropad.cpp │ │ ├── zeropad.cpp │ │ ├── zeropad.hpp │ │ └── zeropad_aux.cpp ├── generate_c_symbols_refs.sh ├── gtests │ ├── CMakeLists.txt │ ├── api │ │ ├── CMakeLists.txt │ │ ├── test_engine.cpp │ │ ├── test_memory.cpp │ │ ├── test_memory_creation.cpp │ │ ├── test_memory_desc.cpp │ │ ├── test_memory_desc_ops.cpp │ │ ├── test_memory_map.cpp │ │ ├── test_namespace.cpp │ │ ├── test_stream.cpp │ │ └── test_submemory.cpp │ ├── convolution_common.h │ ├── dnnl_test_common.hpp │ ├── dnnl_test_common_ocl.hpp │ ├── dnnl_test_macros.hpp │ ├── graph │ │ ├── CMakeLists.txt │ │ ├── api │ │ │ ├── CMakeLists.txt │ │ │ ├── api_test_main.cpp │ │ │ ├── ocl │ │ │ │ ├── CMakeLists.txt │ │ │ │ ├── test_cpp_api_compiled_partition.cpp │ │ │ │ ├── test_cpp_api_engine.cpp │ │ │ │ └── test_cpp_api_tensor.cpp │ │ │ ├── sycl │ │ │ │ ├── CMakeLists.txt │ │ │ │ ├── test_cpp_api_compiled_partition.cpp │ │ │ │ ├── test_cpp_api_engine.cpp │ │ │ │ └── test_cpp_api_tensor.cpp │ │ │ ├── test_api_common.cpp │ │ │ ├── test_api_common.h │ │ │ ├── test_api_common.hpp │ │ │ ├── test_c_api_add_op.cpp │ │ │ ├── test_c_api_compile.cpp │ │ │ ├── test_c_api_compile_parametrized.cpp │ │ │ ├── test_c_api_constant_cache.cpp │ │ │ ├── test_c_api_filter.cpp │ │ │ ├── test_c_api_graph.cpp │ │ │ ├── test_c_api_logical_tensor.cpp │ │ │ ├── test_c_api_op.cpp │ │ │ ├── test_cpp_api_compile.cpp │ │ │ ├── test_cpp_api_constant_cache.cpp │ │ │ ├── test_cpp_api_engine.cpp │ │ │ ├── test_cpp_api_graph.cpp │ │ │ ├── test_cpp_api_logical_tensor.cpp │ │ │ ├── test_cpp_api_op.cpp │ │ │ ├── test_cpp_api_partition.cpp │ │ │ └── test_cpp_api_tensor.cpp │ │ ├── test_allocator.cpp │ │ ├── test_allocator.hpp │ │ └── unit │ │ │ ├── CMakeLists.txt │ │ │ ├── backend │ │ │ ├── CMakeLists.txt │ │ │ ├── dnnl │ │ │ │ ├── CMakeLists.txt │ │ │ │ ├── dnnl_test_common.hpp │ │ │ │ ├── ref_func.hpp │ │ │ │ ├── test_batch_norm.cpp │ │ │ │ ├── test_binary_op.cpp │ │ │ │ ├── test_bmm.cpp │ │ │ │ ├── test_common.cpp │ │ │ │ ├── test_compiled_partition.cpp │ │ │ │ ├── test_concat.cpp │ │ │ │ ├── test_constant_cache.cpp │ │ │ │ ├── test_convolution.cpp │ │ │ │ ├── test_convtranspose.cpp │ │ │ │ ├── test_dequantize.cpp │ │ │ │ ├── test_dnnl_infer_shape_cpu.cpp │ │ │ │ ├── test_dnnl_utils_cpu.cpp │ │ │ │ ├── test_eltwise.cpp │ │ │ │ ├── test_fusion_info_cpu.cpp │ │ │ │ ├── test_graph_cpu.cpp │ │ │ │ ├── test_group_norm.cpp │ │ │ │ ├── test_insert_ops_cpu.cpp │ │ │ │ ├── test_internal_attrs_cpu.cpp │ │ │ │ ├── test_interpolate.cpp │ │ │ │ ├── test_large_partition.cpp │ │ │ │ ├── test_layer_norm.cpp │ │ │ │ ├── test_layout_id_cpu.cpp │ │ │ │ ├── test_layout_propagator_cpu.cpp │ │ │ │ ├── test_logical_tensor_cpu.cpp │ │ │ │ ├── test_matmul.cpp │ │ │ │ ├── test_memory_planning_cpu.cpp │ │ │ │ ├── test_mqa_decomp.cpp │ │ │ │ ├── test_op_executable.cpp │ │ │ │ ├── test_op_schema_cpu.cpp │ │ │ │ ├── test_partition_cpu.cpp │ │ │ │ ├── test_pass.cpp │ │ │ │ ├── test_pool.cpp │ │ │ │ ├── test_prelu.cpp │ │ │ │ ├── test_quantize.cpp │ │ │ │ ├── test_reduce.cpp │ │ │ │ ├── test_reorder.cpp │ │ │ │ ├── test_scratchpad.cpp │ │ │ │ ├── test_sdp_decomp.cpp │ │ │ │ ├── test_select.cpp │ │ │ │ ├── test_softmax.cpp │ │ │ │ ├── test_subgraph_pass.cpp │ │ │ │ ├── test_thread_local_cache_cpu.cpp │ │ │ │ └── test_typecast.cpp │ │ │ └── fake │ │ │ │ ├── CMakeLists.txt │ │ │ │ ├── test_compiled_partition.cpp │ │ │ │ ├── test_fake_backend.cpp │ │ │ │ ├── test_graph.cpp │ │ │ │ ├── test_partition.cpp │ │ │ │ └── test_pass.cpp │ │ │ ├── interface │ │ │ ├── CMakeLists.txt │ │ │ ├── sycl │ │ │ │ ├── CMakeLists.txt │ │ │ │ └── test_allocator.cpp │ │ │ ├── test_allocator.cpp │ │ │ ├── test_backend_cpu.cpp │ │ │ ├── test_compiled_partition.cpp │ │ │ ├── test_graph_cpu.cpp │ │ │ ├── test_logical_tensor_cpu.cpp │ │ │ ├── test_op_cpu.cpp │ │ │ ├── test_op_def_constraint_cpu.cpp │ │ │ ├── test_op_schema_cpu.cpp │ │ │ ├── test_partition_hashing.cpp │ │ │ ├── test_shape_infer_cpu.cpp │ │ │ ├── test_tensor.cpp │ │ │ └── test_value_cpu.cpp │ │ │ ├── unit_test_common.cpp │ │ │ ├── unit_test_common.hpp │ │ │ ├── unit_test_main.cpp │ │ │ ├── utils.hpp │ │ │ └── utils │ │ │ ├── CMakeLists.txt │ │ │ ├── test_allocator.cpp │ │ │ ├── test_attribute_value_cpu.cpp │ │ │ ├── test_debug_cpu.cpp │ │ │ ├── test_json_cpu.cpp │ │ │ ├── test_pattern_matcher_cpu.cpp │ │ │ └── test_utils_cpu.cpp │ ├── in │ │ ├── convolution_attr.h │ │ ├── convolution_simple.h │ │ ├── gemm_in.h │ │ └── layer_normalization.h │ ├── internals │ │ ├── CMakeLists.txt │ │ ├── sdpa_internal.hpp │ │ ├── test_bcast_strategy.cpp │ │ ├── test_bfloat16.cpp │ │ ├── test_brgemm.cpp │ │ ├── test_comparison_operators.cpp │ │ ├── test_dnnl_threading.cpp │ │ ├── test_env_vars_dnnl.cpp │ │ ├── test_env_vars_onednn.cpp │ │ ├── test_float8.cpp │ │ ├── test_nibble.cpp │ │ ├── test_sdpa.cpp │ │ ├── test_utils.cpp │ │ └── test_utils.hpp │ ├── main.cpp │ ├── ocl │ │ ├── CMakeLists.txt │ │ └── api │ │ │ ├── CMakeLists.txt │ │ │ ├── test_engine.cpp │ │ │ ├── test_memory_buffer.cpp │ │ │ ├── test_memory_usm.cpp │ │ │ └── test_stream.cpp │ ├── regression │ │ ├── CMakeLists.txt │ │ └── test_binary_stride.cpp │ ├── sycl │ │ ├── CMakeLists.txt │ │ └── api │ │ │ ├── CMakeLists.txt │ │ │ ├── test_engine.cpp │ │ │ ├── test_memory_buffer.cpp │ │ │ ├── test_memory_usm.cpp │ │ │ └── test_stream.cpp │ ├── test_batch_normalization.cpp │ ├── test_binary.cpp │ ├── test_concat.cpp │ ├── test_concurrency.cpp │ ├── test_convolution_backward_data_common.hpp │ ├── test_convolution_backward_data_f32.cpp │ ├── test_convolution_backward_weights_common.hpp │ ├── test_convolution_backward_weights_f32.cpp │ ├── test_convolution_eltwise_forward_common.hpp │ ├── test_convolution_eltwise_forward_f32.cpp │ ├── test_convolution_eltwise_forward_x8s8f32s32.cpp │ ├── test_convolution_format_any.cpp │ ├── test_convolution_forward_common.hpp │ ├── test_convolution_forward_f32.cpp │ ├── test_convolution_forward_u8s8fp.cpp │ ├── test_convolution_forward_u8s8s32.cpp │ ├── test_cross_engine_reorder.cpp │ ├── test_deconvolution.cpp │ ├── test_eltwise.cpp │ ├── test_gemm_bf16bf16bf16.cpp │ ├── test_gemm_bf16bf16f32.cpp │ ├── test_gemm_common.hpp │ ├── test_gemm_data_preparation.hpp │ ├── test_gemm_f16.cpp │ ├── test_gemm_f16f16f32.cpp │ ├── test_gemm_f32.cpp │ ├── test_gemm_params.hpp │ ├── test_gemm_s8s8s32.cpp │ ├── test_gemm_s8u8s32.cpp │ ├── test_gemm_u8s8s32.cpp │ ├── test_gemm_u8u8s32.cpp │ ├── test_gemm_validation.hpp │ ├── test_global_scratchpad.cpp │ ├── test_group_normalization.cpp │ ├── test_iface_attr.cpp │ ├── test_iface_attr_quantization.cpp │ ├── test_iface_binary_bcast.cpp │ ├── test_iface_gpu_only.cpp │ ├── test_iface_handle.cpp │ ├── test_iface_pd.cpp │ ├── test_iface_pd_iter.cpp │ ├── test_iface_primitive_cache.cpp │ ├── test_iface_runtime_dims.cpp │ ├── test_iface_sparse.cpp │ ├── test_iface_threadpool.cpp │ ├── test_iface_weights_format.cpp │ ├── test_iface_wino_convolution.cpp │ ├── test_inner_product_backward_data.cpp │ ├── test_inner_product_backward_weights.cpp │ ├── test_inner_product_forward.cpp │ ├── test_ip_formats.cpp │ ├── test_isa_hints.cpp │ ├── test_isa_iface.cpp │ ├── test_isa_mask.cpp │ ├── test_layer_normalization.cpp │ ├── test_lrn.cpp │ ├── test_malloc.cpp │ ├── test_malloc.hpp │ ├── test_matmul.cpp │ ├── test_persistent_cache_api.cpp │ ├── test_pooling_backward.cpp │ ├── test_pooling_forward.cpp │ ├── test_prelu.cpp │ ├── test_primitive_cache_mt.cpp │ ├── test_reduction.cpp │ ├── test_reorder.cpp │ ├── test_reorder_common.hpp │ ├── test_reorder_formats.cpp │ ├── test_resampling.cpp │ ├── test_rnn_forward.cpp │ ├── test_shuffle.cpp │ ├── test_softmax.cpp │ └── test_sum.cpp ├── noexcept │ ├── CMakeLists.txt │ └── main.cpp ├── other │ └── subproject │ │ ├── CMakeLists.txt │ │ └── main.c ├── test_isa_common.hpp ├── test_thread.cpp └── test_thread.hpp └── third_party ├── .clang-format ├── .clang-tidy ├── gtest ├── CMakeLists.txt ├── LICENSE ├── gtest-death-test.h ├── gtest-matchers.h ├── gtest-message.h ├── gtest-param-test.h ├── gtest-printers.h ├── gtest-spi.h ├── gtest-test-part.h ├── gtest-typed-test.h ├── gtest.h ├── gtest_pred_impl.h ├── gtest_prod.h ├── internal │ ├── custom │ │ ├── README.md │ │ ├── gtest-port.h │ │ ├── gtest-printers.h │ │ └── gtest.h │ ├── gtest-death-test-internal.h │ ├── gtest-filepath.h │ ├── gtest-internal.h │ ├── gtest-param-util.h │ ├── gtest-port-arch.h │ ├── gtest-port.h │ ├── gtest-string.h │ └── gtest-type-util.h └── src │ ├── gtest-all.cc │ ├── gtest-death-test.cc │ ├── gtest-filepath.cc │ ├── gtest-internal-inl.h │ ├── gtest-matchers.cc │ ├── gtest-port.cc │ ├── gtest-printers.cc │ ├── gtest-test-part.cc │ ├── gtest-typed-test.cc │ ├── gtest.cc │ └── gtest_main.cc ├── ittnotify ├── LICENSE.BSD ├── README.md ├── disable_warnings.h ├── ittnotify.h ├── ittnotify_config.h ├── ittnotify_static.c ├── ittnotify_static.h ├── ittnotify_types.h ├── ittptmark64.S ├── ittptmark64.asm ├── jitprofiling.c ├── jitprofiling.h └── legacy │ └── ittnotify.h ├── level_zero ├── layers │ ├── zel_tracing_api.h │ ├── zel_tracing_ddi.h │ └── zel_tracing_register_cb.h ├── loader │ └── ze_loader.h ├── ze_api.h ├── ze_ddi.h ├── ze_ddi_common.h ├── ze_intel_gpu.h ├── ze_stypes.h ├── zes_api.h ├── zes_ddi.h ├── zet_api.h └── zet_ddi.h ├── mdapi └── metrics_discovery_api.h ├── ngen ├── COPYRIGHT ├── ngen.hpp ├── ngen_asm.hpp ├── ngen_auto_swsb.hpp ├── ngen_compiler_fix.hpp ├── ngen_config_internal.hpp ├── ngen_core.hpp ├── ngen_debuginfo.hpp ├── ngen_decoder.hpp ├── ngen_elf.hpp ├── ngen_emulation.hpp ├── ngen_gen12.hpp ├── ngen_gen8.hpp ├── ngen_interface.hpp ├── ngen_level_zero.hpp ├── ngen_opencl.hpp ├── ngen_pseudo.hpp ├── ngen_register_allocator.cpp ├── ngen_register_allocator.hpp ├── ngen_register_decl.hpp ├── ngen_registers.hpp ├── ngen_sycl.hpp ├── ngen_utils.hpp └── npack │ ├── elf_structs.hpp │ ├── hash.hpp │ ├── neo_packager.hpp │ └── neo_structs.hpp ├── spdlog ├── README.md ├── common-inl.h ├── common.h ├── details │ ├── backtracer-inl.h │ ├── backtracer.h │ ├── circular_q.h │ ├── console_globals.h │ ├── file_helper-inl.h │ ├── file_helper.h │ ├── fmt_helper.h │ ├── log_msg-inl.h │ ├── log_msg.h │ ├── log_msg_buffer-inl.h │ ├── log_msg_buffer.h │ ├── null_mutex.h │ ├── os-inl.h │ ├── os.h │ ├── periodic_worker-inl.h │ ├── periodic_worker.h │ ├── registry-inl.h │ ├── registry.h │ ├── synchronous_factory.h │ └── windows_include.h ├── fmt │ ├── bundled │ │ ├── base.h │ │ ├── core.h │ │ ├── format-inl.h │ │ └── format.h │ └── fmt.h ├── formatter.h ├── logger-inl.h ├── logger.h ├── mdc.h ├── pattern_formatter-inl.h ├── pattern_formatter.h ├── sinks │ ├── ansicolor_sink-inl.h │ ├── ansicolor_sink.h │ ├── base_sink-inl.h │ ├── base_sink.h │ ├── basic_file_sink-inl.h │ ├── basic_file_sink.h │ ├── null_sink.h │ ├── ostream_sink.h │ ├── rotating_file_sink-inl.h │ ├── rotating_file_sink.h │ ├── sink-inl.h │ ├── sink.h │ ├── wincolor_sink-inl.h │ └── wincolor_sink.h ├── spdlog-inl.h ├── spdlog.h ├── tweakme.h └── version.h ├── xbyak ├── COPYRIGHT ├── xbyak.h ├── xbyak_bin2hex.h ├── xbyak_mnemonic.h └── xbyak_util.h └── xbyak_aarch64 ├── CMakeLists.txt ├── src ├── err_impl.h ├── util_impl.cpp ├── util_impl.h ├── util_impl_linux.h ├── util_impl_mac.h ├── util_impl_windows.h ├── xbyak_aarch64_impl.cpp ├── xbyak_aarch64_impl.h └── xbyak_aarch64_mnemonic.h └── xbyak_aarch64 ├── xbyak_aarch64.h ├── xbyak_aarch64_adr.h ├── xbyak_aarch64_code_array.h ├── xbyak_aarch64_err.h ├── xbyak_aarch64_gen.h ├── xbyak_aarch64_inner.h ├── xbyak_aarch64_label.h ├── xbyak_aarch64_meta_mnemonic.h ├── xbyak_aarch64_mnemonic_def.h ├── xbyak_aarch64_perf.h ├── xbyak_aarch64_reg.h ├── xbyak_aarch64_util.h └── xbyak_aarch64_version.h /.github/ISSUE_TEMPLATE/documentation.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Request a documentation change 3 | about: Use this template to report documentation issue or request documentation changes 4 | title: '' 5 | labels: 'documentation' 6 | assignees: '' 7 | --- 8 | 9 | # Summary 10 | Include a short summary of the issue or request. Sections below provide 11 | guidance on what factors are considered important for a documentation 12 | issue. 13 | 14 | # URLs 15 | Include pointers to documents that are impacted. 16 | 17 | # Additional details 18 | Provide detailed description of the expected changes in documentation 19 | and suggestions you have. 20 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/question.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Ask a question 3 | about: Use this template for everything that is not a bug or a feature request 4 | title: '' 5 | labels: 'question' 6 | assignees: '' 7 | --- 8 | -------------------------------------------------------------------------------- /.github/automation/aarch64/ci.json: -------------------------------------------------------------------------------- 1 | { 2 | "dependencies": { 3 | "acl": "v52.1.0", 4 | "gcc": "13", 5 | "clang": "17", 6 | "onednn-base": "v3.7" 7 | } 8 | } 9 | -------------------------------------------------------------------------------- /.github/codeql-config.yml: -------------------------------------------------------------------------------- 1 | paths: 2 | - ./ 3 | - .github/ 4 | -------------------------------------------------------------------------------- /cmake/template.vcxproj.user: -------------------------------------------------------------------------------- 1 | <?xml version="1.0" encoding="utf-8"?> 2 | <Project ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003"> 3 | <PropertyGroup> 4 | <LocalDebuggerEnvironment>PATH=@CTESTCONFIG_PATH@;$(PATH)</LocalDebuggerEnvironment> 5 | <DebuggerFlavor>WindowsLocalDebugger</DebuggerFlavor> 6 | </PropertyGroup> 7 | </Project> 8 | -------------------------------------------------------------------------------- /doc/advanced/design/mem_fmt_blk.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/advanced/design/mem_fmt_blk.png -------------------------------------------------------------------------------- /doc/advanced/design/mem_fmt_img1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/advanced/design/mem_fmt_img1.png -------------------------------------------------------------------------------- /doc/advanced/design/mem_fmt_img2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/advanced/design/mem_fmt_img2.png -------------------------------------------------------------------------------- /doc/advanced/design/mem_fmt_padded_blk.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/advanced/design/mem_fmt_padded_blk.png -------------------------------------------------------------------------------- /doc/advanced/design/strides.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/advanced/design/strides.png -------------------------------------------------------------------------------- /doc/build/system_requirements.md: -------------------------------------------------------------------------------- 1 | System Requirements {#dev_guide_system_requirements} 2 | ==================================================== 3 | 4 | oneDNN supports a broad list of hardware platforms, operating systems, and compilers. 5 | For details, see [oneDNN System Requirements](https://github.com/uxlfoundation/oneDNN?tab=readme-ov-file#system-requirements). 6 | -------------------------------------------------------------------------------- /doc/environment.yml: -------------------------------------------------------------------------------- 1 | name: onednn-doc 2 | channels: 3 | - conda-forge 4 | - nodefaults 5 | dependencies: 6 | - python=3.9.5 7 | - pip=21.1.2 8 | - doxyrest=2.1.2 9 | - doxygen=1.8.14 10 | - graphviz=2.40.1 11 | - sphinx=4.0.2 12 | - sphinx-book-theme=0.0.41 13 | - sphinx-copybutton=0.5.2 14 | -------------------------------------------------------------------------------- /doc/graph/fusion_patterns/images/binary_pattern.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/graph/fusion_patterns/images/binary_pattern.png -------------------------------------------------------------------------------- /doc/graph/fusion_patterns/images/compressed_sdpa_pattern.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/graph/fusion_patterns/images/compressed_sdpa_pattern.png -------------------------------------------------------------------------------- /doc/graph/fusion_patterns/images/conv_bwd_pattern.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/graph/fusion_patterns/images/conv_bwd_pattern.png -------------------------------------------------------------------------------- /doc/graph/fusion_patterns/images/conv_pattern.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/graph/fusion_patterns/images/conv_pattern.png -------------------------------------------------------------------------------- /doc/graph/fusion_patterns/images/convtranspose_pattern.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/graph/fusion_patterns/images/convtranspose_pattern.png -------------------------------------------------------------------------------- /doc/graph/fusion_patterns/images/epilogue_subgraph_conv.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/graph/fusion_patterns/images/epilogue_subgraph_conv.png -------------------------------------------------------------------------------- /doc/graph/fusion_patterns/images/epilogue_subgraph_general_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/graph/fusion_patterns/images/epilogue_subgraph_general_1.png -------------------------------------------------------------------------------- /doc/graph/fusion_patterns/images/epilogue_subgraph_general_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/graph/fusion_patterns/images/epilogue_subgraph_general_2.png -------------------------------------------------------------------------------- /doc/graph/fusion_patterns/images/epilogue_subgraph_matmul.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/graph/fusion_patterns/images/epilogue_subgraph_matmul.png -------------------------------------------------------------------------------- /doc/graph/fusion_patterns/images/f2f_conversion.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/graph/fusion_patterns/images/f2f_conversion.png -------------------------------------------------------------------------------- /doc/graph/fusion_patterns/images/f2q_conversion_general.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/graph/fusion_patterns/images/f2q_conversion_general.png -------------------------------------------------------------------------------- /doc/graph/fusion_patterns/images/f2q_conversion_quantized_conv.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/graph/fusion_patterns/images/f2q_conversion_quantized_conv.png -------------------------------------------------------------------------------- /doc/graph/fusion_patterns/images/f2q_conversion_quantized_matmul.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/graph/fusion_patterns/images/f2q_conversion_quantized_matmul.png -------------------------------------------------------------------------------- /doc/graph/fusion_patterns/images/f2q_conversion_softmax.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/graph/fusion_patterns/images/f2q_conversion_softmax.png -------------------------------------------------------------------------------- /doc/graph/fusion_patterns/images/fp-gated-mlp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/graph/fusion_patterns/images/fp-gated-mlp.png -------------------------------------------------------------------------------- /doc/graph/fusion_patterns/images/gated-mlp-swish.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/graph/fusion_patterns/images/gated-mlp-swish.png -------------------------------------------------------------------------------- /doc/graph/fusion_patterns/images/gqa.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/graph/fusion_patterns/images/gqa.png -------------------------------------------------------------------------------- /doc/graph/fusion_patterns/images/interpolate_pattern.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/graph/fusion_patterns/images/interpolate_pattern.png -------------------------------------------------------------------------------- /doc/graph/fusion_patterns/images/matmul_pattern.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/graph/fusion_patterns/images/matmul_pattern.png -------------------------------------------------------------------------------- /doc/graph/fusion_patterns/images/norm_pattern.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/graph/fusion_patterns/images/norm_pattern.png -------------------------------------------------------------------------------- /doc/graph/fusion_patterns/images/pool_pattern.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/graph/fusion_patterns/images/pool_pattern.png -------------------------------------------------------------------------------- /doc/graph/fusion_patterns/images/q2f_conversion_quantized_conv_matmul.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/graph/fusion_patterns/images/q2f_conversion_quantized_conv_matmul.png -------------------------------------------------------------------------------- /doc/graph/fusion_patterns/images/q2f_conversion_quantized_convtranspose.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/graph/fusion_patterns/images/q2f_conversion_quantized_convtranspose.png -------------------------------------------------------------------------------- /doc/graph/fusion_patterns/images/quantized_conv_pattern.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/graph/fusion_patterns/images/quantized_conv_pattern.png -------------------------------------------------------------------------------- /doc/graph/fusion_patterns/images/quantized_convtranspose_pattern.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/graph/fusion_patterns/images/quantized_convtranspose_pattern.png -------------------------------------------------------------------------------- /doc/graph/fusion_patterns/images/quantized_matmul_pattern.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/graph/fusion_patterns/images/quantized_matmul_pattern.png -------------------------------------------------------------------------------- /doc/graph/fusion_patterns/images/reduction_pattern.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/graph/fusion_patterns/images/reduction_pattern.png -------------------------------------------------------------------------------- /doc/graph/fusion_patterns/images/sdpa-mask-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/graph/fusion_patterns/images/sdpa-mask-1.png -------------------------------------------------------------------------------- /doc/graph/fusion_patterns/images/sdpa-mask-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/graph/fusion_patterns/images/sdpa-mask-2.png -------------------------------------------------------------------------------- /doc/graph/fusion_patterns/images/sdpa-mask-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/graph/fusion_patterns/images/sdpa-mask-3.png -------------------------------------------------------------------------------- /doc/graph/fusion_patterns/images/sdpa-mask-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/graph/fusion_patterns/images/sdpa-mask-4.png -------------------------------------------------------------------------------- /doc/graph/fusion_patterns/images/sdpa-reorder.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/graph/fusion_patterns/images/sdpa-reorder.png -------------------------------------------------------------------------------- /doc/graph/fusion_patterns/images/sdpa.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/graph/fusion_patterns/images/sdpa.png -------------------------------------------------------------------------------- /doc/graph/fusion_patterns/images/softmax_pattern.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/graph/fusion_patterns/images/softmax_pattern.png -------------------------------------------------------------------------------- /doc/graph/fusion_patterns/images/unary_pattern.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/graph/fusion_patterns/images/unary_pattern.png -------------------------------------------------------------------------------- /doc/graph/programming_model/images/bf16_programming.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/graph/programming_model/images/bf16_programming.jpg -------------------------------------------------------------------------------- /doc/graph/programming_model/images/img_graph_programming_model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/graph/programming_model/images/img_graph_programming_model.png -------------------------------------------------------------------------------- /doc/graph/programming_model/images/int8_programming.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/graph/programming_model/images/int8_programming.jpg -------------------------------------------------------------------------------- /doc/graph/rst/graph_programming_model.rst: -------------------------------------------------------------------------------- 1 | Programming Model 2 | ################# 3 | 4 | .. toctree:: 5 | :maxdepth: 1 6 | 7 | dev_guide_graph_basic_concepts 8 | dev_guide_graph_low_precision 9 | -------------------------------------------------------------------------------- /doc/graph/rst/images/other_pattern.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/graph/rst/images/other_pattern.png -------------------------------------------------------------------------------- /doc/performance_considerations/benchdnn.md: -------------------------------------------------------------------------------- 1 | Benchmarking Performance {#dev_guide_benchdnn} 2 | ============================================== 3 | 4 | oneDNN has a built-in benchmarking program called benchdnn. 5 | 6 | For a complete description of the available options and working examples, see 7 | the [benchdnn readme](https://github.com/uxlfoundation/oneDNN/blob/main/tests/benchdnn/README.md#benchdnn). 8 | -------------------------------------------------------------------------------- /doc/performance_considerations/vtune.md: -------------------------------------------------------------------------------- 1 | Profiling with VTune(TM) Profiler {#dev_guide_vtune} 2 | ======================================================== 3 | 4 | See @ref dev_guide_profilers 5 | -------------------------------------------------------------------------------- /doc/primitives/images/unrolled_stack_rnn.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/primitives/images/unrolled_stack_rnn.jpg -------------------------------------------------------------------------------- /doc/programming_model/images/img_depthwise_fusion.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/programming_model/images/img_depthwise_fusion.jpg -------------------------------------------------------------------------------- /doc/programming_model/images/img_dnnl_object_snapshot.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/programming_model/images/img_dnnl_object_snapshot.jpg -------------------------------------------------------------------------------- /doc/programming_model/images/img_dnnl_programming_flow.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/programming_model/images/img_dnnl_programming_flow.jpg -------------------------------------------------------------------------------- /doc/programming_model/images/img_overview_flow.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/programming_model/images/img_overview_flow.jpg -------------------------------------------------------------------------------- /doc/programming_model/images/img_programming_model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/programming_model/images/img_programming_model.png -------------------------------------------------------------------------------- /doc/rst/advanced_topics.rst: -------------------------------------------------------------------------------- 1 | Advanced Topics 2 | ##################### 3 | 4 | .. toctree:: 5 | :maxdepth: 1 6 | 7 | dev_guide_transition_to_dnnl 8 | dev_guide_understanding_memory_formats 9 | dev_guide_int8_computations 10 | dev_guide_primitive_cache 11 | dev_guide_persistent_cache 12 | dev_guide_threadpool 13 | dev_guide_sparsity 14 | dev_guide_experimental 15 | -------------------------------------------------------------------------------- /doc/rst/build_and_link.rst: -------------------------------------------------------------------------------- 1 | Build and Link oneDNN 2 | ##################### 3 | 4 | .. toctree:: 5 | :maxdepth: 1 6 | 7 | dev_guide_system_requirements 8 | dev_guide_build 9 | dev_guide_build_options 10 | dev_guide_link -------------------------------------------------------------------------------- /doc/rst/graph_extension.rst: -------------------------------------------------------------------------------- 1 | Graph Extension 2 | ############### 3 | 4 | .. toctree:: 5 | :maxdepth: 1 6 | 7 | graph_programming_model 8 | graph_supported_operations 9 | graph_fusion_patterns 10 | dev_guide_graph_dump 11 | dev_guide_constant_tensor_cache 12 | -------------------------------------------------------------------------------- /doc/rst/interop_with_dpcpp_and_opencl.rst: -------------------------------------------------------------------------------- 1 | Interoperability with DPC++ and OpenCL 2 | ######################################## 3 | 4 | .. toctree:: 5 | :maxdepth: 1 6 | 7 | dev_guide_opencl_interoperability.rst 8 | dev_guide_dpcpp_interoperability.rst 9 | -------------------------------------------------------------------------------- /doc/rst/performance_profiling_and_inspection.rst: -------------------------------------------------------------------------------- 1 | Performance Profiling and Inspection 2 | ######################################## 3 | 4 | .. toctree:: 5 | :maxdepth: 1 6 | 7 | dev_guide_verbose 8 | dev_guide_performance_settings 9 | dev_guide_benchdnn 10 | dev_guide_profilers 11 | dev_guide_inspecting_jit 12 | page_performance_profiling_cpp 13 | dev_guide_cpu_dispatcher_control 14 | dev_guide_cpu_isa_hints 15 | dev_guide_verbose_table 16 | -------------------------------------------------------------------------------- /doc/rst/programming_model.rst: -------------------------------------------------------------------------------- 1 | oneDNN Concepts 2 | ############### 3 | 4 | .. toctree:: 5 | :maxdepth: 1 6 | 7 | page_memory_format_propagation_cpp 8 | dev_guide_inference_and_training_aspects 9 | dev_guide_attributes 10 | dev_guide_data_types 11 | page_cross_engine_reorder_cpp 12 | dev_guide_c_and_cpp_apis 13 | interop_with_dpcpp_and_opencl -------------------------------------------------------------------------------- /doc/rst/supported_primitives.rst: -------------------------------------------------------------------------------- 1 | Supported Primitives 2 | ##################### 3 | 4 | .. toctree:: 5 | :maxdepth: 1 6 | 7 | dev_guide_convolution 8 | dev_guide_inner_product 9 | dev_guide_matmul 10 | dev_guide_rnn 11 | dev_guide_batch_normalization 12 | dev_guide_binary 13 | dev_guide_concat 14 | dev_guide_eltwise 15 | dev_guide_group_normalization 16 | dev_guide_layer_normalization 17 | dev_guide_lrn 18 | dev_guide_pooling 19 | dev_guide_prelu 20 | dev_guide_resampling 21 | dev_guide_shuffle 22 | dev_guide_softmax 23 | dev_guide_sum 24 | dev_guide_reorder 25 | dev_guide_reduction 26 | -------------------------------------------------------------------------------- /doc/rst/ukernels.rst: -------------------------------------------------------------------------------- 1 | Ukernels 2 | ##################### 3 | 4 | .. toctree:: 5 | :maxdepth: 1 6 | 7 | dev_guide_ukernel_basic_concepts.rst 8 | dev_guide_ukernel_brgemm.rst 9 | dev_guide_ukernel_transform.rst 10 | page_cpu_brgemm_example_cpp.rst 11 | -------------------------------------------------------------------------------- /doc/sphinx/_static/favicons.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/sphinx/_static/favicons.png -------------------------------------------------------------------------------- /doc/sphinx/_static/oneAPI-rgb-rev-100.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/sphinx/_static/oneAPI-rgb-rev-100.png -------------------------------------------------------------------------------- /doc/usage_models/images/img_bf16_diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/usage_models/images/img_bf16_diagram.png -------------------------------------------------------------------------------- /doc/usage_models/images/img_diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/usage_models/images/img_diagram.png -------------------------------------------------------------------------------- /doc/usage_models/images/img_inference_scope.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/usage_models/images/img_inference_scope.jpg -------------------------------------------------------------------------------- /doc/usage_models/images/img_multiscalar.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/usage_models/images/img_multiscalar.png -------------------------------------------------------------------------------- /doc/usage_models/images/img_singlescalar.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/usage_models/images/img_singlescalar.png -------------------------------------------------------------------------------- /doc/usage_models/images/img_training_inference_scope.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/uxlfoundation/oneDNN/da36b74f751a4caf34c739bfc56754cdaa31555e/doc/usage_models/images/img_training_inference_scope.jpg -------------------------------------------------------------------------------- /doc/usage_models/training.md: -------------------------------------------------------------------------------- 1 | Training {#dev_guide_training} 2 | ============================== 3 | 4 | NEW_CONTENT_GOES_HERE 5 | 6 | ## fp32 Training 7 | 8 | NEW_CONTENT_GOES_HERE 9 | 10 | ## bfp16 Training 11 | 12 | NEW_CONTENT_GOES_HERE 13 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.black] 2 | line-length = 80 3 | include = 'scripts\/.*\.pyi?#39; 4 | -------------------------------------------------------------------------------- /src/cpu/jit_utils/linux_perf/README.md: -------------------------------------------------------------------------------- 1 | This is an implementation of jitdump format used by linux perf. The 2 | [spec](https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/plain/tools/perf/Documentation/jitdump-specification.txt) 3 | -------------------------------------------------------------------------------- /src/gpu/intel/jit/gemm/.clang-tidy: -------------------------------------------------------------------------------- 1 | Checks: '-*,misc-definitions-in-headers' 2 | CheckOptions: 3 | - { key: HeaderFileExtensions, value: "x" } 4 | -------------------------------------------------------------------------------- /src/gpu/intel/microkernels/.clang-tidy: -------------------------------------------------------------------------------- 1 | Checks: '-*,misc-definitions-in-headers' 2 | CheckOptions: 3 | - { key: HeaderFileExtensions, value: "x" } 4 | -------------------------------------------------------------------------------- /tests/benchdnn/doc/knobs_dir.md: -------------------------------------------------------------------------------- 1 | # Direction 2 | 3 | **Benchdnn** renames the library propagation kind abstraction into "direction". 4 | The following direction values are supported: 5 | 6 | | Prop kind | Description 7 | | :--- | :--- 8 | | FWD_B | dnnl_forward_training w/ bias 9 | | FWD_D | dnnl_forward_training w/o bias 10 | | FWD_I | dnnl_forward_inference 11 | | BWD_D | dnnl_backward_data 12 | | BWD_WB | dnnl_backward_weights w/ bias 13 | | BWD_W | dnnl_backward_weights w/o bias 14 | | BWD_DW | dnnl_backward 15 | 16 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/binary/harness_binary_regression: -------------------------------------------------------------------------------- 1 | # repeated sum with varying scale 2 | --reset --attr-post-ops=sum+relu+sum:2 8x8x3x5:8x8x3x5_n"multisum" 3 | 4 | # Curious edge case in GPU JIT-reorder-based binary 5 | --reset --alg=ADD --stag=ABcd32a16b:ABcd32a16b --dtag=acdb --sdt=f16:f16 --ddt=f16 64x168x42x42:64x168x42x42 6 | 7 | # Mixed src1/post-op src broadcast 8 | --reset --attr-post-ops=add:f32:2 1x17:1x1 9 | 10 | # per_w broadcasting strategy 11 | --reset --attr-post-ops=mul:f32:4+add:f32:4 --alg=add 1x20x768:1x20x1 12 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/binary/option_set_src0_bcast: -------------------------------------------------------------------------------- 1 | --stag=abx:abx 1x5x1x1:3x5x6x9 2 | 4x1x4x4:4x4x1x4 3 | 1x1x1x1:16x12x2x2 4 | 1x12:12x1 5 | 6 | --stag=aBx8b:aBx16b 2x16x1x1:2x16x5x7 7 | 8 | --stag=aBx16b:axb 1x16x5x7:2x16x5x7 9 | 2x16x1x1:2x16x1x1 10 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/binary/shapes_ci: -------------------------------------------------------------------------------- 1 | 3x5x6x9:3x5x6x9 2 | 5x3x2x9:1x3x2x9 3 | 32x17x2x3:32x17 4 | 32x17x2x3:1x17 5 | 15x12x3x5:15 6 | 15x12x3x5:1 7 | 5x3x1x9:1x3x2x9 8 | 12x12:1x12 9 | 12x1:1x12 10 | 2x3x48:1x3x48 11 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/binary/shapes_perf_1st_conv: -------------------------------------------------------------------------------- 1 | 64x3x224x224:1x3x1x1 2 | 256x3x224x224:1x3x1x1 3 | 512x3x224x224:1x3x1x1 4 | 1024x3x224x224:1x3x1x1 5 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/binary/shapes_perf_scaleshift: -------------------------------------------------------------------------------- 1 | 1024x2048x10x10:1x2048x1x1 2 | 1024x1024x19x19:1x1024x1x1 3 | 1024x512x38x38:1x512x1x1 4 | 1024x256x75x75:1x256x1x1 5 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/binary/test_binary_all: -------------------------------------------------------------------------------- 1 | # all 2 | --reset 3 | 4 | --batch=harness_binary_f32 5 | --batch=harness_binary_bf16 6 | --batch=harness_binary_f16 7 | --batch=harness_binary_i8 8 | --batch=harness_binary_different_dt 9 | --batch=harness_binary_regression 10 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/binary/test_binary_bfloat16: -------------------------------------------------------------------------------- 1 | # bf16 (for legacy infra support) 2 | --reset 3 | 4 | --batch=harness_binary_bf16 5 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/binary/test_binary_different_dt_ci: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | --inplace=false # Different src and dst data types does not support in-place mode. 4 | --ddt=s8,u8,f32,s32 --sdt=s8:u8,u8:s8,s8:f32,f32:u8,f32:f32,f32:s32,s32:f32 5 | --alg=ADD,MUL,MAX,MIN,DIV,SUB,GE,GT,LE,LT,EQ,NE,SELECT 6 | --stag=abx:any,axb:any 7 | --batch=shapes_ci 8 | 9 | --alg=ADD 10 | --attr-post-ops=,add:f32:per_oc+sum:2+linear:2:1:3 11 | --attr-scales=,src:common:0.5+src1:common:0.25 12 | --batch=shapes_ci 13 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/binary/test_binary_float16: -------------------------------------------------------------------------------- 1 | # f16 2 | --reset 3 | 4 | --batch=harness_binary_f16 5 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/bnorm/perf_bnorm_gpu: -------------------------------------------------------------------------------- 1 | --reset --batch=option_set_fwks_key_gpu 2 | --reset --batch=option_set_fwks_ext_gpu 3 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/bnorm/set_nd: -------------------------------------------------------------------------------- 1 | --batch=shapes_1d 2 | --batch=shapes_2d 3 | --batch=shapes_3d 4 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/bnorm/shapes_1d: -------------------------------------------------------------------------------- 1 | # random 1d problems 2 | 3 | mb1ic16iw1n"bnorm1d:1" 4 | mb1ic32iw30n"bnorm1d:2" 5 | mb32ic128iw10n"bnorm1d:3" 6 | mb32ic27iw7n"bnorm1d:4" 7 | mb9ic128iw1n"bnorm1d:5" 8 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/bnorm/shapes_2d: -------------------------------------------------------------------------------- 1 | # random 2d problems 2 | 3 | mb1ic1ih1_n"bnorm_2d:1" 4 | mb1ic3ih1_n"bnorm_2d:2" 5 | mb1ic256ih28_n"bnorm_2d:3" 6 | mb1ic257ih28_n"bnorm_2d:4" 7 | mb1ic1024ih28iw17_n"bnorm_2d:5" 8 | mb2ic128ih2_n"bnorm_2d:catch_cancel" 9 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/bnorm/shapes_3d: -------------------------------------------------------------------------------- 1 | # random 3d problems 2 | 3 | mb1ic16id40_n"bnorm_3d:1" 4 | mb32ic128id12ih10iw8_n"bnorm_3d:2" 5 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/bnorm/shapes_ci: -------------------------------------------------------------------------------- 1 | # random problems 2 | 3 | ic23_n"bnorm_ci_0d:1" 4 | ic32iw32_n"bnorm_ci_1d:1" 5 | mb5ic33iw27_n"bnorm_ci_1d:2" 6 | ic256ih28_n"bnorm_ci_2d:1" # Used in smoke validation, don't change the name 7 | mb4ic200ih17iw16_n"bnorm_ci_2d:2" # Used in smoke validation, don't change the name 8 | ic16id12_n"bnorm_ci_3d:1" 9 | mb1ic19id4ih6iw6_n"bnorm_ci_3d:2" 10 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/bnorm/shapes_large: -------------------------------------------------------------------------------- 1 | # Large minibatch 2 | mb3072ic64ih112 3 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/bnorm/shapes_regressions: -------------------------------------------------------------------------------- 1 | # Blocking bugs 2 | mb2ic251ih28 # 2-core system w/ 1.5MB LLC slices 3 | mb2ic64ih224 # 2-core system w/ 2.5MB LLC slices 4 | mb2ic64ih56 # 4-core system w/ 1.5MB LLC slices 5 | mb8ic322ih16 # 4-core system w/ 2.5MB LLC slices 6 | mb88ic600ih16 # 36-core system w/ 1.5MB LLC slices 7 | mb88ic980ih16 # 44-core system w/ 2.5MB LLC slices 8 | mb88ic800ih16 # 48-core system w/ 1.5MB LLC slices 9 | mb88ic900ih16 # 56-core system w/ 1.5MB LLC slices 10 | 11 | # Spatial threading bugs for bfloat16 12 | mb12ic4ih8 13 | mb1ic24ih14iw14 14 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/bnorm/test_bnorm_regressions: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | --inplace=true 4 | --tag=axb,abx,aBx8b,aBx16b 5 | 6 | # training 7 | --dir=FWD_D,BWD_DW 8 | --dt=f32,bf16 9 | --flags=,G,CH,R,GCHR 10 | --batch=shapes_regressions 11 | 12 | # inference 13 | --dir=FWD_I 14 | 15 | --dt=f32,bf16 16 | --flags=,GCH 17 | --attr-post-ops=,relu 18 | --batch=shapes_regressions 19 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/bnorm/test_bnorm_regressions_large: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | --skip-impl=ref 4 | 5 | --tag=axb,abx,aBx16b --dir=FWD_D,BWD_DW --flags=CHR --batch=shapes_large 6 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/bnorm/test_bnorm_smoke: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | --match=.*bnorm_ci_2d.* # Use 2d problems only from shapes_ci 4 | --inplace=false 5 | --tag=axb 6 | 7 | # training 8 | --dir=FWD_D,BWD_DW 9 | --dt=f32,bf16,f16 10 | --flags=,G,CHR 11 | --batch=shapes_ci 12 | ## no scale or shift support for backward_data 13 | --dir=BWD_D 14 | --flags=,G,R 15 | --batch=shapes_ci 16 | 17 | # inference 18 | --dir=FWD_I 19 | 20 | --dt=f32,bf16,f16 21 | --flags=,G,CH 22 | --attr-post-ops=,relu 23 | --batch=shapes_ci 24 | --attr-post-ops= 25 | 26 | --dt=f16 27 | --flags=,G,CH 28 | --batch=shapes_ci 29 | 30 | --dt=s8 31 | --flags=G,GCH 32 | --batch=shapes_ci 33 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/brgemm/harness_brgemm_f32: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | --dt=f32 4 | --bia_dt=undef,f32 5 | --beta=0,1 6 | --attr-post-ops=,sum:2,relu 7 | --ld=,:160:96 8 | --batch=option_set_f32 9 | 10 | # Separate cases for non-default alpha 11 | --reset 12 | --dt=f32 13 | --alpha=2 14 | --batch=shapes_2d_no_tail_f32 15 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/brgemm/harness_brgemm_fpmath: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | --attr-fpmath=bf16 4 | 5 | # f32 6 | --dt=f32 7 | --bia_dt=undef,f32 8 | --beta=0,1 9 | --batch=option_set_f32 10 | 11 | # Separate cases for non-default alpha 12 | --reset 13 | --alpha=2 14 | --batch=shapes_2d_no_tail_f32 15 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/brgemm/option_set_bf16: -------------------------------------------------------------------------------- 1 | # Incorporates all meaningful shapes with bs included. 2 | # bs > 1 is applicable when K is divisible by K-block size. 3 | 4 | --bs=1,7,16 5 | 6 | --batch=shapes_2d_no_tail_bf16 7 | 8 | --batch=shapes_2d_tail_n_bf16 9 | 10 | --bs=1 11 | 12 | --batch=shapes_2d_tail_k_bf16 13 | --batch=shapes_2d_big_k_bf16 14 | 15 | --batch=shapes_2d_tail_k_tail_n_bf16 16 | --batch=shapes_2d_big_k_tail_n_bf16 17 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/brgemm/option_set_f32: -------------------------------------------------------------------------------- 1 | # Incorporates all meaningful shapes with bs included. 2 | # bs > 1 is applicable when K is divisible by K-block size. 3 | 4 | --bs=1,7,16 5 | 6 | --batch=shapes_2d_no_tail_f32 7 | 8 | --batch=shapes_2d_tail_n_f32 9 | 10 | --bs=1 11 | 12 | --batch=shapes_2d_tail_k_f32 13 | --batch=shapes_2d_big_k_f32 14 | 15 | --batch=shapes_2d_tail_k_tail_n_f32 16 | --batch=shapes_2d_big_k_tail_n_f32 17 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/brgemm/option_set_int8: -------------------------------------------------------------------------------- 1 | # Incorporates all meaningful shapes with bs included. 2 | # bs > 1 is applicable when K is divisible by K-block size. 3 | 4 | --bs=1,7,16 5 | 6 | --batch=shapes_2d_no_tail_int8 7 | 8 | --batch=shapes_2d_tail_n_int8 9 | 10 | --bs=1 11 | 12 | --batch=shapes_2d_tail_k_int8 13 | --batch=shapes_2d_big_k_int8 14 | 15 | --batch=shapes_2d_tail_k_tail_n_int8 16 | --batch=shapes_2d_big_k_tail_n_int8 17 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/brgemm/shapes_2d_big_k_bf16: -------------------------------------------------------------------------------- 1 | 2 | 3 | 1x256:256x16_n"bf16:big_k:0" 4 | 1x256:256x32_n"bf16:big_k:1" 5 | 1x256:256x48_n"bf16:big_k:2" 6 | 1x256:256x64_n"bf16:big_k:3" 7 | 8 | 9 | 13x256:256x16_n"bf16:big_k:4" 10 | 13x256:256x32_n"bf16:big_k:5" 11 | 13x256:256x48_n"bf16:big_k:6" 12 | 13x256:256x64_n"bf16:big_k:7" 13 | 14 | 15 | 16x256:256x16_n"bf16:big_k:8" 16 | 16x256:256x32_n"bf16:big_k:9" 17 | 16x256:256x48_n"bf16:big_k:10" 18 | 16x256:256x64_n"bf16:big_k:11" 19 | 20 | 21 | 64x256:256x16_n"bf16:big_k:12" 22 | 64x256:256x32_n"bf16:big_k:13" 23 | 64x256:256x48_n"bf16:big_k:14" 24 | 64x256:256x64_n"bf16:big_k:15" 25 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/brgemm/shapes_2d_big_k_f32: -------------------------------------------------------------------------------- 1 | 2 | 3 | 1x136:136x16_n"f32:big_k:0" 4 | 1x136:136x32_n"f32:big_k:1" 5 | 1x136:136x48_n"f32:big_k:2" 6 | 1x136:136x64_n"f32:big_k:3" 7 | 8 | 9 | 13x136:136x16_n"f32:big_k:4" 10 | 13x136:136x32_n"f32:big_k:5" 11 | 13x136:136x48_n"f32:big_k:6" 12 | 13x136:136x64_n"f32:big_k:7" 13 | 14 | 15 | 16x136:136x16_n"f32:big_k:8" 16 | 16x136:136x32_n"f32:big_k:9" 17 | 16x136:136x48_n"f32:big_k:10" 18 | 16x136:136x64_n"f32:big_k:11" 19 | 20 | 21 | 64x136:136x16_n"f32:big_k:12" 22 | 64x136:136x32_n"f32:big_k:13" 23 | 64x136:136x48_n"f32:big_k:14" 24 | 64x136:136x64_n"f32:big_k:15" 25 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/brgemm/shapes_2d_big_k_int8: -------------------------------------------------------------------------------- 1 | 2 | 3 | 1x512:512x16_n"int8:big_k:0" 4 | 1x512:512x32_n"int8:big_k:1" 5 | 1x512:512x48_n"int8:big_k:2" 6 | 1x512:512x64_n"int8:big_k:3" 7 | 8 | 9 | 13x512:512x16_n"int8:big_k:4" 10 | 13x512:512x32_n"int8:big_k:5" 11 | 13x512:512x48_n"int8:big_k:6" 12 | 13x512:512x64_n"int8:big_k:7" 13 | 14 | 15 | 16x512:512x16_n"int8:big_k:8" 16 | 16x512:512x32_n"int8:big_k:9" 17 | 16x512:512x48_n"int8:big_k:10" 18 | 16x512:512x64_n"int8:big_k:11" 19 | 20 | 21 | 64x512:512x16_n"int8:big_k:12" 22 | 64x512:512x32_n"int8:big_k:13" 23 | 64x512:512x48_n"int8:big_k:14" 24 | 64x512:512x64_n"int8:big_k:15" 25 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/brgemm/test_brgemm_all: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | --batch=test_brgemm_ci 4 | 5 | --batch=test_brgemm_f32 6 | 7 | --batch=test_brgemm_bf16 8 | 9 | --batch=test_brgemm_f16 10 | 11 | --batch=test_brgemm_f8 12 | 13 | --batch=test_brgemm_int8 14 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/brgemm/test_brgemm_f32: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | --batch=harness_brgemm_f32 4 | 5 | --batch=harness_brgemm_fpmath 6 | 7 | --reset 8 | --brgemm-attr=generate_skip_accumulation:1 9 | --beta=0,1 10 | --bia_dt=f32 11 | --dt=f32 12 | --batch=option_set_f32 13 | 14 | # ukernel wtag support 15 | --reset 16 | --wtag=ba 17 | --dt=f32 18 | --batch=option_set_f32 19 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/brgemm/test_brgemm_f8: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | --batch=harness_brgemm_f8 4 | 5 | # ukernel wtag support 6 | --reset 7 | --wtag=ba 8 | --dt=f8_e5m2,f8_e4m3 9 | --batch=option_set_int8 10 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/brgemm/test_brgemm_regression: -------------------------------------------------------------------------------- 1 | # Incorrect mask comparison in lazy hw config, need two problems to run back-to-back 2 | --reset --dt=bf16 128x448:448x32 42x608:608x32 3 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/brgemm/test_brgemm_smoke: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | --match=.*64:64.* # Use shapes from ci with K=64 only 4 | 5 | --bs=16 6 | --dt=f32 7 | --bia_dt=undef,f32 8 | --attr-post-ops=,relu 9 | --batch=shapes_2d_no_tail_f32 10 | 11 | --dt=f16,bf16 12 | --batch=shapes_2d_no_tail_bf16 13 | 14 | --dt=u8:s8:f32,s8:s8:s32 15 | --attr-scales=,src:common:0.5+wei:per_oc 16 | --attr-zero-points=,src:common:128+dst:common:-1 17 | --batch=shapes_2d_no_tail_int8 18 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/concat/test_concat_ci: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | --dtag=undef,any,abx,axb 4 | --sdt=f32,bf16,f16,s32,s8,u8 5 | --ddt=f32,bf16,f16,s32,s8,u8 6 | --stag=abx:abx,axb:axb 7 | --axis=1 8 | 2x16x3x4:2x16x3x4 9 | 3x5x5x17:3x4x5x17 10 | --axis=0 11 | 32x4x5x17:16x4x5x17 12 | 1x16x3x4:2x16x3x4 13 | 14 | --stag=abx:abx:abx,axb:axb:axb 15 | --axis=1 16 | --attr-scales=,msrc0:common:1.5,msrc0:common:1.5+msrc1:common:2.5 17 | 6x48x3x4x5:6x32x3x4x5:6x16x3x4x5 18 | 6x48x3x4x5:6x31x3x4x5:6x16x3x4x5 19 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/concat/test_concat_smoke: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | --dtag=undef 4 | --sdt=f32,bf16,f16,s8,u8 5 | --ddt=f32,bf16,f16,s8,u8 6 | --stag=axb:axb 7 | --axis=1 8 | 2x16x3x4:2x16x3x4 9 | 3x5x5x17:3x4x5x17 10 | 11 | --stag=axb:axb:axb 12 | --axis=1 13 | --attr-scales=,msrc0:common:1.5+msrc1:common:2.5 14 | 6x48x3x4x5:6x32x3x4x5:6x16x3x4x5 15 | 6x48x3x4x5:6x31x3x4x5:6x16x3x4x5 16 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/conv/harness_conv_arbitrary_dst: -------------------------------------------------------------------------------- 1 | --reset 2 | --mb=2,16 3 | --dir=FWD_B 4 | 5 | # mixed halfs 6 | --dt=bf16,bf16:bf16:f32,f16:f16:f32 7 | --batch=set_gpu 8 | 9 | #int8dst 10 | --dt=f16:f16:s8,f16:f16:u8,bf16:bf16:s8,bf16:bf16:u8,f32:f32:s8,f32:f32:u8 11 | --batch=set_gpu 12 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/conv/harness_conv_auto: -------------------------------------------------------------------------------- 1 | # auto algo 2 | --reset --dt=f32 --alg=auto 3 | --dir=FWD_B --batch=shapes_auto 4 | --dir=BWD_D --batch=shapes_auto 5 | --dir=BWD_WB --batch=shapes_auto 6 | --dt=u8:s8:s8 7 | --dir=FWD_B --batch=shapes_auto 8 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/conv/harness_conv_deepbench: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | --dir=FWD_B,BWD_D,BWD_W 4 | --batch=shapes_deepbench_inference_device 5 | --batch=shapes_deepbench_inference_server 6 | --batch=shapes_deepbench_training 7 | 8 | --dir=FWD_B 9 | --attr-post-ops=relu 10 | --batch=shapes_deepbench_inference_device 11 | --batch=shapes_deepbench_inference_server 12 | --batch=shapes_deepbench_training 13 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/conv/harness_conv_dilated_int8: -------------------------------------------------------------------------------- 1 | # Dilated Int8 2 | --reset 3 | --mb=2 4 | --dir=FWD_D 5 | --dt=u8:s8:u8,s8:s8:u8 6 | --batch=shapes_dilated_rfcn 7 | --match=.*fc6.* --batch=shapes_ssd_300_voc0712 8 | 9 | --reset 10 | --dir=FWD_D --mb=2 11 | --skip-impl=ref,x64:gemm 12 | --dt=u8:s8:s32 13 | --batch=set_dilated-conv_1st 14 | --batch=set_dilated-conv 15 | --batch=set_dilated-conv_3d 16 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/conv/harness_conv_f32: -------------------------------------------------------------------------------- 1 | # f32 2 | --reset --dt=f32 3 | --mb=2 # for fwd and bwd_d reduce mb 4 | --dir=FWD_B --batch=set_conv_all 5 | --dir=BWD_D --batch=set_conv_all 6 | --dir=BWD_WB --batch=set_conv_all 7 | --mb=0 # for bwd_w use the actual mb for 1 topology 8 | --dir=BWD_WB --batch=shapes_resnet_50 9 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/conv/harness_conv_saturation_int8: -------------------------------------------------------------------------------- 1 | # Test that saturation is handled properly (lightweight test) 2 | --reset 3 | --mb=2 4 | --dir=FWD_B 5 | --dt=u8:s8:u8,u8:s8:s8,u8:s8:s32 6 | --attr-scales=src:common:0.25+wei:common:0.5+dst:common:4294967295 7 | ic16oc16ih4oh4kh1ph0 # jit 1x1 8 | ic16oc16ih4oh4kh3ph0 # jit 9 | ic16oc16ih4oh7kh3ph3 # gemm impl 10 | 11 | --reset 12 | --dt=u8:s8:u8 13 | --attr-scales=src:common:0.25+wei:common:0.5+dst:common:4294967295 14 | --attr-post-ops=round 15 | ic16oc16_ih5oh5kh3ph1 16 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/conv/harness_conv_tags: -------------------------------------------------------------------------------- 1 | # Test different combinations of memory format tags for input tensors 2 | --reset 3 | --mb=2 4 | 5 | # Source or destination tensor may be in plain layout 6 | --stag=any,abx,axb 7 | --dtag=any,abx,axb 8 | 9 | # Training: bf16 and f32 10 | --dt=bf16,f32 11 | --dir=FWD_B,BWD_D,BWD_WB 12 | --batch=shapes_basic 13 | # Inference: int8 14 | --dt=u8:s8:u8 15 | --dir=FWD_B 16 | --batch=shapes_basic 17 | 18 | # Winograd 19 | --alg=wino 20 | --match=.*k[dhw]3.* 21 | --dt=f32 # Training: f32 (no bf16 for Winograd) 22 | --dir=FWD_B,BWD_D,BWD_WB 23 | --batch=shapes_basic 24 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/conv/perf_conv_bdw_1sock: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | --dt=f32 4 | --dir=FWD_D,BWD_D,BWD_W 5 | --alg=direct 6 | 7 | --mb=0,20,40,80 8 | --batch=set_perf_cpu_large_mb 9 | 10 | --mb=1,4,8 11 | --batch=set_perf_cpu_all_mb 12 | 13 | --reset 14 | ## int8 15 | --dt=u8:s8:u8 16 | --dir=FWD_D 17 | --alg=direct 18 | 19 | --mb=0,20,40,80 20 | --batch=set_perf_cpu_large_mb 21 | 22 | --mb=1,4,8 23 | --batch=set_perf_cpu_all_mb 24 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/conv/perf_conv_clx_1sock: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | # Direct 4 | ## f32 5 | --dt=f32 6 | --dir=FWD_D,BWD_D,BWD_W 7 | --alg=direct 8 | 9 | --mb=0,26,52,104 10 | --batch=set_perf_cpu_large_mb 11 | 12 | --mb=1,4,8 13 | --batch=set_perf_cpu_all_mb 14 | 15 | --reset 16 | ## int8 17 | --dt=u8:s8:u8 18 | --dir=FWD_D 19 | --alg=direct 20 | 21 | --mb=0,26,52,104 22 | --batch=set_perf_cpu_large_mb 23 | 24 | --mb=1,4,8 25 | --batch=set_perf_cpu_all_mb 26 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/conv/perf_conv_skx_1sock: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | # Direct 4 | ## f32 5 | --dt=f32 6 | --dir=FWD_D,BWD_D,BWD_W 7 | --alg=direct 8 | 9 | --mb=0,16,32,64 10 | --batch=set_perf_cpu_large_mb 11 | 12 | --mb=1,4,8 13 | --batch=set_perf_cpu_all_mb 14 | 15 | --reset 16 | ## int8 17 | --dt=u8:s8:u8 18 | --dir=FWD_D 19 | --alg=direct 20 | 21 | --mb=0,16,32,64 22 | --batch=set_perf_cpu_large_mb 23 | 24 | --mb=1,4,8 25 | --batch=set_perf_cpu_all_mb 26 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/conv/perf_conv_xe: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | # Forward 4 | 5 | # f32 6 | --dir=FWD_B 7 | --dt=f32 8 | --mb=16,64 9 | --batch=set_perf_gpu_large_mb 10 | 11 | --dir=FWD_B 12 | --dt=f32 13 | --mb=1 14 | --batch=set_perf_gpu_all_mb 15 | 16 | # f16 17 | --dt=f16 18 | --dir=FWD_B 19 | 20 | --mb=16,64 21 | --batch=set_perf_gpu_large_mb 22 | 23 | --mb=1 24 | --batch=set_perf_gpu_all_mb 25 | 26 | # Backward 27 | 28 | # f32 29 | --dt=f32 30 | --dir=BWD_D,BWD_W 31 | --mb=64 32 | --batch=set_perf_gpu_large_mb 33 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/conv/perf_conv_xe_hp: -------------------------------------------------------------------------------- 1 | --batch=perf_conv_xe_lp 2 | 3 | --reset 4 | 5 | # Forward, bf16 6 | 7 | --dir=FWD_B 8 | --dt=bf16 9 | --mb=16,64 10 | --batch=set_perf_gpu_large_mb 11 | 12 | --dir=FWD_B 13 | --dt=bf16 14 | --mb=1 15 | --batch=set_perf_gpu_all_mb 16 | 17 | # Backward, bf16 18 | 19 | --dt=bf16 20 | --dir=BWD_D,BWD_W 21 | --mb=64 22 | --batch=set_perf_gpu_large_mb 23 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/conv/perf_conv_xe_lp: -------------------------------------------------------------------------------- 1 | --batch=perf_conv_xe 2 | 3 | --reset 4 | 5 | # Forward, int8 6 | --dt=u8:s8:s8 7 | --dir=FWD_B 8 | 9 | --mb=16,64 10 | --batch=set_perf_gpu_large_mb 11 | 12 | --mb=1 13 | --batch=set_perf_gpu_all_mb 14 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/conv/set_conv_3d: -------------------------------------------------------------------------------- 1 | # 3D Convolutions 2 | # ngroups = 1; ic = oc = 16 3 | # 1x1 Filter 4 | --batch=shapes_3d_1x1_unit-stride_no-padding 5 | --batch=shapes_3d_1x1_unit-stride_padding 6 | --batch=shapes_3d_1x1_strided_no-padding 7 | --batch=shapes_3d_1x1_strided_padding 8 | # N-sized Filter 9 | # ngroups = 1; ic = 1; oc = 16 10 | --batch=shapes_3d_1st_strided_padding 11 | # ngroups = 1; ic = oc = 16 12 | --batch=shapes_3d_unit-stride_no-padding 13 | --batch=shapes_3d_unit-stride_padding 14 | --batch=shapes_3d_strided_no-padding 15 | --batch=shapes_3d_strided_padding 16 | # Depth + Height 17 | --batch=shapes_3d_2d_strided_padding 18 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/conv/set_conv_all: -------------------------------------------------------------------------------- 1 | --batch=shapes_1d 2 | --batch=shapes_1d_wavenet 3 | --batch=shapes_3d_unet 4 | --batch=shapes_alexnet 5 | --batch=shapes_vgg_19 6 | --batch=shapes_resnet_50 7 | --batch=shapes_googlenet_v1 8 | --batch=shapes_googlenet_v2 9 | --batch=shapes_googlenet_v3 10 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/conv/set_dilated-conv: -------------------------------------------------------------------------------- 1 | # Dilated Convolution 2 | # 1D 3 | --batch=shapes_dilated_1d_strided_no-padding 4 | --batch=shapes_dilated_1d_unit-stride_no-padding 5 | --batch=shapes_dilated_1d_unit-stride_padding 6 | --batch=shapes_dilated_1d_strided_padding 7 | # 2D 8 | --batch=shapes_dilated_2d_strided_no-padding 9 | --batch=shapes_dilated_2d_strided_padding 10 | --batch=shapes_dilated_2d_unit-stride_no-padding 11 | --batch=shapes_dilated_2d_unit-stride_padding 12 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/conv/set_dilated-conv_1st: -------------------------------------------------------------------------------- 1 | # Dilated Convolutions 2 | # ic = 1 for 1st convolution code-path 3 | --batch=shapes_dilated_1d_1st_strided_padding 4 | --batch=shapes_dilated_2d_1st_strided_padding 5 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/conv/set_dilated-conv_3d: -------------------------------------------------------------------------------- 1 | # 3D Dilated Convolution 2 | # N-sized Filter 3 | --batch=shapes_dilated_3d_strided_no-padding 4 | --batch=shapes_dilated_3d_strided_padding 5 | --batch=shapes_dilated_3d_unit-stride_no-padding 6 | --batch=shapes_dilated_3d_unit-stride_padding 7 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/conv/set_fastrcnn: -------------------------------------------------------------------------------- 1 | # FastRCNN 2 | --batch=shapes_fastrcnn_p1 3 | --batch=shapes_fastrcnn_p2 4 | #--batch=shapes_fastrcnn_p3 # no unique conv. shapes 5 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/conv/set_gpu: -------------------------------------------------------------------------------- 1 | --batch=shapes_1d_wavenet 2 | --batch=shapes_alexnet 3 | --batch=shapes_resnet_50 4 | --batch=shapes_3d_gpu 5 | --batch=shapes_mobilenet_dw 6 | --batch=shapes_tails_gpu 7 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/conv/set_maskrcnn: -------------------------------------------------------------------------------- 1 | # MaskRCNN 2 | --batch=shapes_maskrcnn_p1 3 | --batch=shapes_maskrcnn_p2 4 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/conv/set_perf_cpu_all_mb: -------------------------------------------------------------------------------- 1 | --batch=set_perf_cpu_large_mb 2 | --batch=set_perf_cpu_small_mb 3 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/conv/set_perf_cpu_inference_only: -------------------------------------------------------------------------------- 1 | --batch=shapes_ssd_resnet34_inference 2 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/conv/set_perf_cpu_large_mb: -------------------------------------------------------------------------------- 1 | --batch=shapes_1d_wavenet 2 | --batch=shapes_alexnet 3 | --batch=shapes_densnet 4 | --batch=shapes_efficientdet 5 | --batch=shapes_googlenet_v1 6 | --batch=shapes_googlenet_v2 7 | --batch=shapes_googlenet_v3 8 | --batch=shapes_mobilenet 9 | --batch=shapes_mobilenet_dw 10 | --batch=shapes_resnet_50 11 | --batch=shapes_resnet_50_sparse 12 | --batch=shapes_ssd_300_voc0712 13 | --batch=shapes_ssd_mobilenet 14 | --batch=shapes_ssd_resnet34_training 15 | --batch=shapes_unet 16 | --batch=shapes_vgg_19 17 | --batch=shapes_yolov2 18 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/conv/set_perf_cpu_small_mb: -------------------------------------------------------------------------------- 1 | --batch=set_maskrcnn 2 | --batch=set_fastrcnn 3 | --batch=shapes_3d_unet 4 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/conv/set_perf_gpu_all_mb: -------------------------------------------------------------------------------- 1 | --batch=set_perf_gpu_large_mb 2 | --batch=set_perf_gpu_small_mb 3 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/conv/set_perf_gpu_large_mb: -------------------------------------------------------------------------------- 1 | --batch=shapes_1d_wavenet 2 | --batch=shapes_a3c 3 | --batch=shapes_alexnet 4 | --batch=shapes_densnet 5 | --batch=shapes_ffn 6 | --batch=shapes_googlenet_v3 7 | --batch=shapes_mobilenet 8 | --batch=shapes_mobilenet_dw 9 | --batch=shapes_pointnet 10 | --batch=shapes_resnet_50 11 | --batch=shapes_resnet_50_sparse 12 | --batch=shapes_unet 13 | --batch=shapes_vgg_19 14 | --batch=shapes_yolov2 15 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/conv/set_perf_gpu_small_mb: -------------------------------------------------------------------------------- 1 | --batch=set_maskrcnn 2 | --batch=set_fastrcnn 3 | --batch=shapes_3d_unet 4 | --batch=shapes_cosmictagger 5 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/conv/set_topologies_inference_only: -------------------------------------------------------------------------------- 1 | --batch=shapes_ssd_resnet34_inference 2 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/conv/shapes_3d_gpu: -------------------------------------------------------------------------------- 1 | #test cases for 3D convolution 2 | 3 | ic16oc16_ih13kh3ph1_iw50kw3pw1_id10kd3pd1_n"3d_conv_pad:1" 4 | ic64oc64_ih10kh3ph1_iw20kw3pw1_id15kd3pd1_n"3d_conv_pad:2" 5 | ic256oc256_ih7kh3ph1_iw9kw3pw1_id11kd3pd1_n"3d_conv_pad:3" 6 | ic256oc256_ih7kh1ph1_iw9kw1pw1_id11kd1pd1_n"3d_conv_pad:4" 7 | ic16oc16_ih13kh3ph0_iw50kw3pw0_id10kd3pd0_n"3d_conv:1" 8 | ic16oc16_ih13kh1ph0_iw50kw1pw0_id10kd1pd0_n"3d_conv:2" 9 | ic256oc256_ih7kh3ph0_iw9kw3pw0_id11kd3pd0_n"3d_conv:3" 10 | ic256oc256_ih7kh1ph0_iw9kw1pw0_id11kd1pd0_n"3d_conv:4" 11 | 12 | #mb1_ic16oc16_id1od2kd1sd1dd1pd0_ih2oh3kh2sh1dh1ph2_iw5ow3kw3sw2dw0pw0 13 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/conv/shapes_a3c: -------------------------------------------------------------------------------- 1 | # A3C 2 | 3 | mb1_g1ic4oc16_ih84oh20kh8sh4dh0ph0_iw84ow20kw8sw4dw0pw0_n"a3c:conv1" 4 | mb1_g1ic16oc32_ih20oh9kh4sh2dh0ph0_iw20ow9kw4sw2dw0pw0_n"a3c:conv2" 5 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/conv/shapes_alexnet: -------------------------------------------------------------------------------- 1 | # alexnet 2 | 3 | g1mb256ic3ih227iw227oc96oh55ow55kh11kw11sh4sw4ph0pw0n"alexnet:conv1" 4 | g2mb256ic96ih27oc256oh27kh5ph2n"alexnet:conv2" 5 | mb256ic256ih13oc384oh13kh3ph1n"alexnet:conv3" 6 | g2mb256ic384ih13oc384oh13kh3ph1n"alexnet:conv4" 7 | g2mb256ic384ih13oc256oh13kh3ph1n"alexnet:conv5" 8 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/conv/shapes_auto: -------------------------------------------------------------------------------- 1 | mb2_ic3oc64_ih300kh3oh300ph1n"ssd_300_voc0712:conv1_1" 2 | mb9_ic3oc64_ih300kh3oh300ph1n"ssd_300_voc0712:conv1_1" 3 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/conv/shapes_dilated_rfcn: -------------------------------------------------------------------------------- 1 | # Dilated Convolution from RFCN 2 | 3 | mb1ic512ih38iw57oc512oh38ow57kh3kw3ph3pw3dh2dw2n"dilated_rfcn" 4 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/conv/shapes_ffn: -------------------------------------------------------------------------------- 1 | # FFN 2 | 3 | mb4ic2id33oc32od33kd3pd1n"ffn:0*2" 4 | mb4ic32id33oc1od33kd1pd0n"ffn:1*3" 5 | mb4ic32id33oc32od33kd3pd1n"ffn:2*69" 6 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/conv/shapes_fused_large_src: -------------------------------------------------------------------------------- 1 | # large-mb, small-ic shape to exceed combined L2 cache and target jit impls 2 | mb448_ic16oc64_ih56oh56kh1sh1dh0ph0n"fused_large_src" 3 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/conv/shapes_fused_mobilenet_stride_2: -------------------------------------------------------------------------------- 1 | ic32oc64_ih112oh112kh1sh1dh0ph0n"MobileNet_v1_fused_stride_2:conv1" 2 | ic128oc128_ih56oh56kh1sh1dh0ph0n"MobileNet_v1_fused_stride_2:conv2" 3 | ic256oc256_ih28oh28kh1sh1dh0ph0n"MobileNet_v1_fused_stride_2:conv3" 4 | ic512oc512_ih14oh14kh1sh1dh0ph0n"MobileNet_v1_fused_stride_2:conv4" 5 | 6 | ic16oc96_ih112oh112kh1sh1dh0ph0n"MobileNet_v2_fused_stride_2:conv1" 7 | ic24oc144_ih56oh56kh1sh1dh0ph0n"MobileNet_v2_fused_stride_2:conv2" 8 | ic64oc384_ih28oh28kh1sh1dh0ph0n"MobileNet_v2_fused_stride_2:conv3" 9 | ic96oc576_ih14oh14kh1sh1dh0ph0n"MobileNet_v2_fused_stride_2:conv4" 10 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/conv/shapes_large_conv: -------------------------------------------------------------------------------- 1 | # Large iw 2 | mb1ic32oc1iw134217732kw3 3 | mb1ic1oc1iw4294967311kw3 4 | 5 | # Large ic 6 | mb1ic4294967311oc1iw1kw1 7 | mb1ic134217732oc1iw27kw3 8 | 9 | # Larg oc 10 | mb1ic1oc1342177321iw27kw3 11 | mb1ic1oc4294967311iw1kw1 12 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/conv/shapes_large_padding: -------------------------------------------------------------------------------- 1 | ic16oc16_iw1kw3ow100pw100 2 | ic1oc64_ih55oh18kh30sh3ph29dh2 3 | ic1oc64ih7oh7kh3ph1sh1dh0 4 | ic1oc64iw100ow98kw5pw50sw2dw0 5 | ic2oc64ih30oh37kh14ph10sh1dh2 6 | ic3oc64id28od36kd3pd5sd1dd0 7 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/conv/shapes_pointnet: -------------------------------------------------------------------------------- 1 | # PointNet 2 | 3 | mb10ic1088ih1iw15000oc512oh1ow15000kh1kw1ph0pw0n"pointnet:0*300" 4 | mb10ic128ih1iw15000oc1024oh1ow15000kh1kw1ph0pw0n"pointnet:1*900" 5 | mb10ic128ih1iw15000oc5oh1ow15000kh1kw1ph0pw0n"pointnet:2*300" 6 | mb10ic256ih1iw15000oc128oh1ow15000kh1kw1ph0pw0n"pointnet:3*300" 7 | mb10ic4ih1iw15000oc64oh1ow15000kh1kw1ph0pw0n"pointnet:4*500" 8 | mb10ic512ih1iw15000oc256oh1ow15000kh1kw1ph0pw0n"pointnet:5*300" 9 | mb10ic64ih1iw15000oc128oh1ow15000kh1kw1ph0pw0n"pointnet:6*900" 10 | mb10ic64ih1iw15000oc64oh1ow15000kh1kw1ph0pw0n"pointnet:7*900" 11 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/conv/shapes_regression_gemm: -------------------------------------------------------------------------------- 1 | # ResNext50 2 | mb2_g32ic128oc128_ih56oh56kh3sh1dh0ph1_iw56ow56kw3sw1dw0pw1 3 | 4 | # Faster RCNN 5 | mb1_g64ic256oc256_ih240oh240kh3sh1dh0ph1_iw352ow352kw3sw1dw0pw1 6 | 7 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/conv/shapes_segnet: -------------------------------------------------------------------------------- 1 | # Minibatch 4 is found in the original prototxt 2 | # Not clear if that's what should be used for perf measurements 3 | 4 | mb4ih240iw320ic64oc64oh240ow320kh7kw7ph3pw3sh1sw1n"conv2" 5 | mb4ih120iw160ic64oc64oh120ow160kh7kw7ph3pw3sh1sw1n"conv3" 6 | mb4ih60iw80ic64oc64oh60ow80kh7kw7ph3pw3sh1sw1n"conv4" 7 | mb4ih480iw640ic64oc64oh480ow640kh7kw7ph3pw3sh1sw1n"conv_decode1" 8 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/conv/shapes_vgg_11: -------------------------------------------------------------------------------- 1 | # vgg_11 2 | 3 | mb64ic3ih100oc32oh100kh3ph1n"vgg_11:conv1_1" 4 | mb64ic32ih100oc64oh100kh3ph1n"vgg_11:conv1_2" 5 | mb64ic64ih50oc64oh50kh3ph1n"vgg_11:conv2_1" 6 | mb64ic64ih50oc128oh50kh3ph1n"vgg_11:conv2_2" 7 | mb64ic128ih25oc96oh25kh3ph1n"vgg_11:conv3_1" 8 | mb64ic96ih25oc192oh25kh3ph1n"vgg_11:conv3_2" 9 | mb64ic192ih13oc128oh13kh3ph1n"vgg_11:conv4_1" 10 | mb64ic128ih13oc256oh13kh3ph1n"vgg_11:conv4_2" 11 | mb64ic256ih7oc160oh7kh3ph1n"vgg_11:conv5_1" 12 | mb64ic160ih7oc320oh7kh3ph1n"vgg_11:conv5_2" 13 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/conv/test_conv_3d: -------------------------------------------------------------------------------- 1 | # f32 3-D Convolutions 2 | --reset --dt=f32 3 | --mb=2 4 | --skip-impl=ref,x64:gemm # ! test jit version only 5 | --dir=FWD_B,BWD_D,BWD_WB 6 | --batch=shapes_3d 7 | --batch=set_conv_3d 8 | 9 | # i8 3-D Convolution 10 | --reset 11 | --skip-impl=ref,x64:gemm # ! test jit version only 12 | --mb=2 13 | --dt=u8:s8:s8,s8:s8:s32,u8:s8:s8 14 | --batch=shapes_3d 15 | --batch=set_conv_3d 16 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/conv/test_conv_3d_f32_plain: -------------------------------------------------------------------------------- 1 | # f32 3-D Convolutions 2 | --reset --dt=f32 3 | --stag=axb --dtag=axb 4 | --mb=2 5 | --skip-impl=ref,x64:gemm # ! test jit version only 6 | --dir=FWD_B,BWD_D,BWD_WB 7 | --batch=shapes_3d 8 | --batch=set_conv_3d 9 | 10 | --stag=abx --dtag=abx 11 | --batch=shapes_3d 12 | --batch=set_conv_3d 13 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/conv/test_conv_all_topologies: -------------------------------------------------------------------------------- 1 | # Test All Topologies with F32 Configuration 2 | --reset --dt=f32 3 | --skip-impl=ref 4 | --mb=2 5 | --dir=FWD_B,BWD_D,BWD_WB 6 | --batch=set_all_topologies 7 | 8 | --dir=FWD_B 9 | --batch=set_topologies_inference_only 10 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/conv/test_conv_all_topologies_f32_plain: -------------------------------------------------------------------------------- 1 | # Test All Topologies with F32 Configuration 2 | --reset --dt=f32 3 | --stag=axb --dtag=axb 4 | --skip-impl=ref 5 | --mb=2 6 | --dir=FWD_B,BWD_D,BWD_WB 7 | --batch=set_all_topologies 8 | 9 | --dir=FWD_B 10 | --batch=set_topologies_inference_only 11 | 12 | --stag=abx --dtag=abx 13 | --dir=FWD_B,BWD_D,BWD_WB 14 | --batch=set_all_topologies 15 | 16 | --dir=FWD_B 17 | --batch=set_topologies_inference_only 18 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/conv/test_conv_attrs: -------------------------------------------------------------------------------- 1 | # f32 2 | --reset 3 | --mb=2 4 | --dir=FWD_B 5 | 6 | --dt=f32 7 | --skip-impl=ref,x64:gemm # ! test jit version only 8 | --batch=option_set_combined_postops 9 | --skip-impl= 10 | 11 | --dt=f32,f16,bf16,u8:s8:u8 12 | --skip-impl=ref 13 | --batch=option_set_all_eltwise_postops 14 | --skip-impl= 15 | 16 | # f32_wino 17 | --reset 18 | --mb=2 19 | --dir=FWD_B 20 | 21 | --dt=f32 22 | --alg=wino 23 | --attr-post-ops=sum+relu 24 | --batch=shapes_resnet_50 25 | 26 | # Depthwise fusion 27 | --batch=harness_conv_fused_depthwise 28 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/conv/test_conv_attrs_f32_plain: -------------------------------------------------------------------------------- 1 | # f32 2 | --reset --dt=f32 3 | --mb=2 4 | --stag=axb --dtag=axb 5 | --dir=FWD_B 6 | 7 | --skip-impl=ref,x64:gemm # ! test jit version only 8 | --batch=option_set_combined_postops 9 | 10 | --stag=abx --dtag=abx 11 | --batch=option_set_all_eltwise_postops 12 | 13 | --stag=axb --dtag=axb 14 | --skip-impl=ref 15 | --batch=option_set_all_eltwise_postops 16 | 17 | --stag=abx --dtag=abx 18 | --batch=option_set_all_eltwise_postops 19 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/conv/test_conv_bfloat16_ymm: -------------------------------------------------------------------------------- 1 | # Currently, only gemm routine implements bf16_ymm hint 2 | # Therefore test bfloat16 GeMM specific functionality 3 | 4 | # global benchdnn knob, will not be reset again 5 | --cpu-isa-hints=prefer_ymm 6 | --reset 7 | 8 | --batch=test_conv_gemm_bfloat16 9 | --batch=test_conv_gemm_bfloat16_nxc 10 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/conv/test_conv_depthwise: -------------------------------------------------------------------------------- 1 | # f32 2 | --reset 3 | --mb=2 4 | --dir=FWD_D,BWD_D,BWD_WB,FWD_I --batch=shapes_mobilenet_dw 5 | --dir=FWD_D,BWD_D,BWD_WB,FWD_I --batch=shapes_regression_dw 6 | 7 | # post-ops 8 | --dir=FWD_D 9 | --attr-post-ops=relu,sum,sum+relu+add:f32:per_tensor,add:f32:per_oc 10 | --batch=shapes_mobilenet_dw 11 | 12 | --reset --dt=f32 13 | --mb=2 14 | --skip-impl=ref,x64:gemm 15 | --dir=FWD_B,BWD_D,BWD_WB,FWD_I 16 | --batch=set_conv_dw 17 | --batch=shapes_dw_minibatch_2d-spatial 18 | --batch=shapes_dw_minibatch_channel_2d-spatial 19 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/conv/test_conv_dilated: -------------------------------------------------------------------------------- 1 | # dilated f32 2 | --reset 3 | --mb=2 4 | --dt=f32 5 | --dir=FWD_B,BWD_D,BWD_WB 6 | --batch=shapes_dilated --batch=shapes_dilated_rfcn 7 | --match=.*fc6.* --batch=shapes_ssd_300_voc0712 8 | 9 | --reset --dt=f32 10 | --mb=2 11 | --skip-impl=ref,x64:gemm # ! test jit version only 12 | --dir=FWD_B,BWD_D,BWD_WB 13 | --batch=set_dilated-conv_1st 14 | --batch=set_dilated-conv 15 | --batch=set_dilated-conv_3d 16 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/conv/test_conv_dt: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | # f32 4 | --batch=harness_conv_f32 5 | 6 | # tails 7 | --reset 8 | --dir=FWD_B,BWD_D,BWD_WB --batch=shapes_tails 9 | 10 | # bf16 11 | --batch=test_conv_bfloat16 12 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/conv/test_conv_dt_plain: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | # f32 4 | --batch=harness_conv_f32_plain 5 | 6 | # tails 7 | --reset 8 | --skip-impl=ref 9 | --stag=axb --dtag=axb 10 | --dir=FWD_B,BWD_D,BWD_WB --batch=shapes_tails 11 | --stag=abx --dtag=abx 12 | --batch=shapes_tails 13 | 14 | # bf16 15 | --batch=test_conv_bfloat16_nxc 16 | 17 | # f16 18 | --batch=test_conv_float16_nxc 19 | 20 | # fp8 21 | --batch=test_conv_fp8_nxc 22 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/conv/test_conv_fp4: -------------------------------------------------------------------------------- 1 | --reset --mb=1 --dt=f4_e2m1:f4_e2m1:bf16,f4_e3m0:f4_e3m0:bf16 --dir=fwd_d --batch=shapes_4bit 2 | --reset --mb=1 --dt=f16:f4_e2m1:f4_e2m1,f32:f4_e3m0:f4_e3m0 --dir=bwd_d --batch=shapes_4bit 3 | --reset --mb=1 --dt=f4_e2m1:f16:f4_e2m1,f4_e3m0:f32:f4_e3m0 --dir=bwd_w --batch=shapes_4bit 4 | --reset --mb=1 --dt=f4_e2m1,f4_e3m0 --dir=fwd_d --batch=shapes_4bit 5 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/conv/test_conv_function: -------------------------------------------------------------------------------- 1 | # Test functional convolution features 2 | 3 | --reset 4 | 5 | # auto algo 6 | --batch=harness_conv_auto 7 | 8 | # memory-tags 9 | --batch=harness_conv_tags 10 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/conv/test_conv_gemm_bfloat16: -------------------------------------------------------------------------------- 1 | # Test Bfloat16 GeMM specific functionality 2 | --reset 3 | --mb=2 4 | --skip-impl=ref 5 | --dir=FWD_B 6 | --dt=bf16:bf16:f32 --batch=shapes_gemm 7 | 8 | --dir=FWD_D 9 | --dt=bf16 --batch=shapes_gemm 10 | 11 | --dir=BWD_D 12 | --dt=f32:bf16:bf16 --batch=shapes_gemm 13 | 14 | --dir=BWD_WB 15 | --dt=bf16:f32:bf16 --batch=shapes_gemm 16 | 17 | --dir=FWD_B,BWD_D,BWD_W 18 | --dt=bf16 19 | --stag=abx --dtag=abx 20 | --batch=shapes_gemm 21 | --batch=shapes_3d_2d_strided_padding 22 | --batch=shapes_dilated_3d_strided_padding 23 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/conv/test_conv_gemm_bfloat16_nxc: -------------------------------------------------------------------------------- 1 | # bfloat16 2 | --reset 3 | --mb=2 4 | --stag=axb --dtag=axb 5 | --skip-impl=ref 6 | 7 | --dir=FWD_B 8 | --dt=bf16:bf16:f32,bf16 9 | --batch=shapes_gemm 10 | 11 | --dir=BWD_D 12 | --dt=f32:bf16:bf16,bf16 13 | --batch=shapes_gemm 14 | 15 | --dir=BWD_WB 16 | --mb=0 17 | --dt=bf16:f32:bf16,bf16 18 | --batch=shapes_gemm 19 | 20 | # PostOps + Attributes 21 | --dir=FWD_B 22 | --dt=bf16:bf16:f32,bf16 23 | --mb=2 24 | --attr-post-ops=sum,relu,sum+tanh:1:1:2.5 --batch=shapes_gemm 25 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/conv/test_conv_gemm_dt: -------------------------------------------------------------------------------- 1 | # Test GeMM specific functionality 2 | 3 | # f32 4 | --reset --dt=f32 5 | --mb=2 # for fwd and bwd_d reduce mb 6 | --dir=FWD_B,BWD_D,BWD_WB --batch=shapes_gemm 7 | 8 | --stag=abx --dtag=abx 9 | --batch=shapes_3d_2d_strided_padding 10 | --batch=shapes_dilated_3d_strided_padding 11 | 12 | # bf16 13 | --batch=test_conv_gemm_bfloat16 14 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/conv/test_conv_gpu_ci: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | --stag=any 4 | --dtag=any 5 | --batch=option_gpu_ci 6 | 7 | --stag=axb 8 | --dtag=axb 9 | --batch=option_gpu_ci 10 | 11 | --stag=any 12 | --dtag=axb 13 | --dir=FWD_B 14 | --dt=f64,f32,bf16,f16,s8,f8_e4m3,f8_e5m2 15 | --batch=shapes_ci_gpu 16 | 17 | --dir=BWD_D,BWD_W 18 | --dt=f64,f32,bf16 19 | --batch=shapes_ci_gpu 20 | 21 | --stag=axb 22 | --dtag=any 23 | --dir=FWD_B 24 | --dt=f64,f32,bf16,f16,s8,f8_e4m3,f8_e5m2 25 | --batch=shapes_ci_gpu 26 | 27 | --dir=BWD_D,BWD_W 28 | --dt=f64,f32,bf16 29 | --batch=shapes_ci_gpu 30 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/conv/test_conv_int8: -------------------------------------------------------------------------------- 1 | # Test Int8 Convolutions 2 | --reset 3 | 4 | --batch=harness_conv_saturation_int8 5 | 6 | --batch=harness_conv_int8 7 | 8 | --batch=harness_conv_dilated_int8 9 | 10 | --batch=harness_conv_attrs_int8 11 | 12 | --batch=harness_conv_attrs_int8_asymmetric 13 | 14 | --batch=harness_conv_depthwise_int8 15 | 16 | --batch=harness_conv_zero_points 17 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/conv/test_conv_regression: -------------------------------------------------------------------------------- 1 | # f32 2 | --reset --dt=f32 3 | --dir=FWD_B,BWD_D,BWD_WB 4 | --batch=shapes_regression_small_spatial 5 | --batch=shapes_regression_padding 6 | --batch=shapes_regression_gemm 7 | --batch=shapes_regression_1x1 8 | --stag=axb --dtag=axb --batch=shapes_regression_1x1 9 | 10 | --reset --dt=f32 11 | --dir=FWD_B 12 | --attr-post-ops=relu 13 | --batch=shapes_regression_small_spatial 14 | --batch=shapes_regression_padding 15 | 16 | # 17 | --batch=harness_conv_regression_general 18 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/conv/test_conv_wino_f32: -------------------------------------------------------------------------------- 1 | # f32 wino 2 | --reset 3 | --dt=f32 4 | --alg=wino 5 | --match=.*kh3[^0-9].* # only 3x3 convolutions so far 6 | --mb=2 # for fwd and bwd_d reduce mb 7 | --dir=FWD_I,FWD_B,BWD_D,BWD_WB 8 | --batch=set_conv_all 9 | --batch=shapes_regression_padding 10 | 11 | --mb=0 12 | --batch=shapes_tails 13 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/conv/test_conv_wino_gpu: -------------------------------------------------------------------------------- 1 | # f16 wino 2 | --reset --dt=f16,f32 --stag=any,nhwc --alg=wino 3 | --match=.*[^k][^d][0-9]kh3[^0-9].* # only 3x3 convolutions so far 4 | --mb=2,32 # for fwd and bwd_d reduce mb 5 | --batch=set_conv_all 6 | --batch=shapes_regression_padding 7 | --batch=shapes_tails 8 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/deconv/harness_deconv_regression_general_f32: -------------------------------------------------------------------------------- 1 | # f32 regression : long accumulation chains 2 | --reset --dt=f32 --dir=bwd_w mb28_ic16oc16_id10od10kd3 3 | 4 | # test brgdeconv strided shape that caused segfault 5 | --reset 6 | --skip-impl=ref,x64:gemm 7 | --dt=f32 8 | --dir=fwd_b 9 | mb56_ic2oc3_ih6oh18kh3sh3dh0ph1_iw5ow15kw3sw3dw0pw1_n"2d_strided_with_bias" 10 | 11 | # test brgdeconv strided shape that kd/kh block != kd/kh 12 | --reset 13 | --skip-impl=ref,x64:gemm 14 | --dt=f32 15 | --dir=FWD_B 16 | mb1ic16ih16oc128oh32kh64sh2ph1 17 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/deconv/set_all: -------------------------------------------------------------------------------- 1 | --batch=shapes_1d 2 | --batch=shapes_3d 3 | --batch=shapes_2d 4 | --batch=shapes_dilated 5 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/deconv/shapes_1d: -------------------------------------------------------------------------------- 1 | # 1D 2 | ic8iw5oc8ow2kw3pw3dw2n"deconv1d:1" 3 | g1mb256oc3ow227ic96iw55kw11sw4pw0n"alexnet:deconv1" 4 | g2mb256oc96ow27ic256iw27kw5pw2n"alexnet:deconv2" 5 | mb256oc256ow13ic384iw13kw3pw1n"alexnet:deconv3" 6 | g1mb96ic64iw112oc3ow224kw7sw2pw3n"googlenet_v1:conv1/7x7_s2" 7 | mb1_g1oc3ic64_ow1030iw512kw7sw2dw0pw0_n"masknet_p1:deconv1" 8 | g1mb50ic256iw28oc512ow56kw1sw2pw0n"resnet_50:res3a_branch1" 9 | mb9_ic1oc1_ih1oh1kh1sh1dh0ph0_iw55ow55kw3sw1dw0pw1n"pytorch_unittest" 10 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/deconv/test_deconv_all: -------------------------------------------------------------------------------- 1 | # f32 2 | --reset 3 | --skip-impl=ref 4 | --mb=2 5 | 6 | --dir=FWD_B 7 | --attr-post-ops=,sum,linear:2:1,sum:1.5+add:f32:per_oc+relu 8 | --batch=set_all 9 | 10 | --dir=BWD_D,BWD_W,BWD_WB 11 | --attr-post-ops= 12 | --batch=set_all 13 | 14 | # int8 15 | --batch=test_deconv_int8 16 | 17 | # bf16 18 | --batch=test_deconv_bfloat16 19 | 20 | # f16 21 | --batch=test_deconv_float16_nxc 22 | 23 | # Regression 24 | --batch=harness_deconv_regression_general_f32 25 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/deconv/test_deconv_all_f32_nxc: -------------------------------------------------------------------------------- 1 | # f32 nxc 2 | --reset 3 | --skip-impl=ref 4 | --mb=2 5 | --stag=axb --dtag=axb 6 | 7 | --dir=FWD_B 8 | --attr-post-ops=,sum+prelu:per_oc,linear:2:1,sum:1.5+add:f32:per_oc+relu 9 | --batch=set_all 10 | 11 | --dir=BWD_D,BWD_W,BWD_WB 12 | --attr-post-ops= 13 | --batch=set_all 14 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/deconv/test_deconv_bfloat16: -------------------------------------------------------------------------------- 1 | # bf16 2 | --reset 3 | --skip-impl=ref 4 | --mb=2 5 | 6 | --dt=bf16 7 | --dir=FWD_B 8 | --attr-post-ops=,sum,linear:2:1,sum:1.5+add:f32:per_oc+relu 9 | --batch=set_all 10 | 11 | --dir=BWD_D,BWD_W,BWD_WB 12 | --attr-post-ops= 13 | --batch=set_all 14 | 15 | --dt=bf16:bf16:f32 --dir=FWD_B --batch=set_all 16 | --dt=f32:bf16:bf16 --dir=BWD_D --batch=set_all 17 | --dt=bf16:f32:bf16 --dir=BWD_WB --batch=set_all 18 | 19 | # Test Deconv w/bias through GeMM 20 | --reset 21 | --skip-impl=ref 22 | --mb=2 23 | 24 | --dt=bf16 25 | --dir=FWD_B,BWD_WB g16_ic32ih4iw8_oc64oh3ow8_kh3kw3sh1sw1ph0pw0n"gemm_shape" 26 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/deconv/test_deconv_bfloat16_ymm: -------------------------------------------------------------------------------- 1 | # bf16 2 | 3 | # global benchdnn knob, will not be reset again 4 | --cpu-isa-hints=prefer_ymm 5 | --batch=test_deconv_bfloat16 6 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/deconv/test_deconv_int8: -------------------------------------------------------------------------------- 1 | # int8 2 | --reset 3 | --skip-impl=ref 4 | --mb=2 5 | --dir=FWD_I,FWD_B 6 | --batch=set_all 7 | --batch=shapes_1x1 8 | 9 | --batch=harness_deconv_regression_general_int8 10 | --batch=harness_deconv_attrs_int8 11 | --batch=harness_deconv_attrs_int8_asymmetric 12 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/eltwise/harness_eltwise_large_buffer: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | # Use inplace to reduce memory consumption, most eltwise implementations do not 4 | # use different implementations for inplace. Use smallest supported data type to 5 | # minimize memory usage as well. 6 | 7 | --inplace=true 8 | 9 | --dt=u8 10 | --dir=FWD_D 11 | --attr-post-ops=,add:f32:per_tensor 12 | --batch=shapes_large_buffer 13 | 14 | --reset 15 | --dt=bf16 16 | --dir=BWD_D 17 | --batch=shapes_large_buffer 18 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/eltwise/harness_eltwise_regression: -------------------------------------------------------------------------------- 1 | # tag `a` regression check 2 | --reset 3 | --skip-impl= 4 | 5 | --dir=FWD_D 6 | --dt=f32 7 | --tag=a 8 | --attr-post-ops=mul:f32 9 | 16 10 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/eltwise/harness_eltwise_saturation: -------------------------------------------------------------------------------- 1 | # int linear saturation check 2 | --reset 3 | 4 | --dt=u8 5 | --tag=abx,axb 6 | --alpha=1,4294967295 7 | --beta=1,4294967295 8 | --alg=linear 9 | --batch=shapes_eltwise 10 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/eltwise/option_set_all_algs_int8: -------------------------------------------------------------------------------- 1 | # Integer algorithm coverage based on alpha and beta validity 2 | --alpha=0,0.25,-0.25 --beta=0 3 | --alg=relu 4 | --batch=shapes_eltwise 5 | 6 | --alpha=0,0.25,-0.25 --beta=0,0.25,-0.25 7 | --alg=linear 8 | --batch=shapes_eltwise 9 | 10 | --alpha=0,0.25,-0.25 --beta=0,0.25,-0.25 11 | --alg=clip 12 | --batch=shapes_eltwise 13 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/eltwise/option_set_all_algs_int8_ci: -------------------------------------------------------------------------------- 1 | # Integer algorithm coverage based on alpha and beta validity 2 | --alpha=-2 --beta=0 3 | --alg=relu 4 | --batch=shapes_ci 5 | 6 | --alpha=1 --beta=2 7 | --alg=linear 8 | --batch=shapes_ci 9 | 10 | --alpha=-2 --beta=3 11 | --alg=clip 12 | --batch=shapes_ci 13 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/eltwise/shapes_ci: -------------------------------------------------------------------------------- 1 | 16x16x2x1_n"eltwise_ci_2d:0" # Used in smoke validation, don't change the name 2 | 3x15x2x5x3_n"eltwise_ci_3d:0" 3 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/eltwise/shapes_eltwise: -------------------------------------------------------------------------------- 1 | 5x16x3 2 | 16x64x1x1 3 | 3x7x3x2 4 | 2x32x3x2 5 | 32x5x2x3 6 | 2x16x5x2x3 7 | 3x17x2x5x3 8 | 9 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/eltwise/shapes_large_buffer: -------------------------------------------------------------------------------- 1 | # Test dimensions with 2 ^ 32 + 1 size to catch integer overflow 2 | 4294967297n"large_dim1" 3 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/eltwise/test_eltwise_bfloat16: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | --inplace=true,false 4 | --dt=bf16 5 | --tag=abx,axb,aBx8b,aBx16b,ABx16a16b 6 | 7 | --dir=FWD_D,BWD_D 8 | --batch=option_set_all_algs 9 | 10 | --dir=FWD_D 11 | --attr-post-ops=add:f32+mul:f32:per_oc 12 | --batch=option_set_all_algs 13 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/eltwise/test_eltwise_ci: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | --inplace=true,false 4 | 5 | --dt=f32,bf16,f16,f8_e5m2,f8_e4m3 6 | --tag=abx,axb 7 | --dir=FWD_D 8 | --attr-post-ops=,mul:s8:per_oc 9 | --batch=option_set_all_algs_ci 10 | --dir=BWD_D 11 | --attr-post-ops= 12 | --batch=option_set_all_algs_ci 13 | 14 | --dir=FWD_I 15 | --dt=s32,s8,u8 16 | --attr-post-ops=,mul:f32 17 | --batch=option_set_all_algs_int8_ci 18 | 19 | 20 | --dt=f64 21 | --dir=BWD_D,FWD_I 22 | --attr-post-ops= 23 | --batch=option_set_all_algs_ci 24 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/eltwise/test_eltwise_float16: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | --inplace=true,false 4 | --dt=f16 5 | --tag=abx,axb 6 | 7 | --dir=FWD_D,BWD_D 8 | --batch=option_set_all_algs 9 | 10 | --dir=FWD_D 11 | --attr-post-ops=add:f32+mul:f32:per_oc 12 | --batch=option_set_all_algs 13 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/eltwise/test_eltwise_float8: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | --inplace=true,false 4 | --dt=f8_e5m2,f8_e4m3 5 | --tag=abx,axb 6 | 7 | --dir=FWD_D,BWD_D 8 | --batch=option_set_all_algs 9 | 10 | --dir=FWD_D 11 | --attr-post-ops=add:f32+mul:f32:per_oc 12 | --batch=option_set_all_algs 13 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/eltwise/test_eltwise_smoke: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | --match=.*eltwise_ci_2d.* # Use 2d problems only from shapes_ci 4 | --inplace=false 5 | 6 | --dt=f32,bf16,f16 7 | --tag=axb 8 | --dir=FWD_D,BWD_D 9 | --batch=option_set_all_algs_ci 10 | 11 | --dir=FWD_I 12 | --dt=s8,u8 13 | --batch=option_set_all_algs_int8_ci 14 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/gnorm/shapes_all: -------------------------------------------------------------------------------- 1 | # Instance normalization 2 | # Unet-3D (id is divided by 10 for the first 2 cases) 3 | g32mb2ic32id16ih224iw224 4 | g256mb6ic256id2ih28iw28 5 | g320mb2ic320id10ih14iw14 6 | g128mb2ic128id6ih7iw7 7 | 8 | # Group normalization 9 | g1mb2ic2iw40 10 | g2mb2ic8ih30iw40 11 | g5mb2ic10id9ih10iw10 12 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/gnorm/shapes_ci: -------------------------------------------------------------------------------- 1 | # Instance normalization 2 | g2mb2ic2iw4 3 | g7mb2ic7ih3iw4 4 | g5mb2ic5id9ih1iw10 5 | 6 | # Group normalization 7 | g1mb2ic2iw4 8 | g2mb2ic8ih3iw4 9 | g5mb2ic10id9ih1iw10 10 | g5mb2ic40id2ih1iw5 11 | g5mb2ic80id2ih1iw5 12 | g1mb1ic128ih2iw2 13 | g2mb2ic30ih1iw1 14 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/graph/complex_fusion/harness_mlp_all: -------------------------------------------------------------------------------- 1 | --batch=complex_fusion/harness_mlp_ci 2 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/graph/complex_fusion/harness_mlp_ci: -------------------------------------------------------------------------------- 1 | --reset --dt=bf16,f16 --case=complex_fusion/mlp/gated-mlp-f32.json 2 | 3 | # WA1: use smaller problem to pass correctness check for f32 on pvc. 4 | # WA2: use subtract binary to avoid precision issue for f32 on xe-lpg. 5 | --reset --in-shapes=0:1x128+1:128x256+4:128x256+13:256x128 --op-kind=12:Subtract --case=complex_fusion/mlp/gated-mlp-f32.json 6 | 7 | # f16-int4 case 8 | --reset --case=complex_fusion/mlp/gated-mlp-int4.json 9 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/graph/pattern/harness_bf16_ci: -------------------------------------------------------------------------------- 1 | --reset --dt=bf16 --case=pattern/f32/conv_post_ops_fusion.json 2 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/graph/pattern/harness_f16_ci: -------------------------------------------------------------------------------- 1 | --reset --dt=f16 --case=pattern/f32/conv_post_ops_fusion.json 2 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/graph/pattern/harness_f32_ci: -------------------------------------------------------------------------------- 1 | --reset --in-shapes=0:2x64x112x112+1:32x64x2x2 --case=pattern/f32/conv_post_ops_fusion.json 2 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/graph/pattern/harness_f8_all: -------------------------------------------------------------------------------- 1 | --reset --case=pattern/f8/f8_conv_add_add_fusion.json 2 | --reset --case=pattern/f8/f8_conv_fwd.json 3 | --reset --case=pattern/f8/f8_conv_post_ops_fusion.json 4 | --reset --case=pattern/f8/f8_conv_post_ops_int8_add_fusion.json 5 | --reset --case=pattern/f8/f8_conv_bias_relu_fusion.json 6 | --reset --case=pattern/f8/f8_matmul.json 7 | --reset --case=pattern/f8/f8_bf16_matmul_add_fusion.json 8 | --reset --case=pattern/f8/f8_bf16_matmul_sum_add_mul_relu.json 9 | --reset --case=pattern/f8/f8_matmul_sum_add_mul_relu.json 10 | --reset --case=pattern/f8/f8_f32_matmul_mul_add_fusion.json 11 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/graph/pattern/harness_f8_ci: -------------------------------------------------------------------------------- 1 | --reset --case=pattern/f8/f8_conv_fwd.json 2 | --reset --case=pattern/f8/f8_matmul.json 3 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/graph/pattern/harness_int8_ci: -------------------------------------------------------------------------------- 1 | --reset --mb=2 --case=pattern/int8/int8_conv_post_ops_fusion.json 2 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/graph/test_graph_all: -------------------------------------------------------------------------------- 1 | --batch=test_graph_f32 2 | --batch=test_graph_bf16 3 | --batch=test_graph_f16 4 | --batch=test_graph_int8 5 | --batch=test_graph_f8 6 | --batch=test_graph_fusions 7 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/graph/test_graph_bf16: -------------------------------------------------------------------------------- 1 | --batch=op/harness_bf16_all 2 | --batch=pattern/harness_bf16_all 3 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/graph/test_graph_ci: -------------------------------------------------------------------------------- 1 | --batch=op/harness_f32_ci 2 | --batch=op/harness_f16_ci 3 | --batch=op/harness_bf16_ci 4 | --batch=pattern/harness_f32_ci 5 | --batch=pattern/harness_f16_ci 6 | --batch=pattern/harness_bf16_ci 7 | --batch=pattern/harness_int8_ci 8 | --batch=pattern/harness_f8_ci 9 | --batch=complex_fusion/harness_mha_ci 10 | --batch=complex_fusion/harness_mlp_ci 11 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/graph/test_graph_f16: -------------------------------------------------------------------------------- 1 | --batch=op/harness_f16_all 2 | --batch=pattern/harness_f16_all 3 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/graph/test_graph_f32: -------------------------------------------------------------------------------- 1 | --batch=op/harness_f32_all 2 | --batch=pattern/harness_f32_all 3 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/graph/test_graph_f8: -------------------------------------------------------------------------------- 1 | --batch=pattern/harness_f8_all -------------------------------------------------------------------------------- /tests/benchdnn/inputs/graph/test_graph_fusions: -------------------------------------------------------------------------------- 1 | --batch=complex_fusion/harness_mha_all 2 | --batch=complex_fusion/harness_mlp_all 3 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/graph/test_graph_fusions_gpu: -------------------------------------------------------------------------------- 1 | --batch=complex_fusion/harness_mha_all 2 | --batch=complex_fusion/harness_mha_ci 3 | --batch=complex_fusion/harness_mlp_all 4 | --batch=complex_fusion/harness_mlp_ci 5 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/graph/test_graph_int8: -------------------------------------------------------------------------------- 1 | --batch=pattern/harness_int8_all 2 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/graph/test_graph_op_gpu: -------------------------------------------------------------------------------- 1 | --batch=op/harness_f32_ci 2 | --batch=op/harness_f16_ci 3 | --batch=op/harness_bf16_ci 4 | 5 | --batch=op/harness_f32_all 6 | --batch=op/harness_f16_all 7 | --batch=op/harness_bf16_all 8 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/graph/test_graph_pattern_gpu: -------------------------------------------------------------------------------- 1 | --batch=pattern/harness_f32_ci 2 | --batch=pattern/harness_f16_ci 3 | --batch=pattern/harness_bf16_ci 4 | --batch=pattern/harness_int8_ci 5 | --batch=pattern/harness_f8_ci 6 | 7 | --batch=pattern/harness_f32_all 8 | --batch=pattern/harness_f16_all 9 | --batch=pattern/harness_bf16_all 10 | --batch=pattern/harness_int8_all 11 | --batch=pattern/harness_f8_all 12 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/ip/harness_ip_gpt-j_2016-32_inf_lb_bfloat16: -------------------------------------------------------------------------------- 1 | # GPT-J 2 | # Input token size = 2016 3 | # Output token size = 32 4 | # Batch Size = 12 5 | # Num Beams = 4 6 | # M = num_beams * batch_size = 48 7 | 8 | --reset 9 | --dir=FWD_I 10 | --dt=bf16 11 | 12 | mb96768ic4096oc4096_n"gptj:gemm0*112" 13 | mb96768ic4096oc16384_n"gptj:gemm3*28" 14 | mb96768ic16384oc4096_n"gptj:gemm4*28" 15 | mb96768ic4096oc50400_n"gptj:gemm5*1" 16 | mb48ic4096oc4096_n"gptj:gemm6*3472" 17 | mb48ic4096oc16384_n"gptj:gemm9*868" 18 | mb48ic16384oc4096_n"gptj:gemm10*868" 19 | mb48ic4096oc50400_n"gptj:gemm11*31" 20 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/ip/harness_ip_gpt-j_2016-32_inf_lb_f32: -------------------------------------------------------------------------------- 1 | # GPT-J 2 | # Input token size = 2016 3 | # Output token size = 32 4 | # Batch Size = 6 5 | # Num Beams = 4 6 | # M = num_beams * batch_size = 24 7 | 8 | --reset 9 | --dir=FWD_I 10 | --dt=f32 11 | 12 | mb48384ic4096oc4096_n"gptj:gemm0*112" 13 | mb48384ic4096oc16384_n"gptj:gemm3*28" 14 | mb48384ic16384oc4096_n"gptj:gemm4*28" 15 | mb48384ic4096oc50400_n"gptj:gemm5*1" 16 | mb24ic4096oc4096_n"gptj:gemm6*3472" 17 | mb24ic4096oc16384_n"gptj:gemm9*868" 18 | mb24ic16384oc4096_n"gptj:gemm10*868" 19 | mb24ic4096oc50400_n"gptj:gemm11*31" 20 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/ip/harness_ip_gpt-j_2016-32_inf_lb_float16: -------------------------------------------------------------------------------- 1 | # GPT-J 2 | # Input token size = 2016 3 | # Output token size = 32 4 | # Batch Size = 12 5 | # Num Beams = 4 6 | # M = num_beams * batch_size = 48 7 | 8 | --reset 9 | --dir=FWD_I 10 | --dt=f16 11 | 12 | mb96768ic4096oc4096_n"gptj:gemm0*112" 13 | mb96768ic4096oc16384_n"gptj:gemm3*28" 14 | mb96768ic16384oc4096_n"gptj:gemm4*28" 15 | mb96768ic4096oc50400_n"gptj:gemm5*1" 16 | mb48ic4096oc4096_n"gptj:gemm6*3472" 17 | mb48ic4096oc16384_n"gptj:gemm9*868" 18 | mb48ic16384oc4096_n"gptj:gemm10*868" 19 | mb48ic4096oc50400_n"gptj:gemm11*31" 20 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/ip/harness_ip_gpt-j_2016-32_inf_sb_bfloat16: -------------------------------------------------------------------------------- 1 | # GPT-J 2 | # Input token size = 2016 3 | # Output token size = 32 4 | # Batch Size = 1 5 | # Num Beams = 4 6 | # M = num_beams * batch_size = 4 7 | 8 | --reset 9 | --dir=FWD_I 10 | --dt=bf16 11 | 12 | mb8064ic4096oc4096_n"gptj:gemm0*112" 13 | mb8064ic4096oc16384_n"gptj:gemm3*28" 14 | mb8064ic16384oc4096_n"gptj:gemm4*28" 15 | mb8064ic4096oc50400_n"gptj:gemm5*1" 16 | mb4ic4096oc4096_n"gptj:gemm6*3472" 17 | mb4ic4096oc16384_n"gptj:gemm9*868" 18 | mb4ic16384oc4096_n"gptj:gemm10*868" 19 | mb4ic4096oc50400_n"gptj:gemm11*31" 20 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/ip/harness_ip_gpt-j_2016-32_inf_sb_f32: -------------------------------------------------------------------------------- 1 | # GPT-J 2 | # Input token size = 2016 3 | # Output token size = 32 4 | # Batch Size = 1 5 | # Num Beams = 4 6 | # M = num_beams * batch_size = 4 7 | 8 | --reset 9 | --dir=FWD_I 10 | --dt=f32 11 | 12 | mb8064ic4096oc4096_n"gptj:gemm0*112" 13 | mb8064ic4096oc16384_n"gptj:gemm3*28" 14 | mb8064ic16384oc4096_n"gptj:gemm4*28" 15 | mb8064ic4096oc50400_n"gptj:gemm5*1" 16 | mb4ic4096oc4096_n"gptj:gemm6*3472" 17 | mb4ic4096oc16384_n"gptj:gemm9*868" 18 | mb4ic16384oc4096_n"gptj:gemm10*868" 19 | mb4ic4096oc50400_n"gptj:gemm11*31" 20 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/ip/harness_ip_gpt-j_2016-32_inf_sb_float16: -------------------------------------------------------------------------------- 1 | # GPT-J 2 | # Input token size = 2016 3 | # Output token size = 32 4 | # Batch Size = 1 5 | # Num Beams = 4 6 | # M = num_beams * batch_size = 4 7 | 8 | --reset 9 | --dir=FWD_I 10 | --dt=f16 11 | 12 | mb8064ic4096oc4096_n"gptj:gemm0*112" 13 | mb8064ic4096oc16384_n"gptj:gemm3*28" 14 | mb8064ic16384oc4096_n"gptj:gemm4*28" 15 | mb8064ic4096oc50400_n"gptj:gemm5*1" 16 | mb4ic4096oc4096_n"gptj:gemm6*3472" 17 | mb4ic4096oc16384_n"gptj:gemm9*868" 18 | mb4ic16384oc4096_n"gptj:gemm10*868" 19 | mb4ic4096oc50400_n"gptj:gemm11*31" 20 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/ip/harness_ip_gpt-j_32-32_inf_lb_bfloat16: -------------------------------------------------------------------------------- 1 | # GPT-J 2 | # Input token size = 32 3 | # Output token size = 32 4 | # Batch Size = 428 5 | # Num Beams = 4 6 | # M = num_beams * batch_size = 1712 7 | 8 | --reset 9 | --dir=FWD_I 10 | --dt=bf16 11 | 12 | mb54784ic4096oc4096_n"gptj:gemm0*112" 13 | mb54784ic4096oc16384_n"gptj:gemm3*28" 14 | mb54784ic16384oc4096_n"gptj:gemm4*28" 15 | mb54784ic4096oc50400_n"gptj:gemm5*1" 16 | mb1712ic4096oc4096_n"gptj:gemm6*3472" 17 | mb1712ic4096oc16384_n"gptj:gemm9*868" 18 | mb1712ic16384oc4096_n"gptj:gemm10*868" 19 | mb1712ic4096oc50400_n"gptj:gemm11*31" 20 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/ip/harness_ip_gpt-j_32-32_inf_lb_f32: -------------------------------------------------------------------------------- 1 | # GPT-J 2 | # Input token size = 32 3 | # Output token size = 32 4 | # Batch Size = 214 5 | # Num Beams = 4 6 | # M = num_beams * batch_size = 856 7 | 8 | --reset 9 | --dir=FWD_I 10 | --dt=f32 11 | 12 | mb27392ic4096oc4096_n"gptj:gemm0*112" 13 | mb27392ic4096oc16384_n"gptj:gemm3*28" 14 | mb27392ic16384oc4096_n"gptj:gemm4*28" 15 | mb27392ic4096oc50400_n"gptj:gemm5*1" 16 | mb856ic4096oc4096_n"gptj:gemm6*3472" 17 | mb856ic4096oc16384_n"gptj:gemm9*868" 18 | mb856ic16384oc4096_n"gptj:gemm10*868" 19 | mb856ic4096oc50400_n"gptj:gemm11*31" 20 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/ip/harness_ip_gpt-j_32-32_inf_lb_float16: -------------------------------------------------------------------------------- 1 | # GPT-J 2 | # Input token size = 32 3 | # Output token size = 32 4 | # Batch Size = 428 5 | # Num Beams = 4 6 | # M = num_beams * batch_size = 1712 7 | 8 | --reset 9 | --dir=FWD_I 10 | --dt=f16 11 | 12 | mb54784ic4096oc4096_n"gptj:gemm0*112" 13 | mb54784ic4096oc16384_n"gptj:gemm3*28" 14 | mb54784ic16384oc4096_n"gptj:gemm4*28" 15 | mb54784ic4096oc50400_n"gptj:gemm5*1" 16 | mb1712ic4096oc4096_n"gptj:gemm6*3472" 17 | mb1712ic4096oc16384_n"gptj:gemm9*868" 18 | mb1712ic16384oc4096_n"gptj:gemm10*868" 19 | mb1712ic4096oc50400_n"gptj:gemm11*31" 20 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/ip/harness_ip_gpt-j_32-32_inf_sb_bfloat16: -------------------------------------------------------------------------------- 1 | # GPT-J 2 | # Input token size = 32 3 | # Output token size = 32 4 | # Batch Size = 1 5 | # Num Beams = 4 6 | # M = num_beams * batch_size = 4 7 | 8 | --reset 9 | --dir=FWD_I 10 | --dt=bf16 11 | 12 | mb128ic4096oc4096_n"gptj:gemm0*112" 13 | mb128ic4096oc16384_n"gptj:gemm3*28" 14 | mb128ic16384oc4096_n"gptj:gemm4*28" 15 | mb128ic4096oc50400_n"gptj:gemm5*1" 16 | mb4ic4096oc4096_n"gptj:gemm6*3472" 17 | mb4ic4096oc16384_n"gptj:gemm9*868" 18 | mb4ic16384oc4096_n"gptj:gemm10*868" 19 | mb4ic4096oc50400_n"gptj:gemm11*31" 20 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/ip/harness_ip_gpt-j_32-32_inf_sb_f32: -------------------------------------------------------------------------------- 1 | # GPT-J 2 | # Input token size = 32 3 | # Output token size = 32 4 | # Batch Size = 1 5 | # Num Beams = 4 6 | # M = num_beams * batch_size = 4 7 | 8 | --reset 9 | --dir=FWD_I 10 | --dt=f32 11 | 12 | mb128ic4096oc4096_n"gptj:gemm0*112" 13 | mb128ic4096oc16384_n"gptj:gemm3*28" 14 | mb128ic16384oc4096_n"gptj:gemm4*28" 15 | mb128ic4096oc50400_n"gptj:gemm5*1" 16 | mb4ic4096oc4096_n"gptj:gemm6*3472" 17 | mb4ic4096oc16384_n"gptj:gemm9*868" 18 | mb4ic16384oc4096_n"gptj:gemm10*868" 19 | mb4ic4096oc50400_n"gptj:gemm11*31" 20 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/ip/harness_ip_gpt-j_32-32_inf_sb_float16: -------------------------------------------------------------------------------- 1 | # GPT-J 2 | # Input token size = 32 3 | # Output token size = 32 4 | # Batch Size = 1 5 | # Num Beams = 4 6 | # M = num_beams * batch_size = 4 7 | 8 | --reset 9 | --dir=FWD_I 10 | --dt=f16 11 | 12 | mb128ic4096oc4096_n"gptj:gemm0*112" 13 | mb128ic4096oc16384_n"gptj:gemm3*28" 14 | mb128ic16384oc4096_n"gptj:gemm4*28" 15 | mb128ic4096oc50400_n"gptj:gemm5*1" 16 | mb4ic4096oc4096_n"gptj:gemm6*3472" 17 | mb4ic4096oc16384_n"gptj:gemm9*868" 18 | mb4ic16384oc4096_n"gptj:gemm10*868" 19 | mb4ic4096oc50400_n"gptj:gemm11*31" 20 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/ip/harness_ip_regression: -------------------------------------------------------------------------------- 1 | # repeated sum with varying scale 2 | --reset --attr-post-ops=sum+relu+sum:2 ic64oc64_n"multisum" 3 | 4 | # large oc to trigger use_buffer_b with tail 5 | --reset --dir=BWD_W mb32ic16oc2049_n"large_oc_for_use_buffer_b_with_tail" 6 | 7 | # f16->s8 is not supported by reference, gemm based ip handles this case, 8 | # but it requires src and weights to have compatible tags. 9 | # This test case makes sure ip chooses weights tag correctly for plain src tag 10 | # when shapes contain ones. 11 | --reset --dt=f16:f16:s8 --stag=abcd mb128ic768ih1oc768_n"f16_s8_plain_src_tag" 12 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/ip/harness_ip_sanitizers: -------------------------------------------------------------------------------- 1 | # shapes to help sanitizers catch bugs 2 | 3 | --reset 4 | --dt=f32,bf16,s8:s8:s8 5 | --dir=FWD_B 6 | 7 | # mb * ic will overflow for int data type in below shape 8 | mb2613184_ic1536oc16_n"huge_mb_ic" 9 | 10 | --dt=f32,bf16 11 | --dir=BWD_D,BWD_WB 12 | mb2613184_ic1536oc16_n"huge_mb_ic" 13 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/ip/harness_ip_saturation: -------------------------------------------------------------------------------- 1 | --reset 2 | --dir=FWD_D 3 | --dt=u8:s8:s32,u8:s8:s8,u8:s8:u8 4 | --attr-scales=wei:common:4294967295 5 | ic16oc16ih1 6 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/ip/harness_ip_tag_gpu: -------------------------------------------------------------------------------- 1 | --reset 2 | --attr-scratchpad=user 3 | --dt=f32 4 | --dir=FWD_B,FWD_I 5 | --dtag=abx 6 | --mb=16,17 7 | 8 | --wtag=ABx16a16b 9 | ic2048ih2id2iw1oc1000 10 | 11 | --stag=aBx16b 12 | --wtag=any 13 | ic2048id2ih2iw2oc1000 14 | 15 | --stag=ABx16a16b,ABcd8a8b,aBx16b,aBx8b 16 | --attr-post-ops=sum:0.5+relu:0.5 17 | ic2048ih2iw2oc1000 18 | 19 | --stag=aBx8b,aBx4b 20 | ic2048iw2oc1000 21 | 22 | --stag=aBx16b 23 | --attr-post-ops=linear:0.5:1.5:2 24 | ic2048id2ih2iw2oc1000 25 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/ip/option_set_fwks_llm_gpu: -------------------------------------------------------------------------------- 1 | --reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f32 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab mb1024ic896ih1iw1oc151936_n"a12f80214bc2625a81ecaa173e76b977*2" 2 | --reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f32 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab mb32ic896ih1iw1oc151936_n"a72ef24774e776b6f286de263d897013*2" 3 | --reset --allow-enum-tags-only=0 --dir=FWD_I --dt=f16:f16:f32 --bia-dt=undef --stag=abcd --wtag=any --dtag=ab mb8ic896ih1iw1oc151936_n"fed31de8b7be65676854064135d375c7*508" 4 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/ip/perf_ip_cpu: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | # Forward 4 | --dir=FWD_B 5 | --dt=f32,u8:s8:s32,s8:s8:s8 6 | --mb=1,64,256,1024,2048 7 | --batch=set_topologies 8 | 9 | # Backward 10 | --dt=f32 11 | --dir=BWD_D,BWD_WB 12 | --mb=64,256,1024,2048 13 | --batch=set_topologies 14 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/ip/perf_ip_inference_sb: -------------------------------------------------------------------------------- 1 | # inference_sb -- inference with small batch size 2 | # f32 3 | --reset 4 | --dir=FWD_B 5 | --mb=1 6 | --batch=shapes_gnmt 7 | --batch=shapes_wd 8 | --batch=shapes_resnet_50 9 | --batch=shapes_resnet_50_sparse 10 | --batch=shapes_googlenet_v1 11 | --batch=shapes_googlenet_v3 12 | --batch=shapes_vgg16 13 | --batch=shapes_ncf 14 | --batch=shapes_alexnet 15 | --batch=shapes_bert 16 | --batch=shapes_bert_large 17 | --mb=0 --batch=shapes_rnn_t 18 | --mb=128 --batch=shapes_dlrm 19 | --mb=0 --batch=shapes_transformer_lt 20 | --mb=0 --batch=shapes_dien_sb 21 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/ip/perf_ip_knx: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | # Forward 4 | --dir=FWD_B 5 | --dt=f32 6 | --mb=1,64,256,1024,2048 7 | --batch=set_topologies 8 | 9 | # Backward 10 | --dt=f32 11 | --dir=BWD_D,BWD_WB 12 | --mb=64,256,1024,2048 13 | --batch=set_topologies 14 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/ip/perf_ip_xe: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | # Forward 4 | 5 | # f32 6 | --dir=FWD_B 7 | --dt=f32 8 | 9 | --mb=1,32,64,128 10 | --batch=set_gpu 11 | 12 | --mb=0 13 | --batch=shapes_1d 14 | 15 | # f16 16 | --dir=FWD_B 17 | --dt=f16 18 | 19 | --mb=1,32,64,128 20 | --batch=set_gpu 21 | 22 | --mb=0 23 | --batch=shapes_1d 24 | 25 | # Backward 26 | 27 | # f32 28 | --dir=BWD_D,BWD_WB 29 | --dt=f32 30 | 31 | --mb=32,64,128 32 | --batch=set_gpu 33 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/ip/perf_ip_xe_hp: -------------------------------------------------------------------------------- 1 | --batch=perf_ip_xe_lp 2 | 3 | --reset 4 | 5 | # Forward, bf16 6 | 7 | --dir=FWD_B 8 | --dt=bf16 9 | 10 | --mb=1,32,64,128 11 | --batch=set_gpu 12 | 13 | --mb=0 14 | --batch=shapes_1d 15 | 16 | # Backward, bf16 17 | 18 | --dir=BWD_D,BWD_WB 19 | --dt=bf16 20 | 21 | --mb=32,64,128 22 | --batch=set_gpu 23 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/ip/perf_ip_xe_lp: -------------------------------------------------------------------------------- 1 | --batch=perf_ip_xe 2 | 3 | --reset 4 | 5 | # Forward, int8 6 | --dir=FWD_B 7 | --dt=u8:s8:s32 8 | 9 | --mb=1,32,64,128 10 | --batch=set_gpu 11 | 12 | --mb=0 13 | --batch=shapes_1d 14 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/ip/set_all: -------------------------------------------------------------------------------- 1 | --batch=shapes_resnet_50 2 | --batch=shapes_googlenet_v1 3 | --batch=shapes_googlenet_v3 4 | --batch=shapes_resnet_50_sparse 5 | --batch=shapes_vgg16 6 | --batch=shapes_3d 7 | --batch=shapes_wd 8 | --batch=shapes_maskrcnn 9 | --batch=shapes_rnn_t 10 | --batch=shapes_alexnet 11 | --batch=shapes_bert 12 | --batch=shapes_bert_large 13 | --batch=shapes_dlrm 14 | --batch=shapes_gnmt 15 | --batch=shapes_ncf 16 | --batch=shapes_transformer_lt 17 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/ip/set_gpu: -------------------------------------------------------------------------------- 1 | --batch=shapes_resnet_50 2 | --batch=shapes_googlenet_v1 3 | --batch=shapes_googlenet_v3 4 | --batch=shapes_resnet_50_sparse 5 | --batch=shapes_vgg16 6 | --batch=shapes_3d 7 | --batch=shapes_wd 8 | --batch=shapes_maskrcnn 9 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/ip/set_topologies: -------------------------------------------------------------------------------- 1 | --batch=shapes_alexnet 2 | --batch=shapes_bert 3 | --batch=shapes_bert_large 4 | --batch=shapes_dlrm 5 | --batch=shapes_gnmt 6 | --batch=shapes_googlenet_v1 7 | --batch=shapes_googlenet_v3 8 | --batch=shapes_maskrcnn 9 | --batch=shapes_ncf 10 | --batch=shapes_resnet_50 11 | --batch=shapes_resnet_50_sparse 12 | --batch=shapes_transformer_lt 13 | --batch=shapes_vgg16 14 | --batch=shapes_wd 15 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/ip/shapes_0d_gpu: -------------------------------------------------------------------------------- 1 | mb64ic1oc2 2 | mb18ic7oc1 3 | mb16ic1oc5 4 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/ip/shapes_1d: -------------------------------------------------------------------------------- 1 | # 1D 2 | mb128ic128iw4oc1024n"googlenet_v1_1d:ip1" 3 | mb64ic512iw7oc4096n"VGG16_1d:ip1" 4 | mb32ic64iw3oc1000n"1d:ip1" 5 | mb32ic512iw5oc1000n"1d:ip2" 6 | mb256ic128iw5oc128n"1d:ip3" 7 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/ip/shapes_3d: -------------------------------------------------------------------------------- 1 | ic64id2ih3iw3oc1000n"3d:ip1" 2 | ic512id5ih5iw5oc1000n"3d:ip2" 3 | ic128id5ih5iw5oc128n"3d:ip3" 4 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/ip/shapes_alexnet: -------------------------------------------------------------------------------- 1 | ic9216oc4096n"Alexnet:ip1" 2 | ic4096oc4096n"Alexnet:ip2" 3 | ic4096oc1000n"Alexnet:ip3" 4 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/ip/shapes_bert: -------------------------------------------------------------------------------- 1 | ic768oc2304n"BERT:1" 2 | ic768oc768n"BERT:2" 3 | ic768oc3072n"BERT:3" 4 | ic3072oc768n"BERT:4" 5 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/ip/shapes_ci: -------------------------------------------------------------------------------- 1 | --batch=shapes_dlrm 2 | --batch=shapes_ncf 3 | --batch=shapes_resnet_50 4 | --batch=shapes_rnn_t 5 | --batch=shapes_bert 6 | --batch=shapes_bert_large 7 | --batch=shapes_gnmt 8 | --batch=shapes_wd 9 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/ip/shapes_dlrm: -------------------------------------------------------------------------------- 1 | ic13oc512n"DLRM:0" 2 | ic512oc256n"DLRM:1*2" 3 | ic256oc128n"DLRM:2" 4 | ic479oc1024n"DLRM:3" 5 | ic1024oc1024n"DLRM:4" 6 | ic1024oc512n"DLRM:5" 7 | # ic512oc256n"DLRM:6" 8 | ic256oc1n"DLRM:7" 9 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/ip/shapes_gnmt: -------------------------------------------------------------------------------- 1 | ic512oc512n"GNMT:0" 2 | ic1024oc1024n"GNMT:1" 3 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/ip/shapes_googlenet_v1: -------------------------------------------------------------------------------- 1 | ic128ih4iw4oc1024n"googlenet_v1:ip1" 2 | ic1024ih1iw1oc1000n"googlenet_v1:ip2" 3 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/ip/shapes_googlenet_v3: -------------------------------------------------------------------------------- 1 | ic2048ih1oc1000n"inceptionv3:ip1" 2 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/ip/shapes_maskrcnn: -------------------------------------------------------------------------------- 1 | mb1000ic256ih7iw7oc1024n"masknet:ip1" 2 | mb1000ic1024oc1024n"masknet:ip2" 3 | mb1000ic1024oc324n"masknet:ip3" 4 | mb1000ic1024oc81n"masknet:ip4" 5 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/ip/shapes_ncf: -------------------------------------------------------------------------------- 1 | ic256oc256n"NCF:0" 2 | ic256oc128n"NCF:1" 3 | ic128oc64n"NCF:2" 4 | ic128oc1n"NCF:3" 5 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/ip/shapes_regression: -------------------------------------------------------------------------------- 1 | mb1_ic16oc26_n"small_oc_block" 2 | mb128ic2200oc2200 3 | mb128ic1500oc1500 4 | mb1120ic1024oc2046 5 | mb1000ic1000oc1111 6 | mb1ic16001oc101 7 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/ip/shapes_resnet_50: -------------------------------------------------------------------------------- 1 | ic2048oc1000n"resnet:ip1" 2 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/ip/shapes_resnet_50_sparse: -------------------------------------------------------------------------------- 1 | ic2048oc1000n"resnet_sparse:ip1" 2 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/ip/shapes_rnn_t: -------------------------------------------------------------------------------- 1 | # mb = 16 * num_cores for throughput inference / training 2 | # and mb = 16 for real time inference 3 | mb16ic240oc4096n"RNN-T:Encoder_cell1_Input*2" 4 | mb16ic1024oc4096n"RNN-T:Encoder_cell1_Hidden*11" 5 | mb16ic2048oc4096n"RNN-T:Encoder_cell3_Input*1" 6 | mb16ic320oc1280n"RNN-T:Prediction_Input*12" 7 | mb16ic1344oc512n"RNN-T:JointNet_Linear1*3" 8 | mb16ic512oc29n"RNN-T:JointNet_Linear2*3" 9 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/ip/shapes_vgg16: -------------------------------------------------------------------------------- 1 | ic512ih7iw7oc4096n"VGG16:ip1" 2 | ic4096oc4096n"VGG16:ip2" 3 | ic4096oc81n"VGG16:ip3" 4 | ic4096oc324n"VGG16:ip4" 5 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/ip/shapes_wd: -------------------------------------------------------------------------------- 1 | # Used in smoke validation, don't change the name 2 | ic845oc1024n"WnD:0" 3 | ic1024oc512n"WnD:1" 4 | ic512oc256n"WnD:2" 5 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/ip/test_ip_all: -------------------------------------------------------------------------------- 1 | # f32 2 | --reset 3 | --dir=FWD_B,BWD_D,BWD_WB 4 | --batch=set_all --batch=shapes_0d 5 | 6 | --dir=FWD_B 7 | --attr-post-ops=sum:0.5+relu:0.5+add:f32:per_oc,prelu:per_oc,sum:2:0:s32 8 | --mb=2 --batch=set_all 9 | --mb=0 --batch=shapes_0d 10 | 11 | --batch=harness_ip_regression 12 | --batch=harness_ip_tag 13 | 14 | # int8 15 | --batch=test_ip_int8 16 | 17 | # fp8 18 | --batch=test_ip_fp8 19 | 20 | # bf16 21 | --batch=test_ip_bfloat16 22 | 23 | # bf32 24 | --batch=test_ip_bf32_bfloat16 25 | 26 | # f16 27 | --batch=test_ip_float16 28 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/ip/test_ip_bfloat16_ymm: -------------------------------------------------------------------------------- 1 | # global benchdnn knob, will not be reset again 2 | --cpu-isa-hints=prefer_ymm 3 | --batch=test_ip_bfloat16 4 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/lnorm/shapes_ci: -------------------------------------------------------------------------------- 1 | 15x3_n"lnorm_ci_0d:0" # Used in smoke validation, don't change the name 2 | 30x300_n"lnorm_ci_0d:1" # Used in smoke validation, don't change the name 3 | 256x768_n"lnorm_ci_0d:2" # Used in smoke validation, don't change the name 4 | 257x768_n"lnorm_ci_0d:3" # Used in smoke validation, don't change the name 5 | 128x1x1024_n"lnorm_ci_1d:0" 6 | 6x2x128x1024_n"lnorm_ci_2d:0" 7 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/lnorm/test_lnorm_all: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | # f32 4 | --inplace=true,false 5 | --dt=f32 6 | 7 | --dir=FWD_D,FWD_I 8 | --flags=,CH,G,GCH,M,GCHM 9 | --batch=option_set_all 10 | 11 | --dir=BWD_D 12 | --flags=,G,GM 13 | --batch=option_set_all 14 | 15 | --dir=BWD_DW 16 | --flags=CH,GCH,M,GCHM 17 | --batch=option_set_all 18 | 19 | # bf16 20 | --batch=test_lnorm_bfloat16 21 | 22 | # int8 23 | --batch=test_lnorm_int8 24 | 25 | # F16 26 | --batch=test_lnorm_float16 27 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/lnorm/test_lnorm_bfloat16: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | --inplace=true,false 4 | --dt=bf16,bf16:f32,f32:bf16 5 | 6 | --dir=FWD_D,FWD_I 7 | --flags=,CH,G,GCH,M,GCHM 8 | --batch=option_set_all 9 | 10 | --dir=BWD_D 11 | --flags=,G,GM 12 | --batch=option_set_all 13 | 14 | --dir=BWD_DW 15 | --flags=CH,GCH,M,GCHM 16 | --batch=option_set_all 17 | 18 | # Different data type combinations 19 | 20 | --dt=f32:bf16,bf16 21 | --dir=FWD_D,BWD_DW 22 | --ss_dt=bf16 23 | --flags=C,H,CH 24 | --batch=option_set_all 25 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/lnorm/test_lnorm_float16: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | --inplace=true,false 4 | --dt=f16,f16:f32,f32:f16 5 | 6 | --dir=FWD_D,FWD_I 7 | --flags=,CH,G,GCH,M,GCHM 8 | --batch=option_set_all 9 | 10 | --dir=BWD_D 11 | --flags=,G,GM 12 | --batch=option_set_all 13 | 14 | --dir=BWD_DW 15 | --flags=CH,GCH,M,GCHM 16 | --batch=option_set_all 17 | 18 | # Different data type combinations 19 | 20 | --dt=f32:f16,f16 21 | --dir=FWD_D,BWD_DW 22 | --ss_dt=f16 23 | --flags=C,H,CH 24 | --batch=option_set_all 25 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/lnorm/test_lnorm_int8: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | --inplace=false 4 | --dt=f32:s8,f32:u8,bf16:s8,bf16:u8, \ 5 | s8:f32,u8:f32,s8:bf16,u8:bf16, \ 6 | s8:s8,u8:s8,s8:u8,u8:u8 7 | 8 | --dir=FWD_I 9 | --attr-scales=,src:common:128,dst:common:0.125,src:common:64+dst:common:0.5 10 | --flags=,CH,G,GCH,M,GCHM 11 | --batch=option_set_all 12 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/lnorm/test_lnorm_smoke: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | --match=.*lnorm_ci_0d.* # Use 0d problems only from shapes_ci 4 | --inplace=false 5 | --tag=axb 6 | --stat_tag=any 7 | 8 | --dir=FWD_D,BWD_DW 9 | --dt=f32,bf16,f16 10 | --flags=,G,CH,M,GCHM 11 | --batch=shapes_ci 12 | 13 | --dir=BWD_D 14 | --flags=,G,GM 15 | --batch=shapes_ci 16 | 17 | # Different data type combinations 18 | --dt=f32:s8,u8:f32 19 | --dir=FWD_I 20 | --attr-scales=,src:common:64+dst:common:0.5 21 | --flags=,CH 22 | --batch=shapes_ci 23 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/lrn/set_all: -------------------------------------------------------------------------------- 1 | --batch=shapes_2d 2 | --batch=shapes_3d 3 | --batch=shapes_topologies 4 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/lrn/shapes_0d: -------------------------------------------------------------------------------- 1 | # random problems 2 | mb3ic128_n"lrn_ci_0d:0" 3 | mb10ic63_n"lrn_ci_0d:1" 4 | mb17ic15_n"lrn_ci_0d:2" 5 | ic1_n"lrn_ci_0d:3" 6 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/lrn/shapes_2d: -------------------------------------------------------------------------------- 1 | # random problems 2 | ic15_ih16_n"lrn_ci_2d:channel_tail" 3 | ic16_ih5_ls7_n"lrn_ci_2d:non_default_local_size" 4 | ic16_ih5_beta1.0_n"lrn_ci_2d:non_default_beta" 5 | ic16_ih5_k14.2_n"lrn_ci_2d:non_default_k" 6 | ic16_ih10iw7_n"lrn_ci_2d:non_square_shape" 7 | ic12_ih15iw1_n"lrn_ci_2d:unit_width" 8 | ic31_ih1iw16_n"lrn_ci_2d:unit_heigth" 9 | ic64_ih1iw1_n"lrn_ci_2d:unit_heigth_and_width" 10 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/lrn/shapes_3d: -------------------------------------------------------------------------------- 1 | ic15_id2n"channels_only" 2 | ic15_id2ih16iw16n"channel_tail" 3 | ic16_id3ih5iw5_ls7n"non_default_local_size" 4 | ic16_id2ih5iw5_beta1.0n"non_default_beta" 5 | ic16_id4ih5iw5_k14.2n"non_default_k" 6 | ic16_id2ih10iw7n"non_square_shape" 7 | ic16_id3ih10iw7n"non_square_shape" 8 | ic12_id2ih15iw1n"unit_width" 9 | ic31_id1ih1iw16_ls4n"unit_heigth" 10 | ic31_id2ih1iw16_ls4n"unit_heigth" 11 | ic31_id3ih1iw16n"unit_heigth" 12 | ic64_id1ih1iw1n"unit_heigth_and_width" 13 | ic64_id2ih1iw1n"unit_heigth_and_width" 14 | ic64_id3ih1iw1n"unit_heigth_and_width" 15 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/lrn/shapes_ci: -------------------------------------------------------------------------------- 1 | --batch=shapes_0d 2 | --batch=shapes_2d 3 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/lrn/shapes_topologies: -------------------------------------------------------------------------------- 1 | # model problems 2 | mb256ic96_ih55n"alexnet:norm1" 3 | mb256ic256_ih27n"alexnet:norm2" 4 | mb96ic64_ih57n"googlenet:pool1/norm1" 5 | mb96ic192_ih57n"googlenet:conv2/norm2" 6 | mb50ic96ih112_alpha0.0005_k2n"fastrcnn:norm1" 7 | mb50ic96ih112_ls3_alpha0.00005n"fastrcnn_zf:norm1" 8 | 9 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/lrn/test_lrn_all: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | --mb=2 4 | --dt=f32 5 | --alg=ACROSS,WITHIN 6 | --dir=FWD_D,FWD_I,BWD_D 7 | 8 | --tag=abx 9 | --batch=shapes_0d 10 | 11 | --tag=abx,axb,aBx8b,aBx16b 12 | --batch=set_all 13 | 14 | # bf16 15 | --batch=test_lrn_bfloat16 16 | 17 | # f16 18 | --batch=test_lrn_float16 -------------------------------------------------------------------------------- /tests/benchdnn/inputs/lrn/test_lrn_bfloat16: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | --mb=2 4 | --dt=bf16 5 | --alg=ACROSS,WITHIN 6 | --dir=FWD_D,FWD_I,BWD_D 7 | 8 | --tag=abx 9 | --batch=shapes_0d 10 | 11 | --tag=abx,axb,aBx8b,aBx16b 12 | --batch=set_all 13 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/lrn/test_lrn_ci: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | --mb=2 4 | --dt=f32,bf16,f16 5 | --alg=ACROSS,WITHIN 6 | --dir=FWD_D,FWD_I,BWD_D 7 | --tag=abx,axb 8 | --batch=shapes_ci 9 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/lrn/test_lrn_float16: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | --mb=2 4 | --dt=f16 5 | --alg=ACROSS,WITHIN 6 | --dir=FWD_D,FWD_I,BWD_D 7 | 8 | --tag=abx 9 | --batch=shapes_0d 10 | 11 | --tag=abx,axb 12 | --batch=set_all 13 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/lrn/test_lrn_gpu: -------------------------------------------------------------------------------- 1 | # f32, bf16 2 | --reset 3 | 4 | --dt=f32,bf16 5 | --alg=ACROSS,WITHIN 6 | --dir=FWD_D,FWD_I,BWD_D 7 | --tag=abx 8 | --batch=set_all 9 | 10 | # f16 11 | --reset 12 | 13 | --dt=f16 14 | --alg=ACROSS,WITHIN 15 | --dir=FWD_I 16 | --tag=abx 17 | --batch=set_all 18 | 19 | # Test CI in Nightly 20 | --reset 21 | --batch=test_lrn_ci 22 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/lrn/test_lrn_smoke: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | --match=.*lrn_ci_2d.* # Use 2d problems only from shapes_ci 4 | --mb=2 5 | --dt=f32,bf16,f16 6 | --alg=ACROSS,WITHIN 7 | --dir=FWD_D,FWD_I,BWD_D 8 | --tag=axb 9 | --batch=shapes_ci 10 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/matmul/harness_matmul_dropout: -------------------------------------------------------------------------------- 1 | --reset 2 | --dt=f32,bf16 3 | --attr-fpmath=,bf16 4 | --check-ref-impl=false 5 | --attr-dropout=0.5:12345678 6 | 7 | --stag=ab --dtag=ab 8 | --batch=shapes_2d 9 | 10 | --stag=abc --dtag=abc 11 | --batch=shapes_3d 12 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/matmul/harness_matmul_regression_float16: -------------------------------------------------------------------------------- 1 | 2 | # test shapes with b_buffer 3 | --reset 4 | --dt=f16:f16:f16 --stag=ab --wtag=ba --dtag=ab --bia-dt=f16 327x256:256x256 5 | 6 | # Test that cases when M == 1 are handled correctly. 7 | --reset 8 | --stag=ba,ab --wtag=ab --dtag=ab --dt=f16 1x2:2x256 9 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/matmul/harness_matmul_runtime_f32: -------------------------------------------------------------------------------- 1 | # f32 2 | --reset 3 | --skip-impl=ref 4 | 5 | --dt=f32 6 | --stag=ab,ba --wtag=ab,ba --dtag=ab 7 | --bia-dt=undef,f32 --bia_mask=2 8 | 9 | --runtime_dims_masks=0 10 | --attr-scales=src:common:0.25+wei:common:0.5+dst:common:4 11 | --attr-post-ops=,sum,relu 12 | --batch=shapes_2d 13 | 14 | --runtime_dims_masks=3:3 15 | --attr-scales=src:common:0.25+wei:common:0.5+dst:common:4 16 | --attr-post-ops=,sum+add:s8,mul:f32:per_oc,mul:f32:per_tensor 17 | --batch=shapes_2d 18 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/matmul/perf_matmul_inference_batched: -------------------------------------------------------------------------------- 1 | --batch=shapes_bert 2 | --batch=shapes_bert_large 3 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/matmul/shapes_2d: -------------------------------------------------------------------------------- 1 | # random shapes for correctness testing 2 | 1x1:1x1 3 | 10x30:30x1 4 | 1x30:30x20 5 | 10x1:1x20 6 | 1x300:300x1 7 | 1x1:1x200 8 | 100x1:1x1 9 | 10x30:30x20 10 | 2x30:30x47 11 | 10x30:30x16 12 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/matmul/shapes_2d_ci: -------------------------------------------------------------------------------- 1 | # 2d shapes for correctness testing in CI 2 | --batch=shapes_converted_ip_inf_lb_dlrm 3 | --batch=shapes_converted_ip_inf_lb_ncf 4 | --batch=shapes_converted_ip_inf_lb_rnn_t 5 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/matmul/shapes_3d: -------------------------------------------------------------------------------- 1 | # random batched shapes for correctness testing 2 | 2x10x30:2x30x1 3 | 3x30x1:3x1x20 4 | 1x30x30:1x30x20 5 | 3x10x30:3x30x16 6 | 7 | # batch broadcast shapes 8 | 7x32x16:1x16x8 9 | 1x128x8:2x8x16 10 | 2x16x73:1x73x8 11 | 1x26x17:5x17x65 12 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/matmul/shapes_4bit: -------------------------------------------------------------------------------- 1 | 24x32:32x64 2 | 25x32:32x16 3 | 96x96:96x64 4 | 14x96:96x32 5 | 1x30:30x20 6 | 10x30:30x20 7 | 2048x1024:1024x512_n"DLRM:5*1" 8 | 2048x256:256x128_n"NCF:1*1" 9 | 2048x128:128x64_n"NCF:2*1" 10 | 896x240:240x4096_n"RNN-T:Encoder_cell1_Input*2" 11 | 896x1024:1024x4096_n"RNN-T:Encoder_cell1_Hidden*11" 12 | 896x320:320x1280_n"RNN-T:Prediction_Input*12" 13 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/matmul/shapes_4d: -------------------------------------------------------------------------------- 1 | 18x16x54x64:18x16x64x54 2 | 11x16x45x45:11x16x45x64 3 | 21x16x41x41:21x16x41x64 4 | 16x16x49x49:16x16x49x64 5 | 14x16x54x54:14x16x54x64 6 | 5x16x38x38:5x16x38x64 7 | 24x16x32x32:24x16x32x64 8 | 13x16x45x64:13x16x64x45 9 | 17x16x41x64:17x16x64x41 10 | 21x16x49x64:21x16x64x49 11 | 12 | # Broadcast shapes 13 | 2x16x384x384:2x1x384x64 14 | 1x1x35x64:13x16x64x35 15 | 1x16x38x64:5x1x64x38 16 | 14x16x54x64:1x1x64x54n"B_full_bcast" 17 | 14x6x1x253:1x1x253x1n"dot_prod_w_B_full_bcast" 18 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/matmul/shapes_bert: -------------------------------------------------------------------------------- 1 | # multihead self-attention layer 2 | # mb = 1, num_heads = 12, hidden_size = 1024, t_x = t_y = 128 3 | 12x128x64:12x64x128_n"encoder:QK_matmul:12" 4 | 12x128x128:12x128x64_n"encoder:WV_matmul:12" 5 | 6 | # mb = 128, num_heads = 12, hidden_size = 768, t_x = t_y = 128, 7 | 1536x128x64:1536x64x128_n"encoder:QK_matmul:12" 8 | 1536x128x128:1536x128x64_n"encoder:WV_matmul:12" 9 | 10 | # mb = 128, num_heads = 16, hidden_size = 1024, t_x = t_y = 128, 11 | #2048x128x64:2048x64x128_n"encoder:QK_matmul:24" 12 | #2048x128x128:2048x128x64_n"encoder:WV_matmul:24" 13 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/matmul/shapes_bert_large: -------------------------------------------------------------------------------- 1 | # multihead self-attention layer 2 | # mb = 1, num_heads = 12, hidden_size = 768, t_x = t_y = 384 3 | 12x384x64:12x64x384_n"encoder:QK_matmul:12" 4 | 12x384x384:12x384x64_n"encoder:WV_matmul:12" 5 | 6 | # mb = 128, num_heads = 12, hidden_size = 768, t_x = t_y = 384 7 | 1536x384x64:1536x64x384_n"encoder:QK_matmul:12" 8 | 1536x384x384:1536x384x64_n"encoder:WV_matmul:12" 9 | 10 | # mb = 128, num_heads = 16, hidden_size = 1024, t_x = t_y = 384 11 | #2048x384x64:2048x64x384_n"encoder:QK_matmul:24" 12 | #2048x384x384:2048x384x64_n"encoder:WV_matmul:24" 13 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/matmul/shapes_converted_ip_inf_lb_alexnet: -------------------------------------------------------------------------------- 1 | # These problems are ported from corresponding inner product forward 2 | # ip mb=1024 3 | 1024x9216:9216x4096n"Alexnet:ip1*1" 4 | 1024x4096:4096x4096n"Alexnet:ip2*1" 5 | 1024x4096:4096x1000n"Alexnet:ip3*1" 6 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/matmul/shapes_converted_ip_inf_lb_dlrm: -------------------------------------------------------------------------------- 1 | # These problems are ported from corresponding inner product forward 2 | # ip mb=2048 3 | 2048x13:13x512n"DLRM:0*1" 4 | 2048x512:512x256n"DLRM:1*2" 5 | 2048x256:256x128n"DLRM:2*1" 6 | 2048x479:479x1024n"DLRM:3*1" 7 | 2048x1024:1024x1024n"DLRM:4*1" 8 | 2048x1024:1024x512n"DLRM:5*1" 9 | 2048x256:256x1n"DLRM:7*1" 10 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/matmul/shapes_converted_ip_inf_lb_gmnt: -------------------------------------------------------------------------------- 1 | # These problems are ported from corresponding inner product forward 2 | # ip mb=64 3 | 64x512:512x512n"GNMT:0*1" 4 | 64x1024:1024x1024n"GNMT:1*1" 5 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/matmul/shapes_converted_ip_inf_lb_googlenet: -------------------------------------------------------------------------------- 1 | # These problems are ported from corresponding inner product forward 2 | # ip mb=128 3 | 128x2048:2048x1024n"googlenet_v1:ip1*1" 4 | 128x1024:1024x1000n"googlenet_v1:ip2*1" 5 | 6 | # ip mb=224 7 | 224x2048:2048x1000n"inceptionv3:ip1*1" 8 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/matmul/shapes_converted_ip_inf_lb_maskrcnn: -------------------------------------------------------------------------------- 1 | # These problems are ported from corresponding inner product forward 2 | # ip mb=1000 3 | 1000x12544:12544x1024n"masknet:ip1*1" 4 | 1000x1024:1024x1024n"masknet:ip2*1" 5 | 1000x1024:1024x324n"masknet:ip3*1" 6 | 1000x1024:1024x81n"masknet:ip4*1" 7 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/matmul/shapes_converted_ip_inf_lb_ncf: -------------------------------------------------------------------------------- 1 | # These problems are ported from corresponding inner product forward 2 | # ip mb=2048 3 | # Used in smoke validation, don't change the name 4 | 2048x256:256x256n"NCF:0*1" 5 | 2048x256:256x128n"NCF:1*1" 6 | 2048x128:128x64n"NCF:2*1" 7 | 2048x128:128x1n"NCF:3*1" 8 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/matmul/shapes_converted_ip_inf_lb_resnet: -------------------------------------------------------------------------------- 1 | # These problems are ported from corresponding inner product forward 2 | # ip mb=112 3 | 112x2048:2048x1000n"resnet:ip1*1" 4 | 5 | # ip mb=64 6 | 64x2048:2048x1000n"resnet_sparse:ip1*1" 7 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/matmul/shapes_converted_ip_inf_lb_rnn_t: -------------------------------------------------------------------------------- 1 | # These problems are ported from corresponding inner product forward 2 | # ip mb=896 (16 * num_cores) 3 | 4 | 896x240:240x4096n"RNN-T:Encoder_cell1_Input*2" 5 | 896x1024:1024x4096n"RNN-T:Encoder_cell1_Hidden*11" 6 | 896x2048:2048x4096n"RNN-T:Encoder_cell3_Input*1" 7 | 896x320:320x1280n"RNN-T:Prediction_Input*12" 8 | 896x1344:1344x512n"RNN-T:JointNet_Linear1*3" 9 | 896x512:512x29n"RNN-T:JointNet_Linear2*3" 10 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/matmul/shapes_converted_ip_inf_lb_vgg16: -------------------------------------------------------------------------------- 1 | # These problems are ported from corresponding inner product forward 2 | # ip mb=64 3 | 64x25088:25088x4096n"VGG16:ip1*1" 4 | 64x4096:4096x4096n"VGG16:ip2*1" 5 | 64x4096:4096x81n"VGG16:ip3*1" 6 | 64x4096:4096x324n"VGG16:ip4*1" 7 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/matmul/shapes_converted_ip_inf_lb_wd: -------------------------------------------------------------------------------- 1 | # These problems are ported from corresponding inner product forward 2 | # ip mb=512 3 | 512x845:845x1024n"WnD-512:0*1" 4 | 512x1024:1024x512n"WnD-512:1*1" 5 | 512x512:512x256n"WnD-512:2*1" 6 | 7 | # ip mb=1024 8 | #1024x845:845x1024n"WnD-1024:0*1" 9 | #1024x1024:1024x512n"WnD-1024:1*1" 10 | #1024x512:512x256n"WnD-1024:2*1" 11 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/matmul/shapes_converted_ip_tr_alexnet_bwd_d: -------------------------------------------------------------------------------- 1 | # These problems are ported from corresponding inner product shapes for 2 | # training backward wrt data mb=1024 3 | 4 | 1024x4096:4096x9216n"Alexnet_train:BWD_D,ip1*1" 5 | 1024x4096:4096x4096n"Alexnet_train:BWD_D,ip2*1" 6 | 1024x1000:1000x4096n"Alexnet_train:BWD_D,ip3*1" 7 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/matmul/shapes_converted_ip_tr_alexnet_bwd_w: -------------------------------------------------------------------------------- 1 | # These problems are ported from corresponding inner product shapes for 2 | # training backward wrt weights mb=1024 3 | 4 | 9216x1024:1024x4096n"Alexnet_train:BWD_W,ip1*1" 5 | 4096x1024:1024x4096n"Alexnet_train:BWD_W,ip2*1" 6 | 4096x1024:1024x1000n"Alexnet_train:BWD_W,ip3*1" 7 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/matmul/shapes_converted_ip_tr_alexnet_fwd: -------------------------------------------------------------------------------- 1 | # These problems are ported from corresponding inner product shapes for 2 | # training forward mb=1024 3 | 4 | 1024x9216:9216x4096n"Alexnet_train:FWD,ip1*1" 5 | 1024x4096:4096x4096n"Alexnet_train:FWD,ip2*1" 6 | 1024x4096:4096x1000n"Alexnet_train:FWD,ip3*1" 7 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/matmul/shapes_converted_ip_tr_dlrm_bwd_d: -------------------------------------------------------------------------------- 1 | # These problems are ported from corresponding inner product shapes for 2 | # training backward wrt data mb=2048 3 | 4 | 2048x512:512x13n"DLRM_train:BWD_D,0*1" 5 | 2048x256:256x512n"DLRM_train:BWD_D,1*2" 6 | 2048x128:128x256n"DLRM_train:BWD_D,2*1" 7 | 2048x1024:1024x479n"DLRM_train:BWD_D,3*1" 8 | 2048x1024:1024x1024n"DLRM_train:BWD_D,4*1" 9 | 2048x512:512x1024n"DLRM_train:BWD_D,5*1" 10 | 2048x1:1x256n"DLRM_train:BWD_D,7*1" 11 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/matmul/shapes_converted_ip_tr_dlrm_bwd_w: -------------------------------------------------------------------------------- 1 | # These problems are ported from corresponding inner product shapes for 2 | # training backward wrt weights mb=2048 3 | 4 | 13x2048:2048x512n"DLRM_train:BWD_W,0*1" 5 | 512x2048:2048x256n"DLRM_train:BWD_W,1*2" 6 | 256x2048:2048x128n"DLRM_train:BWD_W,2*1" 7 | 479x2048:2048x1024n"DLRM_train:BWD_W,3*1" 8 | 1024x2048:2048x1024n"DLRM_train:BWD_W,4*1" 9 | 1024x2048:2048x512n"DLRM_train:BWD_W,5*1" 10 | 256x2048:2048x1n"DLRM_train:BWD_W,7*1" 11 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/matmul/shapes_converted_ip_tr_dlrm_fwd: -------------------------------------------------------------------------------- 1 | # These problems are ported from corresponding inner product shapes for 2 | # training forward mb=2048 3 | 4 | 2048x13:13x512n"DLRM_train:FWD,0*1" 5 | 2048x512:512x256n"DLRM_train:FWD,1*2" 6 | 2048x256:256x128n"DLRM_train:FWD,2*1" 7 | 2048x479:479x1024n"DLRM_train:FWD,3*1" 8 | 2048x1024:1024x1024n"DLRM_train:FWD,4*1" 9 | 2048x1024:1024x512n"DLRM_train:FWD,5*1" 10 | 2048x256:256x1n"DLRM_train:FWD,7*1" 11 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/matmul/shapes_converted_ip_tr_gmnt_bwd_d: -------------------------------------------------------------------------------- 1 | # These problems are ported from corresponding inner product shapes for 2 | # training backward wrt data mb=128 3 | 4 | 128x512:512x512n"GNMT_train:BWD_D,0*1" 5 | 128x1024:1024x1024n"GNMT_train:BWD_D,1*1" 6 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/matmul/shapes_converted_ip_tr_gmnt_bwd_w: -------------------------------------------------------------------------------- 1 | # These problems are ported from corresponding inner product shapes for 2 | # training backward wrt weights mb=128 3 | 4 | 512x128:128x512n"GNMT_train:BWD_W,0*1" 5 | 1024x128:128x1024n"GNMT_train:BWD_W,1*1" 6 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/matmul/shapes_converted_ip_tr_gmnt_fwd: -------------------------------------------------------------------------------- 1 | # These problems are ported from corresponding inner product shapes for 2 | # training forward mb=128 3 | 4 | 128x512:512x512n"GNMT_train:FWD,0*1" 5 | 128x1024:1024x1024n"GNMT_train:FWD,1*1" 6 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/matmul/shapes_converted_ip_tr_googlenet_bwd_d: -------------------------------------------------------------------------------- 1 | # These problems are ported from corresponding inner product shapes for 2 | # training backward wrt data 3 | 4 | # googlenet_v1 mb=128 5 | 128x1024:1024x2048n"googlenet_v1_train:BWD_D,ip1*1" 6 | 128x1000:1000x1024n"googlenet_v1_train:BWD_D,ip2*1" 7 | 8 | # inceptionv3 mb=224 9 | 224x1000:1000x2048n"inceptionv3_train:BWD_D,ip1*1" 10 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/matmul/shapes_converted_ip_tr_googlenet_bwd_w: -------------------------------------------------------------------------------- 1 | # These problems are ported from corresponding inner product shapes for 2 | # training backward wrt weights 3 | 4 | # googlenet_v1 mb=128 5 | 2048x128:128x1024n"googlenet_v1_train:BWD_W,ip1*1" 6 | 1024x128:128x1000n"googlenet_v1_train:BWD_W,ip2*1" 7 | 8 | # inceptionv3 mb=224 9 | 2048x224:224x1000n"inceptionv3_train:BWD_W,ip1*1" 10 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/matmul/shapes_converted_ip_tr_googlenet_fwd: -------------------------------------------------------------------------------- 1 | # These problems are ported from corresponding inner product shapes for 2 | # training forward 3 | 4 | # googlenet_v1 mb=128 5 | 128x2048:2048x1024n"googlenet_v1_train:FWD,ip1*1" 6 | 128x1024:1024x1000n"googlenet_v1_train:FWD,ip2*1" 7 | 8 | # inceptionv3 mb=224 9 | 224x2048:2048x1000n"inceptionv3_train:FWD,ip1*1" 10 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/matmul/shapes_converted_ip_tr_maskrcnn_bwd_d: -------------------------------------------------------------------------------- 1 | # These problems are ported from corresponding inner product shapes for 2 | # training backward wrt data mb=512 3 | 4 | 512x1024:1024x12544n"masknet_train:BWD_D,ip1*1" 5 | 512x1024:1024x1024n"masknet_train:BWD_D,ip2*1" 6 | 512x324:324x1024n"masknet_train:BWD_D,ip3*1" 7 | 512x81:81x1024n"masknet_train:BWD_D,ip4*1" 8 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/matmul/shapes_converted_ip_tr_maskrcnn_bwd_w: -------------------------------------------------------------------------------- 1 | # These problems are ported from corresponding inner product shapes for 2 | # training backward wrt weights mb=512 3 | 4 | 12544x512:512x1024n"masknet_train:BWD_W,ip1*1" 5 | 1024x512:512x1024n"masknet_train:BWD_W,ip2*1" 6 | 1024x512:512x324n"masknet_train:BWD_W,ip3*1" 7 | 1024x512:512x81n"masknet_train:BWD_W,ip4*1" 8 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/matmul/shapes_converted_ip_tr_maskrcnn_fwd: -------------------------------------------------------------------------------- 1 | # These problems are ported from corresponding inner product shapes for 2 | # training forward mb=512 3 | 4 | 512x12544:12544x1024n"masknet_train:FWD,ip1*1" 5 | 512x1024:1024x1024n"masknet_train:FWD,ip2*1" 6 | 512x1024:1024x324n"masknet_train:FWD,ip3*1" 7 | 512x1024:1024x81n"masknet_train:FWD,ip4*1" 8 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/matmul/shapes_converted_ip_tr_ncf_bwd_d: -------------------------------------------------------------------------------- 1 | # These problems are ported from corresponding inner product shapes for 2 | # training backward wrt data mb=2048 3 | 4 | 2048x256:256x256n"NCF_train:BWD_D,0*1" 5 | 2048x128:128x256n"NCF_train:BWD_D,1*1" 6 | 2048x64:64x128n"NCF_train:BWD_D,2*1" 7 | 2048x1:1x128n"NCF_train:BWD_D,3*1" 8 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/matmul/shapes_converted_ip_tr_ncf_bwd_w: -------------------------------------------------------------------------------- 1 | # These problems are ported from corresponding inner product shapes for 2 | # training backward wrt weights mb=2048 3 | 4 | 256x2048:2048x256n"NCF_train:BWD_W,0*1" 5 | 256x2048:2048x128n"NCF_train:BWD_W,1*1" 6 | 128x2048:2048x64n"NCF_train:BWD_W,2*1" 7 | 128x2048:2048x1n"NCF_train:BWD_W,3*1" 8 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/matmul/shapes_converted_ip_tr_ncf_fwd: -------------------------------------------------------------------------------- 1 | # These problems are ported from corresponding inner product shapes for 2 | # training forward mb=2048 3 | 4 | 2048x256:256x256n"NCF_train:FWD,0*1" 5 | 2048x256:256x128n"NCF_train:FWD,1*1" 6 | 2048x128:128x64n"NCF_train:FWD,2*1" 7 | 2048x128:128x1n"NCF_train:FWD,3*1" 8 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/matmul/shapes_converted_ip_tr_resnet_bwd_d: -------------------------------------------------------------------------------- 1 | # These problems are ported from corresponding inner product shapes for 2 | # training backward wrt data 3 | 4 | # resnet mb=112 5 | 112x1000:1000x2048n"resnet_train:BWD_D,ip1*1" 6 | 7 | # resnet_sparse mb=64 8 | 64x1000:1000x2048n"resnet_sparse_train:BWD_D,ip1*1" 9 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/matmul/shapes_converted_ip_tr_resnet_bwd_w: -------------------------------------------------------------------------------- 1 | # These problems are ported from corresponding inner product shapes for 2 | # training backward wrt weights 3 | 4 | # resnet mb=112 5 | 2048x112:112x1000n"resnet_train:BWD_W,ip1*1" 6 | 7 | # resnet_sparse mb=64 8 | 2048x64:64x1000n"resnet_sparse_train:BWD_W,ip1*1" 9 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/matmul/shapes_converted_ip_tr_resnet_fwd: -------------------------------------------------------------------------------- 1 | # These problems are ported from corresponding inner product shapes for 2 | # training forward 3 | 4 | # resnet mb=112 5 | 112x2048:2048x1000n"resnet_train:FWD,ip1*1" 6 | 7 | # resnet_sparse mb=64 8 | 64x2048:2048x1000n"resnet_sparse_train:FWD,ip1*1" 9 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/matmul/shapes_converted_ip_tr_rnn_t_bwd_d: -------------------------------------------------------------------------------- 1 | # These problems are ported from corresponding inner product shapes for 2 | # training backward wrt data mb=896 (16 * num_cores) 3 | 4 | 896x4096:4096x240n"RNN-T:BWD_D,Encoder_cell1_Input*2" 5 | 896x4096:4096x1024n"RNN-T:BWD_D,Encoder_cell1_Hidden*11" 6 | 896x4096:4096x2048n"RNN-T:BWD_D,Encoder_cell3_Input*1" 7 | 896x1280:1280x320n"RNN-T:BWD_D,Prediction_Input*12" 8 | 896x512:512x1344n"RNN-T:BWD_D,JointNet_Linear1*3" 9 | 896x29:29x512n"RNN-T:BWD_D,JointNet_Linear2*3" 10 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/matmul/shapes_converted_ip_tr_rnn_t_bwd_w: -------------------------------------------------------------------------------- 1 | # These problems are ported from corresponding inner product shapes for 2 | # training backward wrt weights mb=896 (16 * num_cores) 3 | 4 | 240x896:896x4096n"RNN-T:BWD_W,Encoder_cell1_Input*2" 5 | 1024x896:896x4096n"RNN-T:BWD_W,Encoder_cell1_Hidden*11" 6 | 2048x896:896x4096n"RNN-T:BWD_W,Encoder_cell3_Input*1" 7 | 320x896:896x1280n"RNN-T:BWD_W,Prediction_Input*12" 8 | 1344x896:896x512n"RNN-T:BWD_W,JointNet_Linear1*3" 9 | 512x896:896x29n"RNN-T:BWD_W,JointNet_Linear2*3" 10 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/matmul/shapes_converted_ip_tr_rnn_t_fwd: -------------------------------------------------------------------------------- 1 | # These problems are ported from corresponding inner product shapes for 2 | # training forward mb=896 (16 * num_cores) 3 | 4 | 896x240:240x4096n"RNN-T:FWD,Encoder_cell1_Input*2" 5 | 896x1024:1024x4096n"RNN-T:FWD,Encoder_cell1_Hidden*11" 6 | 896x2048:2048x4096n"RNN-T:FWD,Encoder_cell3_Input*1" 7 | 896x320:320x1280n"RNN-T:FWD,Prediction_Input*12" 8 | 896x1344:1344x512n"RNN-T:FWD,JointNet_Linear1*3" 9 | 896x512:512x29n"RNN-T:FWD,JointNet_Linear2*3" 10 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/matmul/shapes_converted_ip_tr_vgg16_bwd_d: -------------------------------------------------------------------------------- 1 | # These problems are ported from corresponding inner product shapes for 2 | # training backward wrt data mb=64 3 | 4 | 64x4096:4096x25088n"VGG16_train:BWD_D,ip1*1" 5 | 64x4096:4096x4096n"VGG16_train:BWD_D,ip2*1" 6 | 64x81:81x4096n"VGG16_train:BWD_D,ip3*1" 7 | 64x324:324x4096n"VGG16_train:BWD_D,ip4*1" 8 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/matmul/shapes_converted_ip_tr_vgg16_bwd_w: -------------------------------------------------------------------------------- 1 | # These problems are ported from corresponding inner product shapes for 2 | # training backward wrt weights mb=64 3 | 4 | 25088x64:64x4096n"VGG16_train:BWD_W,ip1*1" 5 | 4096x64:64x4096n"VGG16_train:BWD_W,ip2*1" 6 | 4096x64:64x81n"VGG16_train:BWD_W,ip3*1" 7 | 4096x64:64x324n"VGG16_train:BWD_W,ip4*1" 8 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/matmul/shapes_converted_ip_tr_vgg16_fwd: -------------------------------------------------------------------------------- 1 | # These problems are ported from corresponding inner product shapes for 2 | # training forward mb=64 3 | 4 | 64x25088:25088x4096n"VGG16_train:FWD,ip1*1" 5 | 64x4096:4096x4096n"VGG16_train:FWD,ip2*1" 6 | 64x4096:4096x81n"VGG16_train:FWD,ip3*1" 7 | 64x4096:4096x324n"VGG16_train:FWD,ip4*1" 8 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/matmul/shapes_converted_ip_tr_wd_bwd_d: -------------------------------------------------------------------------------- 1 | # These problems are ported from corresponding inner product shapes for 2 | # training backward wrt data 3 | 4 | # mb=40 5 | 40x1024:1024x845n"WnD-40_train:BWD_D,0*1" 6 | 40x512:512x1024n"WnD-40_train:BWD_D,1*1" 7 | 40x256:256x512n"WnD-40_train:BWD_D,2*1" 8 | 9 | # mb=256 10 | #256x1024:1024x845n"WnD-256_train:BWD_D,0*1" 11 | #256x512:512x1024n"WnD-256_train:BWD_D,1*1" 12 | #256x256:256x512n"WnD-256_train:BWD_D,2*1" 13 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/matmul/shapes_converted_ip_tr_wd_bwd_w: -------------------------------------------------------------------------------- 1 | # These problems are ported from corresponding inner product shapes for 2 | # training backward wrt weights 3 | 4 | # mb=40 5 | 845x40:40x1024n"WnD-40_train:BWD_W,0*1" 6 | 1024x40:40x512n"WnD-40_train:BWD_W,1*1" 7 | 512x40:40x256n"WnD-40_train:BWD_W,2*1" 8 | 9 | # mb=256 10 | #845x256:256x1024n"WnD-256_train:BWD_W,0*1" 11 | #1024x256:256x512n"WnD-256_train:BWD_W,1*1" 12 | #512x256:256x256n"WnD-256_train:BWD_W,2*1" 13 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/matmul/shapes_converted_ip_tr_wd_fwd: -------------------------------------------------------------------------------- 1 | # These problems are ported from corresponding inner product shapes for 2 | # training forward 3 | 4 | # mb=40 5 | 40x845:845x1024n"WnD-40_train:FWD,0*1" 6 | 40x1024:1024x512n"WnD-40_train:FWD,1*1" 7 | 40x512:512x256n"WnD-40_train:FWD,2*1" 8 | 9 | # mb=256 10 | #256x845:845x1024n"WnD-256_train:FWD,0*1" 11 | #256x1024:1024x512n"WnD-256_train:FWD,1*1" 12 | #256x512:512x256n"WnD-256_train:FWD,2*1" 13 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/matmul/shapes_multidim: -------------------------------------------------------------------------------- 1 | # Broadcast shapes > 4D 2 | 22x3x1x8x14:22x1x4x14x8 3 | 13x1x1x8x16:1x13x8x16x8 4 | 2x3x1x1x6x2x3:1x1x4x5x6x3x4 5 | 2x1x7x3x1x6x9:2x6x7x3x4x9x6 6 | 3x5x2x9x4x7x3:1x5x2x1x4x3x9 7 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/matmul/shapes_sparse: -------------------------------------------------------------------------------- 1 | 1x1000005:1000005x6 2 | 1x1000010:1000010x16 3 | 1x1000025:1000025x17 4 | 1x1000045:1000045x32 5 | 1x1000075:1000075x33 6 | 1x1000100:1000100x64 7 | 1x1000120:1000120x65 8 | 1x1000300:1000300x72 9 | 1x1000500:1000500x96 10 | 1x1000600:1000600x100 11 | 1x1000700:1000700x128 12 | 1x1000800:1000800x131 13 | 14 | 4x1000005:1000005x6 15 | 4x1000010:1000010x16 16 | 4x1000025:1000025x17 17 | 4x1000045:1000045x32 18 | 4x1000075:1000075x33 19 | 4x1000100:1000100x64 20 | 4x1000120:1000120x65 21 | 4x1000300:1000300x72 22 | 4x1000500:1000500x96 23 | 4x1000600:1000600x100 24 | 4x1000700:1000700x128 25 | 4x1000800:1000800x131 26 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/matmul/shapes_sparse_packed: -------------------------------------------------------------------------------- 1 | 1x64:64x128 2 | 2x64:64x128 3 | 2x128:128x128 4 | 2x300:300x128 5 | 2x300:300x129 6 | 256x256:256x256 7 | 2x1024:1024x128 8 | 2x1030:1030x128 9 | 2x1030:1030x200 10 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/matmul/shapes_transformer: -------------------------------------------------------------------------------- 1 | # multihead self-attention layer 2 | # mb = 1, num_heads = 16, hidden_size = 1024, t_x = t_y = 40 3 | 16x40x64:16x64x40 4 | 16x40x40:16x40x64 5 | # mb = 128, num_heads = 16, hidden_size = 1024, t_x = t_y = 40 6 | 2048x40x64:2048x64x40 7 | 2048x40x40:2048x40x64 8 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/matmul/test_matmul_bf32_bf16: -------------------------------------------------------------------------------- 1 | # bf32 2 | --reset 3 | --skip-impl=ref,x64:gemm 4 | --dt=f32 --attr-fpmath=bf16 5 | --stag=ab,ba --wtag=ab,ba --dtag=ab 6 | 7 | # test any + blocked 8 | --stag=any --wtag=any,BA16a64b,BA16a48b,BA16a32b,BA16a16b --dtag=any 9 | --batch=shapes_2d 10 | 11 | # 3d 12 | --reset 13 | --skip-impl=ref,x64:gemm 14 | --dt=f32 --attr-fpmath=bf16 15 | --stag=abc,acb --wtag=abc,acb --dtag=abc 16 | --batch=shapes_3d 17 | 2x20x30:2x30x4 18 | 2x20x30:1x30x4 19 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/matmul/test_matmul_bfloat16_ymm: -------------------------------------------------------------------------------- 1 | # bf16 2 | 3 | # global benchdnn knob, will not be reset again 4 | --cpu-isa-hints=prefer_ymm 5 | --batch=test_matmul_bfloat16 6 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/matmul/test_matmul_llm_gpu: -------------------------------------------------------------------------------- 1 | --reset 2 | --batch=option_set_fwks_llm_gpu 3 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/matmul/test_matmul_sparse: -------------------------------------------------------------------------------- 1 | --reset 2 | --dt=f16:f16:f16,f32:f32:f32 3 | --dtag=ab 4 | --encoding=csr+0.9::,:csr+0.9: 5 | --batch=shapes_sparse 6 | 7 | --reset 8 | --dt=f16:f16:f16,f32:f32:f32 9 | --wtag=ab,ba 10 | --dtag=ab 11 | --encoding=coo+0.9::,:coo+0.9: 12 | --batch=shapes_sparse 13 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/matmul/test_matmul_sparse_ci: -------------------------------------------------------------------------------- 1 | --reset 2 | --dt=f16:f16:f16,f32:f32:f32 3 | --dtag=ab 4 | --encoding=csr+0.99::,:csr+0.99: 5 | --batch=shapes_sparse 6 | 7 | --reset 8 | --dt=f16:f16:f16,f32:f32:f32 9 | --dtag=ab 10 | --encoding=coo+0.99::,:coo+0.99: 11 | --batch=shapes_sparse 12 | 13 | --dt=u8:s8:s32,s8:s8:s32,u8:s8:f32,s8:s8:f32 14 | --encoding=:packed+0.99:,:packed+0.5:,:packed+0.0:,:packed+1.0: 15 | --batch=shapes_sparse_packed 16 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/matmul/test_matmul_sparse_gpu: -------------------------------------------------------------------------------- 1 | --reset 2 | --batch=test_matmul_sparse 3 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/pool/harness_pool_regression: -------------------------------------------------------------------------------- 1 | # This shape dispatches to jit:ir and has 2 | # (1) no extra zero-padding, so no zero-out statement, and 3 | # (2) a number of channels that requires a 3-register (mod 4) accumulation 4 | # buffer (ic=208 has the same issue) 5 | --reset --dir=FWD_I --alg=pooling_max --dt=f16:f16 --tag=aBcd16b mb1ic80_ih160oh160kh3sh1dh0ph1_iw160ow160kw3sw1dw0pw1 6 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/pool/harness_pool_smoke_ref: -------------------------------------------------------------------------------- 1 | --reset 2 | --skip-impl=ir # Intentionally test ocl impl coverage 3 | --check-ref-impl= 4 | --match=.*pool_ci_2d.* # Use 2d problems only from shapes_basic 5 | --mb=2 6 | --tag=axb 7 | --alg=max,avg_np,avg_p 8 | 9 | # Training 10 | --dt=f32,bf16,f16 11 | --dir=FWD_D,BWD_D 12 | --batch=shapes_basic 13 | 14 | # Inference 15 | --dir=FWD_I 16 | --tag=axb 17 | --dt=f16,s8,u8 18 | --attr-post-ops=,add:f32:per_oc 19 | --batch=shapes_basic 20 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/pool/harness_pooling_different_dt: -------------------------------------------------------------------------------- 1 | --reset 2 | --mb=2 3 | --dt=s8:u8,u8:s8,u8:f32,f32:u8,s8:f32,f32:s8,u8:f16,f16:u8,s8:f16,f16:s8 4 | --dir=FWD_I 5 | --tag=axb 6 | --alg=max,avg_np,avg_p 7 | --attr-post-ops=,add:f32:per_oc 8 | --batch=shapes_1d 9 | --batch=shapes_2d_small -------------------------------------------------------------------------------- /tests/benchdnn/inputs/pool/perf_pool_gpu: -------------------------------------------------------------------------------- 1 | # mb=1, inference only 2 | --reset 3 | --mb=1 4 | 5 | --dt=f32,s8 6 | --dir=FWD_I 7 | --alg=max,avg_np,avg_p 8 | --tag=axb,aBx16b 9 | 10 | --batch=set_topologies_gpu 11 | 12 | # mb>1, inference only 13 | --reset 14 | --mb=16,32,64 15 | 16 | --dt=f32,s8 17 | --dir=FWD_I 18 | --alg=max,avg_np,avg_p 19 | --tag=axb,ABx16a16b 20 | 21 | --batch=set_topologies_gpu 22 | 23 | # mb>1, training only 24 | --reset 25 | --mb=16,32,64 26 | 27 | --dt=f32 28 | --dir=FWD_D,BWD_D 29 | --alg=max,avg_np,avg_p 30 | --tag=axb,ABx16a16b 31 | 32 | --batch=set_topologies_gpu 33 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/pool/set_all: -------------------------------------------------------------------------------- 1 | --batch=shapes_1d 2 | --batch=shapes_2d 3 | --batch=shapes_3d 4 | 5 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/pool/set_all_small: -------------------------------------------------------------------------------- 1 | --batch=shapes_1d 2 | --batch=shapes_2d_small 3 | --batch=shapes_3d_small 4 | 5 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/pool/set_topologies: -------------------------------------------------------------------------------- 1 | --batch=shapes_3d_unet 2 | --batch=shapes_alexnet 3 | --batch=shapes_googlenet_v1 4 | --batch=shapes_googlenet_v3 5 | --batch=shapes_i3d_resnet50_v1 6 | --batch=shapes_resnet_50 7 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/pool/set_topologies_gpu: -------------------------------------------------------------------------------- 1 | --batch=shapes_alexnet 2 | --batch=shapes_googlenet_v1 3 | --batch=shapes_googlenet_v3 4 | --batch=shapes_resnet_50 5 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/pool/shapes_3d_unet: -------------------------------------------------------------------------------- 1 | # 3d unet 2 | 3 | mb1ic64_id64od32_kd2sd2n"3d_unet:max_pool1" 4 | mb1ic128_id28od14_kd2sd2n"3d_unet:max_pool2" 5 | mb1ic256_id12od6_kd2sd2n"3d_unet:max_pool3" 6 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/pool/shapes_alexnet: -------------------------------------------------------------------------------- 1 | # alexnet 2 | 3 | mb256ic96_ih55oh27_kh3sh2n"alexnet:max_pool1" 4 | mb256ic256_ih27oh13_kh3sh2n"alexnet:max_pool2" 5 | mb256ic256_ih13oh6_kh3sh2n"alexnet:max_pool5" 6 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/pool/shapes_global_pooling: -------------------------------------------------------------------------------- 1 | mb2ic5_iw11ow1_kw11sw11pw0_ih7oh1_kh7sh7ph0 2 | mb2ic5_iw11ow1_kw11sw11pw0_ih1oh1_kh1sh1ph0 3 | mb2ic5_iw11ow1_kw11sw11pw0_ih1oh1_kh1sh1ph0_iw15ow1_kw15sw15pw0 4 | mb4ic32_iw32ow1_kw32sw32pw0_ih32oh1_kh32sh32ph0 5 | mb4ic16_iw8ow1_kw8sw8pw0_ih16oh1_kh16sh16ph0 6 | mb16ic16_iw8ow1_kw8sw8pw0_ih4oh1_kh4sh4ph0 7 | mb32ic16_iw8ow1_kw8sw8pw0_ih4oh1_kh4sh4ph0 8 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/pool/shapes_i3d_resnet50_v1: -------------------------------------------------------------------------------- 1 | # i3d resnet50 v1 2 | 3 | mb1ic64_id16od8kd1sd2pd0_ih112oh56kh3sh2ph1_iw112ow56kw3sw2pw1_n"i3d_resnet50_v1:max_pool1" 4 | mb1ic256_id8od4kd2sd2pd0_ih56oh56kh1sh1ph0_iw56ow56kw1sw1pw0_n"i3d_resnet50_v1:max_pool2" 5 | mb1ic2048_id4od1kd4sd1pd0_ih7oh1kh7sh1ph0_iw7ow1kw7sw1pw0_n"i3d_resnet50_v1:max_pool3" 6 | 7 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/pool/shapes_large_pool: -------------------------------------------------------------------------------- 1 | # Large iw 2 | mb1ic32iw134217732kw7sw5 3 | mb1ic1iw4294967311kw7sw5 4 | 5 | # Large mb 6 | mb4294967311ic1iw1pw1kw3 7 | 8 | # Large ic 9 | mb1ic4294967311iw1pw1kw3 10 | mb1ic4294967311iw1kw1 11 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/pool/shapes_resnet_50: -------------------------------------------------------------------------------- 1 | # resnet_50 2 | 3 | mb50ic64_ih112oh56_kh3sh2n"resnet_50:max_pool1" 4 | # mb50ic256_ih56oh28_khXshXn"resnet_50:res2c_max_pool" 5 | # mb50ic512_ih28oh14_khXshXn"resnet_50:res3d_max_pool" 6 | # mb50ic1024_ih14oh7_khXshXn"resnet_50:res4f_max_pool" 7 | mb50ic2048_ih7oh1_kh7sh1n"resnet_50:ave_pool5" 8 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/pool/test_pool_ci: -------------------------------------------------------------------------------- 1 | --reset 2 | --mb=2 3 | 4 | --alg=max,avg_np,avg_p 5 | # Training 6 | --dt=f32,bf16,f16 7 | --dir=FWD_D,BWD_D 8 | --tag=abx,axb 9 | --batch=shapes_basic 10 | 11 | # Inference 12 | --dir=FWD_I 13 | --tag=axb 14 | 15 | ## All inference configs 16 | --dt=f32,bf16,f16,s32,s8,u8, \ 17 | s8:u8,u8:s8,s8:f32,f32:s8,u8:f32,f32:u8,s8:f16,f16:s8,u8:f16,f16:u8 18 | --batch=shapes_basic 19 | 20 | ## Attributes 21 | --dt=f32,bf16,f16,s32,s8,u8 22 | --attr-post-ops=add:f32:per_oc,linear:0.5:-1 23 | --batch=shapes_basic 24 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/pool/test_pool_float16: -------------------------------------------------------------------------------- 1 | # f16 2 | --reset 3 | --mb=2 4 | 5 | --dt=f16 6 | --alg=max,avg_np,avg_p 7 | # Training 8 | --tag=abx,axb 9 | 10 | --dir=FWD_D,BWD_D 11 | --batch=set_all 12 | --batch=set_topologies 13 | 14 | --dir=FWD_D 15 | --attr-post-ops=add:f32:per_oc,linear:0.5:-1 16 | --batch=set_all_small 17 | 18 | # Inference 19 | --dir=FWD_I 20 | --tag=axb 21 | --batch=set_all 22 | 23 | --attr-post-ops=add:f16,linear:0.5:-1 24 | --batch=set_all_small 25 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/pool/test_pool_fp8: -------------------------------------------------------------------------------- 1 | # fp8 2 | --reset 3 | --mb=2 4 | 5 | --dt=f8_e5m2,f8_e4m3 6 | --alg=max,avg_np,avg_p 7 | # Training 8 | --tag=abx,axb,aBx16b 9 | 10 | --dir=FWD_D 11 | --batch=set_all 12 | --batch=set_topologies 13 | 14 | --dir=FWD_D 15 | --attr-post-ops=add:f32:per_oc,linear:0.5:-1 16 | --batch=set_all_small 17 | 18 | # Inference 19 | --dir=FWD_I 20 | --tag=axb 21 | --batch=set_all 22 | 23 | --attr-post-ops=add:f16,linear:0.5:-1 24 | --batch=set_all_small 25 | 26 | --attr-post-ops=add:f8_e5m2,linear:0.5:-1 27 | --batch=set_all_small 28 | 29 | --attr-post-ops=add:f8_e4m3,linear:0.5:-1 30 | --batch=set_all_small 31 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/pool/test_pool_large_gpu: -------------------------------------------------------------------------------- 1 | # Implicitly test FWD_D via BWD_D 2 | --dir=FWD_I,BWD_D 3 | --dt=bf16:bf16 4 | --alg=max,avg_p 5 | --tag=axb 6 | 7 | --impl=jit 8 | --batch=shapes_large_pool 9 | 10 | # Test both gen_pooling and xe_global pooling in the same pass 11 | --impl=xe 12 | --batch=shapes_large_pool 13 | 14 | --impl=ref 15 | --batch=shapes_large_pool 16 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/pool/test_pool_smoke: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | --match=.*pool_ci_2d.* # Use 2d problems only from shapes_basic 4 | --mb=2 5 | --tag=axb 6 | --alg=max,avg_np,avg_p 7 | 8 | # Training 9 | --dt=f32,bf16,f16 10 | --dir=FWD_D,BWD_D 11 | --batch=shapes_basic 12 | 13 | # Inference 14 | --dir=FWD_I 15 | --tag=axb 16 | --dt=f16,s8,u8 17 | --attr-post-ops=,add:f32:per_oc 18 | --batch=shapes_basic 19 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/prelu/option_set_all: -------------------------------------------------------------------------------- 1 | # tag:any 2 | --stag=abx:any, \ 3 | axb:any, \ 4 | aBx8b:any, \ 5 | aBx16b:any 6 | --batch=shapes_all 7 | 8 | # tag:tag 9 | --stag=abx:abx, \ 10 | axb:axb, \ 11 | aBx8b:aBx8b, \ 12 | aBx16b:aBx16b 13 | --batch=shapes_all 14 | 15 | # tag1:tag2 16 | --stag=abx:axb, \ 17 | axb:aBx8b, \ 18 | aBx16b:abx 19 | --batch=shapes_all 20 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/prelu/test_prelu_all: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | --dir=FWD_D,BWD_DW 4 | --sdt=f32,s32,s8,u8 5 | --batch=option_set_all 6 | 7 | --batch=test_prelu_bfloat16 8 | 9 | --batch=test_prelu_float16 10 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/prelu/test_prelu_bfloat16: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | --dir=FWD_D,BWD_DW 4 | --sdt=bf16:f32,bf16 5 | --batch=option_set_all 6 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/prelu/test_prelu_ci: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | --stag=abx:any,axb:any,abx:abx,axb:axb 4 | 5 | --dir=FWD_D,BWD_DW 6 | --sdt=f32,bf16:f32,bf16,f16,f16:f32 7 | --batch=shapes_ci 8 | 9 | --dir=FWD_I 10 | --sdt=s8:s8,u8:u8,s8:bf16,u8:bf16,s8:f32,u8:f32 11 | --batch=shapes_ci 12 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/prelu/test_prelu_float16: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | --dir=FWD_D,BWD_DW 4 | --sdt=f16:f32,f16 5 | --batch=option_set_all 6 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/prelu/test_prelu_smoke: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | --match=.*prelu_ci_2d.* # Use 2d problems only from shapes_ci 4 | --stag=axb:any 5 | 6 | --dir=FWD_D,BWD_DW 7 | --sdt=f32,bf16,f16 8 | --batch=shapes_ci 9 | 10 | --dir=FWD_I 11 | --sdt=s8,u8 12 | --batch=shapes_ci 13 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/reduction/harness_reduction_bf16: -------------------------------------------------------------------------------- 1 | # bf16 2 | --reset 3 | 4 | --sdt=bf16 --ddt=bf16,f32 5 | --attr-post-ops=,sum+linear:2:1+add:f32,add:f32:per_oc 6 | --batch=option_set_all_algs 7 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/reduction/harness_reduction_f16: -------------------------------------------------------------------------------- 1 | # f16 2 | --reset 3 | 4 | --sdt=f16 --ddt=f16,f32 5 | --attr-post-ops=,sum+linear:2:1+add:f32,add:f32:per_oc 6 | --batch=option_set_all_algs 7 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/reduction/harness_reduction_f32: -------------------------------------------------------------------------------- 1 | # f32 2 | --reset 3 | 4 | --sdt=f32 --ddt=f32 5 | --attr-post-ops=,sum+linear:2:1+add:f32,add:f32:per_oc 6 | --batch=option_set_all_algs 7 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/reduction/harness_reduction_i8: -------------------------------------------------------------------------------- 1 | # i8 2 | --reset 3 | 4 | --attr-post-ops=,sum+linear:2:1+add:f32,add:f32:per_oc 5 | --sdt=u8 --ddt=u8,s32,f32 6 | --batch=option_set_all_algs_int8 7 | 8 | --sdt=s8 --ddt=s8,s32,f32 9 | --batch=option_set_all_algs_int8 10 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/reduction/option_set_all_algs: -------------------------------------------------------------------------------- 1 | # Algorithm coverage based on p and eps validity 2 | --p=1,2 --eps=0.5 3 | --alg=norm_lp_max,norm_lp_sum,norm_lp_power_p_max,norm_lp_power_p_sum 4 | --batch=option_set_all 5 | 6 | --p= --eps= 7 | --alg=sum,mul,max,min,mean 8 | --batch=option_set_all 9 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/reduction/option_set_all_algs_ci: -------------------------------------------------------------------------------- 1 | # Algorithm coverage based on p and eps validity 2 | --p=1,2 --eps=0.5 3 | --alg=norm_lp_max,norm_lp_sum,norm_lp_power_p_max,norm_lp_power_p_sum 4 | --batch=shapes_ci 5 | 6 | --p= --eps= 7 | --alg=sum,mul,max,min,mean 8 | --batch=shapes_ci 9 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/reduction/option_set_all_algs_int8: -------------------------------------------------------------------------------- 1 | # i8 2 | --alg=sum,mul,max,min,mean 3 | --batch=option_set_all 4 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/reduction/option_set_all_algs_int8_ci: -------------------------------------------------------------------------------- 1 | # i8 2 | --alg=sum,mul,max,min,mean 3 | --batch=shapes_ci 4 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/reduction/perf_reduction_gpu: -------------------------------------------------------------------------------- 1 | # Test layers of some key & extened GPU DL Frameworks 2 | --reset 3 | --batch=option_set_fwks_key_gpu 4 | 5 | --reset 6 | --batch=option_set_fwks_ext_gpu 7 | 8 | # Test nested cases 9 | --reset 10 | --batch=shapes_nested_gpu 11 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/reduction/shapes_ci: -------------------------------------------------------------------------------- 1 | 5x3x2x9:1x3x2x9 2 | 32x17x2x3:32x17x1x1 3 | 32x17x2x3:1x17x1x1 4 | 15x12x3x5:15x1x1x1 5 | 15x12x3x5:1x1x1x1 6 | 12x12:1x12 7 | 10x16x32:10x1x32 8 | 1x17x64:1x1x64 9 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/reduction/shapes_gpu_all: -------------------------------------------------------------------------------- 1 | # ND single-dim reduction 2 | 10000:1 3 | 8000x32:1x32 4 | 32x4096x16:32x1x16 5 | 2x52x128x36:2x1x128x36 6 | 17x17x15x15x35:1x17x15x15x35 7 | 11x23x16x16x7x2:11x1x16x16x7x2 8 | 9 | # 2-dim reduction 10 | 16x3x3x30:16x1x1x30 11 | 16x128x4x64:16x128x1x1 12 | 128x53x17:128x1x1 13 | 14 | # 3-dim reduction 15 | 4x16x3x4:1x1x1x4 16 | 4x16x3x4:1x1x3x1 17 | 4x16x3x4:1x16x1x1 18 | 4x16x3x4:4x1x1x1 19 | 20 | # Full reduction 21 | 4192x17:1x1 22 | 36x640x4:1x1x1 23 | 16x16x32x16:1x1x1x1 24 | 25 | # Split reduction 26 | 16x16x32x32:1x16x1x32 27 | 16x16x32x32:16x1x32x1 28 | 5x15x7x17x9:1x15x1x17x1 29 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/reduction/test_reduction_all: -------------------------------------------------------------------------------- 1 | # all 2 | --reset 3 | 4 | --batch=harness_reduction_f32 5 | --batch=harness_reduction_i8 6 | --batch=test_reduction_bfloat16 7 | --batch=test_reduction_float16 8 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/reduction/test_reduction_bfloat16: -------------------------------------------------------------------------------- 1 | # bf16 2 | --reset 3 | 4 | --batch=harness_reduction_bf16 5 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/reduction/test_reduction_ci: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | --stag=abx,axb --dtag=abx,axb,any 4 | --attr-post-ops=,sum+linear:2:1+add:f32 5 | 6 | --sdt=f32 --ddt=f32 7 | --batch=option_set_all_algs_ci 8 | 9 | --sdt=bf16 --ddt=bf16,f32 10 | --batch=option_set_all_algs_ci 11 | 12 | --sdt=f16 --ddt=f16,f32 13 | --batch=option_set_all_algs_ci 14 | 15 | --sdt=s8 --ddt=s8,s32,f32 16 | --batch=option_set_all_algs_int8_ci 17 | 18 | --sdt=u8 --ddt=u8,s32,f32 19 | --batch=option_set_all_algs_int8_ci 20 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/reduction/test_reduction_float16: -------------------------------------------------------------------------------- 1 | # f16 2 | --reset 3 | 4 | --batch=harness_reduction_f16 5 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/reduction/test_reduction_smoke: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | --stag=axb --dtag=any 4 | --attr-post-ops=,sum+linear:2:1+add:f32 5 | 6 | --sdt=f32 --ddt=f32 7 | --batch=option_set_all_algs_ci 8 | 9 | --sdt=bf16 --ddt=bf16 10 | --batch=option_set_all_algs_ci 11 | 12 | --sdt=f16 --ddt=f16 13 | --batch=option_set_all_algs_ci 14 | 15 | --sdt=s8 --ddt=s8 16 | --batch=option_set_all_algs_int8_ci 17 | 18 | --sdt=u8 --ddt=u8 19 | --batch=option_set_all_algs_int8_ci 20 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/reorder/harness_reorder_large: -------------------------------------------------------------------------------- 1 | # test if jit kernels properly handle corner cases: 2 | # * large stride problems 3 | # * huge dimensions (UINT_MAX + 1) 4 | --reset 5 | --skip-impl=ref,simple # run only jit impl, won't iterate 6 | --sdt=f32 7 | --ddt=f32 8 | --stag=abx 9 | --dtag=aBx8b 10 | 2x16x19200x19200 11 | 1x4294967296x1 12 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/reorder/harness_reorder_regression: -------------------------------------------------------------------------------- 1 | # Blocked format with tail processing in compensation 2 | --reset 3 | --sdt=s8 --ddt=s8 --oflag=zp_comp:3 4 | --stag=aBxC16b4c --dtag=xcab 5 | 2x2x32x1x3 1x15x32x1 2x17x32x5 6 | 7 | # test if jit kernel applies zero-point 8 | --reset 9 | --skip-impl=ref,simple # ! test jit version only 10 | --sdt=u8 --ddt=f32 11 | --stag=abdc --dtag=abcd 12 | --attr-zero-points=src0:common:1 13 | 1x32x128x33 14 | 15 | # Test bf16 with aBcde4b format 16 | --reset 17 | --skip-impl=simple #skip non-jit version 18 | --sdt=bf16 --ddt=bf16 19 | --stag=aBcde4b --dtag=aBcde4b 20 | 2x24x19x19x19 21 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/reorder/harness_reorder_saturation: -------------------------------------------------------------------------------- 1 | # basic checks for saturation 2 | --reset 3 | 4 | --sdt=f32,s32,s8,u8 5 | --ddt=f32,s32,s8,u8 6 | 7 | --attr-scales=src:common:4294967295 8 | --stag=abx,axb,aBx4b,aBx8b,aBx16b 9 | --dtag=abx,axb,aBx4b,aBx8b,aBx16b 10 | 1x17x9x5 2x64x3x3 11 | 12 | # checks for int overflow 13 | --reset 14 | 15 | --sdt=s32 16 | --ddt=f32,s8 17 | 18 | --attr-scales=src:common:4294967295 19 | --attr-zero-points=src:common:1 20 | --stag=abx,axb,aBx4b,aBx8b,aBx16b 21 | --dtag=abx,axb,aBx4b,aBx8b,aBx16b 22 | 1x17x9x5 23 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/reorder/test_reorder_smoke: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | --sdt=f32,bf16,f16,s32,s8,u8 4 | --ddt=f32,bf16,f16,s32,s8,u8 5 | --attr-scales=,src:per_dim_1+dst:per_dim_1 6 | --attr-zero-points=,src:common:-1+dst:common:2 7 | --attr-post-ops=,sum:0.5 8 | --runtime-dim-mask=0,63 9 | --stag=abx 10 | --dtag=axb 11 | --oflag= 12 | 2x16x3x4 1x17x5x3 13 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/resampling/set_all: -------------------------------------------------------------------------------- 1 | --batch=shapes_1d 2 | --batch=shapes_2d 3 | --batch=shapes_3d 4 | --batch=shapes_maskrcnn 5 | 6 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/resampling/shapes_1d: -------------------------------------------------------------------------------- 1 | # random problems 2 | 3 | # upsampling 4 | mb16ic2_iw2_ow6 5 | mb16ic64_iw32_ow64 6 | mb1ic32_iw151_ow300 7 | mb4ic17_iw17_ow20 8 | 9 | # downsampling 10 | mb1ic8_iw14_ow7 11 | mb1ic3_iw20_ow17 12 | mb1ic1_iw21_ow13 13 | mb1ic32_iw32_ow6 14 | mb2ic5_iw42_ow14 15 | mb1ic23_iw525_ow5 16 | 17 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/resampling/shapes_2d: -------------------------------------------------------------------------------- 1 | # random problems 2 | 3 | # upsampling 4 | mb1ic8_ih6oh6_iw24ow24 5 | mb1ic8_ih3oh6_iw7ow21 6 | mb2ic16_ih5oh3_iw10ow11 7 | mb12ic8_ih5oh3_iw7ow11 8 | 9 | # downsampling 10 | mb1ic8_ih14oh14_iw7ow7 11 | mb1ic8_ih14oh6_iw7ow3 12 | mb2ic16_ih15oh73_iw10ow11 13 | mb12ic8_ih15oh3_iw5ow11 14 | 15 | # mixed 16 | mb16ic18_ih14oh6_iw7ow12 17 | mb16ic8_ih4oh63_iw9ow13 18 | 19 | # tails for blocked format 20 | mb4ic17_ih6oh7_iw12ow14 21 | mb4ic23_ih60oh60_iw30ow75 22 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/resampling/shapes_3d: -------------------------------------------------------------------------------- 1 | # random problems 2 | 3 | # upsampling 4 | ic8_id2ih4iw6_od4oh8ow12 5 | ic9_id3ih6iw4_od4oh8ow9 6 | 7 | # downsampling 8 | ic16_id6ih6iw6_od3oh3ow3 9 | ic19_id6ih6iw6_od4oh4ow4 10 | 11 | # mixed 12 | mb4ic16_id6ih6iw6_od12oh6ow3 13 | mb32ic32_id31ih50iw31_od16oh77ow16 14 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/resampling/shapes_maskrcnn: -------------------------------------------------------------------------------- 1 | mb1ic256ih34iw25oh68ow50n"maskrcnn1" 2 | mb1ic256ih68iw50oh136ow100n"maskrcnn2" 3 | mb1ic256ih136iw100oh272ow200n"maskrcnn3" -------------------------------------------------------------------------------- /tests/benchdnn/inputs/resampling/test_resampling_bfloat16: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | # bf16 4 | --mb=2 5 | --sdt=bf16 6 | --ddt=bf16 7 | --dir=FWD_D,BWD_D 8 | --alg=nearest,linear 9 | --tag=abx,axb,aBx8b,aBx16b 10 | --batch=set_all 11 | 12 | # post-ops 13 | --dir=FWD_D 14 | --sdt=bf16 15 | --ddt=bf16 16 | --attr-post-ops=add:bf16 17 | --batch=shapes_ci 18 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/resampling/test_resampling_ci: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | --mb=2 4 | --tag=abx,axb 5 | --alg=nearest,linear 6 | 7 | --dir=FWD_D 8 | --attr-post-ops=,sum+add:f32 9 | 10 | --sdt=f32 --ddt=f32,s8 11 | --batch=shapes_ci 12 | 13 | --sdt=bf16 --ddt=bf16 14 | --batch=shapes_ci 15 | 16 | --sdt=f16 --ddt=f16 17 | --batch=shapes_ci 18 | 19 | # backward 20 | --dir=BWD_D 21 | --attr-post-ops= 22 | 23 | --sdt=f32 --ddt=f32 24 | --batch=shapes_ci 25 | 26 | --sdt=bf16 --ddt=bf16 27 | --batch=shapes_ci 28 | 29 | --sdt=f16 --ddt=f16 30 | --batch=shapes_ci 31 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/resampling/test_resampling_float16: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | # f16 4 | --mb=2 5 | --sdt=f16 6 | --ddt=f16 7 | --dir=FWD_D,BWD_D 8 | --alg=nearest,linear 9 | --tag=abx,axb 10 | --batch=set_all 11 | 12 | # post ops 13 | --dir=FWD_D 14 | --sdt=f16 15 | --ddt=f16 16 | --attr-post-ops=add:f16 17 | --batch=shapes_ci 18 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/resampling/test_resampling_smoke: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | --match=.*resampling_ci_2d.* # Use 2d problems only from shapes_ci 4 | --mb=2 5 | --tag=axb 6 | --alg=nearest,linear 7 | 8 | --dir=FWD_D 9 | --attr-post-ops=,sum+add:f32 10 | 11 | --sdt=f32 --ddt=f32 12 | --batch=shapes_ci 13 | 14 | --sdt=bf16 --ddt=bf16 15 | --batch=shapes_ci 16 | 17 | --sdt=f16 --ddt=f16 18 | --batch=shapes_ci 19 | 20 | # backward 21 | --dir=BWD_D 22 | --attr-post-ops= 23 | 24 | --sdt=f32 --ddt=f32 25 | --batch=shapes_ci 26 | 27 | --sdt=bf16 --ddt=bf16 28 | --batch=shapes_ci 29 | 30 | --sdt=f16 --ddt=f16 31 | --batch=shapes_ci 32 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/rnn/harness_augru_bf32: -------------------------------------------------------------------------------- 1 | # bf32 2 | --reset 3 | 4 | --trivial-strides=true,false 5 | --alg=VANILLA_AUGRU,LBR_AUGRU 6 | --activation=UNDEF 7 | --attr-fpmath=bf16 8 | --prop=FWD_I 9 | 10 | # small problems 11 | --direction=left2right 12 | --batch=option_set_small 13 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/rnn/harness_augru_bfloat16: -------------------------------------------------------------------------------- 1 | # bf16 2 | --reset 3 | 4 | --trivial-strides=true,false 5 | --alg=VANILLA_AUGRU,LBR_AUGRU 6 | --activation=UNDEF 7 | --cfg=bf16f32,bf16 8 | --prop=FWD_I,BWD_DW 9 | 10 | # small problems 11 | --direction=left2right 12 | --batch=option_set_small 13 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/rnn/harness_augru_float16: -------------------------------------------------------------------------------- 1 | # f16 2 | --reset 3 | 4 | --trivial-strides=true,false 5 | --alg=VANILLA_AUGRU,LBR_AUGRU 6 | --activation=UNDEF 7 | --cfg=f16f32,f16 8 | --prop=FWD_I,BWD_DW 9 | 10 | # small problems 11 | --direction=left2right 12 | --batch=option_set_small 13 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/rnn/harness_gru_bf32: -------------------------------------------------------------------------------- 1 | # bf32 2 | --reset 3 | 4 | --trivial-strides=true,false 5 | --alg=VANILLA_GRU,LBR_GRU 6 | --activation=UNDEF 7 | --attr-fpmath=bf16 8 | --prop=FWD_I 9 | 10 | # small problems 11 | --direction=left2right,right2left,concat,sum 12 | --batch=option_set_small 13 | 14 | # large problems 15 | --direction=left2right 16 | --batch=option_set_large 17 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/rnn/harness_gru_bfloat16: -------------------------------------------------------------------------------- 1 | # bf16 2 | --reset 3 | 4 | --trivial-strides=true,false 5 | --alg=VANILLA_GRU,LBR_GRU 6 | --activation=UNDEF 7 | --cfg=bf16f32,bf16 8 | --prop=FWD_I,BWD_DW 9 | 10 | # small problems 11 | --direction=left2right,right2left,concat,sum 12 | --batch=option_set_small 13 | 14 | # large problems 15 | --direction=left2right 16 | --batch=option_set_large 17 | 18 | --prop=BWD_DW 19 | --flags=O 20 | 21 | # small problems 22 | --direction=left2right,right2left,concat,sum 23 | --batch=option_set_small 24 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/rnn/harness_gru_f32: -------------------------------------------------------------------------------- 1 | # f32 2 | --reset 3 | 4 | --trivial-strides=true,false 5 | --alg=VANILLA_GRU,LBR_GRU 6 | --activation=UNDEF 7 | --cfg=f32 8 | --prop=FWD_I,BWD_DW 9 | 10 | # small problems 11 | --direction=left2right,right2left,concat,sum 12 | --batch=option_set_small 13 | 14 | # large problems 15 | --direction=left2right 16 | --batch=option_set_large 17 | 18 | --prop=BWD_DW 19 | --flags=O 20 | 21 | # small problems 22 | --direction=left2right,right2left,concat,sum 23 | --batch=option_set_small 24 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/rnn/harness_gru_float16: -------------------------------------------------------------------------------- 1 | # f16 2 | --reset 3 | 4 | --trivial-strides=true,false 5 | --alg=VANILLA_GRU,LBR_GRU 6 | --activation=UNDEF 7 | --cfg=f16f32,f16 8 | --prop=FWD_I,BWD_DW 9 | 10 | # small problems 11 | --direction=left2right,right2left,concat,sum 12 | --batch=option_set_small 13 | 14 | # large problems 15 | --direction=left2right 16 | --batch=option_set_large 17 | 18 | --prop=BWD_DW 19 | --flags=O 20 | 21 | # small problems 22 | --direction=left2right,right2left,concat,sum 23 | --batch=option_set_small 24 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/rnn/harness_gru_int8: -------------------------------------------------------------------------------- 1 | # int8 2 | --reset 3 | 4 | --trivial-strides=true 5 | --prop=FWD_I 6 | --alg=VANILLA_GRU 7 | --activation=UNDEF 8 | 9 | # small problems 10 | --cfg=u8u8u8u8,u8u8u8f32,f32u8f32u8,f32u8f32f32 11 | --direction=left2right,right2left,concat,sum 12 | --scaling=common,per_oc 13 | --batch=option_set_small 14 | 15 | # large problems 16 | --cfg=u8u8u8u8 17 | --direction=left2right 18 | --scaling=per_oc 19 | --batch=option_set_large 20 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/rnn/harness_gru_regression: -------------------------------------------------------------------------------- 1 | # int8 SIC != SLC 2 | --reset 3 | --trivial-strides=true --prop=FWD_I --alg=VANILLA_GRU --activation=UNDEF 4 | --direction=left2right --cfg=u8u8u8f32 l1t47mb100sic128slc256dhc128dic128 5 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/rnn/harness_lstm_bf32: -------------------------------------------------------------------------------- 1 | # bf32 2 | --reset 3 | 4 | --trivial-strides=true,false 5 | --prop=FWD_I 6 | --alg=VANILLA_LSTM 7 | --activation=UNDEF 8 | --attr-fpmath=bf16 9 | --with-peephole=false,true 10 | --with-projection=false,true 11 | 12 | # small problems 13 | --direction=left2right,right2left,concat,sum 14 | --batch=option_set_small 15 | --batch=option_set_lstmp_small 16 | 17 | # large problems 18 | --direction=left2right 19 | --batch=option_set_large 20 | --batch=option_set_lstmp_large 21 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/rnn/harness_rnn_bf32: -------------------------------------------------------------------------------- 1 | # bf32 2 | --reset 3 | 4 | --trivial-strides=true,false 5 | --alg=VANILLA_RNN 6 | --attr-fpmath=bf16 7 | --prop=FWD_I 8 | 9 | # small test case to check GEMM and non-GEMM ops accuracy 10 | --direction=left2right,right2left,concat,sum 11 | --activation=RELU,TANH,LOGISTIC 12 | --batch=option_set_small 13 | 14 | # large cases - test linear activations 15 | --direction=left2right 16 | --activation=RELU 17 | --batch=option_set_large 18 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/rnn/option_set_gnmt_decoder: -------------------------------------------------------------------------------- 1 | --direction=left2right 2 | l1t1sic512slc768dhc512n"GNMT:decoder_0" 3 | l1t1sic512slc1024dhc512n"GNMT:decoder_1" -------------------------------------------------------------------------------- /tests/benchdnn/inputs/rnn/option_set_large: -------------------------------------------------------------------------------- 1 | # option set to run all reasonable large shapes 2 | 3 | --skip-nonlinear=true 4 | --tag=tnc:any:tnc,ntc:any:ntc 5 | --batch=shapes_large 6 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/rnn/option_set_lstmp_large: -------------------------------------------------------------------------------- 1 | # option set to run all reasonable large lstmp shapes 2 | 3 | --skip-nonlinear=true 4 | --batch=shapes_lstmp_large 5 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/rnn/option_set_lstmp_small: -------------------------------------------------------------------------------- 1 | # option set to run all reasonable small lstmp shapes 2 | 3 | --skip-nonlinear=false 4 | --batch=shapes_lstmp_small 5 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/rnn/option_set_perf_inference_lb: -------------------------------------------------------------------------------- 1 | # inference_lb -- inference with large batch size 2 | 3 | --alg=VANILLA_LSTM 4 | --activation=UNDEF 5 | 6 | --mb=640 7 | --batch=option_set_gnmt_decoder 8 | 9 | --mb=64 10 | --batch=option_set_gnmt_encoder 11 | 12 | --mb=64 13 | --alg=VANILLA_RNN 14 | --direction=left2right 15 | --activation=TANH 16 | --batch=shapes_deepspeech_2 17 | 18 | --mb=64 19 | --alg=LBR_GRU 20 | --direction=left2right 21 | --activation=UNDEF 22 | --batch=shapes_deepspeech_2 23 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/rnn/option_set_perf_inference_sb: -------------------------------------------------------------------------------- 1 | # inference_sb -- inference with small batch size 2 | 3 | --alg=VANILLA_LSTM 4 | --activation=UNDEF 5 | 6 | --mb=10 7 | --batch=option_set_gnmt_decoder 8 | 9 | --mb=1 10 | --batch=option_set_gnmt_encoder 11 | 12 | --mb=1 13 | --alg=VANILLA_RNN 14 | --direction=left2right 15 | --activation=TANH 16 | --batch=shapes_deepspeech_2 17 | 18 | --mb=1 19 | --alg=LBR_GRU 20 | --direction=left2right 21 | --activation=UNDEF 22 | --batch=shapes_deepspeech_2 23 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/rnn/option_set_perf_training: -------------------------------------------------------------------------------- 1 | --mb=128 2 | 3 | --alg=VANILLA_LSTM 4 | --activation=UNDEF 5 | --batch=option_set_gnmt_decoder 6 | --batch=option_set_gnmt_encoder 7 | 8 | --alg=VANILLA_RNN 9 | --direction=left2right 10 | --activation=TANH 11 | --batch=shapes_deepspeech_2 12 | 13 | --alg=LBR_GRU 14 | --direction=left2right 15 | --activation=UNDEF 16 | --batch=shapes_deepspeech_2 17 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/rnn/option_set_rnnt: -------------------------------------------------------------------------------- 1 | # RNN-T LSTM shapes 2 | 3 | --alg=VANILLA_LSTM 4 | --activation=UNDEF 5 | --direction=left2right 6 | 7 | --batch=shapes_rnn_t 8 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/rnn/option_set_small: -------------------------------------------------------------------------------- 1 | # option set to run all reasonable small shapes 2 | 3 | --skip-nonlinear=false 4 | --tag=tnc:any:tnc,ntc:any:ntc 5 | --batch=shapes_small 6 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/rnn/perf_rnn_cpu: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | # Forward 4 | 5 | --prop=FWD_I 6 | 7 | --cfg=f32 8 | --batch=option_set_perf_inference_lb 9 | --batch=option_set_perf_inference_sb 10 | --batch=option_set_perf_training 11 | 12 | --cfg=u8u8u8u8 13 | --trivial-strides=true 14 | --scaling=per_oc 15 | --batch=option_set_perf_inference_lb 16 | --batch=option_set_perf_inference_sb 17 | --batch=option_set_perf_training 18 | 19 | # Backward 20 | 21 | --prop=BWD_DW 22 | --cfg=f32 23 | --trivial-strides= 24 | --scaling= 25 | 26 | --batch=option_set_perf_inference_lb 27 | --batch=option_set_perf_inference_sb 28 | --batch=option_set_perf_training 29 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/rnn/perf_rnn_inference_lb: -------------------------------------------------------------------------------- 1 | # inference_lb -- inference with large batch size 2 | --reset 3 | 4 | --mb=640 5 | --alg=VANILLA_LSTM 6 | --activation=TANH 7 | --batch=option_set_gnmt_decoder 8 | 9 | --mb=64 10 | --alg=VANILLA_LSTM 11 | --activation=TANH 12 | --batch=option_set_gnmt_encoder 13 | 14 | --mb=64 15 | --alg=VANILLA_RNN,LBR_GRU 16 | --direction=left2right 17 | --batch=shapes_deepspeech_2 -------------------------------------------------------------------------------- /tests/benchdnn/inputs/rnn/perf_rnn_inference_sb: -------------------------------------------------------------------------------- 1 | # inference_sb -- inference with small batch size 2 | --reset 3 | 4 | --mb=10 5 | --alg=VANILLA_LSTM 6 | --activation=TANH 7 | --batch=option_set_gnmt_decoder 8 | 9 | --mb=1 10 | --alg=VANILLA_LSTM 11 | --activation=TANH 12 | --batch=option_set_gnmt_encoder 13 | 14 | --mb=1 15 | --alg=VANILLA_RNN,LBR_GRU 16 | --direction=left2right 17 | --batch=shapes_deepspeech_2 18 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/rnn/perf_rnn_knx: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | --prop=FWD_I,BWD_DW 4 | 5 | --cfg=f32 6 | --batch=option_set_perf_inference_lb 7 | --batch=option_set_perf_inference_sb 8 | --batch=option_set_perf_training 9 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/rnn/perf_rnn_training: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | --mb=128 4 | --alg=VANILLA_LSTM 5 | --activation=TANH 6 | --batch=option_set_gnmt_decoder 7 | --batch=option_set_gnmt_encoder 8 | 9 | --alg=VANILLA_RNN,LBR_GRU 10 | --direction=left2right 11 | --batch=shapes_deepspeech_2 12 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/rnn/perf_rnn_xe: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | # Forward 4 | 5 | --prop=FWD_I 6 | --cfg=f16,f32 7 | 8 | --batch=option_set_perf_inference_lb 9 | --batch=option_set_perf_inference_sb 10 | --batch=option_set_perf_training 11 | 12 | # Backward 13 | 14 | --prop=BWD_DW 15 | --cfg=f32 16 | 17 | --batch=option_set_perf_inference_lb 18 | --batch=option_set_perf_inference_sb 19 | --batch=option_set_perf_training 20 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/rnn/perf_rnn_xe_hp: -------------------------------------------------------------------------------- 1 | --batch=perf_rnn_xe_lp 2 | 3 | --reset 4 | 5 | # Forward, bf16 6 | 7 | --prop=FWD_I 8 | --cfg=bf16f32 9 | 10 | --batch=option_set_perf_inference_lb 11 | --batch=option_set_perf_inference_sb 12 | --batch=option_set_perf_training 13 | 14 | # Backward 15 | 16 | --prop=BWD_DW 17 | --cfg=bf16f32 18 | 19 | --batch=option_set_perf_inference_lb 20 | --batch=option_set_perf_inference_sb 21 | --batch=option_set_perf_training 22 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/rnn/perf_rnn_xe_lp: -------------------------------------------------------------------------------- 1 | --batch=perf_rnn_xe 2 | 3 | --reset 4 | 5 | # Forward, int8 6 | --prop=FWD_I 7 | --cfg=u8u8u8u8 8 | --scaling=per_oc 9 | --trivial-strides=true 10 | 11 | --batch=option_set_perf_inference_lb 12 | --batch=option_set_perf_inference_sb 13 | --batch=option_set_perf_training 14 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/rnn/shapes_deepspeech_2: -------------------------------------------------------------------------------- 1 | l1t50sic1760n"deepspeech2:0" 2 | l1t100sic1760n"deepspeech2:1" 3 | l1t200sic1760n"deepspeech2:2" -------------------------------------------------------------------------------- /tests/benchdnn/inputs/rnn/shapes_inference: -------------------------------------------------------------------------------- 1 | l1t30mb1sic512n"GNMT_enc-inference" 2 | l7t30mb1sic1024n"GNMT_enc-inference" 3 | l8t1mb1sic2048slc1024dhc1024n"GNMT_dec-inference" 4 | l1t1mb1sic2048slc1024dhc1024n"GNMT_dec-inference" 5 | l1t1mb640sic2048slc1024dhc1024n"GNMT_dec-inference" 6 | l1t50mb1sic1760n"deepspeech2-inference" 7 | l1t100mb1sic1760n"deepspeech2-inference" 8 | l1t200mb1sic1760n"deepspeech2-inference" 9 | l1t50mb1sic500n"pytorch_testcase-inference" 10 | l1t629mb1sic128n"paddlepaddle_testcase-inference" 11 | l1t10mb1sic128slc512dhc128n"exp-0" 12 | l10t1mb1sic512slc128dhc128n"exp-1" -------------------------------------------------------------------------------- /tests/benchdnn/inputs/rnn/shapes_large: -------------------------------------------------------------------------------- 1 | # large shapes 2 | 3 | l1t1mb63_sic64_n"uniform" 4 | l1t1mb34_sic65_n"uniform:tail" 5 | l1t1mb19_sic64_slc128_n"non-uniform:slc_neq_sic" 6 | l1t1mb12_sic65_dhc130_n"non-uniform:slc_neq_dhc_tail" 7 | l1t1mb6_sic64_slc128_dhc256_n"non-uniform:slc_neq_sic_neq_dhc" 8 | l1t1mb4_sic65_slc130_dhc260_n"non-uniform:slc_neq_sic_neq_dhc_tail" 9 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/rnn/shapes_large_gru: -------------------------------------------------------------------------------- 1 | l1t1mb65_sic64_n"uniform" 2 | l1t1mb17_sic128_n"uniform" 3 | l1t1mb100_sic65_n"uniform:tail" 4 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/rnn/shapes_lstmp_large: -------------------------------------------------------------------------------- 1 | # large shapes for lstm w/ projection when dhc != dic 2 | 3 | l1t1mb31_sic64_dic128_n"non-uniform:dhc_neq_dic" 4 | l1t1mb32_sic65_dic130_n"non-uniform:dhc_neq_dic_tail" 5 | l1t1mb12_sic64_slc128_dic128_n"non-uniform:slc_neq_sic_and_dhc_neq_dic" 6 | l1t1mb10_sic65_dhc130_dic260_n"non-uniform:slc_neq_dhc_neq_dic_tail" 7 | l1t1mb3_sic64_slc128_dhc256_dic320_n"non-uniform:slc_neq_sic_neq_dhc_neq_dic" 8 | l1t1mb4_sic65_slc130_dhc260_dic325_n"non-uniform:slc_neq_sic_neq_dhc_neq_dic_tail" 9 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/rnn/shapes_lstmp_small: -------------------------------------------------------------------------------- 1 | # small shapes for lstm w/ projection when dhc != dic 2 | 3 | l1t1mb9_sic16_dic32_n"non-uniform:dhc_neq_dic" 4 | l1t1mb7_sic17_dic34_n"non-uniform:dhc_neq_dic_tail" 5 | l1t1mb3_sic16_slc32_dic32_n"non-uniform:slc_neq_sic_and_dhc_neq_dic" 6 | l1t1mb4_sic17_dhc34_dic68_n"non-uniform:slc_neq_dhc_neq_dic_tail" 7 | l1t1mb2_sic16_slc32_dhc64_dic80_n"non-uniform:slc_neq_sic_neq_dhc_neq_dic" 8 | l1t1mb3_sic17_slc34_dhc68_dic85_n"non-uniform:slc_neq_sic_neq_dhc_neq_dic_tail" 9 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/rnn/shapes_rnn_t: -------------------------------------------------------------------------------- 1 | # RNN-T shapes with fixed timestamp = 256 2 | l1t256mb896sic1024slc240dhc1024dic1024n"RNN-T:Encoder_1" 3 | l1t256mb896sic1024slc1024dhc1024dic1024n"RNN-T:Encoder_2" 4 | l1t128mb896sic1024slc2048dhc1024dic1024n"RNN-T:Encoder_3" 5 | l1t128mb896sic1024slc1024dhc1024dic1024n"RNN-T:Encoder_4*2" 6 | l1t1mb896sic320slc320dhc320dic320n"RNN-T:Prediction*768" 7 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/rnn/shapes_small: -------------------------------------------------------------------------------- 1 | # small shapes 2 | 3 | l8t3mb12_sic16_n"uniform" 4 | l4t3mb20_sic36_n"uniform:unroll_tail" 5 | l1t2mb6_sic16_slc32_n"non-uniform:slc_neq_sic" 6 | l1t1mb7_sic17_dhc34_n"non-uniform:slc_neq_dhc_tail" 7 | l1t1mb3_sic16_slc32_dhc64_n"non-uniform:slc_neq_sic_neq_dhc" 8 | l1t1mb4_sic17_slc34_dhc68_n"non-uniform:slc_neq_sic_neq_dhc_tail" 9 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/rnn/shapes_small_gru: -------------------------------------------------------------------------------- 1 | l14t10mb12_sic16_n"uniform" 2 | l10t14mb10_sic17_n"uniform:tail" 3 | l5t7mb7_sic32_n"uniform:unroll" 4 | l1t6mb8_sic36_n"uniform:unroll_tail" 5 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/rnn/shapes_training: -------------------------------------------------------------------------------- 1 | l1t1mb128sic512n"GNMT_enc-training" 2 | l2t2mb128sic1024n"GNMT_enc-training" 3 | l8t1mb128sic2048slc1024dhc1024n"GNMT_dec-training" 4 | l1t1mb128sic2048slc1024dhc1024n"GNMT_dec-training" 5 | l1t50mb32sic1760n"deepspeech2-training" 6 | l1t100mb32sic1760n"deepspeech2-training" 7 | l1t200mb32sic1760n"deepspeech2-training" 8 | l1t50mb64sic500n"pytorch_testcase-training" 9 | l1t629mb128sic128n"paddlepaddle_testcase-training" 10 | l1t952mb128sic128n"paddlepaddle_testcase-training" 11 | l1t10mb32sic128slc512dhc128n"exp-0" 12 | l10t1mb32sic512slc128dhc128n"exp-1" -------------------------------------------------------------------------------- /tests/benchdnn/inputs/rnn/test_augru_all: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | --batch=test_augru_ci 4 | 5 | --batch=test_augru_bfloat16 6 | 7 | --batch=test_augru_bf32_bfloat16 8 | 9 | --batch=test_augru_float16 10 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/rnn/test_augru_bf32_bfloat16: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | --batch=harness_augru_bf32 4 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/rnn/test_augru_bfloat16: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | --batch=harness_augru_bfloat16 4 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/rnn/test_augru_ci: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | --alg=LBR_AUGRU,VANILLA_AUGRU 4 | --activation=UNDEF 5 | --direction=left2right 6 | --skip-nonlinear=false 7 | 8 | --trivial-strides=true,false 9 | --prop=FWD_I,BWD_DW 10 | --cfg=f32,bf16f32,bf16,f16 11 | --batch=shapes_small 12 | 13 | --trivial-strides=true,false 14 | --prop=FWD_I 15 | --cfg=f32 16 | --attr-fpmath=bf16 17 | --batch=shapes_small 18 | 19 | 20 | # flags 21 | --trivial-strides=true,false 22 | --prop=BWD_DW 23 | --cfg=f32 24 | --flags=O 25 | --batch=shapes_small 26 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/rnn/test_augru_float16: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | --batch=harness_augru_float16 4 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/rnn/test_gru_all: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | --batch=harness_gru_f32 4 | 5 | --batch=test_gru_int8 6 | 7 | --batch=test_gru_bfloat16 8 | 9 | --batch=test_gru_bf32_bfloat16 10 | 11 | --batch=test_gru_float16 12 | 13 | --batch=harness_gru_regression 14 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/rnn/test_gru_bf32_bfloat16: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | --batch=harness_gru_bf32 4 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/rnn/test_gru_bfloat16: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | --batch=harness_gru_bfloat16 -------------------------------------------------------------------------------- /tests/benchdnn/inputs/rnn/test_gru_float16: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | --batch=harness_gru_float16 4 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/rnn/test_gru_int8: -------------------------------------------------------------------------------- 1 | # int8 2 | --reset 3 | 4 | --batch=harness_gru_int8 5 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/rnn/test_lstm_all: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | --batch=test_lstm_f32 4 | 5 | --batch=test_lstm_int8 6 | 7 | --batch=test_lstm_bfloat16 8 | 9 | --batch=test_lstm_bf32_bfloat16 10 | 11 | --batch=test_lstm_float16 12 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/rnn/test_lstm_bf32_bfloat16: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | --batch=harness_lstm_bf32 4 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/rnn/test_lstm_bfloat16: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | --batch=harness_lstm_bfloat16 4 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/rnn/test_lstm_bfloat16_ymm: -------------------------------------------------------------------------------- 1 | # global benchdnn knob, will not be reset again 2 | --cpu-isa-hints=prefer_ymm 3 | 4 | --reset 5 | --batch=test_lstm_bfloat16 6 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/rnn/test_lstm_f32: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | --batch=harness_lstm_f32 4 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/rnn/test_lstm_float16: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | --batch=harness_lstm_float16 4 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/rnn/test_lstm_int8: -------------------------------------------------------------------------------- 1 | # int8 2 | --reset 3 | 4 | --batch=harness_lstm_int8 5 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/rnn/test_rnn_all: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | --batch=harness_rnn_f32 4 | 5 | --batch=test_rnn_bfloat16 6 | 7 | --batch=test_rnn_bf32_bfloat16 8 | 9 | --batch=test_rnn_float16 10 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/rnn/test_rnn_bf32_bfloat16: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | --batch=harness_rnn_bf32 4 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/rnn/test_rnn_bfloat16: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | --batch=harness_rnn_bfloat16 -------------------------------------------------------------------------------- /tests/benchdnn/inputs/rnn/test_rnn_float16: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | --batch=harness_rnn_float16 4 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/self/test_self_ci: -------------------------------------------------------------------------------- 1 | # Use empty input file to align with other drivers 2 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/self/test_self_f32: -------------------------------------------------------------------------------- 1 | # Use empty input file to align with other drivers 2 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/self/test_self_smoke: -------------------------------------------------------------------------------- 1 | # Use empty input file to align with other drivers 2 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/shuffle/option_set_all: -------------------------------------------------------------------------------- 1 | --group=2,4 2 | 3 | --axis=1 1x12x56x56 1x24x56x56 1x36x56x56 1x68x56x56 4 | --axis=1,2 1x68x56x56 1x136x56x56 1x272x56x56 5 | --axis=1,3,4 1x272x2x56x56 6 | 7 | --group=3 8 | 9 | --axis=1 1x3x224x224 1x24x56x56 1x48x28x28 1x96x14x14 1x192x7x7 10 | --axis=1,2 1x36x225x225 1x72x57x57 1x144x27x27 11 | --axis=1,3,4 1x282x2x57x57 12 | 13 | --group=8 14 | 15 | --axis=1,2 1x24x56x56 1x32x56x56 1x72x56x56 16 | --axis=1,3,4 1x272x2x56x56 17 | 18 | --group=16 19 | 20 | --axis=1,2 1x16x64x64 1x48x64x64 1x128x64x64 21 | --axis=1,3,4 1x272x2x64x64 -------------------------------------------------------------------------------- /tests/benchdnn/inputs/shuffle/option_set_min: -------------------------------------------------------------------------------- 1 | --axis=1,2 2 | 3 | --group=2,4 1x12x56x56 1x24x56x56 1x272x56x56 4 | 5 | --group=3 1x36x225x225 1x72x57x57 1x144x27x27 6 | 7 | --group=8 1x24x56x56 1x32x56x56 1x72x56x56 8 | 9 | --group=16 1x16x64x64 1x48x64x64 1x128x64x64 -------------------------------------------------------------------------------- /tests/benchdnn/inputs/shuffle/perf_shuffle_cpu: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | --dir=FWD_D 4 | --dt=f32,bf16 5 | --tag=abx,axb,aBx4b,aBx8b,aBx16b 6 | --batch=option_set_perf -------------------------------------------------------------------------------- /tests/benchdnn/inputs/shuffle/test_shuffle_all: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | --dir=FWD_D 4 | --dt=f32,s32,s8,u8 5 | --tag=abx,axb,aBx4b,aBx8b,aBx16b 6 | --batch=option_set_all 7 | 8 | --dir=BWD_D 9 | --dt=f32 10 | --batch=option_set_min 11 | 12 | # bf16 13 | --batch=test_shuffle_bfloat16 14 | 15 | # f16 16 | --batch=test_shuffle_float16 17 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/shuffle/test_shuffle_bfloat16: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | --dt=bf16 4 | 5 | --dir=FWD_D 6 | --tag=abx,axb,aBx16b 7 | --batch=option_set_all 8 | 9 | --dir=BWD_D 10 | --batch=option_set_min 11 | 12 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/shuffle/test_shuffle_ci: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | --group=4 4 | 5 | --dir=FWD_D,BWD_D 6 | --dt=f32,bf16,f16,s32,s8,u8 7 | --tag=abx,axb 8 | --axis=1,2 9 | 2x12x32x17 3x16x36x9 10 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/shuffle/test_shuffle_float16: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | --dt=f16 4 | 5 | --dir=FWD_D 6 | --tag=abx,axb 7 | --batch=option_set_all 8 | 9 | --dir=BWD_D 10 | --batch=option_set_min 11 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/shuffle/test_shuffle_gpu: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | --group=4 4 | --dir=FWD_D,BWD_D 5 | 6 | --dt=f32,f16,bf16,s8,u8 7 | 8 | --tag=abx,aBx16b 9 | --axis=1,2 1x68x56x56 1x272x56x56 10 | --axis=1,3,4 1x272x2x56x56 11 | 12 | --tag=ABx16a16b 13 | --axis=1,2 32x64x56x56 14 | --axis=1,3,4 32x64x2x56x56 15 | 16 | # blocked with tail 17 | --dt=f32 18 | --tag=aBx16b --axis=1 1x12x56x56 1x36x56x56 19 | 20 | # double block 21 | --allow-enum-tags-only=0 22 | --tag=BA8b4a2b 23 | --group=6 24 | --axis=0,1 25 | 48x48 26 | 27 | # Test CI in Nightly 28 | --reset 29 | --batch=test_shuffle_ci 30 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/shuffle/test_shuffle_smoke: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | --dir=FWD_D,BWD_D 4 | --dt=f32,bf16,f16,s8,u8 5 | --tag=axb 6 | --group=4 7 | --axis=1 8 | 2x12x32x17 3x16x36x9 9 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/softmax/harness_softmax_regression: -------------------------------------------------------------------------------- 1 | # "vectorized" and "small" reusable kernels (stride 1, aligned) with scale 2 | --reset 3 | --dir=FWD_I 4 | --sdt=u8 5 | --ddt=s8 6 | --attr-scales=dst:common:0.125,src:common:2 7 | --axis=3 1x4x192x256 1x4x16x32 8 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/softmax/set_0d: -------------------------------------------------------------------------------- 1 | # 2d dataset 2 | 3 | --batch=shapes_0d 4 | --batch=shapes_nlp 5 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/softmax/shapes_0d: -------------------------------------------------------------------------------- 1 | # 2d dataset 2 | 3 | 96x1000 4 | 256x10 5 | 32x100 6 | 2x113 7 | 128x365 8 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/softmax/shapes_2d: -------------------------------------------------------------------------------- 1 | # 4d dataset 2 | 3 | # spatial is 1 4 | 96x1000x1x1 5 | 256x10x1x1 6 | 7 | # regular 4d 8 | 2x19x128x256 9 | 2x16x128x128 10 | 1x8x1024x16 11 | 1x2x64x64 12 | 448x16x28x28 13 | 64x1011x1x1 14 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/softmax/shapes_3d: -------------------------------------------------------------------------------- 1 | # 5d dataset 2 | 3 | # spatial is 1 4 | 96x1024x1x1x1 5 | 256x10x1x1x1 6 | 7 | # regular 5d 8 | 3x17x9x37x19 9 | 2x16x128x2x4 10 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/softmax/shapes_ci: -------------------------------------------------------------------------------- 1 | # basic shapes 2 | 3 | 16x16_n"softmax_ci_0d:0" 4 | 255x10_n"softmax_ci_0d:1" 5 | 2x19x17x13_n"softmax_ci_2d:0" # Used in smoke validation, don't change the name 6 | 1x16x2x12_n"softmax_ci_2d:1" # Used in smoke validation, don't change the name 7 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/softmax/shapes_large: -------------------------------------------------------------------------------- 1 | 8192x64 2 | 16384x64 3 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/softmax/shapes_large_axis: -------------------------------------------------------------------------------- 1 | 8x3x8320 2 | 4x3x5600 3 | 1x1x4097 4 | 2x3x9999 5 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/softmax/shapes_nlp: -------------------------------------------------------------------------------- 1 | # nlp 2 | 3 | 240x20 4 | 2560x20 5 | 2688x21 6 | 3712x29 7 | 4480x35 8 | 4736x37 9 | 4864x38 10 | 4992x39 11 | 5120x40 12 | 5248x41 13 | 5376x42 14 | 5504x43 15 | 5632x44 16 | 6144x48 17 | 6272x49 18 | 6400x50 19 | 6784x53 20 | 7296x57 21 | 7424x58 22 | 7808x61 23 | 8192x64 24 | 8448x66 25 | 8576x67 26 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/softmax/test_softmax_acl: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | # only FWD_I is supported with ACL 4 | --dir=FWD_I 5 | 6 | # do not run ref 7 | --skip-impl=ref 8 | 9 | --alg=SOFTMAX,LOGSOFTMAX 10 | 11 | --sdt=f32 12 | --ddt=f32 13 | --dtag=any 14 | 15 | --axis=0,1 16 | --batch=shapes_ci 17 | --batch=shapes_nlp 18 | 19 | --stag=abx 20 | --axis=0,1 21 | --batch=set_0d 22 | --axis=1,3 23 | --batch=shapes_2d 24 | --axis=3,4 25 | --batch=shapes_3d 26 | 27 | --stag=axb 28 | --axis=0,1 29 | --batch=shapes_2d 30 | --batch=shapes_3d 31 | 32 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/softmax/test_softmax_smoke: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | --match=.*softmax_ci_2d.* # Use 2d problems only from shapes_ci 4 | --inplace=false 5 | --stag=axb 6 | --dtag=any 7 | --alg=SOFTMAX,LOGSOFTMAX 8 | --axis=1 9 | 10 | --dir=FWD_D,BWD_D 11 | --sdt=f32,bf16,f16 12 | --ddt=f32,bf16,f16 13 | --batch=shapes_ci 14 | 15 | --dir=FWD_I 16 | --sdt=s8,u8 17 | --ddt=s8,u8 18 | --attr-scales=,src:common:64+dst:common:0.5 19 | --batch=shapes_ci 20 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/sum/option_set_fwks_ext_gpu: -------------------------------------------------------------------------------- 1 | --reset --allow-enum-tags-only=0 --sdt=f32:f32 --ddt=f32 --stag=abcd:abcd --dtag=abcd 1113x8x30x64_n"1bca9bd0e78574fe0ab41bbc68b795f9" 2 | --reset --allow-enum-tags-only=0 --sdt=f32:f32 --ddt=f32 --stag=abcd:abcd --dtag=abcd 64x8x50x64_n"8def238b6b453fb393856464ba8a0d4d" 3 | --reset --allow-enum-tags-only=0 --sdt=f32:f32:f32 --ddt=f32 --stag=abc:abc:abc --dtag=abc 64x50x512_n"7d5660f01c9be41fba9d6135abf33d2d" 4 | --reset --allow-enum-tags-only=0 --sdt=f32:f32:f32:f32 --ddt=f32 --stag=ab:ab:ab:ab --dtag=ab 64x512_n"0bab0aacc5f8028faa61019716a8ef50" 5 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/sum/test_sum_all: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | --inplace=true,false 4 | --scales=0.25,1,4 5 | --ddt=f32,s32,s8,u8 6 | --sdt=f32:f32,f32:s32,f32:s8,f32:u8,s32:s8,s32:u8,s8:u8 7 | --dtag=undef,abx,axb,aBx8b,aBx16b 8 | --stag=abx:abx 3x3x16x4 9 | --stag=axb:axb 4x4x2x16 5x5x1x15 10 | --stag=aBx8b:aBx8b 2x8x3x10 1x9x4x7 11 | --stag=aBx16b:aBx16b 1x16x5x11 2x15x6x3 12 | 13 | --ddt=f32,s32 14 | --sdt=f32:s32:s8 15 | --stag=aBx8b:abx:axb,axb:axb:axb 16 | --scales=1.25:3:0.5 16x2x6x4x3 17 | 18 | # bf16 19 | --batch=test_sum_bfloat16 20 | 21 | # f16 22 | --batch=test_sum_float16 23 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/sum/test_sum_bfloat16: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | --scales=0.25,1,4 4 | --ddt=f32,bf16 5 | --sdt=bf16:bf16 6 | --dtag=undef,abx,axb,aBx16b 7 | --stag=abx:abx 3x3x16x4 8 | --stag=axb:axb 4x4x2x16 5x5x1x15 9 | --stag=aBx16b:aBx16b 1x16x5x11 2x15x6x3 10 | 11 | --sdt=bf16:bf16:bf16 12 | --stag=aBx16b:abx:axb,axb:axb:axb 13 | --scales=2:0.25:0.5 16x2x6x4x3 14 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/sum/test_sum_ci: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | --inplace=true,false 4 | --ddt=f32,bf16,f16,s32,s8,u8 5 | --sdt=f32:f32,f32:bf16,f32:s8,bf16:bf16,f16:f16,s32:s8,u8:u8 6 | --dtag=undef,any,abx,axb 7 | --stag=abx:abx,axb:axb 8 | --scales=0.25:2 9 | 3x17x5x7 4x16x8x10 10 | 11 | --ddt=f32,s8 12 | --sdt=f32:u8:s8 13 | --stag=abx:abx:abx,axb:axb:axb 14 | --scales=0.25:2:0.5 15 | 2x17x5x7x3 4x16x8x10x2 16 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/sum/test_sum_float16: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | --scales=0.25,1,4 4 | --ddt=f32,f16 5 | --sdt=f16:f16 6 | --dtag=undef,abx,axb 7 | --stag=abx:abx 3x3x16x4 8 | --stag=axb:axb 4x4x2x16 5x5x1x15 9 | 10 | --sdt=f16:f16:f16 11 | --stag=axb:axb:axb 12 | --scales=2:0.25:0.5 16x2x6x4x3 13 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/sum/test_sum_smoke: -------------------------------------------------------------------------------- 1 | --reset 2 | 3 | --inplace=false 4 | --ddt=f32,bf16,f16,s8,u8 5 | --sdt=f32:f32,bf16:bf16,f16:f16,s8:s8,u8:u8 6 | --dtag=undef 7 | --stag=axb 8 | --scales=0.25 9 | 3x17x5x7 4x16x8x10 10 | 11 | --ddt=f32 12 | --sdt=f32:f32:f32 13 | --stag=axb:axb:axb 14 | 2x17x5x7x3 4x16x8x10x2 15 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/zeropad/option_set_fwks_ext_gpu: -------------------------------------------------------------------------------- 1 | # 2 | --reset --allow-enum-tags-only=0 --dt=f16 --tag=aBcd16b 1x4x1080x1920n"text-image-super-resolution-0001_onnx.inf.fp16.ov.b1*1" 3 | # 4 | --reset --allow-enum-tags-only=0 --dt=f16 --tag=ABcd32a16b 32x4x1080x1920n"text-image-super-resolution-0001_onnx.inf.fp16.ov.b32*1" 5 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/zeropad/option_set_fwks_key_gpu: -------------------------------------------------------------------------------- 1 | # 2 | --reset --allow-enum-tags-only=0 --dt=f32 --tag=ABcde32a16b 32x6x13x14x9n"3dgan.tr.fp32.tf.mb256*8" 3 | --reset --allow-enum-tags-only=0 --dt=f32 --tag=ABcde32a16b 256x8x25x25x25n"3dgan.tr.fp32.tf.mb256*8" 4 | --reset --allow-enum-tags-only=0 --dt=f32 --tag=ABcde16a16b 256x8x25x25x25n"3dgan.tr.fp32.tf.mb256*8" 5 | --reset --allow-enum-tags-only=0 --dt=f32 --tag=ABcde16a16b 256x8x23x23x23n"3dgan.tr.fp32.tf.mb256*8" 6 | --reset --allow-enum-tags-only=0 --dt=f32 --tag=ABcde32a16b 256x6x13x14x9n"3dgan.tr.fp32.tf.mb256*4" 7 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/zeropad/set_dim1_block_3d: -------------------------------------------------------------------------------- 1 | --tag=Abc16a,Abc4a 2 | --batch=shapes_dim1_block_3d -------------------------------------------------------------------------------- /tests/benchdnn/inputs/zeropad/set_dim1dim2_block_2d: -------------------------------------------------------------------------------- 1 | --tag=AB48a16b,AB48a32b,BA4b8a8b4a 2 | 3 | --batch=shapes_dim1dim2_block_2d 4 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/zeropad/set_dim1dim2_block_3d: -------------------------------------------------------------------------------- 1 | # The follwing tags are omitted as they should be tested in the 4d dim2 dim3 2 | # blocked cases 3 | # --tag=ABc16a16b,ABc4a4b,ABc16b16a,ABc4b16a4b,ABc2b8a4b, 4 | # ABc16b16a4b,ABc16b16a2b,ABc4b4a,ABc8a16b2a,ABc8a8b, 5 | # ABc8a4b,ABc8b16a2b,ABc8b8a 6 | 7 | --tag=ABc32a32b,ABc16b32a,ABc16b64a,ABc4b32a4b,ABc4b64a4b,ABc8b32a2b,ABc8b64a2b,ABc4a8b8a4b 8 | --batch=shapes_dim1dim2_block_3d 9 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/zeropad/set_dim2_block_3d: -------------------------------------------------------------------------------- 1 | --tag=aBc16b,aBc32b,aBc4b,aBc8b 2 | --batch=shapes_dim2_block_3d -------------------------------------------------------------------------------- /tests/benchdnn/inputs/zeropad/set_dim2dim3_block_4d: -------------------------------------------------------------------------------- 1 | --tag=aBCd16b16c,aBCd16c16b,aBCd2c4b2c,aBCd4b8c2b,aBCd4c16b4c,aBCd2c8b4c,aBCd16c16b4c,aBCd16c16b2c,aBCd4c4b,aBCd4b4c,aBCd4c8b2c,aBCd8b16c2b,aBCd8b8c,aBCd8b4c,aBCd8c16b2c,aBCd8c8b,aBCd2b4c2b 2 | --batch=shapes_dim2dim3_block_4d 3 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/zeropad/shapes_dim1_block_3d: -------------------------------------------------------------------------------- 1 | 1x1x1 2 | 1x1x33 3 | 1x33x1 4 | 1x33x33 5 | 3x1x1 6 | 3x1x33 7 | 3x33x1 8 | 3x33x33 9 | 7x1x1 10 | 7x1x33 11 | 7x33x1 12 | 7x33x33 13 | 9x1x1 14 | 9x1x33 15 | 9x33x1 16 | 9x33x33 17 | 19x1x1 18 | 19x1x33 19 | 19x33x1 20 | 19x33x33 21 | 43x1x1 22 | 43x1x33 23 | 43x33x1 24 | 43x33x33 25 | 71x1x1 26 | 71x1x33 27 | 71x33x1 28 | 71x33x33 29 | 128x1x1 30 | 128x1x33 31 | 128x33x1 32 | 128x33x33 33 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/zeropad/shapes_dim2_block_3d: -------------------------------------------------------------------------------- 1 | 1x1x1 2 | 1x1x33 3 | 1x3x1 4 | 1x3x33 5 | 1x7x1 6 | 1x7x33 7 | 1x9x1 8 | 1x9x33 9 | 1x19x1 10 | 1x19x33 11 | 1x43x1 12 | 1x43x33 13 | 1x71x1 14 | 1x71x33 15 | 1x128x1 16 | 1x128x33 17 | 33x1x1 18 | 33x1x33 19 | 33x3x1 20 | 33x3x33 21 | 33x7x1 22 | 33x7x33 23 | 33x9x1 24 | 33x9x33 25 | 33x19x1 26 | 33x19x33 27 | 33x43x1 28 | 33x43x33 29 | 33x71x1 30 | 33x71x33 31 | 33x128x1 32 | 33x128x33 33 | -------------------------------------------------------------------------------- /tests/benchdnn/inputs/zeropad/test_zeropad_ci: -------------------------------------------------------------------------------- 1 | --dt=s8,f16,f32,f64 2 | --batch=set_dim1_block_3d 3 | --batch=set_dim2_block_3d 4 | --batch=set_dim1dim2_block_3d 5 | --batch=set_dim2dim3_block_4d 6 | -------------------------------------------------------------------------------- /third_party/.clang-tidy: -------------------------------------------------------------------------------- 1 | Checks: '-*,misc-definitions-in-headers' 2 | CheckOptions: 3 | - { key: HeaderFileExtensions, value: "x" } 4 | -------------------------------------------------------------------------------- /third_party/ittnotify/README.md: -------------------------------------------------------------------------------- 1 | This code is from [Intel(R) Instrumentation and Tracing Technology (ITT) and 2 | Just-In-Time (JIT) API](https://github.com/intel/ittapi) 3 | 4 | tag: 3.22.5 5 | -------------------------------------------------------------------------------- /third_party/spdlog/README.md: -------------------------------------------------------------------------------- 1 | This code is from [spdlog](https://github.com/gabime/spdlog). 2 | 3 | tag: 1.15.1 4 | -------------------------------------------------------------------------------- /third_party/spdlog/details/windows_include.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #ifndef NOMINMAX 4 | #define NOMINMAX // prevent windows redefining min/max 5 | #endif 6 | 7 | #ifndef WIN32_LEAN_AND_MEAN 8 | #define WIN32_LEAN_AND_MEAN 9 | #endif 10 | 11 | #include <windows.h> 12 | -------------------------------------------------------------------------------- /third_party/spdlog/fmt/bundled/core.h: -------------------------------------------------------------------------------- 1 | // This file is only provided for compatibility and may be removed in future 2 | // versions. Use fmt/base.h if you don't need fmt::format and fmt/format.h 3 | // otherwise. 4 | 5 | #include "format.h" 6 | -------------------------------------------------------------------------------- /third_party/spdlog/formatter.h: -------------------------------------------------------------------------------- 1 | // Copyright(c) 2015-present, Gabi Melman & spdlog contributors. 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT) 3 | 4 | #pragma once 5 | 6 | #include <spdlog/details/log_msg.h> 7 | #include <spdlog/fmt/fmt.h> 8 | 9 | namespace spdlog { 10 | 11 | class formatter { 12 | public: 13 | virtual ~formatter() = default; 14 | virtual void format(const details::log_msg &msg, memory_buf_t &dest) = 0; 15 | virtual std::unique_ptr<formatter> clone() const = 0; 16 | }; 17 | } // namespace spdlog 18 | -------------------------------------------------------------------------------- /third_party/spdlog/version.h: -------------------------------------------------------------------------------- 1 | // Copyright(c) 2015-present, Gabi Melman & spdlog contributors. 2 | // Distributed under the MIT License (http://opensource.org/licenses/MIT) 3 | 4 | #pragma once 5 | 6 | #define SPDLOG_VER_MAJOR 1 7 | #define SPDLOG_VER_MINOR 15 8 | #define SPDLOG_VER_PATCH 1 9 | 10 | #define SPDLOG_TO_VERSION(major, minor, patch) (major * 10000 + minor * 100 + patch) 11 | #define SPDLOG_VERSION SPDLOG_TO_VERSION(SPDLOG_VER_MAJOR, SPDLOG_VER_MINOR, SPDLOG_VER_PATCH) 12 | -------------------------------------------------------------------------------- /third_party/xbyak_aarch64/xbyak_aarch64/xbyak_aarch64_version.h: -------------------------------------------------------------------------------- 1 | static const int majorVersion = 1; 2 | static const int minorVersion = 1; 3 | static const int patchVersion = 2; 4 | static int getVersion() { return (majorVersion << 16) + (minorVersion << 8) + patchVersion; } 5 | static const char *getVersionString() { return "1.1.2"; } 6 | --------------------------------------------------------------------------------