├── lib └── libiomp5.so ├── .gitmodules ├── .gitignore ├── src ├── el_init.hpp ├── common │ ├── el_log.hpp │ ├── el_log.cpp │ ├── el_isa.hpp │ └── el_def.hpp ├── elx_stream.hpp ├── elx_int8_conv_wino_bind.hpp ├── elx_conv_wino_bind.hpp ├── el_init.cpp ├── kernel │ ├── elk_conv_wino.hpp │ ├── elk_def.hpp │ ├── elk_u8s8_depthwise_conv_gen.sh │ ├── elk_conv_wino_3x3_3x3_weights.hxx │ ├── elk_conv_wino_4x4_3x3_weights.hxx │ ├── elk_u8s8_depthwise_conv_binder.hxx │ ├── elk_gemm_gen.sh │ ├── elk_u8s8_gemm_gen.sh │ ├── elk_vmg_conv_gen.sh │ └── elk_vmg_conv_binder.hxx ├── elx_conv_wino_gemm.hpp ├── elx_int8_conv_wino_gemm.hpp ├── elx_conv_direct_1x1_bind.hpp ├── elx_int8_conv_direct_depthwise_bind.hpp ├── elx_int8_conv_direct_1x1_bind.hpp ├── elx_conv_direct_vmg.hpp ├── elx_deconv_direct_bind.hpp ├── elx_conv_direct_vmg_bind.hpp ├── elx_stream.cpp ├── elx_deconv_direct.hpp ├── elx_int8_conv_direct_depthwise.hpp ├── elx_int8_conv_direct_depthwise_xopt.hpp ├── elx_conv_direct.hpp └── elx_int8_conv_direct_1x1.hpp ├── tests ├── CMakeLists.txt ├── elt_utils.hpp ├── elt_gflag.hpp └── elt_gflag.cpp ├── scripts ├── best_configs │ ├── resnet-n64-8180-1s-wino-f3_3.sh │ ├── resnet-n64-8180-2s-wino-f5_3.sh │ ├── resnet-n64-8180-1s-wino-f2_3.sh │ ├── resnet-n64-8180-2s-wino-f4_3.sh │ ├── resnet-n64-8180-2s-wino-f3_3.sh │ ├── resnet-n1-8180-1s-wino-f5_3.sh │ ├── resnet-n1-8180-2s-wino-f4_3.sh │ ├── resnet-n1-8180-2s-wino-f5_3.sh │ ├── resnet-n1-8180-2s-wino-f3_3.sh │ ├── resnet-n1-8180-1s.sh │ ├── resnet-n1-8180-1s-wino-f3_3.sh │ ├── resnet-n64-8180-2s-wino-f2_3.sh │ ├── resnet-n64-8180-1s.sh │ ├── resnet-n1-8180-2s-wino-f2_3.sh │ ├── resnet-n64-8180-1s-wino-f5_3.sh │ ├── resnet-n12-8180-1s-wino-int8.sh │ ├── resnet-n64-8180-1s-wino-int8.sh │ ├── resnet-n1-8180-1s-wino-f4_3.sh │ ├── resnet-n64-8180-1s-wino-f4_3.sh │ ├── resnet-n1-8180-1s-wino-f2_3.sh │ ├── vgg-n64-8180-1s-wino-f3_3.sh │ ├── resnet-n1-8180-1s-direct-nhwc.sh │ ├── vgg-n1-8180-1s-wino-f5_3.sh │ ├── vgg-n1-8180-2s-wino-f4_3.sh │ ├── vgg-n1-8180-2s-wino-f5_3.sh │ ├── vgg-n64-8180-2s-wino-f3_3.sh │ ├── vgg-n64-8180-2s-wino-f4_3.sh │ ├── resnet-n1-8180-1s-int8-rd.sh │ ├── common.sh │ ├── vgg-n64-8180-2s-wino-f5_3.sh │ ├── vgg-n64-8180-1s-wino-f2_3.sh │ ├── resnet-n1-8180-1s-direct-1x1-int8.sh │ ├── resnet-n64-8180-1s-direct-1x1-int8.sh │ ├── vgg-n1-8180-2s-wino-f2_3.sh │ ├── googlenetv3-n64-8180-2s-wino-f5_3.sh │ ├── googlenetv3-n64-8180-1s-wino-f5_3.sh │ ├── googlenetv3-n64-8180-1s-wino-f2_3.sh │ ├── googlenetv3-n64-8180-1s-wino-f3_3.sh │ ├── googlenetv3-n64-8180-2s-wino-f3_3.sh │ ├── resnet-n64-8180-1s-direct.sh │ ├── vgg-n64-8180-1s-wino-f5_3.sh │ ├── googlenetv3-n64-8180-1s-wino-f4_3.sh │ ├── googlenetv3-n64-8180-2s-wino-f4_3.sh │ ├── vgg-n1-8180-1s-direct-nhwc.sh │ ├── googlenetv3-n1-8180-1s-wino-f2_3.sh │ ├── googlenetv3-n1-8180-1s-wino-f3_3.sh │ ├── googlenetv3-n1-8180-2s-wino-f3_3.sh │ ├── vgg-n64-8180-1s-int8.sh │ ├── googlenetv3-n1-8180-1s-wino-f5_3.sh │ ├── googlenetv3-n1-8180-2s-wino-f5_3.sh │ ├── googlenetv3-n1-8180-1s-wino-f4_3.sh │ ├── resnet-n1-8180-1s-direct.sh │ ├── googlenetv3-n1-8180-2s-wino-f4_3.sh │ ├── vgg-n64-8180-2s-wino-f2_3.sh │ ├── resnet-n1-8180-1s-direct-int8.sh │ ├── vgg-n1-8180-2s-wino-f3_3.sh │ ├── vgg-n64-8180-1s-direct.sh │ ├── vgg-n1-8180-1s-direct.sh │ ├── vgg-n1-8180-1s-wino-f2_3.sh │ ├── resnet-n64-8180-1s-direct-int8.sh │ ├── conv-1x1-int8.sh │ ├── vgg-n1-8180-1s-wino-f3_3.sh │ ├── vgg-n64-8180-1s.sh │ ├── vgg-n1-8180-1s.sh │ ├── vgg-n1-8180-1s-wino-int8.sh │ ├── vgg-n1-8180-1s-int8-calibration.sh │ ├── resnet-n1-8180-1s-wino-int8.sh │ ├── group-conv.sh │ ├── vgg-n1-8180-1s-wino-f4_3.sh │ ├── vgg-n64-8180-1s-wino-f4_3.sh │ └── transformer.sh ├── regression-fp32.sh ├── test_conv_execute_mode.sh └── conv-non-16x.sh ├── NOTATION ├── TODO ├── docs └── fusion.txt ├── README.md ├── include └── euler_reorder.hpp └── cmake └── flags.cmake /lib/libiomp5.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/intel/Deep-learning-math-kernel-research/HEAD/lib/libiomp5.so -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "tests/gflags"] 2 | path = tests/gflags 3 | url = https://github.com/gflags/gflags.git 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | #tags files 2 | GPATH 3 | GTAGS 4 | GRTAGS 5 | tags 6 | cscope.* 7 | .ycm_extra_conf.py 8 | 9 | #build path 10 | build/* 11 | 12 | .idea 13 | cmake-build-debug 14 | 15 | -------------------------------------------------------------------------------- /src/el_init.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "el_utils.hpp" 4 | 5 | #if __ICC_COMPILER 6 | #include "xmmintrin.h" 7 | #include "pmmintrin.h" 8 | #endif 9 | 10 | namespace euler { 11 | 12 | struct el_global_option { 13 | int log_level = __INFO; 14 | bool verbose = false; // for EULER_VERBOSE 15 | bool initialized = false; 16 | }; 17 | 18 | void el_init(); 19 | 20 | extern el_global_option ego; 21 | 22 | }; // namespace euler 23 | -------------------------------------------------------------------------------- /src/common/el_log.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "el_def.hpp" 4 | #include "el_utils.hpp" 5 | 6 | namespace euler { 7 | 8 | int el_log(int severity, const char *fmt, ...); 9 | 10 | static inline void el_error(const char *msg) { 11 | el_log(__ERROR, "%s", msg); 12 | } 13 | 14 | static inline void el_warn(const char *msg) { 15 | el_log(__WARN, "%s", msg); 16 | } 17 | 18 | static inline void el_info(const char *msg) { 19 | el_log(__ERROR, "%s", msg); 20 | } 21 | 22 | } // euler 23 | -------------------------------------------------------------------------------- /tests/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # supress gflags warning #3280 2 | remove_definitions(-Werror) 3 | remove_definitions(-Wshadow) 4 | 5 | file (GLOB __test_sources 6 | elt_conv.cpp 7 | elt_conv_utils.cpp 8 | elt_gflag.cpp) 9 | 10 | set(gflags_BUILD_STATIC_LIBS ON) 11 | set(GFLAGS_SRC_DIR gflags CACHE STRING "gflags source from submodules") 12 | add_subdirectory("${GFLAGS_SRC_DIR}") 13 | include_directories(gflags) 14 | 15 | add_executable(elt_conv ${__test_sources}) 16 | target_link_libraries(elt_conv ${lib_name} iomp5 gflags) 17 | add_test(NAME elt_conv COMMAND elt_conv) 18 | -------------------------------------------------------------------------------- /src/common/el_log.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "el_utils.hpp" 3 | #include "el_init.hpp" 4 | 5 | namespace euler { 6 | 7 | int el_log(int severity, const char *fmt, ...) { 8 | int ret = 0; 9 | if (severity >= ego.log_level) { 10 | va_list ap; 11 | 12 | fprintf(stdout, "euler-%s, ", log_severity_to_string(severity)); 13 | va_start(ap, fmt); 14 | ret = vfprintf(stdout, fmt, ap); 15 | va_end(ap); 16 | fputs("\n", stdout); 17 | } 18 | 19 | if (severity >= __ERROR && severity != __PERF_TRACE) { 20 | fputs("Euler abort due to error\n", stdout); 21 | abort(); 22 | } 23 | 24 | return ret; 25 | } 26 | 27 | }; // euler 28 | -------------------------------------------------------------------------------- /src/elx_stream.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | namespace euler { 9 | 10 | struct elx_conv_t; 11 | 12 | class elx_stream { 13 | public: 14 | elx_stream(); 15 | ~elx_stream(); 16 | void submit(elx_conv_t *ep); 17 | void wait(elx_conv_t *ep); 18 | int run(); 19 | 20 | private: 21 | elx_stream& operator=(const elx_stream&) = delete; 22 | elx_stream(const elx_stream&) = delete; 23 | 24 | std::queue _stream; 25 | mutable std::mutex _mutex; 26 | std::condition_variable _cond; 27 | std::thread *_threadx; 28 | }; 29 | 30 | extern elx_stream global_stream; 31 | 32 | } 33 | -------------------------------------------------------------------------------- /src/elx_int8_conv_wino_bind.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "elx_int8_conv_wino.hpp" 4 | 5 | namespace euler { 6 | 7 | Template_elx_int8_conv_wino_t void 8 | Instance_elx_int8_conv_wino_t::bind_execute_functions() { 9 | #define EXECUTE_CASE(n) \ 10 | case 0x##n: \ 11 | execute_opt_ = &Instance_elx_int8_conv_wino_t::__execute_##n; \ 12 | break 13 | 14 | switch (xopt_) { 15 | EXECUTE_CASE(a133); 16 | EXECUTE_CASE(a161); 17 | EXECUTE_CASE(a173); 18 | default: 19 | el_error("Unimplemented"); 20 | break; 21 | } 22 | } 23 | 24 | } // namespace euler 25 | -------------------------------------------------------------------------------- /src/elx_conv_wino_bind.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "elx_conv_wino.hpp" 4 | 5 | namespace euler { 6 | 7 | Template_elx_conv_wino_t void 8 | Instance_elx_conv_wino_t::bind_execute_functions() { 9 | #define EXECUTE_CASE(n) \ 10 | case 0x##n: \ 11 | execute_opt_ = &Instance_elx_conv_wino_t::__execute_##n; \ 12 | break 13 | 14 | switch (xopt_) { 15 | EXECUTE_CASE(a000); 16 | EXECUTE_CASE(a033); 17 | EXECUTE_CASE(a061); 18 | EXECUTE_CASE(a071); 19 | EXECUTE_CASE(a073); 20 | default: 21 | el_error("Unimplemented"); 22 | break; 23 | } 24 | } 25 | 26 | } // namespace euler 27 | -------------------------------------------------------------------------------- /scripts/best_configs/resnet-n64-8180-1s-wino-f3_3.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | 3 | #/bin/bash 4 | 5 | # Resnet50 6 | # batch size: 64 7 | # SKX 8180 2S 8 | 9 | source ./scripts/best_configs/common.sh $@ 10 | 11 | # resnet50_res2a_branch2b 12 | NSOCKETS=1 ./scripts/run.sh -c -i64 -h56 -o64 -H56 -n64 --tile-size=5 --execution-mode=0xa061 --blk-i=4 --blk-o=4 --flt-t=30 $COMMON 13 | sleep 1 14 | # resnet50_res3a_branch2b 15 | NSOCKETS=1 ./scripts/run.sh -c -i128 -h28 -o128 -H28 -n64 --tile-size=5 --execution-mode=0xa061 --blk-i=8 --blk-o=8 --flt-t=29 $COMMON 16 | sleep 1 17 | # resnet50_res4a_branch2b 18 | NSOCKETS=1 ./scripts/run.sh -c -i256 -h14 -o256 -H14 -n64 --tile-size=5 --execution-mode=0xa061 --blk-i=8 --blk-o=4 --flt-t=29 --pat-o=4 $COMMON 19 | sleep 1 20 | # resnet50_res5a_branch2b 21 | NSOCKETS=1 ./scripts/run.sh -c -i512 -h7 -o512 -H7 -n64 --tile-size=5 --execution-mode=0xa000 --blk-i=8 --blk-o=8 --flt-t=29 $COMMON 22 | -------------------------------------------------------------------------------- /scripts/best_configs/resnet-n64-8180-2s-wino-f5_3.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | 3 | #/bin/bash 4 | 5 | # Resnet50 6 | # batch size: 64 7 | # SKX 8180 2S 8 | 9 | source ./scripts/best_configs/common.sh $@ 10 | 11 | # resnet50_res2a_branch2b, 11.8T 12 | NSOCKETS=2 ./scripts/run.sh -c -i64 -h56 -o64 -H56 -n64 --tile-size=7 --execution-mode=0xa061 --blk-i=4 --blk-o=4 --flt-t=15 $COMMON 13 | sleep 1 14 | # resnet50_res3a_branch2b, 11.9T 15 | NSOCKETS=2 ./scripts/run.sh -c -i128 -h28 -o128 -H28 -n64 --tile-size=7 --execution-mode=0xa061 --blk-i=8 --blk-o=8 --flt-t=21 $COMMON 16 | sleep 1 17 | # resnet50_res4a_branch2b, 10.0T 18 | NSOCKETS=2 ./scripts/run.sh -c -i256 -h14 -o256 -H14 -n64 --tile-size=7 --execution-mode=0xa000 --blk-i=8 --blk-o=8 --flt-t=29 $COMMON 19 | sleep 1 20 | # resnet50_res5a_branch2b, 6.7T 21 | NSOCKETS=2 ./scripts/run.sh -c -i512 -h7 -o512 -H7 -n64 --tile-size=7 --execution-mode=0xa000 --blk-i=8 --blk-o=8 --flt-t=29 $COMMON 22 | -------------------------------------------------------------------------------- /scripts/best_configs/resnet-n64-8180-1s-wino-f2_3.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | 3 | #/bin/bash 4 | 5 | # Resnet50 6 | # batch size: 64 7 | # SKX 8180 2S 8 | 9 | source ./scripts/best_configs/common.sh $@ 10 | 11 | # resnet50_res2a_branch2b 12 | NSOCKETS=1 ./scripts/run.sh -c -i64 -h56 -o64 -H56 -n64 --tile-size=4 --execution-mode=0xa061 --blk-i=4 --blk-o=2 --flt-t=29 $COMMON 13 | sleep 1 14 | # resnet50_res3a_branch2b 15 | NSOCKETS=1 ./scripts/run.sh -c -i128 -h28 -o128 -H28 -n64 --tile-size=4 --execution-mode=0xa061 --blk-i=8 --blk-o=8 --flt-t=25 $COMMON 16 | sleep 1 17 | # resnet50_res4a_branch2b 18 | NSOCKETS=1 ./scripts/run.sh -c -i256 -h14 -o256 -H14 -n64 --tile-size=4 --execution-mode=0xa061 --blk-i=8 --blk-o=8 --flt-t=28 --pat-o=1 $COMMON 19 | sleep 1 20 | # resnet50_res5a_branch2b 21 | NSOCKETS=1 ./scripts/run.sh -c -i512 -h7 -o512 -H7 -n64 --tile-size=4 --execution-mode=0xa061 --blk-i=32 --blk-o=4 --flt-o=2 --flt-t=14 --pat-o=4 $COMMON 22 | 23 | -------------------------------------------------------------------------------- /scripts/best_configs/resnet-n64-8180-2s-wino-f4_3.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | 3 | #/bin/bash 4 | 5 | # Resnet50 6 | # batch size: 64 7 | # SKX 8180 2S 8 | 9 | source ./scripts/best_configs/common.sh $@ 10 | 11 | # resnet50_res2a_branch2b, 11.8T 12 | NSOCKETS=2 ./scripts/run.sh -c -i64 -h56 -o64 -H56 -n64 --tile-size=6 --execution-mode=0xa061 --blk-i=4 --blk-o=4 --flt-t=28 $COMMON 13 | sleep 1 14 | # resnet50_res3a_branch2b, 11.9T 15 | NSOCKETS=2 ./scripts/run.sh -c -i128 -h28 -o128 -H28 -n64 --tile-size=6 --execution-mode=0xa061 --blk-i=8 --blk-o=8 --flt-t=28 $COMMON 16 | sleep 1 17 | # resnet50_res4a_branch2b, 10.0T 18 | NSOCKETS=2 ./scripts/run.sh -c -i256 -h14 -o256 -H14 -n64 --tile-size=6 --execution-mode=0xa061 --blk-i=8 --blk-o=4 --flt-t=19 --pat-o=4 $COMMON 19 | sleep 1 20 | # resnet50_res5a_branch2b, 6.7T 21 | NSOCKETS=2 ./scripts/run.sh -c -i512 -h7 -o512 -H7 -n64 --tile-size=6 --execution-mode=0xa000 --blk-i=8 --blk-o=8 --flt-t=29 $COMMON 22 | -------------------------------------------------------------------------------- /src/el_init.cpp: -------------------------------------------------------------------------------- 1 | #include "el_utils.hpp" 2 | #include "el_init.hpp" 3 | 4 | namespace euler { 5 | 6 | el_global_option ego; 7 | 8 | __attribute__((constructor)) void el_init() { 9 | 10 | if (ego.initialized) 11 | return; 12 | 13 | #if __ICC_COMPILER 14 | _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); 15 | _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); 16 | #endif 17 | 18 | // PERF_TRACE 19 | auto env_verbose = ::getenv("EULER_VERBOSE"); 20 | if (env_verbose != nullptr && env_verbose[0] == '1') { 21 | ego.verbose = 1; 22 | } 23 | 24 | auto env_log_level = ::getenv("EULER_LOG_LEVEL"); 25 | if (env_log_level != nullptr) { 26 | ego.log_level = atoi(env_log_level); 27 | } 28 | 29 | if (ego.verbose > 0) 30 | el_log(__INFO, "Version: %s, MT_RUNTIME: %s", 31 | XSTRINGIFY(EULER_VERSION), mt_runtime_to_string(MT_RUNTIME)); 32 | 33 | ego.initialized = true; 34 | } 35 | 36 | } // namespace euler 37 | -------------------------------------------------------------------------------- /scripts/best_configs/resnet-n64-8180-2s-wino-f3_3.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | 3 | #/bin/bash 4 | 5 | # Resnet50 6 | # batch size: 64 7 | # SKX 8180 2S 8 | 9 | source ./scripts/best_configs/common.sh $@ 10 | 11 | # resnet50_res2a_branch2b, 11.8T 12 | NSOCKETS=2 ./scripts/run.sh -c -i64 -h56 -o64 -H56 -n64 --tile-size=5 --execution-mode=0xa061 --blk-i=4 --blk-o=4 --flt-t=28 $COMMON 13 | sleep 1 14 | # resnet50_res3a_branch2b, 11.9T 15 | NSOCKETS=2 ./scripts/run.sh -c -i128 -h28 -o128 -H28 -n64 --tile-size=5 --execution-mode=0xa061 --blk-i=8 --blk-o=8 --flt-t=29 $COMMON 16 | sleep 1 17 | # resnet50_res4a_branch2b, 10.0T 18 | NSOCKETS=2 ./scripts/run.sh -c -i256 -h14 -o256 -H14 -n64 --tile-size=5 --execution-mode=0xa061 --blk-i=8 --blk-o=4 --flt-t=23 --pat-o=4 $COMMON 19 | sleep 1 20 | # resnet50_res5a_branch2b, 6.7T 21 | NSOCKETS=2 ./scripts/run.sh -c -i512 -h7 -o512 -H7 -n64 --tile-size=5 --execution-mode=0xa000 --blk-i=8 --blk-o=8 --flt-t=29 $COMMON 22 | 23 | -------------------------------------------------------------------------------- /scripts/best_configs/resnet-n1-8180-1s-wino-f5_3.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | 3 | # Resnet50 4 | # batch-size: 1 5 | # SKX 8180 1S 6 | 7 | source ./scripts/best_configs/common.sh $@ 8 | 9 | # resnet50_res2a_branch2b, 5.2T 10 | NSOCKETS=1 ./scripts/run.sh -c -i64 -h56 -o64 -H56 -n1 --tile-size=7 --execution-mode=0xa061 --blk-i=4 --blk-o=1 --flt-o=2 --flt-t=11 --pat-o=2 --output-as-blocked=true $COMMON 11 | sleep 1 12 | # resnet50_res3a_branch2b, 4.4T 13 | NSOCKETS=1 ./scripts/run.sh -c -i128 -h28 -o128 -H28 -n1 --tile-size=7 --execution-mode=0xa000 --blk-i=8 --blk-o=2 --flt-o=2 --flt-t=9 $COMMON 14 | sleep 1 15 | # resnet50_res4a_branch2b, 4.2T 16 | NSOCKETS=1 ./scripts/run.sh -c -i256 -h14 -o256 -H14 -n1 --tile-size=7 --execution-mode=0xa000 --blk-i=4 --blk-o=4 --flt-o=2 --flt-t=9 $COMMON 17 | sleep 1 18 | # resnet50_res5a_branch2b, 2.5T 19 | NSOCKETS=1 ./scripts/run.sh -c -i512 -h7 -o512 -H7 -n1 --tile-size=7 --execution-mode=0xa000 --blk-i=8 --blk-o=4 --flt-t=4 $COMMON 20 | -------------------------------------------------------------------------------- /scripts/best_configs/resnet-n1-8180-2s-wino-f4_3.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | 3 | # Resnet50 4 | # batch-size: 1 5 | # SKX 8180 2S 6 | 7 | source ./scripts/best_configs/common.sh $@ 8 | 9 | # resnet50_res2a_branch2b, 8.4T 10 | NSOCKETS=2 ./scripts/run.sh -c -i64 -h56 -o64 -H56 -n1 --tile-size=6 --execution-mode=0xa061 --blk-i=4 --blk-o=2 --flt-t=7 --pat-o=2 --output-as-blocked=true $COMMON 11 | sleep 1 12 | # resnet50_res3a_branch2b, 6.3T 13 | NSOCKETS=2 ./scripts/run.sh -c -i128 -h28 -o128 -H28 -n1 --tile-size=6 --execution-mode=0xa061 --blk-i=8 --blk-o=1 --flt-t=7 --pat-o=8 $COMMON 14 | sleep 1 15 | # resnet50_res4a_branch2b, 5.3T 16 | NSOCKETS=2 ./scripts/run.sh -c -i256 -h14 -o256 -H14 -n1 --tile-size=6 --execution-mode=0xa000 --output-as-blocked=true --blk-i=8 --blk-o=2 --flt-t=4 $COMMON 17 | sleep 1 18 | # resnet50_res5a_branch2b, 3.6T 19 | NSOCKETS=2 ./scripts/run.sh -c -i512 -h7 -o512 -H7 -n1 --tile-size=6 --execution-mode=0xa000 --blk-i=8 --blk-o=8 --flt-t=4 $COMMON 20 | -------------------------------------------------------------------------------- /scripts/best_configs/resnet-n1-8180-2s-wino-f5_3.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | 3 | # Resnet50 4 | # batch-size: 1 5 | # SKX 8180 2S 6 | 7 | source ./scripts/best_configs/common.sh $@ 8 | 9 | # resnet50_res2a_branch2b, 8.4T 10 | NSOCKETS=2 ./scripts/run.sh -c -i64 -h56 -o64 -H56 -n1 --tile-size=7 --execution-mode=0xa061 --blk-i=4 --blk-o=2 --flt-t=6 --pat-o=2 --output-as-blocked=true $COMMON 11 | sleep 1 12 | # resnet50_res3a_branch2b, 6.3T 13 | SOCKETS=2 ./scripts/run.sh -c -i128 -h28 -o128 -H28 -n1 --tile-size=7 --execution-mode=0xa000 --blk-i=8 --blk-o=8 --flt-t=5 --output-as-blocked=true $COMMON 14 | sleep 1 15 | # resnet50_res4a_branch2b, 5.3T 16 | NSOCKETS=2 ./scripts/run.sh -c -i256 -h14 -o256 -H14 -n1 --tile-size=7 --execution-mode=0xa000 --blk-i=8 --blk-o=4 --flt-t=3 --output-as-blocked=true $COMMON 17 | sleep 1 18 | # resnet50_res5a_branch2b, 3.6T 19 | NSOCKETS=2 ./scripts/run.sh -c -i512 -h7 -o512 -H7 -n1 --tile-size=7 --execution-mode=0xa000 --blk-i=8 --blk-o=4 --flt-t=4 $COMMON 20 | -------------------------------------------------------------------------------- /scripts/best_configs/resnet-n1-8180-2s-wino-f3_3.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | 3 | # Resnet50 4 | # batch-size: 1 5 | # SKX 8180 2S 6 | 7 | source ./scripts/best_configs/common.sh $@ 8 | 9 | # resnet50_res2a_branch2b, 8.4T 10 | NSOCKETS=2 ./scripts/run.sh -c -i64 -h56 -o64 -H56 -n1 --tile-size=5 --execution-mode=0xa061 --blk-i=4 --blk-o=2 --flt-t=13 --pat-o=2 --output-as-blocked=true $COMMON 11 | sleep 1 12 | # resnet50_res3a_branch2b, 6.3T 13 | NSOCKETS=2 ./scripts/run.sh -c -i128 -h28 -o128 -H28 -n1 --tile-size=5 --execution-mode=0xa061 --blk-i=8 --blk-o=1 --flt-t=15 --pat-o=8 --output-as-blocked=true $COMMON 14 | sleep 1 15 | # resnet50_res4a_branch2b, 5.3T 16 | NSOCKETS=2 ./scripts/run.sh -c -i256 -h14 -o256 -H14 -n1 --tile-size=5 --execution-mode=0xa000 --blk-i=8 --blk-o=1 --flt-t=9 --output-as-blocked=true $COMMON 17 | sleep 1 18 | # resnet50_res5a_branch2b, 3.6T 19 | NSOCKETS=2 ./scripts/run.sh -c -i512 -h7 -o512 -H7 -n1 --tile-size=5 --execution-mode=0xa000 --blk-i=8 --blk-o=4 --flt-t=9 $COMMON 20 | 21 | -------------------------------------------------------------------------------- /scripts/best_configs/resnet-n1-8180-1s.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | 3 | # Resnet50 4 | # batch-size: 1 5 | # SKX 8180 1S 6 | 7 | source ./scripts/best_configs/common.sh $@ 8 | COMMON="$COMMON --f16c-opt=1" 9 | 10 | # resnet50_res2a_branch2b, 5.2T 11 | NSOCKETS=1 ./scripts/run.sh -c -i64 -h56 -o64 -H56 -n1 --tile-size=6 --execution-mode=0xa033 --blk-i=4 --blk-o=2 --flt-o=2 --flt-t=14 --pat-o=1 --output-as-blocked=true $COMMON 12 | sleep 1 13 | # resnet50_res3a_branch2b, 7.0 - 7.5T 14 | NSOCKETS=1 ./scripts/run.sh -c -i128 -h28 -o128 -H28 -n1 --tile-size=6 --execution-mode=0xa033 --blk-i=8 --blk-o=2 --flt-o=2 --flt-t=7 --pat-o=1 $COMMON 15 | sleep 1 16 | # resnet50_res4a_branch2b, 4.2T 17 | NSOCKETS=1 ./scripts/run.sh -c -i256 -h14 -o256 -H14 -n1 --tile-size=4 --execution-mode=0xa033 --blk-i=8 --blk-o=1 --flt-o=2 --flt-t=13 $COMMON 18 | sleep 1 19 | # resnet50_res5a_branch2b, 2.5T 20 | NSOCKETS=1 ./scripts/run.sh -c -i512 -h7 -o512 -H7 -n1 --tile-size=4 --execution-mode=0xa033 --blk-i=16 --blk-o=1 --flt-o=2 --flt-t=8 $COMMON 21 | -------------------------------------------------------------------------------- /scripts/best_configs/resnet-n1-8180-1s-wino-f3_3.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | 3 | # Resnet50 4 | # batch-size: 1 5 | # SKX 8180 1S 6 | 7 | source ./scripts/best_configs/common.sh $@ 8 | 9 | # resnet50_res2a_branch2b, 5.2T 10 | NSOCKETS=1 ./scripts/run.sh -c -i64 -h56 -o64 -H56 -n1 --tile-size=5 --execution-mode=0xa061 --blk-i=4 --blk-o=1 --flt-o=2 --flt-t=7 --pat-o=2 --output-as-blocked=true $COMMON 11 | sleep 1 12 | # resnet50_res3a_branch2b, 4.4T 13 | NSOCKETS=1 ./scripts/run.sh -c -i128 -h28 -o128 -H28 -n1 --tile-size=5 --execution-mode=0xa061 --blk-i=8 --blk-o=1 --flt-o=2 --flt-t=8 --pat-i=1 --pat-o=4 --output-as-blocked=true $COMMON 14 | sleep 1 15 | # resnet50_res4a_branch2b, 4.2T 16 | NSOCKETS=1 ./scripts/run.sh -c -i256 -h14 -o256 -H14 -n1 --tile-size=5 --execution-mode=0xa000 --blk-i=16 --blk-o=16 --flt-o=1 --flt-t=25 $COMMON 17 | sleep 1 18 | # resnet50_res5a_branch2b, 2.5T 19 | NSOCKETS=1 ./scripts/run.sh -c -i512 -h7 -o512 -H7 -n1 --tile-size=5 --execution-mode=0xa000 --blk-i=16 --blk-o=32 --flt-o=1 --flt-t=9 $COMMON 20 | -------------------------------------------------------------------------------- /scripts/best_configs/resnet-n64-8180-2s-wino-f2_3.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | 3 | #/bin/bash 4 | 5 | # Resnet50 6 | # batch size: 64 7 | # SKX 8180 2S 8 | 9 | source ./scripts/best_configs/common.sh $@ 10 | 11 | # resnet50_res2a_branch2b, 11.8T tflops:9.7289 12 | NSOCKETS=2 ./scripts/run.sh -c -i64 -h56 -o64 -H56 -n64 --tile-size=4 --execution-mode=0xa061 --blk-i=4 --blk-o=2 --flt-t=29 $COMMON 13 | sleep 1 14 | # resnet50_res3a_branch2b, 11.9T tflops:10.9736 15 | NSOCKETS=2 ./scripts/run.sh -c -i128 -h28 -o128 -H28 -n64 --tile-size=4 --execution-mode=0xa061 --blk-i=8 --blk-o=8 --flt-t=25 $COMMON 16 | sleep 1 17 | # resnet50_res4a_branch2b, 10.0Ti tflops:10.6828 18 | NSOCKETS=2 ./scripts/run.sh -c -i256 -h14 -o256 -H14 -n64 --tile-size=4 --execution-mode=0xa061 --blk-i=8 --blk-o=8 --flt-t=28 --pat-o=1 $COMMON 19 | sleep 1 20 | # resnet50_res5a_branch2b, 6.7T tflops:6.97863 21 | NSOCKETS=2 ./scripts/run.sh -c -i512 -h7 -o512 -H7 -n64 --tile-size=4 --execution-mode=0xa061 --blk-i=32 --blk-o=2 --flt-o=2 --flt-t=14 --pat-o=4 $COMMON 22 | -------------------------------------------------------------------------------- /scripts/regression-fp32.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # asymmetric padding, k3 s2 4 | NSOCKETS=1 ./scripts/run.sh -c -i3 -h224 -o32 -H112 -k3 -K3 -s2 -S2 -p1 -P1 -n1 -adirect --execution-mode=0xc060 --blk-i=1 --blk-o=1 --flt-o=2 --flt-t=14 --pat-o=1 --input-format=nchw --output-format=nChw16c --weights-format=hwio -v1 5 | NSOCKETS=1 ./scripts/run.sh -c -i3 -h224 -o32 -H112 -k3 -K3 -s2 -S2 -p0 -P0 -n1 -adirect --execution-mode=0xc060 --blk-i=1 --blk-o=1 --flt-o=2 --flt-t=14 --pat-o=1 --input-format=nchw --output-format=nChw16c --weights-format=hwio -v1 6 | 7 | # asymmetric padding, k7 s2 8 | NSOCKETS=1 ./scripts/run.sh -c -i3 -h224 -o64 -H112 -k7 -K7 -s2 -S2 -p3 -P3 -n1 -adirect --execution-mode=0xc060 --blk-i=1 --blk-o=2 --flt-o=2 --flt-t=14 --pat-o=1 --input-format=nchw --output-format=nChw16c --weights-format=hwio -v1 9 | NSOCKETS=1 ./scripts/run.sh -c -i3 -h224 -o64 -H112 -k7 -K7 -s2 -S2 -p2 -P2 -n1 -adirect --execution-mode=0xc060 --blk-i=1 --blk-o=2 --flt-o=2 --flt-t=14 --pat-o=1 --input-format=nchw --output-format=nChw16c --weights-format=hwio -v1 10 | -------------------------------------------------------------------------------- /scripts/best_configs/resnet-n64-8180-1s.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | 3 | # Resnet50 4 | # batch-size: 1 5 | # SKX 8180 1S 6 | 7 | source ./scripts/best_configs/common.sh $@ 8 | COMMON="$COMMON --f16c-opt=1" 9 | 10 | # resnet50_res2a_branch2b, 5.2T 11 | NSOCKETS=1 ./scripts/run.sh -c -i64 -h56 -o64 -H56 -n64 --tile-size=6 --execution-mode=0xa061 --blk-i=4 --blk-o=1 --flt-o=2 --flt-t=14 --pat-o=2 --output-as-blocked=true $COMMON 12 | sleep 1 13 | # resnet50_res3a_branch2b, 7.0 - 7.5T 14 | NSOCKETS=1 ./scripts/run.sh -c -i128 -h28 -o128 -H28 -n64 --tile-size=6 --execution-mode=0xa061 --blk-i=8 --blk-o=1 --flt-o=2 --flt-t=14 --pat-o=4 $COMMON 15 | sleep 1 16 | # resnet50_res4a_branch2b, 4.2T 17 | NSOCKETS=1 ./scripts/run.sh -c -i256 -h14 -o256 -H14 -n64 --tile-size=6 --execution-mode=0xa061 --blk-i=16 --blk-o=1 --flt-o=2 --flt-t=14 --pat-i=1 --pat-o=8 $COMMON 18 | sleep 1 19 | # resnet50_res5a_branch2b, 2.5T 20 | NSOCKETS=1 ./scripts/run.sh -c -i512 -h7 -o512 -H7 -n64 --tile-size=6 --execution-mode=0xa033 --blk-i=8 --blk-o=8 --flt-t=26 --pat-i=1 --pat-o=1 $COMMON 21 | -------------------------------------------------------------------------------- /scripts/best_configs/resnet-n1-8180-2s-wino-f2_3.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | 3 | # Resnet50 4 | # batch-size: 1 5 | # SKX 8180 2S 6 | 7 | source ./scripts/best_configs/common.sh $@ 8 | 9 | # resnet50_res2a_branch2b, 8.4T tflops: 8.27005 10 | NSOCKETS=2 ./scripts/run.sh -c -i64 -h56 -o64 -H56 -n1 --tile-size=4 --execution-mode=0xa061 --blk-i=4 --blk-o=4 --flt-t=14 --pat-o=1 --output-as-blocked=true $COMMON 11 | sleep 1 12 | # resnet50_res3a_branch2b, 6.3Tv tflops:7.44817 13 | NSOCKETS=2 ./scripts/run.sh -c -i128 -h28 -o128 -H28 -n1 --tile-size=4 --execution-mode=0xa061 --blk-i=8 --blk-o=2 --flt-t=7 --pat-o=2 --output-as-blocked=true $COMMON 14 | sleep 1 15 | # resnet50_res4a_branch2b, 5.3T tflops:5.73528 16 | NSOCKETS=2 ./scripts/run.sh -c -i256 -h14 -o256 -H14 -n1 --tile-size=4 --execution-mode=0xa000 --blk-i=2 --blk-o=8 --flt-t=7 --output-as-blocked=true $COMMON 17 | sleep 1 18 | # resnet50_res5a_branch2b, 3.6T tflops:4.3533 19 | NSOCKETS=2 ./scripts/run.sh -c -i512 -h7 -o512 -H7 -n1 --tile-size=4 --execution-mode=0xa000 --blk-i=16 --blk-o=4 --flt-t=8 $COMMON 20 | 21 | -------------------------------------------------------------------------------- /scripts/best_configs/resnet-n64-8180-1s-wino-f5_3.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | 3 | #/bin/bash 4 | 5 | # Resnet50 6 | # batch size: 64 7 | # SKX 8180 2S 8 | 9 | source ./scripts/best_configs/common.sh $@ 10 | 11 | 12 | # resnet50_res2a_branch2b 13 | NSOCKETS=1 ./scripts/run.sh -c -i64 -h56 -o64 -H56 -n64 --tile-size=7 --execution-mode=0xa061 --blk-i=4 --blk-o=4 --flt-t=15 $COMMON 14 | sleep 1 15 | # resnet50_res3a_branch2b 16 | NSOCKETS=1 ./scripts/run.sh -c -i128 -h28 -o128 -H28 -n64 --tile-size=7 --execution-mode=0xa061 --blk-i=8 --blk-o=8 --flt-t=21 $COMMON 17 | sleep 1 18 | # resnet50_res4a_branch2b 19 | #NSOCKETS=1 ./scripts/run.sh -c -i256 -h14 -o256 -H14 -n64 --tile-size=7 --execution-mode=0xa000 --blk-i=8 --blk-o=8 --flt-t=29 $COMMON 20 | NSOCKETS=1 ./scripts/run.sh -c -i256 -h14 -o256 -H14 -n64 --tile-size=7 --execution-mode=0xa073 --blk-i=4 --blk-o=1 --flt-o=2 --flt-t=14 --pat-i=4 --pat-o=8 $COMMON 21 | sleep 1 22 | # resnet50_res5a_branch2b 23 | NSOCKETS=1 ./scripts/run.sh -c -i512 -h7 -o512 -H7 -n64 --tile-size=7 --execution-mode=0xa000 --blk-i=8 --blk-o=8 --flt-t=29 $COMMON 24 | -------------------------------------------------------------------------------- /scripts/best_configs/resnet-n12-8180-1s-wino-int8.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | 3 | # Resnet50 4 | # batch-size: 1 5 | # SKX 8180 1S 6 | 7 | source ./scripts/best_configs/common.sh $@ 8 | COMMON="$COMMON --f16c-opt=0" 9 | 10 | # resnet50_res2a_branch2b 11 | NSOCKETS=1 ./scripts/run.sh -c -i64 -h56 -o64 -H56 -n12 --tile-size=6 --execution-mode=0xa161 --blk-i=4 --blk-o=2 --flt-o=2 --flt-t=14 --pat-o=1 --output-as-blocked=true $COMMON --data-type-cfg=U8F32S8F32 --sampling-kind=2 12 | # resnet50_res3a_branch2b 13 | NSOCKETS=1 ./scripts/run.sh -c -i128 -h28 -o128 -H28 -n12 --tile-size=6 --execution-mode=0xa161 --blk-i=8 --blk-o=2 --flt-o=2 --flt-t=14 --pat-o=2 $COMMON --data-type-cfg=U8F32S8F32 --sampling-kind=2 14 | # resnet50_res4a_branch2b 15 | NSOCKETS=1 ./scripts/run.sh -c -i256 -h14 -o256 -H14 -n12 --tile-size=6 --execution-mode=0xa133 --blk-i=16 --blk-o=2 --flt-o=2 --flt-t=14 --pat-o=1 $COMMON --data-type-cfg=U8F32S8F32 --sampling-kind=2 16 | # resnet50_res5a_branch2b 17 | NSOCKETS=1 ./scripts/run.sh -c -i512 -h7 -o512 -H7 -n12 --tile-size=6 --execution-mode=0xa133 --blk-i=32 --blk-o=2 --flt-o=2 --flt-t=14 --pat-i=1 --pat-o=1 $COMMON --data-type-cfg=U8F32S8F32 --sampling-kind=2 18 | -------------------------------------------------------------------------------- /scripts/best_configs/resnet-n64-8180-1s-wino-int8.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | 3 | # Resnet50 4 | # batch-size: 1 5 | # SKX 8180 1S 6 | 7 | source ./scripts/best_configs/common.sh $@ 8 | COMMON="$COMMON --f16c-opt=0" 9 | 10 | # resnet50_res2a_branch2b 11 | NSOCKETS=1 ./scripts/run.sh -c -i64 -h56 -o64 -H56 -n64 --tile-size=6 --execution-mode=0xa161 --blk-i=4 --blk-o=2 --flt-o=2 --flt-t=14 --pat-o=1 --output-as-blocked=true $COMMON --data-type-cfg=U8F32S8F32 --sampling-kind=2 12 | # resnet50_res3a_branch2b 13 | NSOCKETS=1 ./scripts/run.sh -c -i128 -h28 -o128 -H28 -n64 --tile-size=6 --execution-mode=0xa161 --blk-i=8 --blk-o=2 --flt-o=2 --flt-t=14 --pat-o=2 $COMMON --data-type-cfg=U8F32S8F32 --sampling-kind=2 14 | # resnet50_res4a_branch2b 15 | NSOCKETS=1 ./scripts/run.sh -c -i256 -h14 -o256 -H14 -n64 --tile-size=6 --execution-mode=0xa161 --blk-i=16 --blk-o=2 --flt-o=2 --flt-t=14 --pat-o=4 $COMMON --data-type-cfg=U8F32S8F32 --sampling-kind=2 16 | # resnet50_res5a_branch2b 17 | NSOCKETS=1 ./scripts/run.sh -c -i512 -h7 -o512 -H7 -n64 --tile-size=6 --execution-mode=0xa133 --blk-i=32 --blk-o=2 --flt-o=2 --flt-t=14 --pat-i=1 --pat-o=1 $COMMON --data-type-cfg=U8F32S8F32 --sampling-kind=2 18 | -------------------------------------------------------------------------------- /scripts/best_configs/resnet-n1-8180-1s-wino-f4_3.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | 3 | # Resnet50 4 | # batch-size: 1 5 | # SKX 8180 1S 6 | 7 | source ./scripts/best_configs/common.sh $@ 8 | 9 | # resnet50_res2a_branch2b, 5.2T 10 | #NSOCKETS=1 ./scripts/run.sh -c -i64 -h56 -o64 -H56 -n1 --tile-size=6 --execution-mode=0xa061 --blk-i=4 --blk-o=1 --flt-o=2 --flt-t=7 --pat-o=2 --output-as-blocked=true $COMMON 11 | NSOCKETS=1 ./scripts/run.sh -c -i64 -h56 -o64 -H56 -n1 --tile-size=6 --execution-mode=0xa061 --blk-i=4 --blk-o=1 --flt-o=2 --flt-t=14 --pat-o=2 --output-as-blocked=true $COMMON 12 | sleep 1 13 | # resnet50_res3a_branch2b, 7.0 - 7.5T 14 | NSOCKETS=1 ./scripts/run.sh -c -i128 -h28 -o128 -H28 -n1 --tile-size=6 --execution-mode=0xa073 --blk-i=8 --blk-o=1 --flt-o=2 --flt-t=7 --pat-i=1 --pat-o=4 --output-as-blocked=true $COMMON 15 | sleep 1 16 | # resnet50_res4a_branch2b, 4.2T 17 | NSOCKETS=1 ./scripts/run.sh -c -i256 -h14 -o256 -H14 -n1 --tile-size=6 --execution-mode=0xa000 --blk-i=8 --blk-o=8 --flt-o=1 --flt-t=16 $COMMON 18 | sleep 1 19 | # resnet50_res5a_branch2b, 2.5T 20 | NSOCKETS=1 ./scripts/run.sh -c -i512 -h7 -o512 -H7 -n1 --tile-size=6 --execution-mode=0xa000 --blk-i=8 --blk-o=8 --flt-o=2 --flt-t=4 $COMMON 21 | -------------------------------------------------------------------------------- /tests/elt_utils.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include 5 | #include "euler.hpp" 6 | 7 | 8 | namespace euler { 9 | namespace test { 10 | 11 | class timer { 12 | typedef std::chrono::high_resolution_clock Time; 13 | typedef std::chrono::duration Duration; 14 | 15 | public: 16 | timer(): duration_(0.0) { } 17 | void start() { start_ = Time::now(); } 18 | void stop() 19 | { 20 | Time::time_point end = Time::now(); 21 | double d = Duration(end - start_).count(); 22 | duration_ += d; 23 | } 24 | double duration() { return duration_; } 25 | void report_tflops(std::string& name, size_t num_iters, size_t num_ops) 26 | { 27 | double ms = duration_ / num_iters; 28 | double tflops = num_ops / ms / 1e9; 29 | std::cout << name << ": num_iters=" << num_iters 30 | << ", num_ops=" << num_ops << ", ms=" << ms 31 | << ", tflops=" << tflops << "\n"; 32 | } 33 | 34 | private: 35 | Time::time_point start_; 36 | double duration_; 37 | }; 38 | 39 | inline void error(const char *msg) 40 | { 41 | printf("Euler:test::Error: %s\n", msg); 42 | abort(); 43 | } 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /src/kernel/elk_conv_wino.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "elx_conv.hpp" 4 | 5 | namespace euler { 6 | 7 | // A: tile size 8 | // K: kernel size 9 | // V: vector size 10 | // I: ISA 11 | // format: C/D/E/F 12 | // is_border 13 | // with_bias: has bias 14 | // with_relu 15 | // with_ip_sum 16 | 17 | template 19 | struct elk_conv_wino_trans_input { 20 | static void execute(elx_param_t &ep, TinputType *tinput, 21 | InputType *input, int hA_start, int hA_end, int wA_start, int wA_end); 22 | }; 23 | 24 | template 27 | struct elk_conv_wino_trans_output { 28 | static void execute(elx_param_t &ep, OutputType *output, 29 | ToutputType *toutput, BiasType *bias, int hOA_end, int wOA_end); 30 | }; 31 | 32 | template 34 | struct elk_conv_wino_trans_weights { 35 | static void execute( 36 | TweightsType atweights[A][A][V][V], WeightsType aweights[K][K][V][V]); 37 | }; 38 | 39 | } // namespace euler 40 | -------------------------------------------------------------------------------- /src/elx_conv_wino_gemm.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "kernel/elk_gemm_binder.hxx" 4 | 5 | namespace euler { 6 | 7 | template 8 | class elx_conv_wino_gemm_t { 9 | public: 10 | using TinputType = typename GarrayTypes::InputType; 11 | using TweightsType = typename GarrayTypes::WeightsType; 12 | using ToutputType = typename GarrayTypes::OutputType; 13 | 14 | elx_conv_wino_gemm_t() {}; 15 | virtual ~elx_conv_wino_gemm_t() {}; 16 | void setup(elx_param_t *ep); 17 | 18 | // FP32 GEMM 19 | void execute(ToutputType *toutput, TinputType *tinput, 20 | TweightsType *tweights, int _t2, int Tz, int _I4 = 0); 21 | 22 | void execute_na(ToutputType *toutput, TinputType *tinput, 23 | TweightsType *tweights, int _t2, int Tz, int _I4); 24 | 25 | void execute(ToutputType *toutput, TinputType *tinput, 26 | TweightsType *tweights, int _I4 = 0); 27 | 28 | void execute_na(ToutputType *toutput, TinputType *tinput, 29 | TweightsType *tweights, int _I4 = 0); 30 | 31 | private: 32 | void bind_kernel_functions(); 33 | 34 | using ker_type = typename gemm_kernel_binder::kgemm; 35 | ker_type *ker_gemm_; 36 | ker_type *ker_gemm0_; 37 | 38 | int attr_; 39 | int mthr_; 40 | elx_param_t *ep = nullptr; 41 | }; 42 | 43 | } // namespace euler 44 | -------------------------------------------------------------------------------- /scripts/best_configs/resnet-n64-8180-1s-wino-f4_3.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | 3 | #/bin/bash 4 | 5 | # Resnet50 6 | # batch size: 64 7 | # SKX 8180 2S 8 | 9 | source ./scripts/best_configs/common.sh $@ 10 | 11 | 12 | # resnet50_res2a_branch2b 13 | #NSOCKETS=1 ./scripts/run.sh -c -i64 -h56 -o64 -H56 -n64 --tile-size=6 --execution-mode=0xa061 --blk-i=4 --blk-o=4 --flt-t=28 $COMMON 14 | NSOCKETS=1 ./scripts/run.sh -c -i64 -h56 -o64 -H56 -n64 --tile-size=6 --execution-mode=0xa061 --blk-i=4 --blk-o=1 --flt-o=2 --flt-t=14 --pat-o=2 --output-as-blocked=true $COMMON 15 | sleep 1 16 | # resnet50_res3a_branch2b 17 | #NSOCKETS=1 ./scripts/run.sh -c -i128 -h28 -o128 -H28 -n64 --tile-size=6 --execution-mode=0xa061 --blk-i=8 --blk-o=8 --flt-t=28 $COMMON 18 | NSOCKETS=1 ./scripts/run.sh -c -i128 -h28 -o128 -H28 -n64 --tile-size=6 --execution-mode=0xa061 --blk-i=8 --blk-o=1 --flt-o=2 --flt-t=13 --pat-o=4 $COMMON 19 | sleep 1 20 | # resnet50_res4a_branch2b 21 | NSOCKETS=1 ./scripts/run.sh -c -i256 -h14 -o256 -H14 -n64 --tile-size=6 --execution-mode=0xa061 --blk-i=8 --blk-o=4 --flt-t=19 --pat-o=4 $COMMON 22 | sleep 1 23 | # resnet50_res5a_branch2b 24 | #NSOCKETS=1 ./scripts/run.sh -c -i512 -h7 -o512 -H7 -n64 --tile-size=6 --execution-mode=0xa000 --blk-i=8 --blk-o=8 --flt-t=29 $COMMON 25 | NSOCKETS=1 ./scripts/run.sh -c -i512 -h7 -o512 -H7 -n64 --tile-size=6 --execution-mode=0xa033 --blk-i=8 --blk-o=8 --flt-t=26 --pat-i=1 --pat-o=1 $COMMON 26 | -------------------------------------------------------------------------------- /NOTATION: -------------------------------------------------------------------------------- 1 | Naming conventions 2 | ================== 3 | 4 | Files, functions, classes: 5 | eld: descriptor 6 | elx: exection functoin (stateless) 7 | elk: kernel function (stateless) 8 | elt: test 9 | 10 | Numbering: 11 | Cardinal number 12 | // dimension (plain) 13 | n, ic, oc, ih, iw, oh, ow, kh, kw, t 14 | // blocked dimension 15 | ic2, oc2, ih2, iw2, oh2, ow2, t2 // blocked 16 | ic3, oc3, t3, ... // 2nd level blocked 17 | // blocking unit, normally IV = OV = I = O 18 | V, A, T 19 | // 2nd/3rd level blocking unit 20 | I2, O2, I3, O3 21 | Ordinal number 22 | // dimension 23 | _n, _ic, _oc, _ih, _iw, _oh, _ow, _kh, _kw, ... 24 | // blocked dimension 25 | _ic2, _oc2, _ih2, _iw2, _oh2, _ow2, _t2 // blocked 26 | _ic3, _oc3, _t3, ... // 2nd level blocked 27 | // blocking unit 28 | _V, _T, _IV, _OV, _hA, _wA 29 | // 2nd/3rd level blocking unit 30 | _I2, _O2, _I3, _O3 31 | template: 32 | T: typename 33 | V: vector-size 34 | A: tile-size 35 | K: kernel-size 36 | 37 | Tensor: 38 | // pointer type 39 | input, output, weights, bias 40 | // md-array 41 | ainput, aoutput, aweights 42 | // tensor for transform (from or to) 43 | tinput, toutput, tweights 44 | // transformed md-array 45 | atinput, atoutput, atweights 46 | -------------------------------------------------------------------------------- /src/elx_int8_conv_wino_gemm.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "kernel/elk_u8s8_gemm_binder.hxx" 4 | 5 | namespace euler { 6 | 7 | template 8 | class elx_int8_conv_wino_gemm_t { 9 | public: 10 | using TinputType = typename GarrayTypes::InputType; 11 | using TweightsType = typename GarrayTypes::WeightsType; 12 | using ToutputType = typename GarrayTypes::OutputType; 13 | 14 | elx_int8_conv_wino_gemm_t() {}; 15 | virtual ~elx_int8_conv_wino_gemm_t() {}; 16 | void setup(elx_param_t *ep); 17 | 18 | // INT8 GEMM 19 | void execute(ToutputType *toutput, uint8_t *tinput, int8_t *tweights, 20 | float *src_scale, float *weights_scale, float *weights_shift, 21 | int _t2, int Tz, int _I4 = 0); 22 | 23 | void execute_na(ToutputType *toutput, uint8_t *tinput, int8_t *tweights, 24 | float *src_scale, float *weights_scale, float *weights_shift, 25 | int _t2, int Tz, int _I4 = 0); 26 | 27 | void execute_na(ToutputType *toutput, uint8_t *tinput, int8_t *tweights, 28 | float *src_scale, float *src_shift, float *weights_scale, 29 | float *weights_shift, int _I4 = 0); 30 | 31 | private: 32 | void bind_kernel_functions(); 33 | 34 | using i8_ker_type = typename u8s8_gemm_kernel_binder::kgemm; 35 | 36 | i8_ker_type *ker_u8s8_gemm_; 37 | i8_ker_type *ker_u8s8_gemm0_; 38 | 39 | int attr_; 40 | int mthr_; 41 | elx_param_t *ep = nullptr; 42 | }; 43 | 44 | } // namespace euler 45 | -------------------------------------------------------------------------------- /scripts/best_configs/resnet-n1-8180-1s-wino-f2_3.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | 3 | # Resnet50 4 | # batch-size: 1 5 | # SKX 8180 1S 6 | 7 | source ./scripts/best_configs/common.sh $@ 8 | export NSOCKETS=1 9 | # resnet50_res2a_branch2b, 5.2T tflops=4.83548 10 | NSOCKETS=1 ./scripts/run.sh -c -i64 -h56 -o64 -H56 -n1 --tile-size=4 --execution-mode=0xa061 --blk-i=4 --blk-o=1 --flt-o=2 --flt-t=14 --pat-o=2 --output-as-blocked=true $COMMON 11 | sleep 1 12 | # resnet50_res3a_branch2b, 4.4T tflops:3.68377 13 | #NSOCKETS=1 ./scripts/run.sh -c -i128 -h28 -o128 -H28 -n1 --tile-size=4 --execution-mode=0xa061 --blk-i=8 --blk-o=1 --flt-o=2 --flt-t=7 --pat-o=4 --output-as-blocked=true $COMMON 14 | NSOCKETS=1 ./scripts/run.sh -c -i128 -h28 -o128 -H28 -n1 --tile-size=4 --execution-mode=0xa061 --blk-i=8 --blk-o=2 --flt-o=2 --flt-t=14 --pat-o=2 $COMMON 15 | sleep 1 16 | # resnet50_res4a_branch2b, 4.2T tplops:3.18039 17 | #NSOCKETS=1 ./scripts/run.sh -c -i256 -h14 -o256 -H14 -n1 --tile-size=4 --execution-mode=0xa000 --blk-i=16 --blk-o=16 --flt-o=1 --flt-t=25 $COMMON 18 | NSOCKETS=1 ./scripts/run.sh -c -i256 -h14 -o256 -H14 -n1 --tile-size=4 --execution-mode=0xa000 --blk-i=8 --blk-o=1 --flt-o=2 --flt-t=13 $COMMON 19 | sleep 1 20 | # resnet50_res5a_branch2b, 2.5T tflops:2.27461 21 | #NSOCKETS=1 ./scripts/run.sh -c -i512 -h7 -o512 -H7 -n1 --tile-size=4 --execution-mode=0xa000 --blk-i=32 --blk-o=32 --flt-o=1 --flt-t=16 $COMMON 22 | NSOCKETS=1 ./scripts/run.sh -c -i512 -h7 -o512 -H7 -n1 --tile-size=4 --execution-mode=0xa000 --blk-i=16 --blk-o=1 --flt-o=2 --flt-t=8 $COMMON 23 | -------------------------------------------------------------------------------- /scripts/best_configs/vgg-n64-8180-1s-wino-f3_3.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | 3 | # VGG 4 | # batch-size: 64 5 | # SKX 8180 2S 6 | 7 | source ./scripts/best_configs/common.sh $@ 8 | 9 | 10 | # vgg19_conv1_2 11 | NSOCKETS=1 ./scripts/run.sh -c -i64 -h224 -o64 -H224 -n64 --tile-size=5 --blk-i=4 --blk-o=4 --flt-t=30 --execution-mode=0xa061 --output-as-blocked=true $COMMON 12 | sleep 1 13 | # vgg19_conv2_1 14 | NSOCKETS=1 ./scripts/run.sh -c -i64 -h112 -o128 -H112 -n64 --tile-size=5 --blk-i=4 --blk-o=8 --flt-t=26 --execution-mode=0xa061 $COMMON 15 | sleep 1 16 | # vgg19_conv2_2 17 | NSOCKETS=1 ./scripts/run.sh -c -i128 -h112 -o128 -H112 -n64 --tile-size=5 --blk-i=8 --blk-o=8 --flt-t=26 --execution-mode=0xa061 $COMMON 18 | sleep 1 19 | # vgg19_conv3_1 20 | NSOCKETS=1 ./scripts/run.sh -c -i128 -h56 -o256 -H56 -n64 --tile-size=5 --blk-i=8 --blk-o=8 --flt-t=26 --pat-o=2 --execution-mode=0xa061 $COMMON 21 | sleep 1 22 | # vgg19_conv3_2 23 | NSOCKETS=1 ./scripts/run.sh -c -i256 -h56 -o256 -H56 -n64 --tile-size=5 --blk-i=8 --blk-o=8 --flt-t=26 --pat-o=2 --execution-mode=0xa061 $COMMON 24 | sleep 1 25 | # vgg19_conv4_1 26 | NSOCKETS=1 ./scripts/run.sh -c -i256 -h28 -o512 -H28 -n64 --tile-size=5 --blk-i=8 --blk-o=8 --flt-t=25 --execution-mode=0xa000 --streaming-input=2 $COMMON 27 | sleep 1 28 | # vgg19_conv4_2 29 | NSOCKETS=1 ./scripts/run.sh -c -i512 -h28 -o512 -H28 -n64 --tile-size=5 --blk-i=8 --blk-o=4 --flt-t=25 --execution-mode=0xa000 --streaming-input=2 $COMMON 30 | sleep 1 31 | # vgg19_conv5_1 32 | NSOCKETS=1 ./scripts/run.sh -c -i512 -h14 -o512 -H14 -n64 --tile-size=5 --blk-i=8 --blk-o=4 --flt-t=25 --execution-mode=0xa000 --streaming-input=2 $COMMON 33 | -------------------------------------------------------------------------------- /scripts/best_configs/resnet-n1-8180-1s-direct-nhwc.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | 3 | # Resnet50 4 | # batch-size: 1 5 | # SKX 8180 1S 6 | 7 | source ./scripts/best_configs/common.sh $@ 8 | COMMON="$COMMON --input-format=nhwc --output-format=nhwc --f16c-opt=1" 9 | 10 | # resnet50_res2a_branch2b, 5.2T 11 | NSOCKETS=1 ./scripts/run.sh -c -i64 -h56 -o64 -H56 -n1 -adirect --execution-mode=0xc060 --blk-i=4 --blk-o=2 --flt-o=2 --flt-t=14 --pat-o=1 $COMMON 12 | sleep 1 13 | # resnet50_res3a_branch2b, 7.0 - 7.5T 14 | NSOCKETS=1 ./scripts/run.sh -c -i128 -h28 -o128 -H28 -n1 -adirect --execution-mode=0xc060 --blk-i=8 --blk-o=1 --flt-o=2 --flt-t=14 --pat-i=1 --pat-o=1 $COMMON 15 | sleep 1 16 | # resnet50_res4a_branch2b, 4.2T 17 | NSOCKETS=1 ./scripts/run.sh -c -i256 -h14 -o256 -H14 -n1 -adirect --execution-mode=0xc060 --blk-i=16 --blk-o=1 --flt-o=2 --flt-t=14 --pat-i=1 $COMMON --f16c-opt=1 18 | sleep 1 19 | # resnet50_res5a_branch2b, 2.5T 20 | NSOCKETS=1 ./scripts/run.sh -c -i512 -h7 -o512 -H7 -n1 -adirect --execution-mode=0xc060 --blk-i=16 --blk-o=1 --flt-o=2 --flt-t=7 --pat-i=2 $COMMON --f16c-opt=1 21 | sleep 1 22 | NSOCKETS=1 ./scripts/run.sh -c -i64 -h56 -o64 -H28 -k3 -s2 -S2 -n1 -adirect --execution-mode=0xc060 --blk-i=4 --blk-o=2 --flt-o=2 --flt-t=14 --pat-i=1 $COMMON --f16c-opt=1 23 | sleep 1 24 | NSOCKETS=1 ./scripts/run.sh -c -i128 -h28 -o128 -H14 -k3 -s2 -S2 -n1 -adirect --execution-mode=0xc060 --blk-i=8 --blk-o=2 --flt-o=2 --flt-t=14 --pat-i=1 $COMMON --f16c-opt=1 25 | sleep 1 26 | NSOCKETS=1 ./scripts/run.sh -c -i256 -h14 -o256 -H7 -k3 -s2 -S2 -n1 -adirect --execution-mode=0xc070 --blk-i=4 --blk-o=1 --flt-o=2 --flt-t=7 --pat-i=4 $COMMON --f16c-opt=1 27 | -------------------------------------------------------------------------------- /scripts/best_configs/vgg-n1-8180-1s-wino-f5_3.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | 3 | # VGG19 4 | # batch-size: 1 5 | # SKX 8180 1S 6 | 7 | 8 | source ./scripts/best_configs/common.sh $@ 9 | 10 | # vgg19_conv1_2, 7.0T 11 | NSOCKETS=1 ./scripts/run.sh -c -i64 -h224 -o64 -H224 -n1 --blk-i=4 --blk-o=4 --flt-t=15 --tile-size=7 --execution-mode=0xa061 --output-as-blocked=true $COMMON 12 | sleep 1 13 | # vgg19_conv2_1, 7.3T 14 | NSOCKETS=1 ./scripts/run.sh -c -i64 -h112 -o128 -H112 -n1 --blk-i=4 --blk-o=8 --flt-t=19 --tile-size=7 --execution-mode=0xa061 $COMMON 15 | sleep 1 16 | # vgg19_conv2_2, 7.4T 17 | NSOCKETS=1 ./scripts/run.sh -c -i128 -h112 -o128 -H112 -n1 --blk-i=8 --blk-o=8 --flt-t=19 --tile-size=7 --execution-mode=0xa061 $COMMON 18 | sleep 1 19 | # vgg19_conv3_1, 7.3T 20 | NSOCKETS=1 ./scripts/run.sh -c -i128 -h56 -o256 -H56 -n1 --blk-i=8 --blk-o=8 --flt-t=18 --tile-size=7 --execution-mode=0xa000 --output-as-blocked=true $COMMON 21 | sleep 1 22 | # vgg19_conv3_2, 7.3T 23 | NSOCKETS=1 ./scripts/run.sh -c -i256 -h56 -o256 -H56 -n1 --blk-i=8 --blk-o=8 --flt-t=12 --tile-size=7 --execution-mode=0xa000 $COMMON 24 | sleep 1 25 | # vgg19_conv4_1, 6.6T 26 | NSOCKETS=1 ./scripts/run.sh -c -i256 -h28 -o512 -H28 -n1 --blk-i=8 --blk-o=8 --flt-t=18 --tile-size=7 --execution-mode=0xa000 --streaming-input=2 $COMMON 27 | sleep 1 28 | # vgg19_conv4_2, 6.4T 29 | NSOCKETS=1 ./scripts/run.sh -c -i512 -h28 -o512 -H28 -n1 --blk-i=8 --blk-o=4 --flt-t=18 --tile-size=7 --execution-mode=0xa000 --streaming-input=2 $COMMON 30 | sleep 1 31 | # vgg19_conv5_1, 5.2T 32 | NSOCKETS=1 ./scripts/run.sh -c -i512 -h14 -o512 -H14 -n1 --blk-i=8 --blk-o=4 --flt-t=9 --tile-size=7 --execution-mode=0xa000 $COMMON 33 | -------------------------------------------------------------------------------- /scripts/best_configs/vgg-n1-8180-2s-wino-f4_3.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | 3 | # VGG 4 | # batch-size: 1 5 | # SKX 8180 2S 6 | 7 | source ./scripts/best_configs/common.sh $@ 8 | 9 | # vgg19_conv1_2, 12.8T 10 | NSOCKETS=2 ./scripts/run.sh -c -i64 -h224 -o64 -H224 -n1 --blk-i=4 --blk-o=4 --flt-t=28 --tile-size=6 --execution-mode=0xa061 --output-as-blocked=true $COMMON 11 | sleep 1 12 | # vgg19_conv2_1, 13.5T 13 | NSOCKETS=2 ./scripts/run.sh -c -i64 -h112 -o128 -H112 -n1 --blk-i=4 --blk-o=4 --flt-t=14 --tile-size=6 --pat-o=2 --execution-mode=0xa061 $COMMON 14 | sleep 1 15 | # vgg19_conv2_2, 14.3T 16 | NSOCKETS=2 ./scripts/run.sh -c -i128 -h112 -o128 -H112 -n1 --blk-i=8 --blk-o=8 --flt-t=14 --tile-size=6 --execution-mode=0xa061 $COMMON 17 | sleep 1 18 | # vgg19_conv3_1, 13.0T 19 | NSOCKETS=2 ./scripts/run.sh -c -i128 -h56 -o256 -H56 -n1 --blk-i=8 --blk-o=4 --flt-t=14 --pat-o=4 --tile-size=6 --execution-mode=0xa061 $COMMON 20 | sleep 1 21 | # vgg19_conv3_2, 12.6T 22 | NSOCKETS=2 ./scripts/run.sh -c -i256 -h56 -o256 -H56 -n1 --blk-i=8 --blk-o=4 --flt-t=14 --pat-o=4 --tile-size=6 --execution-mode=0xa061 $COMMON 23 | sleep 1 24 | # vgg19_conv4_1, 10.2T 25 | NSOCKETS=2 ./scripts/run.sh -c -i256 -h28 -o512 -H28 -n1 --blk-i=8 --blk-o=4 --flt-t=13 --tile-size=6 --execution-mode=0xa000 --streaming-input=1 $COMMON 26 | sleep 1 27 | # vgg19_conv4_2, 10.5T 28 | NSOCKETS=2 ./scripts/run.sh -c -i512 -h28 -o512 -H28 -n1 --blk-i=8 --blk-o=4 --flt-t=26 --tile-size=6 --execution-mode=0xa000 --streaming-input=1 $COMMON 29 | sleep 1 30 | # vgg19_conv5_1, 7.3T 31 | NSOCKETS=2 ./scripts/run.sh -c -i512 -h14 -o512 -H14 -n1 --blk-i=8 --blk-o=4 --flt-t=16 --tile-size=6 --execution-mode=0xa000 $COMMON 32 | -------------------------------------------------------------------------------- /scripts/best_configs/vgg-n1-8180-2s-wino-f5_3.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | 3 | # VGG 4 | # batch-size: 1 5 | # SKX 8180 2S 6 | 7 | source ./scripts/best_configs/common.sh $@ 8 | 9 | # vgg19_conv1_2, 12.8T 10 | NSOCKETS=2 ./scripts/run.sh -c -i64 -h224 -o64 -H224 -n1 --blk-i=4 --blk-o=4 --flt-t=19 --tile-size=7 --execution-mode=0xa061 --output-as-blocked=true $COMMON 11 | sleep 1 12 | # vgg19_conv2_1, 13.5T 13 | NSOCKETS=2 ./scripts/run.sh -c -i64 -h112 -o128 -H112 -n1 --blk-i=4 --blk-o=4 --flt-t=19 --pat-o=2 --tile-size=7 --execution-mode=0xa061 $COMMON 14 | sleep 1 15 | # vgg19_conv2_2, 14.3T 16 | NSOCKETS=2 ./scripts/run.sh -c -i128 -h112 -o128 -H112 -n1 --blk-i=8 --blk-o=4 --flt-t=19 --pat-o=2 --tile-size=7 --execution-mode=0xa061 $COMMON 17 | sleep 1 18 | # vgg19_conv3_1, 13.0T 19 | NSOCKETS=2 ./scripts/run.sh -c -i128 -h56 -o256 -H56 -n1 --blk-i=8 --blk-o=4 --flt-t=18 --tile-size=7 --execution-mode=0xa000 --streaming-input=2 $COMMON 20 | sleep 1 21 | # vgg19_conv3_2, 12.6T 22 | NSOCKETS=2 ./scripts/run.sh -c -i256 -h56 -o256 -H56 -n1 --blk-i=8 --blk-o=8 --flt-t=12 --pat-o=1 --tile-size=7 --execution-mode=0xa000 $COMMON 23 | sleep 1 24 | # vgg19_conv4_1, 10.2T 25 | NSOCKETS=2 ./scripts/run.sh -c -i256 -h28 -o512 -H28 -n1 --blk-i=8 --blk-o=4 --flt-t=18 --tile-size=7 --execution-mode=0xa000 --streaming-input=2 $COMMON 26 | sleep 1 27 | # vgg19_conv4_2, 10.5T 28 | NSOCKETS=2 ./scripts/run.sh -c -i512 -h28 -o512 -H28 -n1 --blk-i=8 --blk-o=4 --flt-t=18 --tile-size=7 --execution-mode=0xa000 --streaming-input=2 $COMMON 29 | sleep 1 30 | # vgg19_conv5_1, 7.3T 31 | NSOCKETS=2 ./scripts/run.sh -c -i512 -h14 -o512 -H14 -n1 --blk-i=8 --blk-o=4 --flt-t=9 --tile-size=7 --execution-mode=0xa000 $COMMON 32 | -------------------------------------------------------------------------------- /scripts/best_configs/vgg-n64-8180-2s-wino-f3_3.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | 3 | # VGG 4 | # batch-size: 64 5 | # SKX 8180 2S 6 | 7 | source ./scripts/best_configs/common.sh $@ 8 | 9 | # vgg19_conv1_2, 12.2T 10 | NSOCKETS=2 ./scripts/run.sh -c -i64 -h224 -o64 -H224 -n64 --tile-size=5 --blk-i=4 --blk-o=4 --flt-t=30 --execution-mode=0xa061 --output-as-blocked=true $COMMON 11 | sleep 1 12 | # vgg19_conv2_1, 13.3T 13 | NSOCKETS=2 ./scripts/run.sh -c -i64 -h112 -o128 -H112 -n64 --tile-size=5 --blk-i=4 --blk-o=8 --flt-t=26 --execution-mode=0xa061 $COMMON 14 | sleep 1 15 | # vgg19_conv2_2, 14.2T 16 | NSOCKETS=2 ./scripts/run.sh -c -i128 -h112 -o128 -H112 -n64 --tile-size=5 --blk-i=8 --blk-o=8 --flt-t=26 --execution-mode=0xa061 $COMMON 17 | sleep 1 18 | # vgg19_conv3_1, 13.9T 19 | NSOCKETS=2 ./scripts/run.sh -c -i128 -h56 -o256 -H56 -n64 --tile-size=5 --blk-i=8 --blk-o=8 --flt-t=26 --pat-o=2 --execution-mode=0xa061 $COMMON 20 | sleep 1 21 | # vgg19_conv3_2, 11.2T 22 | NSOCKETS=2 ./scripts/run.sh -c -i256 -h56 -o256 -H56 -n64 --tile-size=5 --blk-i=8 --blk-o=8 --flt-t=26 --pat-o=2 --execution-mode=0xa061 $COMMON 23 | sleep 1 24 | # vgg19_conv4_1, 10.0T 25 | NSOCKETS=2 ./scripts/run.sh -c -i256 -h28 -o512 -H28 -n64 --tile-size=5 --blk-i=8 --blk-o=8 --flt-t=31 --execution-mode=0xa000 --streaming-input=2 $COMMON 26 | sleep 1 27 | # vgg19_conv4_2, 10.8T 28 | NSOCKETS=2 ./scripts/run.sh -c -i512 -h28 -o512 -H28 -n64 --tile-size=5 --blk-i=8 --blk-o=4 --flt-t=31 --execution-mode=0xa000 --streaming-input=2 $COMMON 29 | sleep 1 30 | # vgg19_conv5_1, 10.6T 31 | NSOCKETS=2 ./scripts/run.sh -c -i512 -h14 -o512 -H14 -n64 --tile-size=5 --blk-i=8 --blk-o=4 --flt-t=31 --execution-mode=0xa000 --streaming-input=2 $COMMON 32 | -------------------------------------------------------------------------------- /scripts/best_configs/vgg-n64-8180-2s-wino-f4_3.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | 3 | # VGG 4 | # batch-size: 64 5 | # SKX 8180 2S 6 | 7 | source ./scripts/best_configs/common.sh $@ 8 | 9 | # vgg19_conv1_2, 12.2T 10 | NSOCKETS=2 ./scripts/run.sh -c -i64 -h224 -o64 -H224 -n64 --tile-size=6 --blk-i=4 --blk-o=4 --flt-t=28 --execution-mode=0xa061 --output-as-blocked=true $COMMON 11 | sleep 1 12 | # vgg19_conv2_1, 13.3T 13 | NSOCKETS=2 ./scripts/run.sh -c -i64 -h112 -o128 -H112 -n64 --tile-size=6 --blk-i=4 --blk-o=8 --flt-t=28 --execution-mode=0xa061 $COMMON 14 | sleep 1 15 | # vgg19_conv2_2, 14.2T 16 | NSOCKETS=2 ./scripts/run.sh -c -i128 -h112 -o128 -H112 -n64 --tile-size=6 --blk-i=8 --blk-o=8 --flt-t=28 --execution-mode=0xa061 $COMMON 17 | sleep 1 18 | # vgg19_conv3_1, 13.9T 19 | NSOCKETS=2 ./scripts/run.sh -c -i128 -h56 -o256 -H56 -n64 --tile-size=6 --blk-i=8 --blk-o=8 --flt-t=28 --pat-o=2 --execution-mode=0xa061 $COMMON 20 | sleep 1 21 | # vgg19_conv3_2, 11.2T 22 | NSOCKETS=2 ./scripts/run.sh -c -i256 -h56 -o256 -H56 -n64 --tile-size=6 --blk-i=8 --blk-o=8 --flt-t=28 --pat-o=2 --execution-mode=0xa061 $COMMON 23 | sleep 1 24 | # vgg19_conv4_1, 10.0T 25 | NSOCKETS=2 ./scripts/run.sh -c -i256 -h28 -o512 -H28 -n64 --tile-size=6 --blk-i=8 --blk-o=8 --flt-t=31 --execution-mode=0xa000 --streaming-input=2 $COMMON 26 | sleep 1 27 | # vgg19_conv4_2, 10.8T 28 | NSOCKETS=2 ./scripts/run.sh -c -i512 -h28 -o512 -H28 -n64 --tile-size=6 --blk-i=8 --blk-o=4 --flt-t=31 --execution-mode=0xa000 --streaming-input=2 $COMMON 29 | sleep 1 30 | # vgg19_conv5_1, 10.6T 31 | NSOCKETS=2 ./scripts/run.sh -c -i512 -h14 -o512 -H14 -n64 --tile-size=6 --blk-i=8 --blk-o=4 --flt-t=28 --execution-mode=0xa000 --streaming-input=2 $COMMON 32 | -------------------------------------------------------------------------------- /scripts/best_configs/resnet-n1-8180-1s-int8-rd.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | 3 | # Resnet50 4 | # batch-size: 1 5 | # SKX 8180 1S 6 | 7 | source ./scripts/best_configs/common.sh $@ 8 | 9 | # resnet50_res2a_branch2b, 5.2T 10 | NSOCKETS=1 ./scripts/run.sh -c -i64 -h56 -o64 -H56 -n1 --tile-size=6 --execution-mode=0xa161 --blk-i=1 --blk-o=1 --flt-o=2 --flt-t=7 --pat-o=2 --output-as-blocked=true --input-data-file=tests/data/ResNet50/res2_0_branch2c/op11_input__res2_0_branch2b_bn --weights-data-file=tests/data/ResNet50/res2_0_branch2c/op11_weights__res2_0_branch2c_w -b0 $COMMON 11 | sleep 1 12 | # resnet50_res3a_branch2b, 7.0 - 7.5T 13 | NSOCKETS=1 ./scripts/run.sh -c -i128 -h28 -o128 -H28 -n1 --tile-size=6 --execution-mode=0xa161 --blk-i=2 --blk-o=1 --flt-o=2 --flt-t=7 --pat-o=4 --input-data-file=tests/data/ResNet50/res3_0_branch2c/op43_input__res3_0_branch2b_bn --weights-data-file=tests/data/ResNet50/res3_0_branch2c/op43_weights__res3_0_branch2c_w -b0 $COMMON 14 | sleep 1 15 | # resnet50_res4a_branch2b, 4.2T 16 | NSOCKETS=1 ./scripts/run.sh -c -i256 -h14 -o256 -H14 -n1 --tile-size=5 --execution-mode=0xa133 --blk-i=4 --blk-o=2 --flt-o=4 --flt-t=5 --input-data-file=tests/data/ResNet50/res4_0_branch2c/op85_input__res4_0_branch2b_bn --weights-data-file=tests/data/ResNet50/res4_0_branch2c/op85_weights__res4_0_branch2c_w -b0 $COMMON 17 | sleep 1 18 | # resnet50_res5a_branch2b, 2.5T 19 | NSOCKETS=1 ./scripts/run.sh -c -i512 -h7 -o512 -H7 -n1 --tile-size=4 --execution-mode=0xa133 --blk-i=8 --blk-o=1 --flt-o=1 --flt-t=26 --input-data-file=tests/data/ResNet50/res5_0_branch2c/op147_input__res5_0_branch2b_bn --weights-data-file=tests/data/ResNet50/res5_0_branch2c/op147_weights__res5_0_branch2c_w -b0 $COMMON 20 | -------------------------------------------------------------------------------- /scripts/best_configs/common.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | function usage() { 4 | cat < 4 | #include 5 | 6 | #define SSE42 0x00100000 // ecx[bit 20] 7 | #define AVX 0x10000000 // ecx[bit 28] 8 | #define AVX2 0x00000020 // ebx[bit 5] 9 | #define AVX512F 0x00010000 // ebx[bit 16] 10 | #define AVX512BW 0x40000000 // ebx[bit 30] 11 | #define AVX512VNNI 0x00000800 // ecx[bit 11] 12 | 13 | enum { 14 | isa_undef = 0, 15 | sse42, 16 | avx, 17 | avx2, 18 | avx512_common, 19 | avx512_core, 20 | avx512_core_vnni 21 | }; 22 | 23 | struct cpuid_regs { 24 | unsigned int eax; 25 | unsigned int ebx; 26 | unsigned int ecx; 27 | unsigned int edx; 28 | }; 29 | 30 | static cpuid_regs regs0, regs1; 31 | static void __attribute__ ((constructor)) get_cpuinfo(void) { 32 | __cpuid(0x1, regs0.eax, regs0.ebx, regs0.ecx, regs0.edx); 33 | __cpuid_count(0x7, 0, regs1.eax, regs1.ebx, regs1.ecx, regs1.edx); 34 | } 35 | 36 | static inline bool cpu_has(int isa) 37 | { 38 | switch (isa) { 39 | case sse42: 40 | return !!(regs0.ecx & SSE42); 41 | case avx: 42 | return !!(regs0.ecx & AVX); 43 | case avx2: 44 | return !!(regs1.ebx & AVX2); 45 | case avx512_common: 46 | return !!(regs1.ebx & AVX512F); 47 | case avx512_core: 48 | return !!(regs1.ebx & AVX512F) && !!(regs1.ebx & AVX512BW); 49 | #if defined(WITH_VNNI) 50 | case avx512_core_vnni: 51 | return !!(regs1.ebx & AVX512F) && !!(regs1.ebx & AVX512BW) 52 | && !!(regs1.ecx & AVX512VNNI); 53 | #endif 54 | default: 55 | return false; 56 | } 57 | return false; 58 | } 59 | 60 | // CPU V length by byte 61 | static inline int cpu_vector_length() { 62 | if (cpu_has(avx512_common)) 63 | return 64; // zmm 64 | else if (cpu_has(avx)) 65 | return 32; // ymm 66 | else if (cpu_has(sse42)) 67 | return 16; // xmm 68 | else 69 | return 8; 70 | } 71 | -------------------------------------------------------------------------------- /scripts/best_configs/googlenetv3-n1-8180-1s-wino-f2_3.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | 3 | # googlenet_v3 4 | # batch-size: 1 5 | # SKX 8180 1S 6 | 7 | source ./scripts/best_configs/common.sh $@ 8 | 9 | # googlenet_v3:conv_1_1_conv2d, ['ic', '32', 'ih', '149', 'oc', '32', 'oh', '147', 'kh', '3'] 10 | NSOCKETS=1 ./scripts/run.sh -c -i32 -h149 -o32 -H147 -n1 --blk-i=2 --blk-o=2 --flt-t=28 --tile-size=4 --execution-mode=0xa061 -p0 -P0 $COMMON 11 | sleep 1 12 | # googlenet_v3:conv_2_2_conv2d, ['ic', '32', 'ih', '147', 'oc', '64', 'oh', '147', 'kh', '3', 'ph', '1'] 13 | NSOCKETS=1 ./scripts/run.sh -c -i32 -h147 -o64 -H147 -n1 --blk-i=2 --blk-o=4 --flt-t=28 --tile-size=4 --execution-mode=0xa061 $COMMON 14 | sleep 1 15 | # googlenet_v3:conv_4_4_conv2d, ['ic', '80', 'ih', '73', 'oc', '192', 'oh', '71', 'kh', '3'] 16 | NSOCKETS=1 ./scripts/run.sh -c -i80 -h73 -o192 -H71 -n1 --blk-i=5 --blk-o=2 --pat-o=3 --flt-t=24 --tile-size=4 --execution-mode=0xa061 --output-as-blocked=true -p0 -P0 $COMMON 17 | sleep 1 18 | # googlenet_v3:mixed_tower_1_conv_1_conv2d, ['ic', '64', 'ih', '35', 'oc', '96', 'oh', '35', 'kh', '3', 'ph', '1'] 19 | NSOCKETS=1 ./scripts/run.sh -c -i64 -h35 -o96 -H35 -n1 --blk-i=4 --blk-o=3 --flt-t=12 --pat-o=2 --tile-size=4 --execution-mode=0xa061 --output-as-blocked=true $COMMON 20 | sleep 1 21 | # googlenet_v3:mixed_tower_1_conv_2_conv2d, ['ic', '96', 'ih', '35', 'oc', '96', 'oh', '35', 'kh', '3', 'ph', '1'] 22 | NSOCKETS=1 ./scripts/run.sh -c -i96 -h35 -o96 -H35 -n1 --blk-i=6 --blk-o=1 --flt-t=12 --pat-o=2 --tile-size=4 --execution-mode=0xa061 --output-as-blocked=true $COMMON 23 | sleep 1 24 | # googlenet_v3:mixed_9_tower_1_conv_1_conv2d, ['ic', '448', 'ih', '8', 'oc', '384', 'oh', '8', 'kh', '3', 'ph', '1'] 25 | NSOCKETS=1 ./scripts/run.sh -c -i448 -h8 -o384 -H8 -n1 --blk-i=14 --blk-o=4 --flt-t=8 --tile-size=4 --execution-mode=0xa000 --streaming-input=0 $COMMON 26 | sleep 1 27 | -------------------------------------------------------------------------------- /scripts/best_configs/googlenetv3-n1-8180-1s-wino-f3_3.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | 3 | # googlenet_v3 4 | # batch-size: 1 5 | # SKX 8180 1S 6 | 7 | source ./scripts/best_configs/common.sh $@ 8 | 9 | # googlenet_v3:conv_1_1_conv2d, ['ic', '32', 'ih', '149', 'oc', '32', 'oh', '147', 'kh', '3'] 10 | NSOCKETS=1 ./scripts/run.sh -c -i32 -h149 -o32 -H147 -n1 --blk-i=2 --blk-o=2 --flt-t=22 --tile-size=5 --execution-mode=0xa061 -p0 -P0 $COMMON 11 | sleep 1 12 | # googlenet_v3:conv_2_2_conv2d, ['ic', '32', 'ih', '147', 'oc', '64', 'oh', '147', 'kh', '3', 'ph', '1'] 13 | NSOCKETS=1 ./scripts/run.sh -c -i32 -h147 -o64 -H147 -n1 --blk-i=2 --blk-o=4 --flt-t=22 --tile-size=5 --execution-mode=0xa061 $COMMON 14 | sleep 1 15 | # googlenet_v3:conv_4_4_conv2d, ['ic', '80', 'ih', '73', 'oc', '192', 'oh', '71', 'kh', '3'] 16 | NSOCKETS=1 ./scripts/run.sh -c -i80 -h73 -o192 -H71 -n1 --blk-i=5 --blk-o=2 --pat-o=3 --flt-t=21 --tile-size=5 --execution-mode=0xa061 --output-as-blocked=true -p0 -P0 $COMMON 17 | sleep 1 18 | # googlenet_v3:mixed_tower_1_conv_1_conv2d, ['ic', '64', 'ih', '35', 'oc', '96', 'oh', '35', 'kh', '3', 'ph', '1'] 19 | NSOCKETS=1 ./scripts/run.sh -c -i64 -h35 -o96 -H35 -n1 --blk-i=4 --blk-o=3 --flt-t=11 --pat-o=2 --tile-size=5 --execution-mode=0xa061 --output-as-blocked=true $COMMON 20 | sleep 1 21 | # googlenet_v3:mixed_tower_1_conv_2_conv2d, ['ic', '96', 'ih', '35', 'oc', '96', 'oh', '35', 'kh', '3', 'ph', '1'] 22 | NSOCKETS=1 ./scripts/run.sh -c -i96 -h35 -o96 -H35 -n1 --blk-i=6 --blk-o=1 --flt-t=11 --pat-o=2 --tile-size=5 --execution-mode=0xa061 --output-as-blocked=true $COMMON 23 | sleep 1 24 | # googlenet_v3:mixed_9_tower_1_conv_1_conv2d, ['ic', '448', 'ih', '8', 'oc', '384', 'oh', '8', 'kh', '3', 'ph', '1'] 25 | NSOCKETS=1 ./scripts/run.sh -c -i448 -h8 -o384 -H8 -n1 --blk-i=14 --blk-o=4 --flt-t=9 --tile-size=5 --execution-mode=0xa000 --streaming-input=0 $COMMON 26 | sleep 1 27 | -------------------------------------------------------------------------------- /scripts/best_configs/googlenetv3-n1-8180-2s-wino-f3_3.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | 3 | # googlenet_v3 4 | # batch-size: 1 5 | # SKX 8180 2S 6 | 7 | 8 | source ./scripts/best_configs/common.sh $@ 9 | 10 | # googlenet_v3:conv_1_1_conv2d, ['ic', '32', 'ih', '149', 'oc', '32', 'oh', '147', 'kh', '3'] 11 | NSOCKETS=2 ./scripts/run.sh -c -i32 -h149 -o32 -H147 -n1 --blk-i=2 --blk-o=2 --flt-t=22 --tile-size=5 --execution-mode=0xa061 -p0 -P0 $COMMON 12 | sleep 1 13 | # googlenet_v3:conv_2_2_conv2d, ['ic', '32', 'ih', '147', 'oc', '64', 'oh', '147', 'kh', '3', 'ph', '1'] 14 | NSOCKETS=2 ./scripts/run.sh -c -i32 -h147 -o64 -H147 -n1 --blk-i=2 --blk-o=4 --flt-t=22 --tile-size=5 --execution-mode=0xa061 $COMMON 15 | sleep 1 16 | # googlenet_v3:conv_4_4_conv2d, ['ic', '80', 'ih', '73', 'oc', '192', 'oh', '71', 'kh', '3'] 17 | NSOCKETS=2 ./scripts/run.sh -c -i80 -h73 -o192 -H71 -n1 --blk-i=5 --blk-o=6 --pat-o=2 --flt-t=21 --tile-size=5 --execution-mode=0xa061 --output-as-blocked=true -p0 -P0 $COMMON 18 | sleep 1 19 | # googlenet_v3:mixed_tower_1_conv_1_conv2d, ['ic', '64', 'ih', '35', 'oc', '96', 'oh', '35', 'kh', '3', 'ph', '1'] 20 | NSOCKETS=2 ./scripts/run.sh -c -i64 -h35 -o96 -H35 -n1 --blk-i=4 --blk-o=3 --flt-t=6 --pat-o=2 --tile-size=5 --execution-mode=0xa061 --output-as-blocked=true $COMMON 21 | sleep 1 22 | # googlenet_v3:mixed_tower_1_conv_2_conv2d, ['ic', '96', 'ih', '35', 'oc', '96', 'oh', '35', 'kh', '3', 'ph', '1'] 23 | NSOCKETS=2 ./scripts/run.sh -c -i96 -h35 -o96 -H35 -n1 --blk-i=6 --blk-o=1 --flt-t=6 --pat-o=2 --tile-size=5 --execution-mode=0xa061 --output-as-blocked=true $COMMON 24 | sleep 1 25 | # googlenet_v3:mixed_9_tower_1_conv_1_conv2d, ['ic', '448', 'ih', '8', 'oc', '384', 'oh', '8', 'kh', '3', 'ph', '1'] 26 | NSOCKETS=2 ./scripts/run.sh -c -i448 -h8 -o384 -H8 -n1 --blk-i=14 --blk-o=4 --flt-t=9 --tile-size=5 --execution-mode=0xa000 --streaming-input=0 $COMMON 27 | sleep 1 28 | -------------------------------------------------------------------------------- /scripts/best_configs/vgg-n64-8180-1s-int8.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | 3 | # VGG19 4 | # batch-size: 1 5 | # SKX 8180 1S 6 | 7 | 8 | source ./scripts/best_configs/common.sh $@ 9 | COMMON="$COMMON --sampling-kind=2 --data-type-cfg=U8F32S8F32 -v0" 10 | 11 | # vgg19_conv1_2, 12 | NSOCKETS=1 ./scripts/run.sh -c -i64 -h224 -o64 -H224 -n64 --blk-i=4 --blk-o=2 --flt-o=2 --flt-t=11 --tile-size=6 --execution-mode=0xa161 --pat-o=1 --output-as-blocked=true $COMMON 13 | 14 | sleep 1 15 | # vgg19_conv2_1, 16 | NSOCKETS=1 ./scripts/run.sh -c -i64 -h112 -o128 -H112 -n64 --blk-i=4 --blk-o=4 --flt-o=2 --flt-t=11 --tile-size=6 --execution-mode=0xa161 --pat-o=1 $COMMON 17 | 18 | sleep 1 19 | # vgg19_conv2_2, 20 | NSOCKETS=1 ./scripts/run.sh -c -i128 -h112 -o128 -H112 -n64 --blk-i=8 --blk-o=2 --flt-o=2 --flt-t=11 --tile-size=6 --execution-mode=0xa161 --pat-o=2 $COMMON 21 | 22 | sleep 1 23 | # vgg19_conv3_1, 24 | NSOCKETS=1 ./scripts/run.sh -c -i128 -h56 -o256 -H56 -n64 --blk-i=8 --blk-o=4 --flt-o=2 --flt-t=11 --pat-i=1 --pat-o=2 --tile-size=6 --execution-mode=0xa161 $COMMON 25 | 26 | sleep 1 27 | # vgg19_conv3_2, 28 | NSOCKETS=1 ./scripts/run.sh -c -i256 -h56 -o256 -H56 -n64 --blk-i=16 --blk-o=2 --flt-o=2 --flt-t=11 --pat-i=1 --pat-o=4 --tile-size=6 --execution-mode=0xa161 $COMMON 29 | 30 | sleep 1 31 | # vgg19_conv4_1, 6.6T 32 | NSOCKETS=1 ./scripts/run.sh -c -i256 -h28 -o512 -H28 -n64 --blk-i=16 --blk-o=2 --flt-o=2 --flt-t=11 --tile-size=6 --execution-mode=0xa161 --pat-o=8 $COMMON 33 | 34 | sleep 1 35 | # vgg19_conv4_2, 6.4T 36 | NSOCKETS=1 ./scripts/run.sh -c -i512 -h28 -o512 -H28 -n64 --blk-i=32 --blk-o=1 --flt-o=2 --flt-t=11 --tile-size=6 --execution-mode=0xa161 --pat-o=16 $COMMON 37 | 38 | sleep 1 39 | # vgg19_conv5_1, 5.2T 40 | NSOCKETS=1 ./scripts/run.sh -c -i512 -h14 -o512 -H14 -n64 --blk-i=32 --blk-o=1 --flt-o=2 --flt-t=11 --tile-size=6 --execution-mode=0xa161 --pat-o=16 $COMMON 41 | -------------------------------------------------------------------------------- /scripts/best_configs/googlenetv3-n1-8180-1s-wino-f5_3.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | 3 | # googlenet_v3 4 | # batch-size: 1 5 | # SKX 8180 1S 6 | 7 | 8 | source ./scripts/best_configs/common.sh $@ 9 | 10 | # googlenet_v3:conv_1_1_conv2d, ['ic', '32', 'ih', '149', 'oc', '32', 'oh', '147', 'kh', '3'] 11 | NSOCKETS=1 ./scripts/run.sh -c -i32 -h149 -o32 -H147 -n1 --blk-i=2 --blk-o=2 --flt-t=17 --tile-size=7 --execution-mode=0xa061 --output-as-blocked=true -p0 -P0 $COMMON 12 | sleep 1 13 | # googlenet_v3:conv_2_2_conv2d, ['ic', '32', 'ih', '147', 'oc', '64', 'oh', '147', 'kh', '3', 'ph', '1'] 14 | NSOCKETS=1 ./scripts/run.sh -c -i32 -h147 -o64 -H147 -n1 --blk-i=2 --blk-o=4 --flt-t=17 --tile-size=7 --execution-mode=0xa061 --output-as-blocked=true $COMMON 15 | sleep 1 16 | # googlenet_v3:conv_4_4_conv2d, ['ic', '80', 'ih', '73', 'oc', '192', 'oh', '71', 'kh', '3'] 17 | NSOCKETS=1 ./scripts/run.sh -c -i80 -h73 -o192 -H71 -n1 --blk-i=5 --blk-o=2 --pat-o=3 --flt-t=9 --tile-size=7 --execution-mode=0xa061 --output-as-blocked=true -p0 -P0 $COMMON 18 | sleep 1 19 | # googlenet_v3:mixed_tower_1_conv_1_conv2d, ['ic', '64', 'ih', '35', 'oc', '96', 'oh', '35', 'kh', '3', 'ph', '1'] 20 | NSOCKETS=1 ./scripts/run.sh -c -i64 -h35 -o96 -H35 -n1 --blk-i=4 --blk-o=3 --flt-t=13 --tile-size=7 --execution-mode=0xa000 $COMMON 21 | sleep 1 22 | # googlenet_v3:mixed_tower_1_conv_2_conv2d, ['ic', '96', 'ih', '35', 'oc', '96', 'oh', '35', 'kh', '3', 'ph', '1'] 23 | NSOCKETS=1 ./scripts/run.sh -c -i96 -h35 -o96 -H35 -n1 --blk-i=6 --blk-o=1 --flt-t=6 --tile-size=7 --execution-mode=0xa000 --output-as-blocked=true $COMMON 24 | sleep 1 25 | # googlenet_v3:mixed_9_tower_1_conv_1_conv2d, ['ic', '448', 'ih', '8', 'oc', '384', 'oh', '8', 'kh', '3', 'ph', '1'] 26 | NSOCKETS=1 ./scripts/run.sh -c -i448 -h8 -o384 -H8 -n1 --blk-i=7 --blk-o=6 --flt-t=4 --tile-size=7 --execution-mode=0xa000 --streaming-input=0 $COMMON 27 | sleep 1 28 | -------------------------------------------------------------------------------- /scripts/best_configs/googlenetv3-n1-8180-2s-wino-f5_3.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | 3 | # googlenet_v3 4 | # batch-size: 1 5 | # SKX 8180 2S 6 | 7 | 8 | source ./scripts/best_configs/common.sh $@ 9 | 10 | # googlenet_v3:conv_1_1_conv2d, ['ic', '32', 'ih', '149', 'oc', '32', 'oh', '147', 'kh', '3'] 11 | NSOCKETS=2 ./scripts/run.sh -c -i32 -h149 -o32 -H147 -n1 --blk-i=2 --blk-o=2 --flt-t=17 --tile-size=7 --execution-mode=0xa061 --output-as-blocked=true -p0 -P0 $COMMON 12 | sleep 1 13 | # googlenet_v3:conv_2_2_conv2d, ['ic', '32', 'ih', '147', 'oc', '64', 'oh', '147', 'kh', '3', 'ph', '1'] 14 | NSOCKETS=2 ./scripts/run.sh -c -i32 -h147 -o64 -H147 -n1 --blk-i=2 --blk-o=4 --flt-t=17 --tile-size=7 --execution-mode=0xa061 $COMMON 15 | sleep 1 16 | # googlenet_v3:conv_4_4_conv2d, ['ic', '80', 'ih', '73', 'oc', '192', 'oh', '71', 'kh', '3'] 17 | NSOCKETS=2 ./scripts/run.sh -c -i80 -h73 -o192 -H71 -n1 --blk-i=5 --blk-o=1 --pat-o=12 --flt-t=9 --tile-size=7 --execution-mode=0xa061 --output-as-blocked=true -p0 -P0 $COMMON 18 | sleep 1 19 | # googlenet_v3:mixed_tower_1_conv_1_conv2d, ['ic', '64', 'ih', '35', 'oc', '96', 'oh', '35', 'kh', '3', 'ph', '1'] 20 | NSOCKETS=2 ./scripts/run.sh -c -i64 -h35 -o96 -H35 -n1 --blk-i=4 --blk-o=3 --flt-t=6 --tile-size=7 --execution-mode=0xa000 --output-as-blocked=true $COMMON 21 | sleep 1 22 | # googlenet_v3:mixed_tower_1_conv_2_conv2d, ['ic', '96', 'ih', '35', 'oc', '96', 'oh', '35', 'kh', '3', 'ph', '1'] 23 | NSOCKETS=2 ./scripts/run.sh -c -i96 -h35 -o96 -H35 -n1 --blk-i=6 --blk-o=1 --flt-t=6 --tile-size=7 --execution-mode=0xa000 --output-as-blocked=true $COMMON 24 | sleep 1 25 | # googlenet_v3:mixed_9_tower_1_conv_1_conv2d, ['ic', '448', 'ih', '8', 'oc', '384', 'oh', '8', 'kh', '3', 'ph', '1'] 26 | NSOCKETS=2 ./scripts/run.sh -c -i448 -h8 -o384 -H8 -n1 --blk-i=7 --blk-o=6 --flt-t=4 --tile-size=7 --execution-mode=0xa000 --streaming-input=0 $COMMON 27 | sleep 1 28 | -------------------------------------------------------------------------------- /src/elx_conv_direct_1x1_bind.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "elx_conv_direct_1x1.hpp" 4 | 5 | namespace euler { 6 | 7 | Template_elx_conv_direct_1x1_t void 8 | Instance_elx_conv_direct_1x1_t::bind_execute_functions() 9 | { 10 | #define BIND_KERNEL(S, F) \ 11 | gemm_kernel_binder::bind(O, T, func); 12 | 13 | auto bind_kernel = [&](int O, int T, 14 | gemm_kernel_binder::kgemm **func) { 15 | switch (xopt_) { 16 | case (a061p2): 17 | if (ep.input_fmt == nhwc) { 18 | if (ep.ws == 1) 19 | BIND_KERNEL(1, GKF_FCF) 20 | else if (ep.ws == 2) 21 | BIND_KERNEL(2, GKF_FCF) 22 | } else { 23 | BIND_KERNEL(1, GKF_CCC) 24 | } 25 | break; 26 | case (a061p1): 27 | if (ep.input_fmt == nhwc) 28 | BIND_KERNEL(1, GKF_FCF) 29 | else 30 | BIND_KERNEL(1, GKF_CCC) 31 | break; 32 | case (a061): 33 | BIND_KERNEL(1, GKF_CCD) 34 | break; 35 | case (a060): 36 | BIND_KERNEL(1, GKF_DCD) 37 | break; 38 | default: 39 | el_error("Unknown xopt"); 40 | break; 41 | } 42 | }; 43 | 44 | bind_kernel(ep.O, ep.T, &ker_gemm_I_O_T_); 45 | bind_kernel(ep.O, ep.Tr, &ker_gemm_I_O_Tr_); 46 | 47 | switch (xopt_) { 48 | case a060: 49 | execute_opt_ = &Instance_elx_conv_direct_1x1_t::__execute_a060; 50 | break; 51 | case a061: 52 | execute_opt_ = &Instance_elx_conv_direct_1x1_t::__execute_a061; 53 | break; 54 | case a061p1: 55 | execute_opt_ = &Instance_elx_conv_direct_1x1_t::__execute_a061p1; 56 | break; 57 | case a061p2: 58 | execute_opt_ = &Instance_elx_conv_direct_1x1_t::__execute_a061p2; 59 | break; 60 | default: 61 | el_error("Unimplemented xopt"); 62 | break; 63 | } 64 | } 65 | 66 | } // namespace euler 67 | -------------------------------------------------------------------------------- /scripts/best_configs/googlenetv3-n1-8180-1s-wino-f4_3.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | 3 | # googlenet_v3 4 | # batch-size: 1 5 | # SKX 8180 1S 6 | 7 | 8 | source ./scripts/best_configs/common.sh $@ 9 | 10 | # googlenet_v3:conv_1_1_conv2d, ['ic', '32', 'ih', '149', 'oc', '32', 'oh', '147', 'kh', '3'] 11 | NSOCKETS=1 ./scripts/run.sh -c -i32 -h149 -o32 -H147 -n1 --blk-i=2 --blk-o=2 --flt-t=25 --tile-size=6 --execution-mode=0xa061 --output-as-blocked=true -p0 -P0 $COMMON 12 | sleep 1 13 | # googlenet_v3:conv_2_2_conv2d, ['ic', '32', 'ih', '147', 'oc', '64', 'oh', '147', 'kh', '3', 'ph', '1'] 14 | NSOCKETS=1 ./scripts/run.sh -c -i32 -h147 -o64 -H147 -n1 --blk-i=2 --blk-o=4 --flt-t=28 --tile-size=6 --execution-mode=0xa061 $COMMON 15 | sleep 1 16 | # googlenet_v3:conv_4_4_conv2d, ['ic', '80', 'ih', '73', 'oc', '192', 'oh', '71', 'kh', '3'] 17 | NSOCKETS=1 ./scripts/run.sh -c -i80 -h73 -o192 -H71 -n1 --blk-i=5 --blk-o=2 --pat-o=3 --flt-t=12 --tile-size=6 --execution-mode=0xa061 --output-as-blocked=true -p0 -P0 $COMMON 18 | sleep 1 19 | # googlenet_v3:mixed_tower_1_conv_1_conv2d, ['ic', '64', 'ih', '35', 'oc', '96', 'oh', '35', 'kh', '3', 'ph', '1'] 20 | NSOCKETS=1 ./scripts/run.sh -c -i64 -h35 -o96 -H35 -n1 --blk-i=4 --blk-o=3 --flt-t=6 --pat-o=2 --tile-size=6 --execution-mode=0xa061 --output-as-blocked=true $COMMON 21 | sleep 1 22 | # googlenet_v3:mixed_tower_1_conv_2_conv2d, ['ic', '96', 'ih', '35', 'oc', '96', 'oh', '35', 'kh', '3', 'ph', '1'] 23 | NSOCKETS=1 ./scripts/run.sh -c -i96 -h35 -o96 -H35 -n1 --blk-i=6 --blk-o=3 --flt-t=3 --pat-o=2 --tile-size=6 --execution-mode=0xa061 --output-as-blocked=true $COMMON 24 | sleep 1 25 | # googlenet_v3:mixed_9_tower_1_conv_1_conv2d, ['ic', '448', 'ih', '8', 'oc', '384', 'oh', '8', 'kh', '3', 'ph', '1'] 26 | NSOCKETS=1 ./scripts/run.sh -c -i448 -h8 -o384 -H8 -n1 --blk-i=14 --blk-o=4 --flt-t=4 --tile-size=6 --execution-mode=0xa000 --streaming-input=1 $COMMON 27 | sleep 1 28 | -------------------------------------------------------------------------------- /scripts/best_configs/resnet-n1-8180-1s-direct.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | 3 | # Resnet50 4 | # batch-size: 1 5 | # SKX 8180 1S 6 | 7 | source ./scripts/best_configs/common.sh $@ 8 | COMMON="$COMMON --f16c-opt=1" 9 | 10 | # resnet_50:conv1 11 | NSOCKETS=1 ./scripts/run.sh -c -i3 -h224 -o64 -H112 -k7 -K7 -s2 -S2 -p3 -P3 -n1 -adirect --execution-mode=0xc060 --blk-i=1 --blk-o=1 --flt-o=2 --flt-t=14 --pat-o=1 $COMMON --input-format=nchw --weights-format=hwio 12 | sleep 1 13 | # resnet50_res2a_branch2b, 5.2T 14 | NSOCKETS=1 ./scripts/run.sh -c -i64 -h56 -o64 -H56 -n1 -adirect --execution-mode=0xc060 --blk-i=4 --blk-o=2 --flt-o=2 --flt-t=14 --pat-o=1 $COMMON 15 | sleep 1 16 | # resnet50_res3a_branch2b, 7.0 - 7.5T 17 | NSOCKETS=1 ./scripts/run.sh -c -i128 -h28 -o128 -H28 -n1 -adirect --execution-mode=0xc070 --blk-i=2 --blk-o=1 --flt-o=2 --flt-t=14 --pat-i=4 --pat-o=1 $COMMON 18 | sleep 1 19 | # resnet50_res4a_branch2b, 4.2T 20 | NSOCKETS=1 ./scripts/run.sh -c -i256 -h14 -o256 -H14 -n1 -adirect --execution-mode=0xc070 --blk-i=4 --blk-o=1 --flt-o=1 --flt-t=14 --pat-i=4 $COMMON 21 | sleep 1 22 | # resnet50_res5a_branch2b, 2.5T 23 | NSOCKETS=1 ./scripts/run.sh -c -i512 -h7 -o512 -H7 -n1 -adirect --execution-mode=0xc070 --blk-i=4 --blk-o=1 --flt-o=1 --flt-t=7 --pat-i=8 $COMMON 24 | sleep 1 25 | # resnet_50_sparse:res2c_branch2b 26 | NSOCKETS=1 ./scripts/run.sh -c -i64 -h56 -o64 -H28 -k3 -s2 -S2 -n1 -adirect --execution-mode=0xc060 --blk-i=4 --blk-o=1 --flt-o=2 --flt-t=14 --pat-i=1 $COMMON 27 | sleep 1 28 | # resnet_50_sparse:res3d_branch2b 29 | NSOCKETS=1 ./scripts/run.sh -c -i128 -h28 -o128 -H14 -k3 -s2 -S2 -n1 -adirect --execution-mode=0xc070 --blk-i=2 --blk-o=1 --flt-o=2 --flt-t=14 --pat-i=4 $COMMON 30 | sleep 1 31 | # resnet_50_sparse:res4f_branch2b 32 | NSOCKETS=1 ./scripts/run.sh -c -i256 -h14 -o256 -H7 -k3 -s2 -S2 -n1 -adirect --execution-mode=0xc070 --blk-i=2 --blk-o=1 --flt-o=1 --flt-t=7 --pat-i=8 $COMMON 33 | -------------------------------------------------------------------------------- /scripts/best_configs/googlenetv3-n1-8180-2s-wino-f4_3.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | 3 | # googlenet_v3 4 | # batch-size: 1 5 | # SKX 8180 2S 6 | 7 | 8 | source ./scripts/best_configs/common.sh $@ 9 | 10 | # googlenet_v3:conv_1_1_conv2d, ['ic', '32', 'ih', '149', 'oc', '32', 'oh', '147', 'kh', '3'] 11 | NSOCKETS=2 ./scripts/run.sh -c -i32 -h149 -o32 -H147 -n1 --blk-i=2 --blk-o=2 --flt-t=25 --tile-size=6 --execution-mode=0xa061 --output-as-blocked=true -p0 -P0 $COMMON 12 | sleep 1 13 | # googlenet_v3:conv_2_2_conv2d, ['ic', '32', 'ih', '147', 'oc', '64', 'oh', '147', 'kh', '3', 'ph', '1'] 14 | NSOCKETS=2 ./scripts/run.sh -c -i32 -h147 -o64 -H147 -n1 --blk-i=2 --blk-o=4 --flt-t=28 --tile-size=6 --execution-mode=0xa061 $COMMON 15 | sleep 1 16 | # googlenet_v3:conv_4_4_conv2d, ['ic', '80', 'ih', '73', 'oc', '192', 'oh', '71', 'kh', '3'] 17 | NSOCKETS=2 ./scripts/run.sh -c -i80 -h73 -o192 -H71 -n1 --blk-i=5 --blk-o=3 --pat-o=2 --flt-t=12 --tile-size=6 --execution-mode=0xa061 --output-as-blocked=true -p0 -P0 $COMMON 18 | sleep 1 19 | # googlenet_v3:mixed_tower_1_conv_1_conv2d, ['ic', '64', 'ih', '35', 'oc', '96', 'oh', '35', 'kh', '3', 'ph', '1'] 20 | NSOCKETS=2 ./scripts/run.sh -c -i64 -h35 -o96 -H35 -n1 --blk-i=4 --blk-o=3 --flt-t=3 --pat-o=2 --tile-size=6 --execution-mode=0xa061 --output-as-blocked=true $COMMON 21 | sleep 1 22 | # googlenet_v3:mixed_tower_1_conv_2_conv2d, ['ic', '96', 'ih', '35', 'oc', '96', 'oh', '35', 'kh', '3', 'ph', '1'] 23 | NSOCKETS=2 ./scripts/run.sh -c -i96 -h35 -o96 -H35 -n1 --blk-i=6 --blk-o=3 --flt-t=3 --pat-o=2 --tile-size=6 --execution-mode=0xa061 --output-as-blocked=true $COMMON 24 | sleep 1 25 | # googlenet_v3:mixed_9_tower_1_conv_1_conv2d, ['ic', '448', 'ih', '8', 'oc', '384', 'oh', '8', 'kh', '3', 'ph', '1'] 26 | NSOCKETS=2 ./scripts/run.sh -c -i448 -h8 -o384 -H8 -n1 --blk-i=14 --blk-o=24 --flt-t=4 --tile-size=6 --execution-mode=0xa000 --streaming-input=1 $COMMON 27 | sleep 1 28 | -------------------------------------------------------------------------------- /src/elx_int8_conv_direct_depthwise_bind.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "elx_int8_conv_direct_depthwise.hpp" 4 | 5 | namespace euler { 6 | 7 | Template_elx_int8_conv_direct_depthwise_t void 8 | Instance_elx_int8_conv_direct_depthwise_t::bind_execute_functions() 9 | { 10 | #define BIND_CONV_KERNEL(S, F, K) \ 11 | if (K == 3) { \ 12 | u8s8_depthwise_conv_kernel_binder::bind(O, T, func); \ 13 | } 14 | 15 | auto bind_conv_kernel = [&](int O, int T, 16 | u8s8_depthwise_conv_kernel_binder::kconv **func, 17 | int K) { 18 | switch (xopt_) { 19 | case (0xc160): 20 | if (ep.input_fmt == nChw16c && ep.output_fmt == nChw16c) { 21 | if (ep.ws == 1) { 22 | BIND_CONV_KERNEL(1, GKF_DCD, K); 23 | } else if (ep.ws == 2) { 24 | BIND_CONV_KERNEL(2, GKF_DCD, K); 25 | } else { 26 | el_error("Stride > 2 not yet bounded"); 27 | } 28 | } else { 29 | el_error("direct_depthwise: int8: kernel fmt not supported"); 30 | } 31 | break; 32 | default: 33 | el_error("Unknown xopt"); 34 | break; 35 | } 36 | }; 37 | 38 | if (xopt_ == 0xc160) { 39 | bind_conv_kernel(ep.O, ep.T, &ker_conv_, ep.kw); 40 | bind_conv_kernel(ep.O, ep.Tr, &ker_conv_Tr_, ep.kw); 41 | } 42 | 43 | #define EXECUTE_CASE(n) \ 44 | case 0x##n: \ 45 | execute_opt_ = &Instance_elx_int8_conv_direct_depthwise_t::__execute_##n; \ 46 | break 47 | 48 | switch (xopt_) { 49 | EXECUTE_CASE(c160); 50 | default: 51 | el_error("direct_depthwise: int8: Unimplemented xopt"); 52 | break; 53 | } 54 | } 55 | 56 | } // namespace euler 57 | -------------------------------------------------------------------------------- /scripts/best_configs/vgg-n64-8180-2s-wino-f2_3.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | 3 | # VGG 4 | # batch-size: 64 5 | # SKX 8180 2S 6 | 7 | source ./scripts/best_configs/common.sh $@ 8 | 9 | # vgg19_conv1_2, 12.2T tflops=10.5702 10 | NSOCKETS=2 ./scripts/run.sh -c -i64 -h224 -o64 -H224 -n64 --tile-size=4 --blk-i=4 --blk-o=4 --flt-t=28 --tile-size=4 --execution-mode=0xa061 --output-as-blocked=true $COMMON 11 | sleep 1 12 | # vgg19_conv2_1, 13.3T tflops=11.0838 13 | NSOCKETS=2 ./scripts/run.sh -c -i64 -h112 -o128 -H112 -n64 --tile-size=4 --blk-i=4 --blk-o=8 --flt-t=30 --tile-size=4 --execution-mode=0xa061 $COMMON 14 | sleep 1 15 | # vgg19_conv2_2, 14.2T tflops=11.572 16 | NSOCKETS=2 ./scripts/run.sh -c -i128 -h112 -o128 -H112 -n64 --tile-size=4 --blk-i=8 --blk-o=8 --flt-t=27 --tile-size=4 --execution-mode=0xa061 $COMMON 17 | sleep 1 18 | # vgg19_conv3_1, 13.9T tflops=12.0013 19 | NSOCKETS=2 ./scripts/run.sh -c -i128 -h56 -o256 -H56 -n64 --tile-size=4 --blk-i=8 --blk-o=16 --flt-t=29 --pat-o=1 --tile-size=4 --execution-mode=0xa061 $COMMON 20 | sleep 1 21 | # vgg19_conv3_2, 11.2T tflops=12.0409 22 | NSOCKETS=2 ./scripts/run.sh -c -i256 -h56 -o256 -H56 -n64 --tile-size=4 --blk-i=8 --blk-o=8 --flt-t=29 --pat-o=1 --tile-size=4 --execution-mode=0xa061 $COMMON 23 | sleep 1 24 | # vgg19_conv4_1, 10.0T tflops=8.37297 25 | NSOCKETS=2 ./scripts/run.sh -c -i256 -h28 -o512 -H28 -n64 --tile-size=4 --blk-i=8 --blk-o=16 --flt-t=22 --tile-size=4 --execution-mode=0xa000 --streaming-input=2 $COMMON 26 | sleep 1 27 | # vgg19_conv4_2, 10.8T tflops=9.77669 28 | NSOCKETS=2 ./scripts/run.sh -c -i512 -h28 -o512 -H28 -n64 --tile-size=4 --blk-i=8 --blk-o=32 --flt-t=23 --tile-size=4 --execution-mode=0xa000 --streaming-input=2 $COMMON 29 | sleep 1 30 | # vgg19_conv5_1, 10.6T tflops=9.47513 31 | NSOCKETS=2 ./scripts/run.sh -c -i512 -h14 -o512 -H14 -n64 --tile-size=4 --blk-i=8 --blk-o=8 --flt-t=25 --tile-size=4 --execution-mode=0xa000 --streaming-input=2 $COMMON 32 | -------------------------------------------------------------------------------- /src/kernel/elk_def.hpp: -------------------------------------------------------------------------------- 1 | #ifndef __ELK_DEF_HPP__ 2 | #define __ELK_DEF_HPP__ 3 | 4 | #include "el_utils.hpp" 5 | 6 | namespace euler { 7 | 8 | // Transform kernel format 9 | // C: compact 10 | // Input: A * A * V 11 | // Output: (A - K + 1) * (A - K + 1) * V 12 | // D: blocked 13 | // Input: I2, ih, iw, V, Vx 14 | // Output: O1, O, oh, ow, V 15 | // E: nchw 16 | // Input: I2, V, ih, iw 17 | // Output: O2, V, ih, iw 18 | // F: nhwc 19 | // Input: ih, iw, I2, V 20 | // Output: oh, ow, O1, O, V 21 | const int TKF_COMPACT = 0xC; 22 | const int TKF_BLOCKED = 0xD; 23 | const int TKF_NCHW = 0xE; 24 | const int TKF_NHWC = 0xF; 25 | 26 | // GEMM kernel format 27 | // Input - weights - output 28 | // B: compact b 29 | // Weights: O1, Ir, O, oV 30 | // C: compact c 31 | // Input: I2, T, S, V, Vx 32 | // Weights: O1, I2, V, O, V, Vx 33 | // Output: O1, O, T, V 34 | // shift: O1, O, V 35 | // weights_scale: O1, O, V 36 | // D: blocked 37 | // Input: I2, ih, iw, V, Vx 38 | // Weights: O1, O, ic2, V, V, Vx 39 | // Output: O1, O, oh, ow, V 40 | // shift: O1, O, V 41 | // weights_scale: O1, O, V 42 | // E: nchw 43 | // Input: I2, V, ih, iw 44 | // F: nhwc 45 | // Input: ih, iw, I2, V 46 | // Output: oh, ow, O1, O, V 47 | const int GKF_CCC = 0xccc; 48 | const int GKF_CCD = 0xccd; 49 | const int GKF_DCD = 0xdcd; 50 | const int GKF_DDD = 0xddd; 51 | const int GKF_EBD = 0xebd; 52 | const int GKF_FCF = 0xfcf; 53 | const int GKF_FBD = 0xfbd; 54 | const int GKF_FBF = 0xfbf; 55 | const int GKF_DCF = 0xdcf; 56 | const int GKF_FCD = 0xfcd; 57 | 58 | // Conv padding: 59 | // symmetric-padding: pl = pr 60 | // lean-right-padding: pl = 2, pl = 3 61 | // lean-left-padding: pl = 3, pl = 2 62 | const int GKP_LLP_MASK = (0x1 << 7); 63 | const int GKP_S_MASK = ((0x1 << 7) - 1); 64 | 65 | const int S2_LLP = 2 | GKP_LLP_MASK; 66 | 67 | } // namespace euler 68 | 69 | #endif // __ELK_DEF_HPP__ 70 | -------------------------------------------------------------------------------- /scripts/best_configs/resnet-n1-8180-1s-direct-int8.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | 3 | # Resnet50 4 | # batch-size: 1 5 | # SKX 8180 1S 6 | 7 | source ./scripts/best_configs/common.sh $@ 8 | 9 | # resnet50_res2a_branch2b, 5.2T 10 | NSOCKETS=1 ./scripts/run.sh -c -i64 -h56 -o64 -H56 -n1 -adirect --execution-mode=0xc160 --blk-i=4 --blk-o=1 --flt-o=2 --flt-t=8 --pat-o=1 $COMMON --data-type-cfg=U8F32S8F32 --sampling-kind=2 11 | sleep 1 12 | # resnet50_res3a_branch2b, 7.0 - 7.5T 13 | NSOCKETS=1 ./scripts/run.sh -c -i128 -h28 -o128 -H28 -n1 -adirect --execution-mode=0xc160 --blk-i=8 --blk-o=1 --flt-o=2 --flt-t=11 --pat-i=1 --pat-o=1 $COMMON --data-type-cfg=U8F32S8F32 --sampling-kind=2 14 | #sleep 1 15 | # resnet50_res4a_branch2b, 4.2T 16 | NSOCKETS=1 ./scripts/run.sh -c -i256 -h14 -o256 -H14 -n1 -adirect --execution-mode=0xc160 --blk-i=16 --blk-o=1 --flt-o=1 --flt-t=14 --pat-i=1 $COMMON --data-type-cfg=U8F32S8F32 --sampling-kind=2 17 | #sleep 1 18 | # resnet50_res5a_branch2b, 2.5T 19 | NSOCKETS=1 ./scripts/run.sh -c -i512 -h7 -o512 -H7 -n1 -adirect --execution-mode=0xc160 --blk-i=32 --blk-o=1 --flt-o=1 --flt-t=7 --pat-i=1 --pat-o=1 $COMMON --data-type-cfg=U8F32S8F32 --sampling-kind=2 20 | #sleep 1 21 | # resnet_50_sparse:res2c_branch2b 22 | NSOCKETS=1 ./scripts/run.sh -c -i64 -h56 -o64 -H28 -k3 -s2 -S2 -n1 -adirect --execution-mode=0xc160 --blk-i=4 --blk-o=1 --flt-o=1 --flt-t=14 --pat-i=1 $COMMON --data-type-cfg=U8F32S8F32 --sampling-kind=2 23 | sleep 1 24 | # resnet_50_sparse:res3d_branch2b 25 | NSOCKETS=1 ./scripts/run.sh -c -i128 -h28 -o128 -H14 -k3 -s2 -S2 -n1 -adirect --execution-mode=0xc160 --blk-i=8 --blk-o=1 --flt-o=1 --flt-t=14 --pat-i=1 $COMMON --data-type-cfg=U8F32S8F32 --sampling-kind=2 26 | #sleep 1 27 | # resnet_50_sparse:res4f_branch2b 28 | NSOCKETS=1 ./scripts/run.sh -c -i256 -h14 -o256 -H7 -k3 -s2 -S2 -n1 -adirect --execution-mode=0xc160 --blk-i=16 --blk-o=1 --flt-o=1 --flt-t=14 --pat-i=1 $COMMON --data-type-cfg=U8F32S8F32 --sampling-kind=2 29 | -------------------------------------------------------------------------------- /src/elx_int8_conv_direct_1x1_bind.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "elx_int8_conv_direct_1x1.hpp" 4 | 5 | namespace euler { 6 | 7 | Template_elx_int8_conv_direct_1x1_t void 8 | Instance_elx_int8_conv_direct_1x1_t::bind_execute_functions() { 9 | #define BIND_KERNEL(S, F) u8s8_gemm_kernel_binder::bind(O, T, func); 10 | 11 | auto bind_kernel = [&](int O, int T, 12 | u8s8_gemm_kernel_binder::kgemm **func) { 14 | if (ep.ws == 1) { 15 | if (ep.input_fmt == nChw16c && ep.output_fmt == nChw16c) 16 | BIND_KERNEL(1, GKF_DCD) 17 | else if (ep.input_fmt == nhwc && ep.output_fmt == nChw16c) 18 | BIND_KERNEL(1, GKF_FCD) 19 | else if (ep.input_fmt == nChw16c && ep.output_fmt == nhwc) 20 | BIND_KERNEL(1, GKF_DCF) 21 | else 22 | BIND_KERNEL(1, GKF_FCF) 23 | } else if (ep.ws == 2) { 24 | if (ep.input_fmt == nChw16c && ep.output_fmt == nChw16c) { 25 | BIND_KERNEL(2, GKF_DCD) 26 | } else if (ep.input_fmt == nhwc && ep.output_fmt == nChw16c) { 27 | BIND_KERNEL(2, GKF_FCD) 28 | } else if (ep.input_fmt == nChw16c && ep.output_fmt == nhwc) { 29 | BIND_KERNEL(2, GKF_DCF) 30 | } else { // nhwc -> nhwc 31 | BIND_KERNEL(2, GKF_FCF) 32 | } 33 | } else { 34 | el_error("ws > 2 not enabled"); 35 | } 36 | }; 37 | 38 | bind_kernel(ep.O, ep.T, &ker_u8s8_gemm_I_O_T_); 39 | bind_kernel(ep.O, ep.Tr, &ker_u8s8_gemm_I_O_Tr_); 40 | 41 | #define EXECUTE_CASE(n) \ 42 | case 0x##n: \ 43 | execute_opt_ = &Instance_elx_int8_conv_direct_1x1_t::__execute_##n; \ 44 | break 45 | 46 | switch (xopt_) { 47 | EXECUTE_CASE(a160); 48 | default: 49 | el_error("Unimplemented xopt"); 50 | break; 51 | } 52 | } 53 | 54 | } // namespace euler 55 | -------------------------------------------------------------------------------- /TODO: -------------------------------------------------------------------------------- 1 | Wino: 2 | - Tile-size: 6 3 | 4 | Blocking: 5 | - tail oc/ic handling 6 | 7 | Build: 8 | - arch-specific compiling option 9 | 10 | Fusion: 11 | - relu 12 | - sum 13 | 14 | OMP: 15 | - kmp_malloc(?) allocate from thread-local heap 16 | - omp_in_final() 17 | - omp_get_num_procs 18 | 19 | Modularity (2018-07-05): Done 20 | - trans_input/trans_output: common function for offset computing 21 | - code reorg, break transform kernels into small kernel files: 22 | elk_conv_wino_4x4_3x3_input.hxx 23 | ... 24 | - Memory size calculation for different xopt 25 | 26 | 27 | Plain format (nchw/oihw) support (2018-7-11): Done 28 | - fused reorder 29 | - scatter/gather 30 | - double buffer (drop as no effect) 31 | - enable plain fmt for 0xa0e0/0xa0e1 32 | - enable plain fmt for 0xa073 33 | - fix performance regression in block16: format as template parameter 34 | - format-as-blocked (esp. to avoid false sharing in fused output reorder) 35 | 36 | 37 | IC/OC != 16x (2018-7-16): Done 38 | - Support DNN first layer (drop as it can not achieve good performance with Winograd) 39 | IC < 16, OC = 16x, nchw + Oihw16o => nChw16o 40 | - Support blocked format with padded tensor 41 | IC|OC != 16x, nChw16c + OIhw16i16o => nChw16c 42 | - Support plain format in format-as path 43 | IC|OC != 16x, nchw + oihw => nchw 44 | - Support plain format in format-is path 45 | IC|OC != 16x, nchw + oihw => nchw 46 | 47 | MD-Array (2018-7-22): Done 48 | - Cross platform/compiler MD-array 49 | - Improve MD-Array performance for ICC 50 | 51 | Conv_1x1 (2018-8-1): Done 52 | - Uni-stride: kernel=1x1, stride=1, padding=0 53 | - Stride=2, padding=0 54 | - Blocked format 55 | - plain format, IC/OC != 16x 56 | - TODO: code clean and Perf tuning 57 | - TODO: padding support 58 | 59 | GEMM kernel (2018-8-23): Done 60 | - Rewrite gemm kernel with better readability and modularity 61 | - Apply new gemm kernel to conv1x1 62 | - Apply new gemm kernel to Winograd 63 | - Perf tuning for Winograd: xopt-A072 64 | 65 | - Refactoring elx_conv_wino_t 66 | -------------------------------------------------------------------------------- /scripts/best_configs/vgg-n1-8180-2s-wino-f3_3.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | 3 | # VGG 4 | # batch-size: 1 5 | # SKX 8180 2S 6 | 7 | source ./scripts/best_configs/common.sh $@ 8 | 9 | # vgg19_conv1_2, 12.8T 10 | NSOCKETS=2 ./scripts/run.sh -c -i64 -h224 -o64 -H224 -n1 --blk-i=4 --blk-o=4 --flt-t=17 --tile-size=5 --execution-mode=0xa061 --output-as-blocked=true $COMMON 11 | sleep 1 12 | # vgg19_conv2_1, 13.5T 13 | NSOCKETS=2 ./scripts/run.sh -c -i64 -h112 -o128 -H112 -n1 --blk-i=4 --blk-o=8 --flt-t=26 --tile-size=5 --execution-mode=0xa061 $COMMON 14 | sleep 1 15 | # vgg19_conv2_2, 14.3T 16 | NSOCKETS=2 ./scripts/run.sh -c -i128 -h112 -o128 -H112 -n1 --blk-i=8 --blk-o=8 --flt-t=26 --tile-size=5 --execution-mode=0xa061 $COMMON 17 | sleep 1 18 | # vgg19_conv3_1, 13.0T 19 | NSOCKETS=2 ./scripts/run.sh -c -i128 -h56 -o256 -H56 -n1 --blk-i=8 --blk-o=4 --flt-t=26 --pat-o=4 --tile-size=5 --execution-mode=0xa061 $COMMON 20 | sleep 1 21 | # vgg19_conv3_2, 12.6T 22 | NSOCKETS=2 ./scripts/run.sh -c -i256 -h56 -o256 -H56 -n1 --blk-i=8 --blk-o=4 --flt-t=26 --pat-o=4 --tile-size=5 --execution-mode=0xa061 $COMMON 23 | sleep 1 24 | # vgg19_conv4_1, 9.5T 25 | # NSOCKETS=2 ./scripts/run.sh -c -i256 -h28 -o512 -H28 -n1 --blk-i=8 --blk-o=4 --flt-t=15 --pat-o=8 --tile-size=5 --execution-mode=0xa061 $COMMON 26 | # vgg19_conv4_1, 9.7T 27 | # NSOCKETS=2 ./scripts/run.sh -c -i256 -h28 -o512 -H28 -n1 --blk-i=8 --blk-o=8 --flt-t=15 --pat-i=2 --pat-o=4 --tile-size=5 --execution-mode=0xa073 $COMMON 28 | # vgg19_conv4_1, 10.2T 29 | NSOCKETS=2 ./scripts/run.sh -c -i256 -h28 -o512 -H28 -n1 --blk-i=8 --blk-o=4 --flt-t=25 --tile-size=5 --execution-mode=0xa000 --streaming-input=1 $COMMON 30 | sleep 1 31 | # vgg19_conv4_2, 10.5T 32 | NSOCKETS=2 ./scripts/run.sh -c -i512 -h28 -o512 -H28 -n1 --blk-i=8 --blk-o=4 --flt-t=26 --tile-size=5 --execution-mode=0xa000 --streaming-input=1 $COMMON 33 | sleep 1 34 | # vgg19_conv5_1, 7.3T 35 | NSOCKETS=2 ./scripts/run.sh -c -i512 -h14 -o512 -H14 -n1 --blk-i=8 --blk-o=4 --flt-t=25 --tile-size=5 --execution-mode=0xa000 $COMMON 36 | -------------------------------------------------------------------------------- /scripts/best_configs/vgg-n64-8180-1s-direct.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | 3 | # VGG19 4 | # batch-size: 1 5 | # SKX 8180 1S 6 | 7 | source ./scripts/best_configs/common.sh $@ 8 | 9 | # vgg19_conv1_1, 10 | NSOCKETS=1 ./scripts/run.sh -c -i3 -h224 -o64 -H224 -n64 --blk-i=1 --blk-o=2 --flt-o=2 --flt-t=14 -adirect --execution-mode=0xc060 --pat-o=1 $COMMON --input-format=nchw --weights-format=hwio 11 | 12 | sleep 1 13 | # vgg19_conv1_2, 14 | NSOCKETS=1 ./scripts/run.sh -c -i64 -h224 -o64 -H224 -n64 --blk-i=4 --blk-o=2 --flt-o=2 --flt-t=14 -adirect --execution-mode=0xc060 --pat-o=1 $COMMON 15 | 16 | sleep 1 17 | # vgg19_conv2_1, 18 | NSOCKETS=1 ./scripts/run.sh -c -i64 -h112 -o128 -H112 -n64 --blk-i=4 --blk-o=2 --flt-o=2 --flt-t=14 -adirect --execution-mode=0xc060 --pat-o=1 $COMMON 19 | 20 | sleep 1 21 | # vgg19_conv2_2, 22 | NSOCKETS=1 ./scripts/run.sh -c -i128 -h112 -o128 -H112 -n64 --blk-i=8 --blk-o=2 --flt-o=2 --flt-t=14 -adirect --execution-mode=0xc060 --pat-i=1 --pat-o=1 $COMMON 23 | 24 | sleep 1 25 | # vgg19_conv3_1, 26 | NSOCKETS=1 ./scripts/run.sh -c -i128 -h56 -o256 -H56 -n64 --blk-i=8 --blk-o=2 --flt-o=2 --flt-t=14 --pat-i=1 --pat-o=1 -adirect --execution-mode=0xc060 $COMMON 27 | 28 | sleep 1 29 | # vgg19_conv3_2, 30 | NSOCKETS=1 ./scripts/run.sh -c -i256 -h56 -o256 -H56 -n64 --blk-i=8 --blk-o=2 --flt-o=2 --flt-t=14 --pat-i=2 --pat-o=1 -adirect --execution-mode=0xc060 $COMMON 31 | 32 | sleep 1 33 | # vgg19_conv4_1, 6.6T 34 | NSOCKETS=1 ./scripts/run.sh -c -i256 -h28 -o512 -H28 -n64 --blk-i=8 --blk-o=1 --flt-o=2 --flt-t=14 -adirect --execution-mode=0xc060 --pat-i=2 --pat-o=1 $COMMON 35 | 36 | sleep 1 37 | # vgg19_conv4_2, 6.4T 38 | NSOCKETS=1 ./scripts/run.sh -c -i512 -h28 -o512 -H28 -n64 --blk-i=16 --blk-o=1 --flt-o=2 --flt-t=14 -adirect --execution-mode=0xc060 --pat-i=2 --pat-o=1 $COMMON 39 | 40 | sleep 1 41 | # vgg19_conv5_1, 5.2T 42 | NSOCKETS=1 ./scripts/run.sh -c -i512 -h14 -o512 -H14 -n64 --blk-i=16 --blk-o=2 --flt-o=2 --flt-t=14 -adirect --execution-mode=0xc060 --pat-i=2 --pat-o=1 $COMMON --f16c-opt=1 43 | -------------------------------------------------------------------------------- /scripts/best_configs/vgg-n1-8180-1s-direct.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | 3 | # VGG19 4 | # batch-size: 1 5 | # SKX 8180 1S 6 | 7 | source ./scripts/best_configs/common.sh $@ 8 | COMMON="$COMMON --f16c-opt=0" 9 | 10 | # vgg19_conv1_1, 11 | NSOCKETS=1 ./scripts/run.sh -c -i3 -h224 -o64 -H224 -n1 --blk-i=1 --blk-o=2 --flt-o=2 --flt-t=14 -adirect --execution-mode=0xc060 --pat-o=1 $COMMON --input-format=nchw --weights-format=hwio 12 | 13 | sleep 1 14 | # vgg19_conv1_2, 15 | NSOCKETS=1 ./scripts/run.sh -c -i64 -h224 -o64 -H224 -n1 --blk-i=4 --blk-o=2 --flt-o=2 --flt-t=14 -adirect --execution-mode=0xc060 --pat-o=1 $COMMON 16 | 17 | sleep 1 18 | # vgg19_conv2_1, 19 | NSOCKETS=1 ./scripts/run.sh -c -i64 -h112 -o128 -H112 -n1 --blk-i=4 --blk-o=2 --flt-o=2 --flt-t=14 -adirect --execution-mode=0xc060 --pat-o=1 $COMMON 20 | 21 | sleep 1 22 | # vgg19_conv2_2, 23 | NSOCKETS=1 ./scripts/run.sh -c -i128 -h112 -o128 -H112 -n1 --blk-i=4 --blk-o=2 --flt-o=2 --flt-t=14 -adirect --execution-mode=0xc060 --pat-i=2 --pat-o=1 $COMMON 24 | 25 | sleep 1 26 | # vgg19_conv3_1, 27 | NSOCKETS=1 ./scripts/run.sh -c -i128 -h56 -o256 -H56 -n1 --blk-i=4 --blk-o=2 --flt-o=2 --flt-t=14 --pat-i=2 --pat-o=1 -adirect --execution-mode=0xc060 $COMMON 28 | 29 | sleep 1 30 | # vgg19_conv3_2, 31 | NSOCKETS=1 ./scripts/run.sh -c -i256 -h56 -o256 -H56 -n1 --blk-i=4 --blk-o=2 --flt-o=2 --flt-t=14 --pat-i=4 --pat-o=1 -adirect --execution-mode=0xc060 $COMMON 32 | 33 | sleep 1 34 | # vgg19_conv4_1, 6.6T 35 | NSOCKETS=1 ./scripts/run.sh -c -i256 -h28 -o512 -H28 -n1 --blk-i=2 --blk-o=1 --flt-o=2 --flt-t=14 -adirect --execution-mode=0xc060 --pat-i=8 --pat-o=1 $COMMON 36 | 37 | sleep 1 38 | # vgg19_conv4_2, 6.4T 39 | NSOCKETS=1 ./scripts/run.sh -c -i512 -h28 -o512 -H28 -n1 --blk-i=4 --blk-o=1 --flt-o=2 --flt-t=14 -adirect --execution-mode=0xc060 --pat-i=8 --pat-o=1 $COMMON 40 | 41 | sleep 1 42 | # vgg19_conv5_1, 5.2T 43 | NSOCKETS=1 ./scripts/run.sh -c -i512 -h14 -o512 -H14 -n1 --blk-i=2 --blk-o=1 --flt-o=1 --flt-t=14 -adirect --execution-mode=0xc060 --pat-i=16 --pat-o=1 $COMMON 44 | -------------------------------------------------------------------------------- /docs/fusion.txt: -------------------------------------------------------------------------------- 1 | ## winograd 2 | +------------+-------------------------------+-------------------------------------+ 3 | | winograd | Blocked format | Plain format | 4 | +------------+---------+---------------------+---------+---------------------------+ 5 | | Fusion type| Support | Fusion point | Support | Fusion point | 6 | +------------+---------+---------------------+---------+---------------------------+ 7 | | ReLU | Y | Trans output kernel | Y | Trans output kernel | 8 | +------------+---------+---------------------+---------+---------------------------+ 9 | | Sum(IP) | Y | Trans output kernel | Y | Trans output(after kernel)| 10 | +------------+---------+---------------------+---------+---------------------------+ 11 | | Sum+ReLU | Y | Trans output kernel | N | | 12 | +------------+---------+---------------------+---------+---------------------------+ 13 | 14 | ## Conv1x1 15 | +------------+-------------------------------+-----------------------------------------+ 16 | | Conv 1x1 | Blocked format | Plain format | 17 | +------------+---------+---------------------+---------+-------------------------------+ 18 | | Fusion type| Support | Fusion point | Support | Fusion point | 19 | +------------+---------+---------------------+---------+-------------------------------+ 20 | | ReLU | Y | OTJ kernel | Y | OTJ kernel | 21 | +------------+---------+---------------------+---------+-------------------------------+ 22 | | Sum(IP) | Y | OTJ kernel | Y | Trans output(after OTJ kernel)| 23 | +------------+---------+---------------------+---------+-------------------------------+ 24 | | Sum+ReLU | Y | OTJ kernel | N | | 25 | +------------+---------+---------------------+---------+-------------------------------+ 26 | -------------------------------------------------------------------------------- /scripts/best_configs/vgg-n1-8180-1s-wino-f2_3.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | 3 | # VGG19 4 | # batch-size: 1 5 | # SKX 8180 1S 6 | 7 | 8 | source ./scripts/best_configs/common.sh $@ 9 | 10 | # vgg19_conv1_2, 7.0T tflops= 5.50285 11 | NSOCKETS=1 ./scripts/run.sh -c -i64 -h224 -o64 -H224 -n1 --blk-i=4 --blk-o=1 --flt-o=2 --flt-t=14 --tile-size=4 --pat-o=1 --execution-mode=0xa061 --output-as-blocked=true $COMMON 12 | sleep 1 13 | # vgg19_conv2_1, 7.3T tflops=5.85272 14 | NSOCKETS=1 ./scripts/run.sh -c -i64 -h112 -o128 -H112 -n1 --blk-i=4 --blk-o=1 --flt-o=2 --flt-t=14 --tile-size=4 --pat-o=4 --execution-mode=0xa061 $COMMON 15 | sleep 1 16 | # vgg19_conv2_2, 7.4T tflops=5.92653 17 | NSOCKETS=1 ./scripts/run.sh -c -i128 -h112 -o128 -H112 -n1 --blk-i=8 --blk-o=1 --flt-o=2 --flt-t=14 --pat-o=4 --tile-size=4 --execution-mode=0xa061 -x1 $COMMON 18 | sleep 1 19 | # vgg19_conv3_1, 7.3T tflops=6.08779 20 | NSOCKETS=1 ./scripts/run.sh -c -i128 -h56 -o256 -H56 -n1 --blk-i=8 --blk-o=1 --flt-o=2 --flt-t=14 --pat-o=8 --tile-size=4 --execution-mode=0xa061 --output-as-blocked=true $COMMON 21 | sleep 1 22 | # vgg19_conv3_2, 7.3T tflops=6.33066 23 | NSOCKETS=1 ./scripts/run.sh -c -i256 -h56 -o256 -H56 -n1 --blk-i=8 --blk-o=1 --flt-o=2 --flt-t=14 --pat-o=8 --tile-size=4 --execution-mode=0xa061 $COMMON 24 | sleep 1 25 | # vgg19_conv4_1, 6.6T tflops=6.13859 26 | NSOCKETS=1 ./scripts/run.sh -c -i256 -h28 -o512 -H28 -n1 --blk-i=8 --blk-o=16 --flt-t=28 --tile-size=4 --execution-mode=0xa000 --streaming-input=1 $COMMON 27 | sleep 1 28 | # vgg19_conv4_2, 6.4T tflops=6.11829 29 | NSOCKETS=1 ./scripts/run.sh -c -i512 -h28 -o512 -H28 -n1 --blk-i=8 --blk-o=8 --flt-t=25 --tile-size=4 --execution-mode=0xa000 --streaming-input=1 $COMMON 30 | sleep 1 31 | # vgg19_conv5_1, 5.2T tflops=4.81118 32 | #NSOCKETS=1 ./scripts/run.sh -c -i512 -h14 -o512 -H14 -n1 --blk-i=8 --blk-o=8 --flt-t=25 --tile-size=4 --execution-mode=0xa000 $COMMON 33 | NSOCKETS=1 ./scripts/run.sh -c -i512 -h14 -o512 -H14 -n1 --blk-i=8 --blk-o=4 --flt-o=1 --flt-t=26 --tile-size=4 --execution-mode=0xa033 --pat-o=1 $COMMON 34 | 35 | -------------------------------------------------------------------------------- /scripts/best_configs/resnet-n64-8180-1s-direct-int8.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | 3 | # Resnet50 4 | # batch-size: 1 5 | # SKX 8180 1S 6 | 7 | source ./scripts/best_configs/common.sh $@ 8 | COMMON="$COMMON " 9 | 10 | NSOCKETS=1 ./scripts/run.sh -c -i3 -h224 -o64 -H112 -k7 -K7 -s2 -S2 -p3 -P3 -n64 -adirect --execution-mode=0xc160 --blk-i=1 --blk-o=2 --flt-o=2 --flt-t=14 --pat-o=1 $COMMON --data-type-cfg=U8F32S8F32z --sampling-kind=2 --input-format=nhwc --output-format=nChw16c --weights-format=OIhw16i16o 11 | # resnet50_res2a_branch2b, 5.2T 12 | NSOCKETS=1 ./scripts/run.sh -c -i64 -h56 -o64 -H56 -n64 -adirect --execution-mode=0xc160 --blk-i=4 --blk-o=1 --flt-o=2 --flt-t=14 --pat-o=1 $COMMON --data-type-cfg=U8F32S8F32 --sampling-kind=2 13 | sleep 1 14 | # resnet50_res3a_branch2b, 7.0 - 7.5T 15 | NSOCKETS=1 ./scripts/run.sh -c -i128 -h28 -o128 -H28 -n64 -adirect --execution-mode=0xc160 --blk-i=8 --blk-o=4 --flt-o=2 --flt-t=14 --pat-i=1 --pat-o=1 $COMMON --data-type-cfg=U8F32S8F32 --sampling-kind=2 16 | sleep 1 17 | # resnet50_res4a_branch2b, 4.2T 18 | NSOCKETS=1 ./scripts/run.sh -c -i256 -h14 -o256 -H14 -n64 -adirect --execution-mode=0xc160 --blk-i=16 --blk-o=4 --flt-o=2 --flt-t=14 --pat-i=1 $COMMON --data-type-cfg=U8F32S8F32 --sampling-kind=2 19 | sleep 1 20 | # resnet50_res5a_branch2b, 2.5T 21 | NSOCKETS=1 ./scripts/run.sh -c -i512 -h7 -o512 -H7 -n64 -adirect --execution-mode=0xc160 --blk-i=32 --blk-o=1 --flt-o=2 --flt-t=7 --pat-i=1 $COMMON --data-type-cfg=U8F32S8F32 --sampling-kind=2 22 | sleep 1 23 | NSOCKETS=1 ./scripts/run.sh -c -i64 -h56 -o64 -H28 -k3 -s2 -S2 -n64 -adirect --execution-mode=0xc160 --blk-i=4 --blk-o=2 --flt-o=2 --flt-t=14 --pat-i=1 $COMMON --data-type-cfg=U8F32S8F32 --sampling-kind=2 24 | sleep 1 25 | NSOCKETS=1 ./scripts/run.sh -c -i128 -h28 -o128 -H14 -k3 -s2 -S2 -n64 -adirect --execution-mode=0xc160 --blk-i=8 --blk-o=4 --flt-o=2 --flt-t=7 --pat-i=1 $COMMON --data-type-cfg=U8F32S8F32 --sampling-kind=2 26 | sleep 1 27 | NSOCKETS=1 ./scripts/run.sh -c -i256 -h14 -o256 -H7 -k3 -s2 -S2 -n64 -adirect --execution-mode=0xc160 --blk-i=16 --blk-o=2 --flt-o=2 --flt-t=7 --pat-i=1 $COMMON --data-type-cfg=U8F32S8F32 --sampling-kind=2 28 | -------------------------------------------------------------------------------- /src/kernel/elk_u8s8_depthwise_conv_gen.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Build conv kernel instantiation 4 | # 5 | 6 | src_file=$1; dst_dir=$2; cc=$3; enable_user_fp16=$4 7 | 8 | if [ ! -f $src_file ] || [ ! -d $dst_dir ]; then 9 | "Invalid src_file=$src_file or dst_dir=$dst_dir" 10 | exit -1 11 | fi 12 | 13 | __u8s8_depthwise_kconv_generate_inst__() { 14 | ktype=$1; dtype=$2; otype=$3; V=$4; Vx=$5; I=$6; S=$7; F=$8; 15 | 16 | cat <<@ > $dst_dir/elk_${ktype}_${dtype}_${otype}_${V}_${Vx}_${I}_${S}_${F}.cpp 17 | // _generated_kernel_file_ 18 | // 19 | #include "$src_file" 20 | 21 | using namespace euler; 22 | 23 | namespace euler { 24 | 25 | #undef E 26 | #define E(O, T, K) \\ 27 | ${ktype}_kernel_binder::conv_ker_cls::conv 29 | ${ktype}_kernel_binder::kconv 30 | *${ktype}_kernel_binder::kconv_${dtype}_${otype}_${V}_${Vx}_${I}_${S}_${F}[1][32][1] = 31 | { // 8 32 | { // 32 33 | { E(1, 1, 3) }, 34 | { E(1, 2, 3) }, 35 | { E(1, 3, 3) }, 36 | { E(1, 4, 3) }, 37 | { E(1, 5, 3) }, 38 | { E(1, 6, 3) }, 39 | { E(1, 7, 3) }, 40 | { E(1, 8, 3) }, 41 | { E(1, 9, 3) }, 42 | { E(1, 10, 3) }, 43 | { E(1, 11, 3) }, 44 | { E(1, 12, 3) }, 45 | { E(1, 13, 3) }, 46 | { E(1, 14, 3) }, 47 | { E(1, 15, 3) }, 48 | { E(1, 16, 3) }, 49 | { E(1, 17, 3) }, 50 | { E(1, 18, 3) }, 51 | { E(1, 19, 3) }, 52 | { E(1, 20, 3) }, 53 | { E(1, 21, 3) }, 54 | { E(1, 22, 3) }, 55 | { E(1, 23, 3) }, 56 | { E(1, 24, 3) }, 57 | { E(1, 25, 3) }, 58 | { E(1, 26, 3) }, 59 | { E(1, 27, 3) }, 60 | { E(1, 28, 3) }, 61 | { E(1, 29, 3) }, 62 | { E(1, 30, 3) }, 63 | { E(1, 31, 3) }, 64 | }, 65 | }; 66 | 67 | } // namespace 68 | @ 69 | } 70 | 71 | if [ $enable_user_fp16 == "ON" ]; then 72 | eval $($cc -DENABLE_USER_FP16 -DBUILD_OTJ_TBL -E $src_file 2>&1 | grep _generate_inst_) 73 | else 74 | eval $($cc -DBUILD_OTJ_TBL -E $src_file 2>&1 | grep _generate_inst_) 75 | fi 76 | -------------------------------------------------------------------------------- /scripts/best_configs/conv-1x1-int8.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | source ./scripts/best_configs/common.sh $@ 4 | COMMON="$COMMON --data-type-cfg=U8F32S8F32 --sampling-kind=2 " 5 | 6 | # clx low bin, 28c 7 | 8 | # direct 1x1, s=1 9 | NSOCKETS=1 ./scripts/run.sh -c -i1024 -h28 -w28 -o2048 -H28 -W28 -k1 -K1 -s1 -S1 -p0 -P0 -n1 -adirect_1x1 --execution-mode=0xa160 --blk-i=16 --blk-o=2 --pat-o=1 --flt-o=2 --flt-t=14 --pat-i=4 $COMMON 10 | # 10.8 tflops, ref (7.6) 11 | NSOCKETS=1 ./scripts/run.sh -c -i1024 -h28 -w28 -o2048 -H28 -W28 -k1 -K1 -s1 -S1 -p0 -P0 -n64 -adirect_1x1 --execution-mode=0xa160 --blk-i=64 --blk-o=16 --pat-o=1 --flt-o=2 --flt-t=12 --pat-i=1 $COMMON 12 | # 10.9 tflops(12) 13 | 14 | # direct, s=1 15 | NSOCKETS=1 ./scripts/run.sh -c -i1024 -h28 -w28 -o2048 -H28 -W28 -k1 -K1 -s1 -S1 -p0 -P0 -n1 -adirect --execution-mode=0xa160 --blk-i=16 --blk-o=1 --pat-o=2 --flt-o=2 --flt-t=14 --pat-i=4 $COMMON 16 | # 10.0, ref (7.6tflops) 17 | NSOCKETS=1 ./scripts/run.sh -c -i1024 -h28 -w28 -o2048 -H28 -W28 -k1 -K1 -s1 -S1 -p0 -P0 -n64 -adirect --execution-mode=0xa160 --blk-i=64 --blk-o=4 --pat-o=2 --flt-o=2 --flt-t=14 --pat-i=1 $COMMON 18 | # 9.22 tflops, ref(12 tflops) 19 | 20 | # direct 1x1, s=2 21 | NSOCKETS=1 ./scripts/run.sh -c -i1024 -h28 -w28 -o2048 -H14 -W14 -k1 -K1 -s2 -S2 -p0 -P0 -n1 -adirect_1x1 --execution-mode=0xa160 --blk-i=32 --blk-o=2 --pat-o=1 --flt-o=2 --flt-t=14 --pat-i=2 $COMMON 22 | # 7.0 tflops, ref (7.7 tflops) 23 | NSOCKETS=1 ./scripts/run.sh -c -i1024 -h28 -w28 -o2048 -H14 -W14 -k1 -K1 -s2 -S2 -p0 -P0 -n64 -adirect_1x1 --execution-mode=0xa160 --blk-i=64 --blk-o=2 --pat-o=1 --flt-o=2 --flt-t=14 --pat-i=1 $COMMON 24 | # 10.03 tflops. ref (9.7 tflops) 25 | 26 | # direct s=2 27 | NSOCKETS=1 ./scripts/run.sh -c -i1024 -h28 -w28 -o2048 -H14 -W14 -k1 -K1 -s2 -S2 -p0 -P0 -n1 -adirect --execution-mode=0xa160 --blk-i=16 --blk-o=2 --pat-o=1 --flt-o=2 --flt-t=14 --pat-i=4 $COMMON 28 | # 6.64 (7.7) 29 | NSOCKETS=1 ./scripts/run.sh -c -i1024 -h28 -w28 -o2048 -H14 -W14 -k1 -K1 -s2 -S2 -p0 -P0 -n64 -adirect --execution-mode=0xa160 --blk-i=64 --blk-o=2 --pat-o=1 --flt-o=2 --flt-t=14 --pat-i=1 $COMMON 30 | # 8.9 (9.7) 31 | 32 | 33 | -------------------------------------------------------------------------------- /scripts/best_configs/vgg-n1-8180-1s-wino-f3_3.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | 3 | # VGG19 4 | # batch-size: 1 5 | # SKX 8180 1S 6 | 7 | 8 | source ./scripts/best_configs/common.sh $@ 9 | 10 | # vgg19_conv1_2, 7.0T 11 | NSOCKETS=1 ./scripts/run.sh -c -i64 -h224 -o64 -H224 -n1 --blk-i=4 --blk-o=2 --flt-o=2 --flt-t=14 --tile-size=5 --execution-mode=0xa061 --output-as-blocked=true $COMMON 12 | sleep 1 13 | # vgg19_conv2_1, 7.3T 14 | NSOCKETS=1 ./scripts/run.sh -c -i64 -h112 -o128 -H112 -n1 --blk-i=4 --blk-o=1 --flt-o=2 --flt-t=13 --tile-size=5 --pat-o=4 --execution-mode=0xa061 $COMMON 15 | sleep 1 16 | # vgg19_conv2_2, 7.4T 17 | NSOCKETS=1 ./scripts/run.sh -c -i128 -h112 -o128 -H112 -n1 --blk-i=8 --blk-o=1 --flt-o=2 --flt-t=13 --tile-size=5 --execution-mode=0xa061 --pat-o=4 $COMMON 18 | sleep 1 19 | # vgg19_conv3_1, 7.3T 20 | NSOCKETS=1 ./scripts/run.sh -c -i128 -h56 -o256 -H56 -n1 --blk-i=8 --blk-o=1 --flt-o=2 --flt-t=13 --tile-size=5 --pat-o=8 --execution-mode=0xa061 --output-as-blocked=true $COMMON 21 | sleep 1 22 | # vgg19_conv3_2, 7.3T 23 | NSOCKETS=1 ./scripts/run.sh -c -i256 -h56 -o256 -H56 -n1 --blk-i=8 --blk-o=1 --flt-o=2 --flt-t=13 --pat-i=1 --pat-o=8 --tile-size=5 --execution-mode=0xa061 $COMMON 24 | sleep 1 25 | # vgg19_conv4_1, 5.6T 26 | # NSOCKETS=1 ./scripts/run.sh -c -i256 -h28 -o512 -H28 -n1 --blk-i=8 --blk-o=8 --flt-t=15 --pat-o=4 --tile-size=5 --execution-mode=0xa061 $COMMON 27 | # vgg19_conv4_1, 5.3T 28 | # NSOCKETS=1 ./scripts/run.sh -c -i256 -h28 -o512 -H28 -n1 --blk-i=8 --blk-o=8 --flt-t=28 --pat-o=4 --tile-size=5 --execution-mode=0xa061 --tile-size=4 $COMMON 29 | # vgg19_conv4_1, 5.5T 30 | # vgg19_conv4_1, 6.6T 31 | NSOCKETS=1 ./scripts/run.sh -c -i256 -h28 -o512 -H28 -n1 --blk-i=8 --blk-o=4 --flt-t=25 --tile-size=5 --execution-mode=0xa000 --streaming-input=1 $COMMON 32 | sleep 1 33 | # vgg19_conv4_2, 6.4T 34 | NSOCKETS=1 ./scripts/run.sh -c -i512 -h28 -o512 -H28 -n1 --blk-i=8 --blk-o=4 --flt-t=26 --tile-size=5 --execution-mode=0xa000 --streaming-input=1 $COMMON 35 | sleep 1 36 | # vgg19_conv5_1, 5.2T 37 | NSOCKETS=1 ./scripts/run.sh -c -i512 -h14 -o512 -H14 -n1 --blk-i=8 --blk-o=4 --flt-t=25 --tile-size=5 --execution-mode=0xa000 $COMMON 38 | 39 | -------------------------------------------------------------------------------- /scripts/best_configs/vgg-n64-8180-1s.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | 3 | # VGG 4 | # batch-size: 64 5 | # SKX 8180 2S 6 | 7 | source ./scripts/best_configs/common.sh $@ 8 | COMMON="$COMMON --f16c-opt=1" 9 | 10 | # vgg19_conv1_1 11 | NSOCKETS=1 ./scripts/run.sh -c --name=vgg19_conv1_1 -i3 -h224 -o64 -H224 -n64 --blk-i=1 --blk-o=2 --flt-o=2 --flt-t=14 --tile-size=6 --execution-mode=0xa061 --pat-o=1 --output-as-blocked=true $COMMON --streaming-output=2 12 | sleep 1 13 | # vgg19_conv1_2 14 | NSOCKETS=1 ./scripts/run.sh -c --name=vgg19_conv1_2 -i64 -h224 -o64 -H224 -n64 --blk-i=4 --blk-o=2 --flt-o=2 --flt-t=14 --tile-size=6 --execution-mode=0xa061 --pat-o=1 --output-as-blocked=true $COMMON 15 | sleep 1 16 | # vgg19_conv2_1 17 | NSOCKETS=1 ./scripts/run.sh -c --name=vgg19_conv2_1 -i64 -h112 -o128 -H112 -n64 --tile-size=6 --blk-i=4 --blk-o=2 --flt-o=1 --flt-t=28 --execution-mode=0xa061 --pat-o=4 $COMMON 18 | sleep 1 19 | # vgg19_conv2_2 20 | NSOCKETS=1 ./scripts/run.sh -c --name=vgg19_conv2_2 -i128 -h112 -o128 -H112 -n64 --tile-size=6 --blk-i=8 --blk-o=1 --flt-t=28 --execution-mode=0xa061 --pat-o=8 $COMMON 21 | sleep 1 22 | # vgg19_conv3_1 23 | NSOCKETS=1 ./scripts/run.sh -c --name=vgg19_conv3_1 -i128 -h56 -o256 -H56 -n64 --tile-size=6 --blk-i=8 --blk-o=1 --flt-o=2 --flt-t=14 --pat-i=1 --pat-o=8 --execution-mode=0xa061 $COMMON 24 | sleep 1 25 | # vgg19_conv3_2 26 | NSOCKETS=1 ./scripts/run.sh -c --name=vgg19_conv3_2 -i256 -h56 -o256 -H56 -n64 --tile-size=6 --blk-i=8 --blk-o=2 --flt-o=1 --flt-t=28 --pat-i=1 --pat-o=4 --execution-mode=0xa061 $COMMON 27 | sleep 1 28 | # vgg19_conv4_1 29 | NSOCKETS=1 ./scripts/run.sh -c --name=vgg19_conv4_1 -i256 -h28 -o512 -H28 -n64 --tile-size=6 --blk-i=8 --blk-o=2 --flt-t=28 --execution-mode=0xa061 --pat-i=1 --pat-o=8 $COMMON 30 | sleep 1 31 | # vgg19_conv4_2 32 | NSOCKETS=1 ./scripts/run.sh -c --name=vgg19_conv4_2 -i512 -h28 -o512 -H28 -n64 --tile-size=6 --blk-i=8 --blk-o=2 --flt-t=28 --execution-mode=0xa061 --pat-i=1 --pat-o=8 $COMMON 33 | sleep 1 34 | # vgg19_conv5_1 35 | NSOCKETS=1 ./scripts/run.sh -c --name=vgg19_conv5_1 -i512 -h14 -o512 -H14 -n64 --tile-size=7 --blk-i=16 --blk-o=4 --flt-o=1 --flt-t=21 --execution-mode=0xa061 --pat-i=1 --pat-o=8 $COMMON 36 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | DISCONTINUATION OF PROJECT 2 | 3 | This project will no longer be maintained by Intel. 4 | 5 | Intel has ceased development and contributions including, but not limited to, maintenance, bug fixes, new releases, or updates, to this project. 6 | 7 | Intel no longer accepts patches to this project. 8 | 9 | If you have an ongoing need to use this project, are interested in independently developing it, or would like to maintain patches for the open source software community, please create your own fork of this project. 10 | 11 | Contact: webadmin@linux.intel.com 12 | Deep Learning Math Kernel Research (Euler) 13 | ============================================================ 14 | 15 | Experimental DNN math kernel based on C++11 and Intel intrinsic instructions. 16 | 17 | ## System Requirements 18 | Intel Core-X series processor with Intel (R) AVX-512 instruction set extensions 19 | Intel Xeon Scalable processor (Skylake, Cascade Lake, ...) 20 | 21 | ## Prerequisites 22 | Linux x86_64 OS 23 | CMake >= 3.0 24 | Gflags >= 2.0 25 | [ICC >= 18.0](https://software.intel.com/en-us/c-compilers) 26 | (ICC >= 19.0 for Intel DL Boost (VNNI) support on Intel Cascade Lake) 27 | 28 | ## Build 29 | ; ICC/ICX 30 | source /opt/intel/compilers_and_libraries/linux/bin/compilervars.sh -arch intel64 -platform linux 31 | mkdir -p build && cd build 32 | ; ICC 33 | cmake .. -DCMAKE_C_COMPILER=icc -DCMAKE_CXX_COMPILER=icpc -DCMAKE_CXX_FLAGS=-xCore-AVX512 -DWITH_TEST=ON 34 | ; ICX 35 | cmake .. -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DCMAKE_CXX_FLAGS=-xCore-AVX512 -DWITH_TEST=ON 36 | make -j 37 | cd - 38 | 39 | ; CMake build option to enable VNNI support (default: OFF) 40 | -DWITH_VNNI=ON 41 | ; CMake build option to enable Intel TBB threading runtime (default: OMP) 42 | -DMT_RUNTIME=TBB 43 | ; CMake build option to enable FP16 user inputs (default: OFF) 44 | -DENABLE_USER_FP16=ON 45 | 46 | ## Run Tests 47 | cd /path/to/euler/root 48 | ./scripts/best_configs/vgg-n1-8180-1s.sh 49 | 50 | ## Link to Euler 51 | CFLAGS += /path/to/euler/include 52 | #include "euler.hpp" 53 | LDFLAGS += libel 54 | 55 | ## License 56 | Apache License Version 2.0. 57 | -------------------------------------------------------------------------------- /src/kernel/elk_conv_wino_3x3_3x3_weights.hxx: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include "el_intrin.hpp" 4 | #include "elk_def.hpp" 5 | #include "el_utils.hpp" 6 | #include "elk_conv_wino.hpp" 7 | 8 | namespace euler { 9 | 10 | template 11 | struct elk_conv_wino_trans_weights { 13 | constexpr static int A = 5; 14 | constexpr static int K = 3; 15 | constexpr static int I = ISA_AVX512; 16 | 17 | static void execute( 18 | float atweights[A][A][V][V], WeightsType aweights[K][K][V][V]) 19 | { 20 | ENABLE_AVX512F(); 21 | 22 | __m M[5][3]; 23 | 24 | auto z0 = _mm::set1_ps(2.0f); 25 | auto z1 = _mm::set1_ps(0.5f); 26 | 27 | for (int _V = 0; _V < 16; ++_V) { 28 | #pragma unroll 29 | for (int i = 0; i < 3; i++) { 30 | __m f0, f1, f2; 31 | 32 | if (std::is_same::value) { 33 | f0 = _mm::load_ps(aweights[0][i][_V]); 34 | f1 = _mm::load_ps(aweights[1][i][_V]); 35 | f2 = _mm::load_ps(aweights[2][i][_V]); 36 | } else { 37 | f0 = _mm::cvtph_ps(_mm::load_si256((__m256i *)aweights[0][i][_V])); 38 | f1 = _mm::cvtph_ps(_mm::load_si256((__m256i *)aweights[1][i][_V])); 39 | f2 = _mm::cvtph_ps(_mm::load_si256((__m256i *)aweights[2][i][_V])); 40 | } 41 | 42 | auto t0 = f0 * z0; 43 | auto t1 = f0 + f2; 44 | 45 | M[0][i] = t0; 46 | M[1][i] = f1 - t1; 47 | M[2][i] = f1 + t1; 48 | M[3][i] = f2 * z1 + t0 - f1; 49 | M[4][i] = f2; 50 | } 51 | 52 | #pragma unroll 53 | for (int i = 0; i < 5; i++) { 54 | auto f0 = M[i][0]; 55 | auto f1 = M[i][1]; 56 | auto f2 = M[i][2]; 57 | 58 | auto t0 = f0 * z0; 59 | auto t1 = f0 + f2; 60 | 61 | *(__m *)atweights[i][0][_V] = t0; 62 | *(__m *)atweights[i][1][_V] = f1 - t1; 63 | *(__m *)atweights[i][2][_V] = f1 + t1; 64 | *(__m *)atweights[i][3][_V] = f2 * z1 + t0 - f1; 65 | *(__m *)atweights[i][4][_V] = f2; 66 | } 67 | } 68 | } 69 | }; // elk_conv_wino_trans_weights 70 | } // namespace euler 71 | -------------------------------------------------------------------------------- /scripts/best_configs/vgg-n1-8180-1s.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | 3 | # VGG19 4 | # batch-size: 1 5 | # SKX 8180 1S 6 | 7 | 8 | source ./scripts/best_configs/common.sh $@ 9 | COMMON="$COMMON --f16c-opt=1" 10 | 11 | # vgg19_conv1_1, 12 | NSOCKETS=1 ./scripts/run.sh -c --name=vgg19_conv1_1 -i3 -h224 -o64 -H224 -n1 --blk-i=1 --blk-o=2 --flt-o=2 --flt-t=14 --tile-size=6 --execution-mode=0xa061 --pat-o=1 --output-as-blocked=true -v0 $COMMON --streaming-output=2 13 | 14 | sleep 1 15 | # vgg19_conv1_2, 16 | NSOCKETS=1 ./scripts/run.sh -c --name=vgg19_conv1_2 -i64 -h224 -o64 -H224 -n1 --blk-i=4 --blk-o=2 --flt-o=2 --flt-t=14 --tile-size=6 --execution-mode=0xa061 --pat-o=1 --output-as-blocked=true -v0 $COMMON --streaming-output=2 17 | 18 | sleep 1 19 | # vgg19_conv2_1, 20 | NSOCKETS=1 ./scripts/run.sh -c --name=vgg19_conv2_1 -i64 -h112 -o128 -H112 -n1 --blk-i=4 --blk-o=1 --flt-o=1 --flt-t=28 --tile-size=6 --execution-mode=0xa061 --pat-o=8 $COMMON 21 | 22 | sleep 1 23 | # vgg19_conv2_2, 24 | NSOCKETS=1 ./scripts/run.sh -c --name=vgg19_conv2_2 -i128 -h112 -o128 -H112 -n1 --blk-i=8 --blk-o=1 --flt-o=1 --flt-t=28 --tile-size=6 --execution-mode=0xa061 --pat-o=8 $COMMON 25 | 26 | sleep 1 27 | # vgg19_conv3_1, 28 | NSOCKETS=1 ./scripts/run.sh -c --name=vgg19_conv3_1 -i128 -h56 -o256 -H56 -n1 --blk-i=4 --blk-o=2 --flt-o=2 --flt-t=14 --pat-i=1 --pat-o=1 --tile-size=6 --execution-mode=0xa033 $COMMON 29 | 30 | sleep 1 31 | # vgg19_conv3_2, 32 | NSOCKETS=1 ./scripts/run.sh -c --name=vgg19_conv3_2 -i256 -h56 -o256 -H56 -n1 --blk-i=8 --blk-o=8 --flt-o=1 --flt-t=28 --pat-i=1 --pat-o=1 --tile-size=6 --execution-mode=0xa033 $COMMON 33 | 34 | sleep 1 35 | # vgg19_conv4_1, 6.6T 36 | NSOCKETS=1 ./scripts/run.sh -c --name=vgg19_conv4_1 -i256 -h28 -o512 -H28 -n1 --blk-i=8 --blk-o=2 --flt-o=2 --flt-t=13 --tile-size=6 --execution-mode=0xa033 --streaming-input=1 $COMMON 37 | 38 | sleep 1 39 | # vgg19_conv4_2, 6.4T 40 | NSOCKETS=1 ./scripts/run.sh -c --name=vgg19_conv4_2 -i512 -h28 -o512 -H28 -n1 --blk-i=8 --blk-o=2 --flt-o=2 --flt-t=13 --tile-size=6 --execution-mode=0xa033 $COMMON 41 | 42 | sleep 1 43 | # vgg19_conv5_1, 5.2T 44 | NSOCKETS=1 ./scripts/run.sh -c --name=vgg19_conv5_1 -i512 -h14 -o512 -H14 -n1 --blk-i=32 --blk-o=1 --flt-o=2 --flt-t=14 --tile-size=4 --execution-mode=0xa033 --pat-o=1 $COMMON 45 | -------------------------------------------------------------------------------- /scripts/best_configs/vgg-n1-8180-1s-wino-int8.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | 3 | # VGG19 4 | # batch-size: 1 5 | # SKX 8180 1S 6 | 7 | 8 | source ./scripts/best_configs/common.sh $@ 9 | 10 | # vgg19_conv1_2, 6.73T 11 | NSOCKETS=1 ./scripts/run.sh -c -i64 -h224 -o64 -H224 -n1 --blk-i=4 --blk-o=4 --flt-o=1 --flt-t=28 --tile-size=6 --execution-mode=0xa161 --pat-o=1 --output-as-blocked=true $COMMON --sampling-kind=2 --data-type-cfg=U8F32S8F32 12 | 13 | # vgg19_conv2_1, 8.32T 14 | NSOCKETS=1 ./scripts/run.sh -c -i64 -h112 -o128 -H112 -n1 --blk-i=4 --blk-o=4 --flt-o=1 --flt-t=28 --tile-size=6 --execution-mode=0xa161 --pat-o=2 $COMMON --sampling-kind=2 --data-type-cfg=U8F32S8F32 15 | 16 | # vgg19_conv2_2, 9.53T 17 | NSOCKETS=1 ./scripts/run.sh -c -i128 -h112 -o128 -H112 -n1 --blk-i=8 --blk-o=4 --flt-o=1 --flt-t=28 --tile-size=6 --execution-mode=0xa161 --pat-o=2 $COMMON --sampling-kind=2 --data-type-cfg=U8F32S8F32 18 | 19 | # vgg19_conv3_1, 9.95T 20 | NSOCKETS=1 ./scripts/run.sh -c -i128 -h56 -o256 -H56 -n1 --blk-i=8 --blk-o=2 --flt-o=1 --flt-t=28 --pat-i=1 --pat-o=4 --tile-size=6 --execution-mode=0xa161 $COMMON --sampling-kind=2 --data-type-cfg=U8F32S8F32 21 | 22 | # vgg19_conv3_2, 9.48T 23 | NSOCKETS=1 ./scripts/run.sh -c -i256 -h56 -o256 -H56 -n1 --blk-i=16 --blk-o=4 --flt-o=2 --flt-t=14 --pat-i=1 --pat-o=1 --tile-size=6 --execution-mode=0xa133 $COMMON --sampling-kind=2 --data-type-cfg=U8F32S8F32 24 | 25 | # vgg19_conv4_1, 9.80T 26 | NSOCKETS=1 ./scripts/run.sh -c -i256 -h28 -o512 -H28 -n1 --blk-i=16 --blk-o=2 --flt-o=2 --flt-t=7 --tile-size=6 --execution-mode=0xa133 --streaming-input=1 $COMMON --sampling-kind=2 --data-type-cfg=U8F32S8F32 27 | 28 | # vgg19_conv4_2, 11.32T 29 | NSOCKETS=1 ./scripts/run.sh -c -i512 -h28 -o512 -H28 -n1 --blk-i=16 --blk-o=2 --flt-o=2 --flt-t=7 --tile-size=6 --execution-mode=0xa133 $COMMON --sampling-kind=2 --data-type-cfg=U8F32S8F32 30 | 31 | # vgg19_conv5_1, 7.01T 32 | NSOCKETS=1 ./scripts/run.sh -c -i512 -h14 -o512 -H14 -n1 --blk-i=32 --blk-o=1 --flt-o=2 --flt-t=5 --tile-size=5 --execution-mode=0xa133 $COMMON --sampling-kind=2 --data-type-cfg=U8F32S8F32 -v0 33 | -------------------------------------------------------------------------------- /include/euler_reorder.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include "euler.hpp" 6 | 7 | namespace euler { 8 | 9 | // Reorder 10 | template 11 | struct EULER_API reorder { 12 | reorder(Type *dst, Type *src, Args...) { 13 | assert(!!"reorder not implemented\n"); 14 | abort(); 15 | } 16 | }; 17 | 18 | template struct EULER_API reorder { 19 | reorder(Type *dst, Type *src, int n, int c, int h, int w); 20 | }; 21 | 22 | template struct EULER_API reorder { 23 | reorder(Type *dst, Type *src, int n, int c, int h, int w); 24 | }; 25 | 26 | template struct EULER_API reorder { 27 | reorder(Type *dst, Type *src, int n, int c, int h, int w); 28 | }; 29 | 30 | template struct EULER_API reorder { 31 | reorder(Type *dst, Type *src, int n, int c, int h, int w); 32 | }; 33 | 34 | template struct EULER_API reorder { 35 | reorder(Type *dst, Type *src, int o, int i, int h, int w); 36 | }; 37 | 38 | template struct EULER_API reorder { 39 | reorder(Type *dst, Type *src, int g, int o, int i, int h, int w); 40 | }; 41 | 42 | template struct EULER_API reorder { 43 | reorder(Type *dst, Type *src, int o, int i, int h, int w); 44 | }; 45 | 46 | template struct EULER_API reorder { 47 | reorder(Type *dst, Type *src, int g, int o, int i, int h, int w); 48 | }; 49 | 50 | template struct EULER_API reorder { 51 | reorder(Type *dst, Type *src, int o, int i, int h, int w); 52 | }; 53 | 54 | template struct EULER_API reorder { 55 | reorder(Type *dst, Type *src, int g, int o, int i, int h, int w); 56 | }; 57 | 58 | template struct EULER_API reorder { 59 | reorder(Type *dst, Type *src, int o, int i, int h, int w); 60 | }; 61 | 62 | template struct EULER_API reorder { 63 | reorder(Type *dst, Type *src, int g, int o, int i, int h, int w); 64 | }; 65 | 66 | } // namespace euler 67 | -------------------------------------------------------------------------------- /scripts/best_configs/vgg-n1-8180-1s-int8-calibration.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | 3 | # VGG19 4 | # batch-size: 1 5 | # SKX 8180 1S 6 | 7 | 8 | source ./scripts/best_configs/common.sh $@ 9 | 10 | # vgg19_conv1_2, 6.73T 11 | NSOCKETS=1 ./scripts/run.sh -c -i64 -h224 -o64 -H224 -n1 --blk-i=1 --blk-o=1 --flt-o=1 --flt-t=28 --tile-size=6 --execution-mode=0xa161 --pat-o=4 --output-as-blocked=true -v0 --tinput-cali-s=1.0 --tinput-cali-z=1.0 $COMMON 12 | 13 | sleep 1 14 | # vgg19_conv2_1, 8.32T 15 | NSOCKETS=1 ./scripts/run.sh -c -i64 -h112 -o128 -H112 -n1 --blk-i=1 --blk-o=1 --flt-o=1 --flt-t=28 --tile-size=6 --execution-mode=0xa161 --pat-o=8 -v0 --tinput-cali-s=1.0 --tinput-cali-z=1.0 $COMMON 16 | 17 | sleep 1 18 | # vgg19_conv2_2, 9.53T 19 | NSOCKETS=1 ./scripts/run.sh -c -i128 -h112 -o128 -H112 -n1 --blk-i=2 --blk-o=1 --flt-o=1 --flt-t=28 --tile-size=6 --execution-mode=0xa161 --pat-o=8 -v0 --tinput-cali-s=1.0 --tinput-cali-z=1.0 $COMMON 20 | 21 | sleep 1 22 | # vgg19_conv3_1, 9.95T 23 | NSOCKETS=1 ./scripts/run.sh -c -i128 -h56 -o256 -H56 -n1 --blk-i=2 --blk-o=1 --flt-o=2 --flt-t=14 --pat-i=1 --pat-o=8 --tile-size=6 --execution-mode=0xa161 -v0 --tinput-cali-s=1.0 --tinput-cali-z=1.0 $COMMON 24 | 25 | sleep 1 26 | # vgg19_conv3_2, 9.48T 27 | NSOCKETS=1 ./scripts/run.sh -c -i256 -h56 -o256 -H56 -n1 --blk-i=4 --blk-o=8 --flt-o=1 --flt-t=28 --pat-i=1 --pat-o=1 --tile-size=6 --execution-mode=0xa133 -v0 --tinput-cali-s=1.0 --tinput-cali-z=1.0 $COMMON 28 | 29 | sleep 1 30 | # vgg19_conv4_1, 9.80T 31 | NSOCKETS=1 ./scripts/run.sh -c -i256 -h28 -o512 -H28 -n1 --blk-i=4 --blk-o=1 --flt-o=4 --flt-t=6 --tile-size=6 --execution-mode=0xa133 --streaming-input=1 -v0 --tinput-cali-s=1.0 --tinput-cali-z=1.0 $COMMON 32 | 33 | sleep 1 34 | # vgg19_conv4_2, 11.32T 35 | NSOCKETS=1 ./scripts/run.sh -c -i512 -h28 -o512 -H28 -n1 --blk-i=4 --blk-o=2 --flt-o=2 --flt-t=13 --tile-size=6 --execution-mode=0xa133 -v0 --tinput-cali-s=1.0 --tinput-cali-z=1.0 $COMMON 36 | 37 | sleep 1 38 | # vgg19_conv5_1, 7.01T 39 | NSOCKETS=1 ./scripts/run.sh -c -i512 -h14 -o512 -H14 -n1 --blk-i=8 --blk-o=2 --flt-o=2 --flt-t=13 --tile-size=5 --execution-mode=0xa133 -v0 --tinput-cali-s=1.0 --tinput-cali-z=1.0 $COMMON 40 | -------------------------------------------------------------------------------- /scripts/best_configs/resnet-n1-8180-1s-wino-int8.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | 3 | # Resnet50 4 | # batch-size: 1 5 | # SKX 8180 1S 6 | 7 | source ./scripts/best_configs/common.sh $@ 8 | 9 | # resnet50_res2a_branch2b 6.75T 10 | NSOCKETS=1 ./scripts/run.sh -c -i64 -h56 -o64 -H56 -n1 --tile-size=6 --execution-mode=0xa161 --blk-i=4 --blk-o=2 --flt-o=2 --flt-t=7 --pat-o=1 --output-as-blocked=true $COMMON --data-type-cfg=U8F32S8F32 --sampling-kind=2 11 | # resnet50_res3a_branch2b 5.62T 12 | NSOCKETS=1 ./scripts/run.sh -c -i128 -h28 -o128 -H28 -n1 --tile-size=6 --execution-mode=0xa133 --blk-i=8 --blk-o=2 --flt-o=2 --flt-t=13 --pat-o=1 $COMMON --streaming-output=2 $COMMON --data-type-cfg=U8F32S8F32 --sampling-kind=2 13 | # resnet50_res4a_branch2b 4.95T 14 | NSOCKETS=1 ./scripts/run.sh -c -i256 -h14 -o256 -H14 -n1 --tile-size=4 --execution-mode=0xa133 --blk-i=16 --blk-o=2 --flt-o=2 --flt-t=10 $COMMON --data-type-cfg=U8F32S8F32 --sampling-kind=2 15 | # resnet50_res5a_branch2b 3.19T 16 | NSOCKETS=1 ./scripts/run.sh -c -i512 -h7 -o512 -H7 -n1 --tile-size=4 --execution-mode=0xa133 --blk-i=32 --blk-o=1 --flt-o=2 --flt-t=8 $COMMON --data-type-cfg=U8F32S8F32 --sampling-kind=2 17 | 18 | 19 | # a161 Ir case modified from resnet50_res2a_branch2b 20 | NSOCKETS=1 ./scripts/run.sh -c -i66 -h56 -o64 -H56 -n1 --tile-size=6 --execution-mode=0xa161 --blk-i=5 --blk-o=1 --flt-o=2 --flt-t=7 --pat-o=1 --output-as-blocked=true $COMMON --input-format=nchw --weights-format=oihw --data-type-cfg=U8F32S8F32 --sampling-kind=2 21 | 22 | # a161 Or case modified from resnet50_res2a_branch2b 23 | NSOCKETS=1 ./scripts/run.sh -c -i64 -h56 -o66 -H56 -n1 --tile-size=6 --execution-mode=0xa161 --blk-i=4 --blk-o=1 --flt-o=1 --flt-t=7 --pat-o=1 --output-as-blocked=true $COMMON --input-format=nchw --weights-format=oihw --output-format=nchw --data-type-cfg=U8F32S8F32 --sampling-kind=2 24 | 25 | # a133 Ir case modified from resnet50_res5a_branch2b 26 | NSOCKETS=1 ./scripts/run.sh -c -i516 -h7 -o512 -H7 -n1 --tile-size=4 --execution-mode=0xa133 --blk-i=33 --blk-o=1 --flt-o=2 --flt-t=8 $COMMON --input-format=nchw --weights-format=oihw --data-type-cfg=U8F32S8F32 --sampling-kind=2 27 | 28 | # a133 Or case modified from resnet50_res5a_branch2b 29 | NSOCKETS=1 ./scripts/run.sh -c -i512 -h7 -o516 -H7 -n1 --tile-size=4 --execution-mode=0xa133 --blk-i=32 --blk-o=1 --flt-o=1 --flt-t=8 $COMMON --input-format=nchw --weights-format=oihw --output-format=nchw --data-type-cfg=U8F32S8F32 --sampling-kind=2 30 | -------------------------------------------------------------------------------- /src/elx_conv_direct_vmg.hpp: -------------------------------------------------------------------------------- 1 | #ifndef __ELX_CONV_DIRECT_VMG_HPP__ 2 | #define __ELX_CONV_DIRECT_VMG_HPP__ 3 | 4 | #include "euler.hpp" 5 | #include "el_def.hpp" 6 | #include "el_utils.hpp" 7 | #include "el_allocator.hpp" 8 | #include "elx_conv.hpp" 9 | #include "kernel/elk_vmg_conv_binder.hxx" 10 | 11 | namespace euler { 12 | 13 | #define Template_elx_conv_direct_vmg_t \ 14 | template 15 | 16 | #define Instance_elx_conv_direct_vmg_t \ 17 | elx_conv_direct_vmg_t 18 | 19 | Template_elx_conv_direct_vmg_t class elx_conv_direct_vmg_t : public elx_conv_t { 20 | using InputType = typename UserTypes::InputType; 21 | using WeightsType = typename UserTypes::WeightsType; 22 | using OutputType = typename UserTypes::OutputType; 23 | using BiasType = typename UserTypes::BiasType; 24 | 25 | // t-buffer type 26 | using TinputType = typename TarrayTypes::InputType; 27 | using TweightsType = typename TarrayTypes::WeightsType; 28 | using ToutputType = typename TarrayTypes::OutputType; 29 | 30 | public: 31 | elx_conv_direct_vmg_t(eld_conv_t &dc); 32 | virtual ~elx_conv_direct_vmg_t(); 33 | 34 | virtual void execute(void *output, void *input, void *weights, void *bias); 35 | 36 | private: 37 | void __execute_c060(OutputType *output, InputType *input, 38 | WeightsType *weights, BiasType *bias); 39 | 40 | void trans_weights_to_compact(TweightsType *tweights, WeightsType *weights); 41 | 42 | void conv_c060(OutputType *output, InputType *input, TweightsType *weights, 43 | BiasType *bias, int _I4, int _O4, int _ht, int _wt); 44 | 45 | void set_scratch_buffers(void *base); 46 | void set_workspace_buffers(void *base); 47 | int prepare_execute_opt(); 48 | void bind_execute_functions(); 49 | 50 | // TODO: optimize it 51 | vmg_conv_kernel_binder::kconv *ker_conv_; 52 | vmg_conv_kernel_binder::kconv *ker_conv_Tr_; 53 | 54 | void (elx_conv_direct_vmg_t::*execute_opt_)( 55 | OutputType *, InputType *, WeightsType *, BiasType *); 56 | 57 | int C; 58 | bool is_first_run_; 59 | bool inference_acc_; 60 | 61 | size_t tweights_size_; 62 | TweightsType *tweights_; 63 | size_t toutput_size_; 64 | ToutputType *toutput_; 65 | unsigned int xopt_; 66 | int attr_; 67 | int mthr_; 68 | }; 69 | 70 | } // namespace euler 71 | #endif // __ELX_CONV_DIRECT_VMG_HPP__ 72 | -------------------------------------------------------------------------------- /src/elx_deconv_direct_bind.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "elx_deconv_direct.hpp" 4 | 5 | namespace euler { 6 | 7 | Template_elx_deconv_direct_t void 8 | Instance_elx_deconv_direct_t::bind_execute_functions() 9 | { 10 | #define BIND_CONV_KERNEL(S, F, K) \ 11 | if (K == 3) { \ 12 | conv_kernel_binder::bind(O, T, func); \ 13 | } else if (K == 5) { \ 14 | conv_kernel_binder::bind(O, T, func); \ 15 | } else if (K == 7) { \ 16 | conv_kernel_binder::bind(O, T, func); \ 17 | } 18 | 19 | auto bind_conv_kernel = [&](int O, int T, 20 | conv_kernel_binder::kconv **func, int K) { 21 | switch (xopt_) { 22 | case (0xa060): 23 | if (ep.input_fmt == nchw) { 24 | if (ep.ws == 1) { 25 | BIND_CONV_KERNEL(1, GKF_EBD, K); 26 | } else if (ep.ws == 2) { 27 | BIND_CONV_KERNEL(2, GKF_EBD, K); 28 | } else { 29 | el_error("Stride > 2 not yet bounded"); 30 | } 31 | } else if (ep.input_fmt == nhwc) { 32 | if (ep.ws == 1) { 33 | BIND_CONV_KERNEL(1, GKF_FCF, K); 34 | } else if (ep.ws == 2) { 35 | BIND_CONV_KERNEL(2, GKF_FCF, K); 36 | } else { 37 | el_error("Stride > 2 not yet bounded"); 38 | } 39 | } else { 40 | if (ep.ws == 1) { 41 | BIND_CONV_KERNEL(1, GKF_DCD, K); 42 | } else if (ep.ws == 2) { 43 | BIND_CONV_KERNEL(2, GKF_DCD, K); 44 | } else { 45 | el_error("Stride > 2 not yet bounded"); 46 | } 47 | } 48 | break; 49 | default: 50 | el_error("Unknown xopt"); 51 | break; 52 | } 53 | }; 54 | 55 | bind_conv_kernel(ep.O, ep.T, &ker_conv_, ep.kw); 56 | bind_conv_kernel(ep.O, ep.Tr, &ker_conv_Tr_, ep.kw); 57 | 58 | #define EXECUTE_CASE(n) \ 59 | case 0x##n: \ 60 | execute_opt_ = &Instance_elx_deconv_direct_t::__execute_##n; \ 61 | break 62 | 63 | switch (xopt_) { 64 | EXECUTE_CASE(a060); 65 | default: 66 | el_error("Unimplemented xopt"); 67 | break; 68 | } 69 | } 70 | 71 | } // namespace euler 72 | -------------------------------------------------------------------------------- /src/elx_conv_direct_vmg_bind.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "elx_conv_direct_vmg.hpp" 4 | 5 | namespace euler { 6 | 7 | Template_elx_conv_direct_vmg_t void 8 | Instance_elx_conv_direct_vmg_t::bind_execute_functions() { 9 | #define BIND_CONV_KERNEL(S, F, K, G) \ 10 | if (K == 3) { \ 11 | if (G == 1) { \ 12 | vmg_conv_kernel_binder::bind(O, T, func); \ 13 | } else if (G == 2) { \ 14 | vmg_conv_kernel_binder::bind(O, T, func); \ 15 | } else if (G == 4) { \ 16 | vmg_conv_kernel_binder::bind(O, T, func); \ 17 | } else if (G == 8) { \ 18 | vmg_conv_kernel_binder::bind(O, T, func); \ 19 | } else if (G == 16) { \ 20 | vmg_conv_kernel_binder::bind(O, T, func); \ 21 | } \ 22 | } else { \ 23 | el_error("Unimplemented: VMG conv for K != 3"); \ 24 | } 25 | 26 | auto bind_conv_kernel = [&](int O, int T, 27 | vmg_conv_kernel_binder::kconv **func, 28 | int K, int G) { 29 | switch (xopt_) { 30 | case (0xc060): 31 | if (ep.input_fmt == nhwc) { 32 | if (ep.ws == 1) { 33 | BIND_CONV_KERNEL(1, GKF_FCF, K, G); 34 | } else { 35 | el_error("Stride > 1 not yet bounded"); 36 | } 37 | } else { 38 | if (ep.ws == 1) { 39 | BIND_CONV_KERNEL(1, GKF_DCD, K, G); 40 | } else { 41 | el_error("Stride > 1 not yet bounded"); 42 | } 43 | } 44 | break; 45 | default: 46 | el_error("Unknown xopt"); 47 | break; 48 | } 49 | }; 50 | 51 | bind_conv_kernel(ep.O, ep.T, &ker_conv_, ep.kw, ep.G); 52 | bind_conv_kernel(ep.O, ep.Tr, &ker_conv_Tr_, ep.kw, ep.G); 53 | #define EXECUTE_CASE(n) \ 54 | case 0x##n: \ 55 | execute_opt_ = &Instance_elx_conv_direct_vmg_t::__execute_##n; \ 56 | break 57 | 58 | switch (xopt_) { 59 | EXECUTE_CASE(c060); 60 | default: 61 | el_error("Unimplemented xopt"); 62 | break; 63 | } 64 | } 65 | 66 | } // namespace euler 67 | -------------------------------------------------------------------------------- /src/elx_stream.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "elx_stream.hpp" 5 | #include "elx_conv.hpp" 6 | #include "el_init.hpp" 7 | 8 | #define gettid() syscall(SYS_gettid) 9 | 10 | namespace euler { 11 | 12 | elx_stream global_stream; 13 | 14 | struct elx_eol_t : public elx_conv_t { 15 | public: 16 | elx_eol_t(eld_conv_t &dc) : elx_conv_t(dc) { 17 | ep.eager_mode = false; 18 | ep.stream_sync = true; 19 | ep.shared_workspace_enabled = false; 20 | 21 | on_destroy_ = ELX_EVENT_EXIT; // signal exit 22 | } 23 | 24 | virtual void execute(void *output, void *input, void *weights, void *bias) {} 25 | virtual ~elx_eol_t() {} 26 | 27 | private: 28 | virtual void set_workspace_buffers(void *base) {} 29 | virtual void set_scratch_buffers(void *base) {} 30 | }; 31 | 32 | int set_cpu_affinity() { 33 | // TODO 34 | return 0; 35 | } 36 | 37 | elx_stream::elx_stream() { 38 | _threadx = new std::thread([&]{ 39 | set_cpu_affinity(); 40 | // executor thread 41 | while (run()); 42 | }); 43 | _threadx->detach(); 44 | } 45 | 46 | elx_stream::~elx_stream() { 47 | 48 | eld_conv_t dummy; 49 | elx_eol_t eol(dummy); 50 | 51 | delete _threadx; 52 | } 53 | 54 | void elx_stream::submit(elx_conv_t *ex) { 55 | // user thread 56 | if (ex->ep.stream_sync) 57 | ex->mu_.lock(); 58 | std::unique_lock mlock(_mutex); 59 | _stream.push(ex); 60 | mlock.unlock(); 61 | _cond.notify_one(); 62 | } 63 | 64 | int elx_stream::run() { 65 | std::unique_lock mlock(_mutex); 66 | while(_stream.empty()) { 67 | _cond.wait(mlock); 68 | } 69 | euler::elx_conv_t *ex = _stream.front(); 70 | _stream.pop(); 71 | mlock.unlock(); 72 | 73 | int ret = 1; 74 | if (ex != nullptr) { 75 | int event = ex->on_destroy(); 76 | if (event != ELX_EVENT_NORMAL) { 77 | if (event == ELX_EVENT_TEARDOWN) 78 | ex->teardown(); 79 | else if (event == ELX_EVENT_EXIT) 80 | ret = 0; 81 | } else { 82 | if (ego.verbose) { 83 | ex->execute_verbose(ex->output_, ex->input_, ex->weights_, ex->bias_); 84 | } else { 85 | ex->execute(ex->output_, ex->input_, ex->weights_, ex->bias_); 86 | } 87 | } 88 | if (ex->ep.stream_sync) { 89 | ex->mu_.unlock(); 90 | } 91 | } 92 | return ret; 93 | } 94 | 95 | void elx_stream::wait(elx_conv_t *ex) { 96 | // user thread 97 | std::lock_guard mlock(ex->mu_); 98 | } 99 | 100 | } 101 | -------------------------------------------------------------------------------- /scripts/best_configs/group-conv.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | source ./scripts/best_configs/common.sh $@ 4 | COMMON="$COMMON --f16c-opt=1" 5 | 6 | # direct: a060 7 | NSOCKETS=1 ./scripts/run.sh -c -g32 -i128 -h56 -o128 -H56 -n1 --blk-i=1 --blk-o=1 --flt-o=1 --flt-t=28 -p1 -P1 -adirect --execution-mode=0xa060 --pat-o=1 $COMMON --weights-format=ghwio --input-format=nhwc --output-format=nhwc -b1 8 | 9 | NSOCKETS=1 ./scripts/run.sh -c -g32 -i256 -h28 -o256 -H28 -n1 --blk-i=1 --blk-o=1 --flt-o=1 --flt-t=28 -p1 -P1 -adirect --execution-mode=0xa060 --pat-o=1 $COMMON --weights-format=ghwio --input-format=nhwc --output-format=nhwc -b1 10 | 11 | NSOCKETS=1 ./scripts/run.sh -c -g32 -i512 -h14 -o512 -H14 -n1 --blk-i=1 --blk-o=1 --flt-o=1 --flt-t=14 -p1 -P1 -adirect --execution-mode=0xa060 --pat-o=1 $COMMON --weights-format=ghwio --input-format=nChw16c --output-format=nChw16c -b1 12 | 13 | NSOCKETS=1 ./scripts/run.sh -c -g32 -i1024 -h7 -o1024 -H7 -n1 --blk-i=2 --blk-o=1 --flt-o=2 --flt-t=7 -p1 -P1 -adirect --execution-mode=0xa060 --pat-o=1 $COMMON --weights-format=ghwio --input-format=nChw16c --output-format=nChw16c -b1 14 | 15 | 16 | # direct: c060 17 | NSOCKETS=1 ./scripts/run.sh -c -g32 -i128 -h56 -o128 -H56 -n1 --blk-i=1 --blk-o=1 --flt-o=1 --flt-t=14 -p1 -P1 -adirect --execution-mode=0xc060 --pat-o=1 $COMMON --weights-format=ghwio --input-format=nhwc --output-format=nhwc -b1 18 | 19 | NSOCKETS=1 ./scripts/run.sh -c -g32 -i256 -h28 -o256 -H28 -n1 --blk-i=1 --blk-o=1 --flt-o=1 --flt-t=14 -p1 -P1 -adirect --execution-mode=0xc060 --pat-o=1 $COMMON --weights-format=ghwio --input-format=nhwc --output-format=nhwc -b1 20 | 21 | NSOCKETS=1 ./scripts/run.sh -c -g32 -i512 -h14 -o512 -H14 -n1 --blk-i=1 --blk-o=1 --flt-o=1 --flt-t=14 -p1 -P1 -adirect --execution-mode=0xc060 --pat-o=1 $COMMON --weights-format=ghwio --input-format=nChw16c --output-format=nChw16c -b1 22 | 23 | NSOCKETS=1 ./scripts/run.sh -c -g32 -i1024 -h7 -o1024 -H7 -n1 --blk-i=2 --blk-o=1 --flt-o=2 --flt-t=7 -p1 -P1 -adirect --execution-mode=0xc060 --pat-o=1 $COMMON --weights-format=ghwio --input-format=nChw16c --output-format=nChw16c -b1 24 | 25 | # direct_vmg 26 | NSOCKETS=1 ./scripts/run.sh -c -g32 -i128 -h56 -o128 -H56 -n1 --blk-i=1 --blk-o=1 --flt-o=1 --flt-t=14 -p1 -P1 -adirect_vmg --execution-mode=0xc060 --pat-o=1 $COMMON --weights-format=ghwio --input-format=nChw16c --output-format=nChw16c -b1 27 | 28 | NSOCKETS=1 ./scripts/run.sh -c -g32 -i256 -h28 -o256 -H28 -n1 --blk-i=1 --blk-o=1 --flt-o=1 --flt-t=14 -p1 -P1 -adirect_vmg --execution-mode=0xc060 --pat-o=1 $COMMON --weights-format=ghwio --input-format=nChw16c --output-format=nChw16c -b1 29 | 30 | 31 | -------------------------------------------------------------------------------- /scripts/best_configs/vgg-n1-8180-1s-wino-f4_3.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | 3 | # VGG19 4 | # batch-size: 1 5 | # SKX 8180 1S 6 | 7 | 8 | source ./scripts/best_configs/common.sh $@ 9 | 10 | # vgg19_conv1_2, 11 | NSOCKETS=1 ./scripts/run.sh -c -i64 -h224 -o64 -H224 -n1 --blk-i=4 --blk-o=2 --flt-o=2 --flt-t=14 --tile-size=6 --execution-mode=0xa061 --pat-o=1 --output-as-blocked=true $COMMON 12 | sleep 1 13 | # vgg19_conv2_1, 14 | #NSOCKETS=1 ./scripts/run.sh -c -i64 -h112 -o128 -H112 -n1 --blk-i=4 --blk-o=1 --flt-o=2 --flt-t=14 --tile-size=6 --execution-mode=0xa061 --pat-o=4 $COMMON 15 | NSOCKETS=1 ./scripts/run.sh -c -i64 -h112 -o128 -H112 -n1 --blk-i=4 --blk-o=1 --flt-o=1 --flt-t=28 --tile-size=6 --execution-mode=0xa061 --pat-o=8 $COMMON 16 | sleep 1 17 | # vgg19_conv2_2, 18 | #NSOCKETS=1 ./scripts/run.sh -c -i128 -h112 -o128 -H112 -n1 --blk-i=8 --blk-o=1 --flt-o=2 --flt-t=14 --tile-size=6 --execution-mode=0xa061 --pat-o=4 $COMMON 19 | NSOCKETS=1 ./scripts/run.sh -c -i128 -h112 -o128 -H112 -n1 --blk-i=8 --blk-o=1 --flt-o=1 --flt-t=28 --tile-size=6 --execution-mode=0xa061 --pat-o=8 $COMMON 20 | sleep 1 21 | # vgg19_conv3_1, 22 | #NSOCKETS=1 ./scripts/run.sh -c -i128 -h56 -o256 -H56 -n1 --blk-i=8 --blk-o=1 --flt-o=2 --flt-t=14 --pat-o=8 --tile-size=6 --execution-mode=0xa061 $COMMON 23 | NSOCKETS=1 ./scripts/run.sh -c -i128 -h56 -o256 -H56 -n1 --blk-i=4 --blk-o=2 --flt-o=2 --flt-t=14 --pat-i=1 --pat-o=1 --tile-size=6 --execution-mode=0xa033 $COMMON 24 | sleep 1 25 | # vgg19_conv3_2, 26 | #NSOCKETS=1 ./scripts/run.sh -c -i256 -h56 -o256 -H56 -n1 --blk-i=8 --blk-o=1 --flt-o=2 --flt-t=14 --pat-i=2 --pat-o=8 --tile-size=6 --execution-mode=0xa073 $COMMON 27 | NSOCKETS=1 ./scripts/run.sh -c -i256 -h56 -o256 -H56 -n1 --blk-i=8 --blk-o=8 --flt-o=1 --flt-t=28 --pat-i=1 --pat-o=1 --tile-size=6 --execution-mode=0xa033 $COMMON 28 | sleep 1 29 | # vgg19_conv4_1, 6.6T 30 | #NSOCKETS=1 ./scripts/run.sh -c -i256 -h28 -o512 -H28 -n1 --blk-i=8 --blk-o=4 --flt-t=25 --tile-size=6 --execution-mode=0xa000 --streaming-input=1 $COMMON 31 | NSOCKETS=1 ./scripts/run.sh -c -i256 -h28 -o512 -H28 -n1 --blk-i=8 --blk-o=2 --flt-o=2 --flt-t=13 --tile-size=6 --execution-mode=0xa033 --streaming-input=1 $COMMON 32 | sleep 1 33 | # vgg19_conv4_2, 6.4T 34 | #NSOCKETS=1 ./scripts/run.sh -c -i512 -h28 -o512 -H28 -n1 --blk-i=8 --blk-o=4 --flt-t=26 --tile-size=6 --execution-mode=0xa000 --streaming-input=1 $COMMON 35 | NSOCKETS=1 ./scripts/run.sh -c -i512 -h28 -o512 -H28 -n1 --blk-i=8 --blk-o=2 --flt-o=2 --flt-t=13 --tile-size=6 --execution-mode=0xa033 $COMMON 36 | sleep 1 37 | # vgg19_conv5_1, 5.2T 38 | NSOCKETS=1 ./scripts/run.sh -c -i512 -h14 -o512 -H14 -n1 --blk-i=8 --blk-o=4 --flt-t=16 --tile-size=6 --execution-mode=0xa000 $COMMON 39 | -------------------------------------------------------------------------------- /scripts/conv-non-16x.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | 3 | # conv: wino: 4 | # a061 5 | # with_relu=0 with_ip_sum=0 6 | NSOCKETS=1 ./scripts/run.sh -c -i127 -h56 -o255 -H56 -n1 --blk-i=8 --blk-o=1 --flt-o=2 --flt-t=14 --pat-o=1 --tile-size=6 --execution-mode=0xa061 --input-format=nchw --weights-format=oihw --output-format=nchw -v1 7 | # with_relu=1 with_ip_sum=0 8 | NSOCKETS=1 ./scripts/run.sh -c -i127 -h56 -o255 -H56 -n1 --blk-i=8 --blk-o=1 --flt-o=2 --flt-t=14 --pat-o=1 --tile-size=6 --execution-mode=0xa061 --input-format=nchw --weights-format=oihw --output-format=nchw -r1 -v1 9 | # with_relu=0 with_ip_sum=1 10 | NSOCKETS=1 ./scripts/run.sh -c -i127 -h56 -o255 -H56 -n1 --blk-i=8 --blk-o=1 --flt-o=2 --flt-t=14 --pat-o=1 --tile-size=6 --execution-mode=0xa061 --input-format=nchw --weights-format=oihw --output-format=nchw --with-ip-sum=1 -v1 11 | 12 | # a073 13 | # with_relu=0 with_ip_sum=0 14 | NSOCKETS=1 ./scripts/run.sh -c -i255 -h56 -o255 -H56 -n1 --blk-i=8 --blk-o=1 --flt-o=2 --flt-t=14 --pat-i=1 --pat-o=1 --tile-size=6 --execution-mode=0xa073 --input-format=nchw --weights-format=oihw --output-format=nchw -v1 15 | # with_relu=0 with_ip_sum=1 16 | NSOCKETS=1 ./scripts/run.sh -c -i255 -h56 -o255 -H56 -n1 --blk-i=8 --blk-o=1 --flt-o=2 --flt-t=14 --pat-i=1 --pat-o=1 --tile-size=6 --execution-mode=0xa073 --input-format=nchw --weights-format=oihw --output-format=nchw --with-ip-sum=1 -v1 17 | 18 | # conv: direct 1x1: 19 | # a063 20 | # with_relu=0 with_ip_sum=0 21 | NSOCKETS=1 ./scripts/run.sh -c -n1 -i255 -h56 -o511 -H28 -k1 -K1 -s2 -S2 -p0 -P0 -b1 -adirect_1x1 -v0 --blk-i=16 --flt-o=2 --flt-t=14 --pat-i=1 --pat-o=16 --execution-mode=0xa063 -v1 22 | # with_relu=1 with_ip_sum=0 23 | NSOCKETS=1 ./scripts/run.sh -c -n1 -i255 -h56 -o511 -H28 -k1 -K1 -s2 -S2 -p0 -P0 -b1 -adirect_1x1 -v0 --blk-i=16 --flt-o=2 --flt-t=14 --pat-i=1 --pat-o=16 --execution-mode=0xa063 -r1 -v1 24 | # with_relu=0 with_ip_sum=1 25 | NSOCKETS=1 ./scripts/run.sh -c -n1 -i255 -h56 -o511 -H28 -k1 -K1 -s2 -S2 -p0 -P0 -b1 -adirect_1x1 -v0 --blk-i=16 --flt-o=2 --flt-t=14 --pat-i=1 --pat-o=16 --execution-mode=0xa063 --with-ip-sum=1 -v1 26 | 27 | # a062 28 | # with_relu=0 with_ip_sum=0 29 | NSOCKETS=1 ./scripts/run.sh -c -n1 -i1023 -o1023 -h19 -k1 -K1 -H19 -p0 -P0 -s1 -b1 -r0 -v0 -adirect_1x1 --blk-i=32 --flt-o=2 --flt-t=13 --execution-mode=0xa062 --pat-o=32 -v1 30 | # with_relu=1 with_ip_sum=0 31 | NSOCKETS=1 ./scripts/run.sh -c -n1 -i1023 -o1023 -h19 -k1 -K1 -H19 -p0 -P0 -s1 -b1 -r0 -v0 -adirect_1x1 --blk-i=32 --flt-o=2 --flt-t=13 --execution-mode=0xa062 --pat-o=32 -r1 -v1 32 | # with_relu=0 with_ip_sum=1 33 | NSOCKETS=1 ./scripts/run.sh -c -n1 -i1023 -o1023 -h19 -k1 -K1 -H19 -p0 -P0 -s1 -b1 -r0 -v0 -adirect_1x1 --blk-i=32 --flt-o=2 --flt-t=13 --execution-mode=0xa062 --pat-o=32 --with-ip-sum=1 -v1 34 | -------------------------------------------------------------------------------- /scripts/best_configs/vgg-n64-8180-1s-wino-f4_3.sh: -------------------------------------------------------------------------------- 1 | #/bin/bash 2 | 3 | # VGG 4 | # batch-size: 64 5 | # SKX 8180 2S 6 | 7 | source ./scripts/best_configs/common.sh $@ 8 | 9 | 10 | # vgg19_conv1_2 11 | #NSOCKETS=1 ./scripts/run.sh -c -i64 -h224 -o64 -H224 -n64 --tile-size=6 --blk-i=4 --blk-o=4 --flt-t=28 --execution-mode=0xa061 --output-as-blocked=true $COMMON 12 | NSOCKETS=1 ./scripts/run.sh -c -i64 -h224 -o64 -H224 -n64 --blk-i=4 --blk-o=2 --flt-o=1 --flt-t=28 --tile-size=6 --execution-mode=0xa061 --pat-o=2 --output-as-blocked=true $COMMON 13 | sleep 1 14 | # vgg19_conv2_1 15 | #NSOCKETS=1 ./scripts/run.sh -c -i64 -h112 -o128 -H112 -n64 --tile-size=6 --blk-i=4 --blk-o=8 --flt-t=28 --execution-mode=0xa061 $COMMON 16 | NSOCKETS=1 ./scripts/run.sh -c -i64 -h112 -o128 -H112 -n64 --tile-size=6 --blk-i=4 --blk-o=2 --flt-o=1 --flt-t=28 --execution-mode=0xa061 --pat-o=4 $COMMON 17 | sleep 1 18 | # vgg19_conv2_2 19 | #NSOCKETS=1 ./scripts/run.sh -c -i128 -h112 -o128 -H112 -n64 --tile-size=6 --blk-i=8 --blk-o=8 --flt-t=28 --execution-mode=0xa061 $COMMON 20 | NSOCKETS=1 ./scripts/run.sh -c -i128 -h112 -o128 -H112 -n64 --tile-size=6 --blk-i=8 --blk-o=1 --flt-t=28 --execution-mode=0xa061 --pat-o=8 $COMMON 21 | sleep 1 22 | # vgg19_conv3_1 23 | #NSOCKETS=1 ./scripts/run.sh -c -i128 -h56 -o256 -H56 -n64 --tile-size=6 --blk-i=8 --blk-o=8 --flt-t=28 --pat-o=2 --execution-mode=0xa061 $COMMON 24 | NSOCKETS=1 ./scripts/run.sh -c -i128 -h56 -o256 -H56 -n64 --tile-size=6 --blk-i=8 --blk-o=1 --flt-o=2 --flt-t=14 --pat-i=1 --pat-o=8 --execution-mode=0xa073 $COMMON 25 | sleep 1 26 | # vgg19_conv3_2 27 | #NSOCKETS=1 ./scripts/run.sh -c -i256 -h56 -o256 -H56 -n64 --tile-size=6 --blk-i=8 --blk-o=8 --flt-t=28 --pat-o=2 --execution-mode=0xa061 $COMMON 28 | NSOCKETS=1 ./scripts/run.sh -c -i256 -h56 -o256 -H56 -n64 --tile-size=6 --blk-i=8 --blk-o=4 --flt-o=1 --flt-t=28 --pat-i=2 --pat-o=4 --execution-mode=0xa073 $COMMON 29 | sleep 1 30 | # vgg19_conv4_1 31 | #NSOCKETS=1 ./scripts/run.sh -c -i256 -h28 -o512 -H28 -n64 --tile-size=6 --blk-i=8 --blk-o=8 --flt-t=31 --execution-mode=0xa000 --streaming-input=2 $COMMON 32 | NSOCKETS=1 ./scripts/run.sh -c -i256 -h28 -o512 -H28 -n64 --tile-size=6 --blk-i=8 --blk-o=4 --flt-t=28 --execution-mode=0xa073 --pat-i=2 --pat-o=8 $COMMON 33 | sleep 1 34 | # vgg19_conv4_2 35 | #NSOCKETS=1 ./scripts/run.sh -c -i512 -h28 -o512 -H28 -n64 --tile-size=6 --blk-i=8 --blk-o=4 --flt-t=31 --execution-mode=0xa000 --streaming-input=2 $COMMON 36 | NSOCKETS=1 ./scripts/run.sh -c -i512 -h28 -o512 -H28 -n64 --tile-size=6 --blk-i=8 --blk-o=4 --flt-t=28 --execution-mode=0xa073 --pat-i=4 --pat-o=8 $COMMON 37 | sleep 1 38 | # vgg19_conv5_1 39 | NSOCKETS=1 ./scripts/run.sh -c -i512 -h14 -o512 -H14 -n64 --tile-size=6 --blk-i=8 --blk-o=4 --flt-t=28 --execution-mode=0xa000 --streaming-input=2 $COMMON 40 | -------------------------------------------------------------------------------- /src/common/el_def.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | #define MT_RUNTIME_OMP (1) 7 | #define MT_RUNTIME_TBB (2) 8 | 9 | namespace euler { 10 | 11 | enum { 12 | ISA_GENERIC = 0, 13 | ISA_AVX2 = 256, 14 | ISA_AVX512 = 512 15 | }; 16 | 17 | enum { 18 | __ALL = 0, 19 | __TRACE = 1, 20 | __DEBUG = 2, 21 | __INFO = 3, 22 | __WARN = 4, 23 | __ERROR = 5, 24 | __FATAL = 6, 25 | __PERF_TRACE = 999, // ensure EULER_VERBOSE always work once set 26 | }; 27 | 28 | constexpr size_t PAGE_SIZE = 4096; 29 | 30 | // int8 quantization 31 | constexpr float EL_INT8_MAX = 127.0f; 32 | constexpr float EL_UINT8_MAX = 255.0f; 33 | 34 | // wino fusion 35 | constexpr uint32_t FUS_MASK = 0xF0; 36 | constexpr uint32_t FUS_I = 0x10; 37 | constexpr uint32_t FUS_O = 0x20; 38 | 39 | // kernel attr 40 | constexpr uint32_t AT_BIAS_MASK { 1 << 0 }; 41 | constexpr uint32_t AT_RELU_MASK { 1 << 1 }; 42 | constexpr uint32_t AT_INP_SUM_MASK { 1 << 2 }; // inplace sum 43 | constexpr uint32_t AT_CLEAR_OUTPUT_MASK { 1 << 3 }; 44 | constexpr uint32_t AT_STREAMING_OUTPUT_MASK { 1 << 4 }; 45 | constexpr uint32_t AT_RESTORE_OUTPUT_MASK { 1 << 5 }; 46 | constexpr uint32_t AT_Ir_MASK { 1 << 6 }; 47 | constexpr uint32_t AT_Or_MASK { 1 << 7 }; 48 | constexpr uint32_t AT_FMAOPT_MASK { 1 << 8 }; // FMA optimization 49 | 50 | template struct ConvImplTypes { 51 | static_assert(sizeof...(Types) == 4, 52 | "Inner types input/weights/output/bias scale data type"); 53 | using InputType = typename std::tuple_element<0, std::tuple>::type; 54 | using WeightsType = typename std::tuple_element<1, std::tuple>::type; 55 | using OutputType = typename std::tuple_element<2, std::tuple>::type; 56 | using BiasType = typename std::tuple_element<3, std::tuple>::type; 57 | }; 58 | 59 | namespace conv_impl { 60 | using FP32 = ConvImplTypes; 61 | using FP32_F16b = ConvImplTypes; 62 | using FP32_F16w = ConvImplTypes; 63 | using FP32_F16o = ConvImplTypes; 64 | using FP32_F16iwo = ConvImplTypes; 65 | using FP32_F16wob = ConvImplTypes; 66 | using INT8_F32 = ConvImplTypes; 67 | using INT8_F16b = ConvImplTypes; 68 | using INT8_F16o = ConvImplTypes; 69 | using INT8_F16ob = ConvImplTypes; 70 | using INT8_INT8o = ConvImplTypes; 71 | using INT8_UINT8o = ConvImplTypes; 72 | }; 73 | 74 | } // namespace euler 75 | -------------------------------------------------------------------------------- /src/kernel/elk_conv_wino_4x4_3x3_weights.hxx: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include "el_intrin.hpp" 5 | #include "elk_def.hpp" 6 | #include "el_utils.hpp" 7 | #include "elk_conv_wino.hpp" 8 | 9 | namespace euler { 10 | 11 | template 12 | struct elk_conv_wino_trans_weights { 14 | constexpr static int A = 6; 15 | constexpr static int K = 3; 16 | constexpr static int I = ISA_AVX512; 17 | 18 | static void execute( 19 | float atweights[A][A][V][V], WeightsType aweights[K][K][V][V]) 20 | { 21 | ENABLE_AVX512F(); 22 | 23 | alignas(64) float M[6][3][16]; 24 | 25 | auto z0 = _mm::set1_ps(0.26890756302521f); 26 | auto z1 = _mm::set1_ps(-0.688403361344538f); 27 | auto z2 = _mm::set1_ps(0.119514472455649f); 28 | auto z3 = _mm::set1_ps(1.13777777777778f); 29 | auto z4 = _mm::set1_ps(0.430252100840336f); 30 | auto z5 = _mm::set1_ps(0.179271708683473f); 31 | 32 | for (int _V = 0; _V < 16; _V++) { 33 | #pragma unroll 34 | for (int i = 0; i < 3; i++) { 35 | __m f0, f1, f2; 36 | if (std::is_same::value) { 37 | f0 = _mm::load_ps(aweights[0][i][_V]); 38 | f1 = _mm::load_ps(aweights[1][i][_V]); 39 | f2 = _mm::load_ps(aweights[2][i][_V]); 40 | } else { 41 | f0 = _mm::cvtph_ps(_mm::load_si256((__m256i *)aweights[0][i][_V])); 42 | f1 = _mm::cvtph_ps(_mm::load_si256((__m256i *)aweights[1][i][_V])); 43 | f2 = _mm::cvtph_ps(_mm::load_si256((__m256i *)aweights[2][i][_V])); 44 | } 45 | auto t0 = z0 * f2; 46 | auto t1 = z1 * f0 - t0; 47 | auto t2 = t0 + z2 * f0; 48 | 49 | *(__m *)M[0][i] = z3 * f0; 50 | *(__m *)M[1][i] = t1 - z4 * f1; 51 | *(__m *)M[2][i] = t1 + z4 * f1; 52 | *(__m *)M[3][i] = t2 + z5 * f1; 53 | *(__m *)M[4][i] = t2 - z5 * f1; 54 | *(__m *)M[5][i] = f2; 55 | } 56 | #pragma unroll 57 | for (int i = 0; i < 6; i++) { 58 | auto f0 = _mm::load_ps(M[i][0]); 59 | auto f1 = _mm::load_ps(M[i][1]); 60 | auto f2 = _mm::load_ps(M[i][2]); 61 | auto t0 = z0 * f2; 62 | auto t1 = z1 * f0 - t0; 63 | auto t2 = t0 + z2 * f0; 64 | 65 | *(__m *)atweights[i][0][_V] = z3 * f0; 66 | *(__m *)atweights[i][1][_V] = t1 - z4 * f1; 67 | *(__m *)atweights[i][2][_V] = t1 + z4 * f1; 68 | *(__m *)atweights[i][3][_V] = t2 + z5 * f1; 69 | *(__m *)atweights[i][4][_V] = t2 - z5 * f1; 70 | *(__m *)atweights[i][5][_V] = f2; 71 | } 72 | } 73 | } 74 | 75 | }; // elk_conv_wino_trans_weights 76 | } // namespace euler 77 | -------------------------------------------------------------------------------- /scripts/best_configs/transformer.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | source ./scripts/best_configs/common.sh $@ 4 | COMMON="$COMMON --f16c-opt=1" 5 | 6 | echo "Plain: nhwc" 7 | # nhwc 8 | NSOCKETS=1 ./scripts/run.sh -c -n1 -i1024 -o1024 -h42 -w256 -H42 -W256 -k1 -K1 -p0 -P0 -s1 -S1 -b1 -adirect --blk-i=16 --pat-i=4 --blk-o=4 --flt-o=8 --flt-t=7 --execution-mode=0xa060 --pat-o=1 $COMMON --input-format=nhwc --output-format=nhwc 9 | NSOCKETS=1 ./scripts/run.sh -c -n1 -i1024 -o1024 -h32 -w32 -H32 -W32 -k1 -K1 -p0 -P0 -s1 -S1 -b1 -adirect --blk-i=16 --pat-i=4 --blk-o=1 --flt-o=2 --flt-t=7 --execution-mode=0xa060 --pat-o=1 $COMMON --input-format=nhwc --output-format=nhwc 10 | NSOCKETS=1 ./scripts/run.sh -c -n1 -i1024 -o3072 -h42 -w128 -H42 -W128 -k1 -K1 -p0 -P0 -s1 -S1 -b1 -adirect --blk-i=16 --pat-i=4 --blk-o=1 --flt-o=2 --flt-t=7 --execution-mode=0xa060 --pat-o=1 $COMMON --input-format=nhwc --output-format=nhwc 11 | NSOCKETS=1 ./scripts/run.sh -c -n1 -i1024 -o1024 -h1 -w42 -H1 -W42 -k1 -K1 -p0 -P0 -s1 -S1 -b1 -adirect --blk-i=32 --pat-i=2 --blk-o=1 --flt-o=2 --flt-t=14 --execution-mode=0xa060 --pat-o=1 $COMMON --input-format=nhwc --output-format=nhwc 12 | NSOCKETS=1 ./scripts/run.sh -c -n1 -i1024 -o4096 -h1 -w42 -H1 -W42 -k1 -K1 -p0 -P0 -s1 -S1 -b1 -adirect --blk-i=16 --pat-i=4 --blk-o=1 --flt-o=2 --flt-t=7 --execution-mode=0xa060 --pat-o=1 $COMMON --input-format=nhwc --output-format=nhwc 13 | NSOCKETS=1 ./scripts/run.sh -c -n1 -i4096 -o1024 -h1 -w42 -H1 -W42 -k1 -K1 -p0 -P0 -s1 -S1 -b1 -adirect --blk-i=16 --pat-i=16 --blk-o=1 --flt-o=1 --flt-t=7 --execution-mode=0xa060 --pat-o=1 $COMMON --input-format=nhwc --output-format=nhwc 14 | 15 | echo "Blocked: nChw16c" 16 | # blocked 17 | NSOCKETS=1 ./scripts/run.sh -c -n1 -i1024 -o1024 -h21 -w256 -H21 -W256 -k1 -K1 -p0 -P0 -s1 -S1 -b1 -adirect --blk-i=32 --pat-i=2 --blk-o=8 --flt-o=2 --flt-t=14 --execution-mode=0xa060 --pat-o=1 $COMMON 18 | NSOCKETS=1 ./scripts/run.sh -c -n1 -i1024 -o1024 -h4 -w256 -H4 -W256 -k1 -K1 -p0 -P0 -s1 -S1 -b1 -adirect --blk-i=32 --pat-i=2 --blk-o=1 --flt-o=2 --flt-t=14 --execution-mode=0xa060 --pat-o=1 $COMMON 19 | NSOCKETS=1 ./scripts/run.sh -c -n1 -i1024 -o3072 -h1 -w42 -H1 -W42 -k1 -K1 -p0 -P0 -s1 -S1 -b1 -adirect --blk-i=32 --pat-i=2 --blk-o=1 --flt-o=1 --flt-t=28 --execution-mode=0xa060 --pat-o=1 $COMMON 20 | NSOCKETS=1 ./scripts/run.sh -c -n1 -i1024 -o1024 -h1 -w42 -H1 -W42 -k1 -K1 -p0 -P0 -s1 -S1 -b1 -adirect --blk-i=32 --pat-i=2 --blk-o=1 --flt-o=1 --flt-t=28 --execution-mode=0xa060 --pat-o=1 $COMMON 21 | NSOCKETS=1 ./scripts/run.sh -c -n1 -i1024 -o4096 -h1 -w42 -H1 -W42 -k1 -K1 -p0 -P0 -s1 -S1 -b1 -adirect --blk-i=32 --pat-i=2 --blk-o=1 --flt-o=1 --flt-t=28 --execution-mode=0xa060 --pat-o=1 $COMMON 22 | NSOCKETS=1 ./scripts/run.sh -c -n1 -i4096 -o1024 -h1 -w42 -H1 -W42 -k1 -K1 -p0 -P0 -s1 -S1 -b1 -adirect --blk-i=16 --pat-i=16 --blk-o=1 --flt-o=1 --flt-t=28 --execution-mode=0xa060 --pat-o=1 $COMMON 23 | -------------------------------------------------------------------------------- /src/elx_deconv_direct.hpp: -------------------------------------------------------------------------------- 1 | #ifndef __ELX_DECONV_DIRECT_HPP__ 2 | #define __ELX_DECONV_DIRECT_HPP__ 3 | 4 | #include "euler.hpp" 5 | #include "el_def.hpp" 6 | #include "el_utils.hpp" 7 | #include "el_allocator.hpp" 8 | #include "elx_conv.hpp" 9 | #include "kernel/elk_conv_binder.hxx" 10 | 11 | namespace euler { 12 | 13 | #define Template_elx_deconv_direct_t \ 14 | template 15 | 16 | #define Instance_elx_deconv_direct_t \ 17 | elx_deconv_direct_t 18 | 19 | Template_elx_deconv_direct_t class elx_deconv_direct_t : public elx_conv_t { 20 | using InputType = typename UserTypes::InputType; 21 | using WeightsType = typename UserTypes::WeightsType; 22 | using OutputType = typename UserTypes::OutputType; 23 | using BiasType = typename UserTypes::BiasType; 24 | 25 | // t-buffer type 26 | using TinputType = typename TarrayTypes::InputType; 27 | using TweightsType = typename TarrayTypes::WeightsType; 28 | using ToutputType = typename TarrayTypes::OutputType; 29 | 30 | public: 31 | elx_deconv_direct_t(eld_conv_t &dc); 32 | virtual ~elx_deconv_direct_t(); 33 | 34 | virtual void execute(void *output, void *input, void *weights, void *bias); 35 | 36 | private: 37 | void __execute_a060(OutputType *output, InputType *input, 38 | WeightsType *weights, BiasType *bias); 39 | 40 | void trans_weights_to_compact(TweightsType *tweights, WeightsType *weights); 41 | inline void __trans_weights_post(WeightsType *aweights, TweightsType *tweights, 42 | int _g, int _O4, int _I4, int _O3, int _I3, int _kh, int _kw, int _O1, 43 | int _I2, int _iV, int _O); 44 | inline void __trans_weights_Or_post(WeightsType *aweights, TweightsType *tweights, 45 | int _g, int _O4, int _I4, int _O3, int _I3, int _kh, int _kw, int _O1, 46 | int _I2, int _iV, int _O); 47 | 48 | void conv_a060(OutputType *output, InputType *input, TweightsType *weights, 49 | BiasType *bias, int _I4, int _O4, int _ht, int _wt); 50 | 51 | void set_workspace_buffers(void *base); 52 | void set_scratch_buffers(void *base); 53 | int prepare_execute_opt(); 54 | void bind_execute_functions(); 55 | 56 | // TODO: optimize it 57 | conv_kernel_binder::kconv *ker_conv_; 58 | conv_kernel_binder::kconv *ker_conv_Tr_; 59 | 60 | void (elx_deconv_direct_t::*execute_opt_)( 61 | OutputType *, InputType *, WeightsType *, BiasType *); 62 | 63 | bool is_first_run_; 64 | bool inference_acc_; 65 | 66 | int tp_, bp_, lp_, rp_; 67 | size_t tweights_size_; 68 | TweightsType *tweights_; 69 | size_t toutput_size_; 70 | ToutputType *toutput_; 71 | unsigned int xopt_; 72 | int attr_; 73 | int mthr_; 74 | }; 75 | 76 | } // namespace euler 77 | #endif // __ELX_DECONV_DIRECT_HPP__ 78 | -------------------------------------------------------------------------------- /src/elx_int8_conv_direct_depthwise.hpp: -------------------------------------------------------------------------------- 1 | #ifndef __ELX_CONV_DIRECT_DEPTHWISE_LP_HPP__ 2 | #define __ELX_CONV_DIRECT_DEPTHWISE_LP_HPP__ 3 | 4 | #include "euler.hpp" 5 | #include "el_def.hpp" 6 | #include "el_utils.hpp" 7 | #include "el_allocator.hpp" 8 | #include "elx_conv.hpp" 9 | #include "kernel/elk_u8s8_depthwise_conv_binder.hxx" 10 | 11 | namespace euler { 12 | 13 | #define Template_elx_int8_conv_direct_depthwise_t \ 14 | template 15 | 16 | #define Instance_elx_int8_conv_direct_depthwise_t \ 17 | elx_int8_conv_direct_depthwise_t 18 | 19 | Template_elx_int8_conv_direct_depthwise_t class elx_int8_conv_direct_depthwise_t : public elx_conv_t { 20 | using InputType = typename UserTypes::InputType; 21 | using WeightsType = typename UserTypes::WeightsType; 22 | using OutputType = typename UserTypes::OutputType; 23 | using BiasType = typename UserTypes::BiasType; 24 | 25 | // t-buffer type 26 | using TinputType = typename TarrayTypes::InputType; 27 | using TweightsType = typename TarrayTypes::WeightsType; 28 | using ToutputType = typename TarrayTypes::OutputType; 29 | static constexpr int KW = 4; 30 | 31 | public: 32 | elx_int8_conv_direct_depthwise_t(eld_conv_t &dc); 33 | virtual ~elx_int8_conv_direct_depthwise_t(); 34 | 35 | virtual void execute(void *output, void *input, void *weights, void *bias); 36 | 37 | private: 38 | void __execute_c160(OutputType *output, InputType *input, 39 | WeightsType *weights, BiasType *bias); 40 | 41 | void trans_weights_3x3(float *weights_scale, float * weights_shift, 42 | int8_t *weights_s8, WeightsType *weights, BiasType *bias); 43 | 44 | void conv_c160(OutputType *output, ToutputType *toutput, InputType *input, 45 | int8_t *tweights, BiasType *bias, float *src_scale, 46 | float *weights_scale, float *weights_shift, int _ht, int _wt); 47 | 48 | void set_scratch_buffers(void *base); 49 | void set_workspace_buffers(void *base); 50 | int prepare_execute_opt(); 51 | void bind_execute_functions(); 52 | void prepare_quant_calibration(eld_conv_t &); 53 | 54 | // TODO: optimize it 55 | u8s8_depthwise_conv_kernel_binder::kconv *ker_conv_; 56 | u8s8_depthwise_conv_kernel_binder::kconv *ker_conv_Tr_; 57 | 58 | void (elx_int8_conv_direct_depthwise_t::*execute_opt_)( 59 | OutputType *, InputType *, WeightsType *, BiasType *); 60 | 61 | bool is_first_run_; 62 | bool inference_acc_; 63 | 64 | size_t tweights_size_; 65 | TweightsType *tweights_; 66 | size_t tweights_s8_size_; 67 | size_t toutput_size_; 68 | size_t input_scale_size_; 69 | size_t weights_scale_size_; 70 | size_t weights_shift_size_; 71 | ToutputType *toutput_; 72 | float *input_scale_; 73 | float *weights_scale_; 74 | float *weights_shift_; 75 | int8_t *tweights_s8_; 76 | unsigned int xopt_; 77 | int attr_; 78 | int mthr_; 79 | }; 80 | 81 | } // namespace euler 82 | #endif // __ELX_CONV_DIRECT_DEPTHWISE_LP_HPP__ 83 | -------------------------------------------------------------------------------- /src/elx_int8_conv_direct_depthwise_xopt.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "elx_int8_conv_direct_depthwise.hpp" 4 | #include "el_parallel.hpp" 5 | 6 | // XOPT 7 | // 8 | // fusion: same as winograd 9 | // dup: same as winograd 10 | // ------+-----+--------+-----+------------------------------------------------ 11 | // | ker | fusion | dup | notes 12 | // ------+-----+--------+-----+------------------------------------------------ 13 | // c160 |conv | t+o | - | blocked/nhwc, Tr, K=3 S=1,2 14 | // ------+-----+--------+-----+------------------------------------------------ 15 | // 16 | namespace euler { 17 | 18 | Template_elx_int8_conv_direct_depthwise_t 19 | void Instance_elx_int8_conv_direct_depthwise_t::__execute_c160( 20 | OutputType *output, InputType *input, WeightsType *weights, BiasType *bias) 21 | { 22 | if (is_first_run_) { 23 | setup_workspace([&]() { 24 | trans_weights_3x3( 25 | weights_scale_, weights_shift_, tweights_s8_, weights, bias); 26 | if (ep.sampling_kind == CALIBRATED) { 27 | MD2(float, atinput_scale, input_scale_, 2, ep.T); 28 | iter_each(_T, ep.T) { 29 | md2(atinput_scale, 0, _T) = ep.input_quant_S; 30 | md2(atinput_scale, 1, _T) = ep.input_quant_z; 31 | } 32 | } 33 | }); 34 | } 35 | 36 | estl::parallel_for<4>([&](int _n, int _G3, int _ht, int _wt) { 37 | MD3(int8_t, atweights_s8, tweights_s8_, ep.G3, ep.G2, ep.kh * KW * V); 38 | MD3(BiasType, abias, bias, ep.G3, ep.G2, V); 39 | MD3(float, atweights_scale, weights_scale_, ep.G3, ep.G2, V); 40 | MD3(float, aweights_shift, weights_shift_, ep.G3, ep.G2, V); 41 | // blocked input 42 | MD4(InputType, ainput_blocked, input, ep.n, ep.G3, ep.G2, ep.ih * ep.iw * V); 43 | // blocked output 44 | MD5(OutputType, aoutput0_blocked, output, 45 | ep.n, ep.G3, ep.G2, ep.ht, ep.ow * V); 46 | MD3(OutputType, aoutput1_blocked, 47 | &md5(aoutput0_blocked, _n, _G3, 0, _ht, 0), ep.wt, ep.T, V); 48 | // blocked toutput 49 | MD5(ToutputType, atoutput0_blocked, toutput_, 50 | ep.n, ep.G3, ep.G2, ep.ht, ep.ow * V); 51 | MD3(ToutputType, atoutput1_blocked, 52 | &md5(atoutput0_blocked, _n, _G3, 0, _ht, 0), ep.wt, ep.T, V); 53 | 54 | auto ainput = &md4(ainput_blocked, _n, _G3, 0, 0); 55 | auto aoutput = &md3(aoutput1_blocked, _wt, 0, 0); 56 | auto atoutput = &md3(atoutput1_blocked, _wt, 0, 0); 57 | conv_c160(aoutput, atoutput, ainput, 58 | &md3(atweights_s8, _G3, 0, 0), 59 | &md3(abias, _G3, 0, 0), input_scale_, 60 | &md3(atweights_scale, _G3, 0, 0), 61 | &md3(aweights_shift, _G3, 0, 0), _ht, _wt); 62 | }, ep.n, ep.G3, ep.ht, ep.wt); 63 | 64 | if (is_first_run_ && inference_acc_) 65 | is_first_run_ = false; 66 | } 67 | 68 | Template_elx_int8_conv_direct_depthwise_t 69 | void Instance_elx_int8_conv_direct_depthwise_t::execute( 70 | void *output, void *input, void *weights, void *bias) 71 | { 72 | (this->*execute_opt_)((OutputType *)output, 73 | (InputType *)input, (WeightsType *)weights, (BiasType *)bias); 74 | } 75 | 76 | } // namespace euler 77 | -------------------------------------------------------------------------------- /cmake/flags.cmake: -------------------------------------------------------------------------------- 1 | if (__flags_included) 2 | return () 3 | endif() 4 | 5 | set(__basic_flags "-Wall -Wextra -Wshadow") 6 | list(APPEND __basic_flags "-fopenmp") 7 | list(APPEND __basic_flags "-Wno-sign-compare") 8 | list(APPEND __basic_flags "-Wno-uninitialized") 9 | list(APPEND __basic_flags "-Wno-unused-variable") 10 | list(APPEND __basic_flags "-Wno-unused-parameter") 11 | list(APPEND __basic_flags "-DEULER_VERSION=${EULER_VERSION}") 12 | 13 | if (CMAKE_CXX_COMPILER MATCHES "icpx") 14 | # for aligned new 15 | set(__cxx_flags "-std=c++17") 16 | else() 17 | set(__cxx_flags "-std=c++11") 18 | endif() 19 | 20 | if (__debug) 21 | set(__opt_flags "-O0 -g -DDEBUG") 22 | elseif (__profiling) 23 | set(__opt_flags "-O2 -g -DNDEBUG") 24 | else () 25 | set(__opt_flags "-O2 -DNDEBUG") 26 | endif () 27 | 28 | if (ENABLE_USER_FP16) 29 | list(APPEND __opt_flags "-DENABLE_USER_FP16") 30 | endif() 31 | 32 | if (WITH_GK) 33 | list(APPEND __opt_flags "-DWITH_GK") 34 | endif() 35 | 36 | if (WITH_VNNI) 37 | list(APPEND __opt_flags "-DWITH_VNNI") 38 | endif() 39 | if (WITH_DPBF16) 40 | list(APPEND __opt_flags "-DWITH_DPBF16") 41 | endif() 42 | 43 | if(MT_RUNTIME STREQUAL "omp") 44 | list(APPEND __opt_flags "-DMT_RUNTIME=MT_RUNTIME_OMP") 45 | elseif(MT_RUNTIME STREQUAL "tbb") 46 | list(APPEND __opt_flags "-DMT_RUNTIME=MT_RUNTIME_TBB") 47 | else() 48 | MESSAGE(FATAL_ERROR "MT_RUNTIME=" ${MT_RUNTIME} " is not supported. omp|tbb") 49 | endif() 50 | 51 | MESSAGE("-- MT_RUNTIME: " ${MT_RUNTIME}) 52 | 53 | if (CMAKE_CXX_COMPILER MATCHES "icpc") 54 | list(APPEND __opt_flags "-xHost") 55 | # list(APPEND __opt_flags "-qopt-report=5") 56 | list(APPEND __opt_flags "-qopt-zmm-usage=high") 57 | list(APPEND __opt_flags "-no-inline-max-size") 58 | list(APPEND __opt_flags "-no-inline-max-total-size") 59 | list(APPEND __opt_flags "-wd15335") # disable remark #15335: was not vectorized 60 | elseif(CMAKE_CXX_COMPILER MATCHES "icpx") 61 | list(APPEND __opt_flags "-xHost") 62 | list(APPEND __basic_flags "-Wno-pragmas") 63 | list(APPEND __basic_flags "-Wno-pass-failed") 64 | list(APPEND __basic_flags "-Wno-missing-braces") 65 | if (WITH_VNNI) 66 | list(APPEND __opt_flags "-mavx512vnni") 67 | endif() 68 | list(APPEND __opt_flags "-mavx512f") 69 | list(APPEND __opt_flags "-mavx512dq") 70 | list(APPEND __opt_flags "-mavx512bw") 71 | list(APPEND __opt_flags "-mfma -mavx512vl") 72 | elseif(CMAKE_CXX_COMPILER MATCHES "clang") 73 | list(APPEND __opt_flags "-mavx512f") 74 | list(APPEND __opt_flags "-mavx512dq") 75 | list(APPEND __opt_flags "-mavx512bw") 76 | list(APPEND __opt_flags "-mfma -mavx512vl") 77 | else() 78 | list(APPEND __basic_flags "-Wno-unused-result") 79 | list(APPEND __basic_flags "-Wno-unused-but-set-variable") 80 | list(APPEND __basic_flags "-Wno-misleading-indentation") 81 | list(APPEND __basic_flags "-Wno-unknown-pragmas") 82 | list(APPEND __basic_flags "-Wno-implicit-fallthrough") 83 | list(APPEND __opt_flags "-mavx512f") 84 | list(APPEND __opt_flags "-mavx512dq") 85 | list(APPEND __opt_flags "-mavx512bw") 86 | list(APPEND __opt_flags "-mfma -mavx512vl") 87 | endif() 88 | 89 | add_definitions(${__basic_flags} ${__cxx_flags} ${__opt_flags}) 90 | -------------------------------------------------------------------------------- /src/kernel/elk_u8s8_depthwise_conv_binder.hxx: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #if !defined(BUILD_OTJ_TBL) 4 | #define DECL_U8S8_DEPTHWISE_KCONV_TBL(type, otype, V, Vx, I, S, F) \ 5 | static kconv \ 6 | *kconv_##type##_##otype##_##V##_##Vx##_##I##_##S##_##F[1][32][1] 7 | #else 8 | #define DECL_U8S8_DEPTHWISE_KCONV_TBL(type, otype, V, Vx, I, S, F) \ 9 | __u8s8_depthwise_kconv_generate_inst__ u8s8_depthwise_conv type otype V Vx I S F 10 | #endif 11 | 12 | #define LOOKUP_U8S8_DEPTHWISE_KCONV_TBL(type, otype, V, Vx, I, S, F, O, T, K) \ 13 | kconv_##type##_##otype##_##V##_##Vx##_##I##_##S##_##F[O - 1][T - 1][0] 14 | 15 | #if !defined(BUILD_OTJ_TBL) 16 | #include "el_def.hpp" 17 | #include "src/kernel/elk_def.hpp" 18 | #include "src/kernel/elk_u8s8_depthwise_conv.hxx" 19 | 20 | namespace euler { 21 | 22 | struct u8s8_depthwise_conv_kernel_binder { 23 | template 25 | using conv_ker_cls = typename euler::u8s8_depthwise_conv_kernel< 26 | GarrayTypes, RoutputType, V, Vx, I, estl::integer_sequence>; 27 | 28 | template 29 | using kconv = decltype( 30 | conv_ker_cls::conv); 31 | 32 | #endif // BUILD_OTJ_TBL 33 | 34 | DECL_U8S8_DEPTHWISE_KCONV_TBL(INT8_F32, int8_t, 16, 4, ISA_AVX512, 1, GKF_DCD); 35 | DECL_U8S8_DEPTHWISE_KCONV_TBL(INT8_F32, int8_t, 16, 4, ISA_AVX512, 2, GKF_DCD); 36 | DECL_U8S8_DEPTHWISE_KCONV_TBL(INT8_F32, uint8_t, 16, 4, ISA_AVX512, 1, GKF_DCD); 37 | DECL_U8S8_DEPTHWISE_KCONV_TBL(INT8_F32, uint8_t, 16, 4, ISA_AVX512, 2, GKF_DCD); 38 | 39 | #if !defined(BUILD_OTJ_TBL) 40 | 41 | # define DEF_DEPTHWISE_CONV_BIND_INT8_F32(otype) \ 42 | template \ 43 | static inline void bind(int O, int T, \ 44 | kconv **func) { \ 45 | switch (F) { \ 46 | case GKF_DCD: \ 47 | if (S == 1) \ 48 | *func = LOOKUP_U8S8_DEPTHWISE_KCONV_TBL( \ 49 | INT8_F32, otype, 16, 4, ISA_AVX512, 1, GKF_DCD, O, T, K); \ 50 | else if (S == 2) \ 51 | *func = LOOKUP_U8S8_DEPTHWISE_KCONV_TBL( \ 52 | INT8_F32, otype, 16, 4, ISA_AVX512, 2, GKF_DCD, O, T, K); \ 53 | break; \ 54 | default: \ 55 | el_error("Unimlemented conv kernel format"); \ 56 | break; \ 57 | } \ 58 | } 59 | 60 | DEF_DEPTHWISE_CONV_BIND_INT8_F32(int8_t) 61 | DEF_DEPTHWISE_CONV_BIND_INT8_F32(uint8_t) 62 | #endif // BUILD_OTJ_TBL 63 | }; 64 | 65 | } // namespace euler 66 | -------------------------------------------------------------------------------- /src/kernel/elk_gemm_gen.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Build gemm kernel instantiation 4 | # 5 | 6 | src_file=$1; dst_dir=$2; cc=$3; enable_user_fp16=$4 7 | 8 | if [ ! -f $src_file ] || [ ! -d $dst_dir ]; then 9 | "Invalid src_file=$src_file or dst_dir=$dst_dir" 10 | exit -1 11 | fi 12 | 13 | __kgemm_generate_inst__() { 14 | ktype=$1; dtype=$2; V=$3; Vx=$4; I=$5; S=$6; F=$7; 15 | 16 | cat <<@ > $dst_dir/elk_${ktype}_${dtype}_${V}_${Vx}_${I}_${S}_${F}.cpp 17 | // _generated_kgemm_file_ 18 | // 19 | #include "$src_file" 20 | 21 | using namespace euler; 22 | 23 | namespace euler { 24 | 25 | #undef E 26 | #define E(O, T) \\ 27 | ${ktype}_kernel_binder::gemm_ker_cls::gemm 29 | ${ktype}_kernel_binder::kgemm 30 | *${ktype}_kernel_binder::kgemm_${dtype}_${V}_${Vx}_${I}_${S}_${F}[8][32] = 31 | { // 8 32 | { // 32 33 | E(1, 1), 34 | E(1, 2), 35 | E(1, 3), 36 | E(1, 4), 37 | E(1, 5), 38 | E(1, 6), 39 | E(1, 7), 40 | E(1, 8), 41 | E(1, 9), 42 | E(1, 10), 43 | E(1, 11), 44 | E(1, 12), 45 | E(1, 13), 46 | E(1, 14), 47 | E(1, 15), 48 | E(1, 16), 49 | E(1, 17), 50 | E(1, 18), 51 | E(1, 19), 52 | E(1, 20), 53 | E(1, 21), 54 | E(1, 22), 55 | E(1, 23), 56 | E(1, 24), 57 | E(1, 25), 58 | E(1, 26), 59 | E(1, 27), 60 | E(1, 28), 61 | E(1, 29), 62 | E(1, 30), 63 | E(1, 31), 64 | }, 65 | { // 32 66 | E(2, 1), 67 | E(2, 2), 68 | E(2, 3), 69 | E(2, 4), 70 | E(2, 5), 71 | E(2, 6), 72 | E(2, 7), 73 | E(2, 8), 74 | E(2, 9), 75 | E(2, 10), 76 | E(2, 11), 77 | E(2, 12), 78 | E(2, 13), 79 | E(2, 14), 80 | }, 81 | { // 32 82 | E(3, 1), 83 | E(3, 2), 84 | E(3, 3), 85 | E(3, 4), 86 | E(3, 5), 87 | E(3, 6), 88 | E(3, 7), 89 | E(3, 8), 90 | E(3, 9), 91 | E(3, 10), 92 | E(3, 11), 93 | E(3, 12), 94 | E(3, 13), 95 | E(3, 14), 96 | }, 97 | { // 32 98 | E(4, 1), 99 | E(4, 2), 100 | E(4, 3), 101 | E(4, 4), 102 | E(4, 5), 103 | E(4, 6), 104 | E(4, 7), 105 | E(4, 8), 106 | E(4, 9), 107 | E(4, 10), 108 | E(4, 11), 109 | E(4, 12), 110 | E(4, 13), 111 | E(4, 14), 112 | }, 113 | { // 32 114 | E(5, 1), 115 | E(5, 2), 116 | E(5, 3), 117 | E(5, 4), 118 | E(5, 5), 119 | }, 120 | { // 32 121 | E(6, 1), 122 | E(6, 2), 123 | E(6, 3), 124 | E(6, 4), 125 | }, 126 | { // 32 127 | E(7, 1), 128 | E(7, 2), 129 | E(7, 3), 130 | }, 131 | { // 32 132 | E(8, 1), 133 | E(8, 2), 134 | E(8, 3), 135 | E(8, 4), 136 | E(8, 5), 137 | E(8, 6), 138 | E(8, 7), 139 | E(8, 8), 140 | }, 141 | }; 142 | 143 | } // namespace 144 | @ 145 | } 146 | 147 | if [ $enable_user_fp16 == "ON" ]; then 148 | eval $($cc -DENABLE_USER_FP16 -DBUILD_OTJ_TBL -E $src_file 2>&1 | grep _generate_inst_) 149 | else 150 | eval $($cc -DBUILD_OTJ_TBL -E $src_file 2>&1 | grep _generate_inst_) 151 | fi 152 | -------------------------------------------------------------------------------- /src/kernel/elk_u8s8_gemm_gen.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Build gemm kernel instantiation 4 | # 5 | 6 | src_file=$1; dst_dir=$2; cc=$3; enable_user_fp16=$4 7 | 8 | if [ ! -f $src_file ] || [ ! -d $dst_dir ]; then 9 | "Invalid src_file=$src_file or dst_dir=$dst_dir" 10 | exit -1 11 | fi 12 | 13 | __u8s8_kgemm_generate_inst__() { 14 | ktype=$1; dtype=$2; otype=$3; V=$4; Vx=$5; I=$6; S=$7; F=$8; 15 | 16 | cat <<@ > $dst_dir/elk_${ktype}_${dtype}_${otype}_${V}_${Vx}_${I}_${S}_${F}.cpp 17 | // _generated_kgemm_file_ 18 | // 19 | #include "$src_file" 20 | 21 | using namespace euler; 22 | 23 | namespace euler { 24 | 25 | #undef E 26 | #define E(O, T) \\ 27 | ${ktype}_kernel_binder::gemm_ker_cls::gemm 29 | ${ktype}_kernel_binder::kgemm 30 | *${ktype}_kernel_binder::kgemm_${dtype}_${otype}_${V}_${Vx}_${I}_${S}_${F}[8][32] = 31 | { // 8 32 | { // 32 33 | E(1, 1), 34 | E(1, 2), 35 | E(1, 3), 36 | E(1, 4), 37 | E(1, 5), 38 | E(1, 6), 39 | E(1, 7), 40 | E(1, 8), 41 | E(1, 9), 42 | E(1, 10), 43 | E(1, 11), 44 | E(1, 12), 45 | E(1, 13), 46 | E(1, 14), 47 | E(1, 15), 48 | E(1, 16), 49 | E(1, 17), 50 | E(1, 18), 51 | E(1, 19), 52 | E(1, 20), 53 | E(1, 21), 54 | E(1, 22), 55 | E(1, 23), 56 | E(1, 24), 57 | E(1, 25), 58 | E(1, 26), 59 | E(1, 27), 60 | E(1, 28), 61 | E(1, 29), 62 | E(1, 30), 63 | E(1, 31), 64 | }, 65 | { // 32 66 | E(2, 1), 67 | E(2, 2), 68 | E(2, 3), 69 | E(2, 4), 70 | E(2, 5), 71 | E(2, 6), 72 | E(2, 7), 73 | E(2, 8), 74 | E(2, 9), 75 | E(2, 10), 76 | E(2, 11), 77 | E(2, 12), 78 | E(2, 13), 79 | E(2, 14), 80 | }, 81 | { // 32 82 | E(3, 1), 83 | E(3, 2), 84 | E(3, 3), 85 | E(3, 4), 86 | E(3, 5), 87 | E(3, 6), 88 | E(3, 7), 89 | E(3, 8), 90 | E(3, 9), 91 | E(3, 10), 92 | E(3, 11), 93 | E(3, 12), 94 | E(3, 13), 95 | E(3, 14), 96 | }, 97 | { // 32 98 | E(4, 1), 99 | E(4, 2), 100 | E(4, 3), 101 | E(4, 4), 102 | E(4, 5), 103 | E(4, 6), 104 | E(4, 7), 105 | E(4, 8), 106 | E(4, 9), 107 | E(4, 10), 108 | E(4, 11), 109 | E(4, 12), 110 | E(4, 13), 111 | E(4, 14), 112 | }, 113 | { // 32 114 | E(5, 1), 115 | E(5, 2), 116 | E(5, 3), 117 | E(5, 4), 118 | E(5, 5), 119 | }, 120 | { // 32 121 | E(6, 1), 122 | E(6, 2), 123 | E(6, 3), 124 | E(6, 4), 125 | }, 126 | { // 32 127 | E(7, 1), 128 | E(7, 2), 129 | E(7, 3), 130 | }, 131 | { // 32 132 | E(8, 1), 133 | E(8, 2), 134 | E(8, 3), 135 | E(8, 4), 136 | E(8, 5), 137 | E(8, 6), 138 | E(8, 7), 139 | E(8, 8), 140 | }, 141 | }; 142 | 143 | } // namespace 144 | @ 145 | } 146 | 147 | if [ $enable_user_fp16 == "ON" ]; then 148 | eval $($cc -DENABLE_USER_FP16 -DBUILD_OTJ_TBL -E $src_file 2>&1 | grep _generate_inst_) 149 | else 150 | eval $($cc -DBUILD_OTJ_TBL -E $src_file 2>&1 | grep _generate_inst_) 151 | fi 152 | -------------------------------------------------------------------------------- /src/elx_conv_direct.hpp: -------------------------------------------------------------------------------- 1 | #ifndef __ELX_CONV_DIRECT_HPP__ 2 | #define __ELX_CONV_DIRECT_HPP__ 3 | 4 | #include "euler.hpp" 5 | #include "el_def.hpp" 6 | #include "el_utils.hpp" 7 | #include "el_allocator.hpp" 8 | #include "elx_conv.hpp" 9 | #include "kernel/elk_gemm_binder.hxx" 10 | #include "kernel/elk_conv_binder.hxx" 11 | 12 | namespace euler { 13 | 14 | #define Template_elx_conv_direct_t \ 15 | template 16 | 17 | #define Instance_elx_conv_direct_t \ 18 | elx_conv_direct_t 19 | 20 | Template_elx_conv_direct_t class elx_conv_direct_t : public elx_conv_t { 21 | using InputType = typename UserTypes::InputType; 22 | using WeightsType = typename UserTypes::WeightsType; 23 | using OutputType = typename UserTypes::OutputType; 24 | using BiasType = typename UserTypes::BiasType; 25 | 26 | // t-buffer type 27 | using TinputType = typename TarrayTypes::InputType; 28 | using TweightsType = typename TarrayTypes::WeightsType; 29 | using ToutputType = typename TarrayTypes::OutputType; 30 | 31 | public: 32 | elx_conv_direct_t(eld_conv_t &dc); 33 | virtual ~elx_conv_direct_t(); 34 | 35 | virtual void execute(void *output, void *input, void *weights, void *bias); 36 | 37 | private: 38 | void __execute_c060(OutputType *output, InputType *input, 39 | WeightsType *weights, BiasType *bias); 40 | void __execute_c070(OutputType *output, InputType *input, 41 | WeightsType *weights, BiasType *bias); 42 | void __execute_a060(OutputType *output, InputType *input, 43 | WeightsType *weights, BiasType *bias); 44 | 45 | void trans_weights_to_compact(TweightsType *tweights, WeightsType *weights); 46 | inline void __trans_weights_post(WeightsType *aweights, TweightsType *tweights, 47 | int _g, int _O4, int _I4, int _O3, int _I3, int _kh, int _kw, int _O1, 48 | int _I2, int _iV, int _O); 49 | inline void __trans_weights_Or_post(WeightsType *aweights, TweightsType *tweights, 50 | int _g, int _O4, int _I4, int _O3, int _I3, int _kh, int _kw, int _O1, 51 | int _I2, int _iV, int _O); 52 | 53 | void conv_c060(OutputType *output, InputType *input, TweightsType *weights, 54 | BiasType *bias, int _I4, int _O4, int _ht, int _wt); 55 | void conv_c070(OutputType *output, InputType *input, TweightsType *weights, 56 | BiasType *bias, int _I4, int _I3, int _O4, int _ht, int _wt); 57 | void gemm_a060(OutputType *toutput, InputType *tinput, TweightsType *tweights, 58 | BiasType *bias, int _I4, int _O4, int _ht, int _wt); 59 | 60 | void set_workspace_buffers(void *base); 61 | void set_scratch_buffers(void *base); 62 | int prepare_execute_opt(); 63 | void bind_execute_functions(); 64 | 65 | // TODO: optimize it 66 | gemm_kernel_binder::kgemm *ker_gemm_[128][8]; 67 | conv_kernel_binder::kconv *ker_conv_; 68 | conv_kernel_binder::kconv *ker_conv_Tr_; 69 | 70 | void (elx_conv_direct_t::*execute_opt_)( 71 | OutputType *, InputType *, WeightsType *, BiasType *); 72 | 73 | bool is_first_run_; 74 | bool inference_acc_; 75 | 76 | size_t tweights_size_; 77 | TweightsType *tweights_; 78 | size_t toutput_size_; 79 | ToutputType *toutput_; 80 | unsigned int xopt_; 81 | int attr_; 82 | int mthr_; 83 | }; 84 | 85 | } // namespace euler 86 | #endif // __ELX_CONV_DIRECT_HPP__ 87 | -------------------------------------------------------------------------------- /src/kernel/elk_vmg_conv_gen.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Build conv kernel instantiation 4 | # 5 | 6 | src_file=$1; dst_dir=$2; cc=$3; enable_user_fp16=$4 7 | 8 | if [ ! -f $src_file ] || [ ! -d $dst_dir ]; then 9 | "Invalid src_file=$src_file or dst_dir=$dst_dir" 10 | exit -1 11 | fi 12 | 13 | __vmg_kconv_generate_inst__() { 14 | ktype=$1; dtype=$2; V=$3; Vx=$4; I=$5; S=$6; F=$7 15 | 16 | cat <<@ > $dst_dir/elk_${ktype}_${dtype}_${V}_${Vx}_${I}_${S}_${F}.cpp 17 | // _generated_kernel_file_ 18 | // 19 | #include "$src_file" 20 | 21 | using namespace euler; 22 | 23 | namespace euler { 24 | 25 | #undef E 26 | #define E(O, T, K) \\ 27 | { \\ 28 | ${ktype}_kernel_binder::conv_ker_cls::conv, \\ 29 | ${ktype}_kernel_binder::conv_ker_cls::conv, \\ 30 | ${ktype}_kernel_binder::conv_ker_cls::conv, \\ 31 | ${ktype}_kernel_binder::conv_ker_cls::conv, \\ 32 | ${ktype}_kernel_binder::conv_ker_cls::conv \\ 33 | } 34 | ${ktype}_kernel_binder::kconv 35 | *${ktype}_kernel_binder::kconv_${dtype}_${V}_${Vx}_${I}_${S}_${F}[1][32][3][5] = 36 | { // 1 37 | { // 32 38 | { E(1, 1, 3), /* E(1, 1, 5), E(1, 1, 7) */ }, 39 | { E(1, 2, 3), /* E(1, 2, 5), E(1, 2, 7) */ }, 40 | { E(1, 3, 3), /* E(1, 3, 5), E(1, 3, 7) */ }, 41 | { E(1, 4, 3), /* E(1, 4, 5), E(1, 4, 7) */ }, 42 | { E(1, 5, 3), /* E(1, 5, 5), E(1, 5, 7) */ }, 43 | { E(1, 6, 3), /* E(1, 6, 5), E(1, 6, 7) */ }, 44 | { E(1, 7, 3), /* E(1, 7, 5), E(1, 7, 7) */ }, 45 | { E(1, 8, 3), /* E(1, 8, 5), E(1, 8, 7) */ }, 46 | { E(1, 9, 3), /* E(1, 9, 5), E(1, 9, 7) */ }, 47 | { E(1, 10, 3), /* E(1, 10, 5), E(1, 10, 7) */ }, 48 | { E(1, 11, 3), /* E(1, 11, 5), E(1, 11, 7) */ }, 49 | { E(1, 12, 3), /* E(1, 12, 5), E(1, 12, 7) */ }, 50 | { E(1, 13, 3), /* E(1, 13, 5), E(1, 13, 7) */ }, 51 | { E(1, 14, 3), /* E(1, 14, 5), E(1, 14, 7) */ }, 52 | #if 0 53 | { E(1, 15, 3), /* E(1, 15, 5), E(1, 15, 7) */ }, 54 | { E(1, 16, 3), /* E(1, 16, 5), E(1, 16, 7) */ }, 55 | { E(1, 17, 3), /* E(1, 17, 5), E(1, 17, 7) */ }, 56 | { E(1, 18, 3), /* E(1, 18, 5), E(1, 18, 7) */ }, 57 | { E(1, 19, 3), /* E(1, 19, 5), E(1, 19, 7) */ }, 58 | { E(1, 20, 3), /* E(1, 20, 5), E(1, 20, 7) */ }, 59 | { E(1, 21, 3), /* E(1, 21, 5), E(1, 21, 7) */ }, 60 | { E(1, 22, 3), /* E(1, 22, 5), E(1, 22, 7) */ }, 61 | { E(1, 23, 3), /* E(1, 23, 5), E(1, 23, 7) */ }, 62 | { E(1, 24, 3), /* E(1, 24, 5), E(1, 24, 7) */ }, 63 | { E(1, 25, 3), /* E(1, 25, 5), E(1, 25, 7) */ }, 64 | { E(1, 26, 3), /* E(1, 26, 5), E(1, 26, 7) */ }, 65 | { E(1, 27, 3), /* E(1, 27, 5), E(1, 27, 7) */ }, 66 | { E(1, 28, 3), /* E(1, 28, 5), E(1, 28, 7) */ }, 67 | { E(1, 29, 3), /* E(1, 29, 5), E(1, 29, 7) */ }, 68 | { E(1, 30, 3), /* E(1, 30, 5), E(1, 30, 7) */ }, 69 | { E(1, 31, 3), /* E(1, 31, 5), E(1, 31, 7) */ }, 70 | #endif 71 | }, 72 | }; 73 | 74 | } // namespace 75 | @ 76 | } 77 | 78 | if [ $enable_user_fp16 == "ON" ]; then 79 | eval $($cc -DENABLE_USER_FP16 -DBUILD_OTJ_TBL -E $src_file 2>&1 | grep _generate_inst_) 80 | else 81 | eval $($cc -DBUILD_OTJ_TBL -E $src_file 2>&1 | grep _generate_inst_) 82 | fi 83 | -------------------------------------------------------------------------------- /src/kernel/elk_vmg_conv_binder.hxx: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #if !defined(BUILD_OTJ_TBL) 4 | #define DECL_VMG_KCONV_TBL(type, V, Vx, I, S, F) \ 5 | static kconv \ 6 | *kconv_##type##_##V##_##Vx##_##I##_##S##_##F[1][32][3][5] 7 | #else 8 | #define DECL_VMG_KCONV_TBL(type, V, Vx, I, S, F) \ 9 | __vmg_kconv_generate_inst__ vmg_conv type V Vx I S F 10 | #endif 11 | 12 | #define _LOG2(x) (x) == 16 ? 4 : (x) == 8 ? 3 : (x) == 4 ? 2 : (x) == 2 ? 1 : 0 13 | #define LOOKUP_VMG_KCONV_TBL(type, V, Vx, I, S, F, O, T, K, G) \ 14 | kconv_##type##_##V##_##Vx##_##I##_##S##_##F[O - 1][T - 1][K/2-1][_LOG2(G)] 15 | 16 | #if !defined(BUILD_OTJ_TBL) 17 | #include "src/kernel/elk_def.hpp" 18 | #include "src/kernel/elk_vmg_conv.hxx" 19 | 20 | namespace euler { 21 | 22 | struct vmg_conv_kernel_binder { 23 | template 24 | using conv_ker_cls = typename euler::vmg_conv_kernel>; 26 | 27 | template 28 | using kconv 29 | = decltype(conv_ker_cls::conv); 30 | 31 | #endif // BUILD_OTJ_TBL 32 | 33 | DECL_VMG_KCONV_TBL(FP32, 16, 1, ISA_AVX512, 1, GKF_DCD); // direct, blocked 34 | DECL_VMG_KCONV_TBL(FP32, 16, 1, ISA_AVX512, 1, GKF_FCF); // direct, nhwc 35 | DECL_VMG_KCONV_TBL(FP32_F16w, 16, 1, ISA_AVX512, 1, GKF_DCD); // direct, blocked, f16c 36 | DECL_VMG_KCONV_TBL(FP32_F16w, 16, 1, ISA_AVX512, 1, GKF_FCF); // direct, nhwc input, f16c 37 | //DECL_VMG_KCONV_TBL(FP32_F16o, 16, 1, ISA_AVX512, 1, GKF_DCD); // direct, f16c 38 | 39 | #ifdef ENABLE_USER_FP16 40 | DECL_VMG_KCONV_TBL(FP32_F16o, 16, 1, ISA_AVX512, 1, GKF_EBD); // direct, nchw input, f16c 41 | #endif 42 | 43 | #if !defined(BUILD_OTJ_TBL) 44 | 45 | #ifdef ENABLE_USER_FP16 46 | template 47 | static inline void bind(int O, int T, kconv **func) 48 | { 49 | switch (F) { 50 | //case GKF_DCD: 51 | // if (S == 1) 52 | // *func = LOOKUP_VMG_KCONV_TBL(FP32_F16o, 16, 1, ISA_AVX512, 1, GKF_DCD, O, T, K, G); 53 | // break; 54 | case GKF_EBD: 55 | if (S == 1) 56 | *func = LOOKUP_VMG_KCONV_TBL(FP32_F16o, 16, 1, ISA_AVX512, 1, GKF_EBD, O, T, K, G); 57 | break; 58 | default: 59 | break; 60 | } 61 | } 62 | #endif 63 | 64 | template 65 | static inline void bind(int O, int T, kconv **func) 66 | { 67 | switch (F) { 68 | case GKF_DCD: 69 | if (S == 1) 70 | *func = LOOKUP_VMG_KCONV_TBL(FP32, 16, 1, ISA_AVX512, 1, GKF_DCD, O, T, K, G); 71 | break; 72 | case GKF_FCF: 73 | if (S == 1) 74 | *func = LOOKUP_VMG_KCONV_TBL(FP32, 16, 1, ISA_AVX512, 1, GKF_FCF, O, T, K, G); 75 | break; 76 | default: 77 | break; 78 | } 79 | } 80 | 81 | template 82 | static inline void bind(int O, int T, kconv **func) 83 | { 84 | switch (F) { 85 | case GKF_DCD: 86 | if (S == 1) 87 | *func = LOOKUP_VMG_KCONV_TBL(FP32_F16w, 16, 1, ISA_AVX512, 1, GKF_DCD, O, T, K, G); 88 | break; 89 | case GKF_FCF: 90 | if (S == 1) 91 | *func = LOOKUP_VMG_KCONV_TBL(FP32_F16w, 16, 1, ISA_AVX512, 1, GKF_FCF, O, T, K, G); 92 | break; 93 | default: 94 | break; 95 | } 96 | } 97 | 98 | #endif // BUILD_OTJ_TBL 99 | }; 100 | 101 | } // namespace euler 102 | -------------------------------------------------------------------------------- /src/elx_int8_conv_direct_1x1.hpp: -------------------------------------------------------------------------------- 1 | #ifndef __ELX_CONV_DIRECT_1X1_LP_HPP__ 2 | #define __ELX_CONV_DIRECT_1X1_LP_HPP__ 3 | 4 | #include "euler.hpp" 5 | #include "el_def.hpp" 6 | #include "el_utils.hpp" 7 | #include "el_allocator.hpp" 8 | #include "elx_conv.hpp" 9 | #include "kernel/elk_u8s8_gemm_binder.hxx" 10 | 11 | namespace euler { 12 | 13 | #define Template_elx_int8_conv_direct_1x1_t \ 14 | template 15 | 16 | #define Instance_elx_int8_conv_direct_1x1_t \ 17 | elx_int8_conv_direct_1x1_t 18 | 19 | Template_elx_int8_conv_direct_1x1_t 20 | class elx_int8_conv_direct_1x1_t : public elx_conv_t { 21 | public: 22 | // Configurable parameters 23 | using InputType = typename UserTypes::InputType; 24 | using WeightsType = typename UserTypes::WeightsType; 25 | using OutputType = typename UserTypes::OutputType; 26 | using BiasType = typename UserTypes::BiasType; 27 | 28 | // t-buffer type 29 | using TinputType = typename TarrayTypes::InputType; 30 | using TweightsType = typename TarrayTypes::WeightsType; 31 | using ToutputType = typename TarrayTypes::OutputType; 32 | 33 | public: 34 | elx_int8_conv_direct_1x1_t(eld_conv_t &dc); 35 | virtual ~elx_int8_conv_direct_1x1_t(); 36 | 37 | virtual void execute(void *, void *, void *, void *); 38 | 39 | private: 40 | void __execute_a160(OutputType *, InputType *, WeightsType *, BiasType *); 41 | void __execute_a160_s1(OutputType *, InputType *, WeightsType *, BiasType *); 42 | void __execute_a160_s2(OutputType *, InputType *, WeightsType *, BiasType *); 43 | 44 | inline void trans_weights_s8_blocked_oc(float *, int8_t *, WeightsType *, BiasType *); 45 | 46 | void gemm_a160_s1(ToutputType *, OutputType *, uint8_t *, int8_t *, 47 | float *, float *, BiasType *, int, int, int); 48 | void gemm_a160_s2(ToutputType *, OutputType *, uint8_t *, int8_t *, 49 | float *, float *, BiasType *, int); 50 | 51 | void prepare_quant_calibration(eld_conv_t &); 52 | void set_scratch_buffers(void *base); 53 | void set_workspace_buffers(void *base); 54 | int prepare_execute_opt(); 55 | void bind_execute_functions(); 56 | 57 | u8s8_gemm_kernel_binder::kgemm *ker_u8s8_gemm_I_O_T_; 58 | u8s8_gemm_kernel_binder::kgemm *ker_u8s8_gemm_I_O_Tr_; 59 | 60 | void (elx_int8_conv_direct_1x1_t::*execute_opt_)( 61 | OutputType *, InputType *, WeightsType *, BiasType *); 62 | 63 | bool no_pad_; 64 | bool is_first_run_; 65 | bool inference_acc_; 66 | bool toutput_opt_; 67 | 68 | bool stream_in_; 69 | bool stream_out_; 70 | 71 | bool is_bfmt_; 72 | bool input_is_bfmt_; 73 | bool weights_is_bfmt_; 74 | bool output_is_bfmt_; 75 | bool input_as_bfmt_; 76 | bool weights_as_bfmt_; 77 | bool output_as_bfmt_; 78 | 79 | TweightsType *tweights_; 80 | TinputType *tinput_; 81 | ToutputType *toutput_; 82 | InputType *binput_; // blocked input 83 | WeightsType *bweights_; 84 | OutputType *boutput_; 85 | float *input_scale_; 86 | int8_t *tweights_s8_; 87 | float *weights_scale_; 88 | 89 | unsigned int xopt_; 90 | int attr_; 91 | int mthr_; 92 | size_t tweights_size_; 93 | size_t tinput_size_; 94 | size_t toutput_size_; 95 | size_t binput_size_; 96 | size_t bweights_size_; 97 | size_t boutput_size_; 98 | size_t input_scale_size_; 99 | size_t tweights_s8_size_; 100 | size_t weights_scale_size_; 101 | }; 102 | 103 | } // namespace euler 104 | #endif // __ELX_CONV_DIRECT_1X1_HPP__ 105 | -------------------------------------------------------------------------------- /tests/elt_gflag.cpp: -------------------------------------------------------------------------------- 1 | #include "elt_gflag.hpp" 2 | 3 | DEFINE_int32(mb, 0, "Batch size"); 4 | DEFINE_int32(g, 1, "Groups size"); 5 | DEFINE_int32(ic, 0, "Input channel size"); 6 | DEFINE_int32(oc, 0, "Output channel size"); 7 | DEFINE_int32(ih, 0, "Input height"); 8 | DEFINE_int32(iw, 0, "Input width"); 9 | DEFINE_int32(oh, 0, "Output height"); 10 | DEFINE_int32(ow, 0, "Output width"); 11 | DEFINE_int32(kh, 3, "Kernel height. Default: 3"); 12 | DEFINE_int32(kw, 3, "Kernel width: Default: 3"); 13 | DEFINE_int32(ph, 1, "Padding along height. Default: 1"); 14 | DEFINE_int32(pw, 1, "Padding along width. Default: 1"); 15 | DEFINE_int32(sh, 1, "Stride along height. Default: 1"); 16 | DEFINE_int32(sw, 1, "Stride along width. Default: 1"); 17 | DEFINE_int32(dh, 1, "Dilation along height. Default: 1"); 18 | DEFINE_int32(dw, 1, "Dilation along width. Default: 1"); 19 | DEFINE_bool(validate_results, false, 20 | "on|off. Validate correctness. Default: off"); 21 | DEFINE_bool(with_bias, true, "on|off. With bias. Default: on"); 22 | DEFINE_bool(with_relu, false, "on|off. With relu. Default: off"); 23 | DEFINE_bool(with_argmax, false, "on|off. With argmax. Default: off"); 24 | DEFINE_int32(repeated_layer, 1, "Number of repeated layers. Default: 1"); 25 | DEFINE_bool(dbuffering, false, "Double buffering. Default: off"); 26 | DEFINE_bool(output_as_input, false, 27 | "Output of layer n used as input of layer n+1. Default: off"); 28 | DEFINE_string(alg, "wino", 29 | "deconv|auto|wino|direct|direct_1x1. Algorithm. Default: wino"); 30 | DEFINE_int32(tile_size, 0, "Winograd tile size: 0"); 31 | DEFINE_int32(nthreads, 0, "Number of threads per team"); 32 | DEFINE_string(execution_mode, "0x0", "Execution mode"); 33 | DEFINE_int32(flt_o, 1, "OC flatting"); 34 | DEFINE_int32(flt_t, 1, "Tile flatting"); 35 | DEFINE_int32(blk_i, 1, "IC blocking"); 36 | DEFINE_int32(blk_o, 1, "OC blocking"); 37 | DEFINE_int32(pat_i, 1, "Partition on ic"); 38 | DEFINE_int32(pat_o, 1, "Partition on oc"); 39 | DEFINE_int32(pat_g, 1, "Partition on g"); 40 | DEFINE_int32(streaming_input, 0, 41 | "Streaming hint for winograd transformed input"); 42 | DEFINE_int32(streaming_output, 0, 43 | "Streaming hint for winograd transformed output"); 44 | DEFINE_string(input_format, "nChw16c", 45 | "nchw|nhwc|nChw16c. Input data format. Default: nChw16c"); 46 | DEFINE_string(weights_format, "OIhw16i16o", 47 | "oihw|hwio|OIhw16i16o|goihw|ghwio|gOIhw16i16o. Weights data format. Default: OIhw16i16o"); 48 | DEFINE_string(output_format, "nChw16c", 49 | "nchw|nhwc|nChw16c. Output data format. Default: nChw16c"); 50 | DEFINE_bool(input_as_blocked, false, 51 | "on|off. Format input as blocked. Default: off"); 52 | DEFINE_bool(weights_as_blocked, false, 53 | "on|off. Format weighs as blocked. Default: off"); 54 | DEFINE_bool(output_as_blocked, false, 55 | "on|off. Format output as blocked. Default: off"); 56 | DEFINE_bool(f16c_opt, false, "on|off. With half-precision opt, Default: off"); 57 | DEFINE_string(data_type_cfg, "FP32", "UserTypes, Default: FP32"); 58 | DEFINE_bool(with_ip_sum, false, "on|off. With inplace sum, Default: off"); 59 | DEFINE_int32(sampling_kind, 2, 60 | "sampling kind 0: FINE, 1: COARSE, 2: CALIBRATED, Default: 2"); 61 | DEFINE_double(tinput_cali_s, 0.0, 62 | "calibration scale for tinput quantization, Default: 0"); 63 | DEFINE_double(tinput_cali_z, 0.0, 64 | "calibration zero for tinput quantization, Default: 0"); 65 | DEFINE_string(input_data_file, "", "Input data file(nchw)"); 66 | DEFINE_string(weights_data_file, "", "Weights data file(oihw)"); 67 | DEFINE_string(bias_data_file, "", "Bias data file"); 68 | DEFINE_string(name, "ioi", "Name of layer"); 69 | DEFINE_bool(disable_autoparam, true, "Disable autoparam"); 70 | 71 | --------------------------------------------------------------------------------