├── .gitignore ├── AutoSearch ├── CMakeLists.txt ├── README.md ├── generator │ └── batch_matmul.cpp ├── include │ └── utils.h ├── src │ ├── CMakeLists.txt │ ├── adams2019 │ │ ├── ASLog.cpp │ │ ├── ASLog.h │ │ ├── AutoSchedule.cpp │ │ ├── AutoSchedule.h │ │ ├── CMakeLists.txt │ │ ├── CostModel.h │ │ ├── DefaultCostModel.cpp │ │ ├── DefaultCostModel.h │ │ ├── Featurization.h │ │ ├── FunctionDAG.cpp │ │ ├── FunctionDAG.h │ │ ├── LoopNest.cpp │ │ ├── LoopNest.h │ │ ├── Makefile │ │ ├── NetworkSize.h │ │ ├── PerfectHashMap.h │ │ ├── Weights.cpp │ │ ├── Weights.h │ │ ├── autotune_loop.sh │ │ ├── baseline.weights │ │ ├── cost_model_generator.cpp │ │ ├── cost_model_schedule.h │ │ ├── demo_generator.cpp │ │ ├── featurization_to_sample.cpp │ │ ├── get_host_target.cpp │ │ ├── included_schedule_file.schedule.h │ │ ├── included_schedule_file_generator.cpp │ │ ├── retrain_cost_model.cpp │ │ ├── test.cpp │ │ ├── test_function_dag.cpp │ │ ├── test_perfect_hash_map.cpp │ │ ├── updated.weights │ │ └── weightsdir_to_weightsfile.cpp │ ├── common │ │ ├── BoundEstimate.h │ │ ├── CMakeLists.txt │ │ ├── DataOP.h │ │ ├── DataTransform.h │ │ ├── Errors.h │ │ ├── HalidePlugin.h │ │ ├── binary2cpp.cpp │ │ └── cmdline.h │ ├── li2018 │ │ ├── CMakeLists.txt │ │ ├── GradientAutoscheduler.cpp │ │ ├── Makefile │ │ ├── README.md │ │ ├── demo_generator.cpp │ │ ├── test.cpp │ │ └── test.py │ ├── mullapudi2016 │ │ ├── AutoSchedule.cpp │ │ ├── CMakeLists.txt │ │ └── Makefile │ └── sioutas2020 │ │ ├── AutoSchedule.cpp │ │ ├── CMakeLists.txt │ │ ├── Makefile │ │ └── test.cpp └── toolkit │ ├── RunGen.h │ ├── RunGenMain.cpp │ ├── shape_config.py │ ├── template │ ├── demo_eval.cpp │ ├── demo_run.cpp │ └── gen.cpp │ ├── tools.py │ └── utils.py ├── Dockerfile ├── Dockerfile.cpu ├── Dockerfile.cuda └── Dockerfile.opencl ├── LICENSE ├── README.md ├── README_CN.md ├── auto_deploy ├── README.md ├── c_source │ └── main.cpp ├── data │ ├── 0.jpg │ ├── 3.jpg │ ├── 6.jpg │ ├── auto-deploy.png │ ├── input_6.bin │ ├── main_head │ ├── mnist-8.onnx │ ├── mnist.weights │ └── reg_str ├── generated_op.py ├── graph.py ├── graph_tutorial.ipynb ├── mnist.py ├── op_build.sh ├── op_codegen.py ├── op_gen.cpp ├── op_generator.py └── pass_manager.py ├── autokernel_plugin ├── .gitignore ├── CMakeLists.txt ├── common │ └── GenGen.cpp ├── images │ └── cat.jpg ├── include │ ├── Halide.h │ ├── HalideBuffer.h │ └── HalideRuntime.h ├── models │ ├── squeezenet.tmfile │ ├── synset2015.txt │ └── synset_words.txt ├── scripts │ ├── clean.sh │ ├── generate.sh │ └── register_op.sh ├── src │ ├── CMakeLists.txt │ ├── depthwise │ │ ├── build.sh │ │ ├── depthwise.cpp │ │ ├── depthwise.h │ │ └── depthwise_gen.cc │ ├── direct_conv │ │ ├── build.sh │ │ ├── direct_conv.cpp │ │ ├── direct_conv.h │ │ └── direct_conv_gen.cc │ ├── fc │ │ ├── build.sh │ │ ├── fc.cpp │ │ ├── fc.h │ │ └── fc_gen.cc │ ├── im2col_conv │ │ ├── build.sh │ │ ├── im2col_conv.cpp │ │ ├── im2col_conv.h │ │ └── im2col_conv_gen.cc │ ├── normalize │ │ ├── build.sh │ │ ├── normalize.cpp │ │ ├── normalize.h │ │ └── normalize_gen.cc │ ├── plugin_init.cpp │ ├── pool │ │ ├── avepool_gen.cc │ │ ├── build.sh │ │ ├── maxpool_gen.cc │ │ ├── pool.cpp │ │ └── pool.h │ └── softmax │ │ ├── build.sh │ │ ├── softmax.cpp │ │ ├── softmax.h │ │ └── softmax_gen.cc ├── template │ ├── build.sh │ ├── generator.cc │ ├── template.cpp │ └── template.h └── tests │ ├── CMakeLists.txt │ ├── common │ ├── alphabeta.hpp │ ├── common.hpp │ ├── stb_image.h │ ├── stb_image_write.h │ ├── tengine_operations.cpp │ ├── tengine_operations.h │ └── utils.hpp │ ├── test_conv.cpp │ ├── test_depthwise.cpp │ ├── test_fc.cpp │ ├── test_normalize.cpp │ ├── test_pool.cpp │ ├── test_softmax.cpp │ └── tm_classification.cpp └── doc ├── add_op.png ├── architecture-en.png ├── architecture.png ├── how_to_add_op.md ├── logo.png ├── readme.md └── tutorials ├── 01_AutoKernel开发环境快速入门.md ├── 02_Tengine快速入门.md ├── 03_Halide初体验.md ├── 04_AutoKernel插件指南.md ├── 05_Halide调度策略Schedule.md ├── 06_GEMM调度策略优化指南.md ├── data ├── 02_tengine_tutorial.cpp ├── 03_halide_basic.py ├── 03_halide_feed_data.py ├── 03_halide_magic.py ├── 04_test_relu.cpp ├── 05_loop_schedule.py ├── 06_build.sh ├── 06_gemm_optimization.cpp ├── CMakeLists.txt ├── gemm.jpg ├── inference.png ├── interleave.png ├── memory.png ├── plugin.png └── step6.png └── readme.md /.gitignore: -------------------------------------------------------------------------------- 1 | *.s 2 | .vscode 3 | build 4 | -------------------------------------------------------------------------------- /AutoSearch/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | 2 | cmake_minimum_required(VERSION 3.18) 3 | project(AutoSearch) 4 | add_subdirectory(src) -------------------------------------------------------------------------------- /AutoSearch/README.md: -------------------------------------------------------------------------------- 1 | More documentation see: 2 | 3 | - [AutoSearch中文文档](https://autokernel-docs-en.readthedocs.io/zh_CN/latest/tutorials/autosearch.html) 4 | - [AutoSearch Doc](https://autokernel-docs-en.readthedocs.io/en/latest/tutorials/autosearch.html) 5 | -------------------------------------------------------------------------------- /AutoSearch/generator/batch_matmul.cpp: -------------------------------------------------------------------------------- 1 | #include "Halide.h" 2 | #include "iostream" 3 | #include "utils.h" 4 | 5 | /* 6 | C(x,y)+=A(k,y)*B(x,k) 7 | 8 | ____N____ 9 | K | B | 10 | |_______| 11 | 12 | __K___ ___N___ 13 | | | | | 14 | M| A | M| C | 15 | |_____| |_______| 16 | 17 | */ 18 | namespace { 19 | 20 | class BatchMatmul : public Halide::Generator { 21 | public: 22 | std::vector args = GetArgsFromEnv(); 23 | int i = 0; 24 | const int B = args[i++]; 25 | const int M = args[i++]; 26 | const int N = args[i++]; 27 | const int K = args[i++]; 28 | 29 | // const int B = 1; 30 | // const int N = 1024; 31 | // const int N = 1024; 32 | // const int K = 1024; 33 | 34 | Input> input_a{"input_a", 3}; //(dim0,dim1,dim2)=(width,heiht,batch)=(K,M,B) 35 | Input> input_b{"input_b", 3}; //(dim0,dim1,dim2)=(width,heiht,batch)=(N,K,B) 36 | Output> output{"output", 3}; //(dim0,dim1,dim2)=(width,heiht,batch)=(N,M,B) 37 | 38 | void generate() { 39 | Var x("x"), y("y"),b("b"); 40 | RDom k(0, K); 41 | Func prod("prod"),Br("Br"); 42 | 43 | // Algorithm 44 | prod(x, y, b) = 0.0f; 45 | prod(x, y, b) += input_a(k, y, b) * input_b(x, k, b); 46 | output(x, y, b) = prod(x, y, b); 47 | 48 | if (!auto_schedule) { 49 | Var xi("xi"), yi("yi"), xii("xii"), yii("yii"), xt("xt"), yt("yt"), xy("xy"); 50 | 51 | if(get_target().has_gpu_feature()) 52 | { 53 | // manuel gpu schedule 54 | output.tile(x,y,xi,yi,8,8) 55 | .unroll(xi) 56 | .unroll(yi) 57 | .gpu_tile(x, y, xt, yt, 2, 2); 58 | 59 | prod.compute_at(output,x) 60 | .gpu_threads(x,y) 61 | .update() 62 | .gpu_threads(x,y); 63 | } 64 | else 65 | { 66 | //manuel cpu schedul 67 | output.tile(x, y, xi, yi, 16, 32) 68 | .fuse(x, y, xy).parallel(xy) 69 | .split(yi, yi, yii, 4) 70 | .vectorize(xi, 8) 71 | .unroll(xi) 72 | .unroll(yii); 73 | 74 | prod.compute_at(output, yi) 75 | .vectorize(x, 8).unroll(y); 76 | 77 | prod.update() 78 | .reorder(x, y, k) 79 | .vectorize(x, 8) 80 | .unroll(x) 81 | .unroll(y) 82 | .unroll(k, 2); 83 | } 84 | } 85 | 86 | output.bound(x, 0, N) 87 | .bound(y, 0, M) 88 | .bound(b, 0, B); 89 | 90 | input_a.dim(0).set_bounds(0, K).set_stride(1) 91 | .dim(1).set_bounds(0, M).set_stride(K) 92 | .dim(2).set_bounds(0, B).set_stride(K * M); 93 | 94 | input_b.dim(0).set_bounds(0, N).set_stride(1) 95 | .dim(1).set_bounds(0, K).set_stride(N) 96 | .dim(2).set_bounds(0, B).set_stride(N * K); 97 | 98 | output.dim(0).set_bounds(0, N).set_stride(1) 99 | .dim(1).set_bounds(0, M).set_stride(N) 100 | .dim(2).set_bounds(0, B).set_stride(M * N); 101 | 102 | } 103 | }; 104 | 105 | } // namespace 106 | 107 | HALIDE_REGISTER_GENERATOR(BatchMatmul, matmul) -------------------------------------------------------------------------------- /AutoSearch/include/utils.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | inline int GetArg(const std::vector &args, size_t index, int default_value = 0) { 8 | return index < args.size() ? args[index] : default_value; 9 | } 10 | 11 | inline std::vector GetArgsFromEnv() { 12 | std::vector ret; 13 | if (const char* env_p = std::getenv("HL_APP_ARGS")) { 14 | std::string val(env_p); 15 | size_t offset = 0; 16 | auto pos = val.find(',', offset); 17 | while (pos != std::string::npos) { 18 | ret.push_back(std::stoi(val.substr(offset, pos - offset))); 19 | offset = pos + 1; 20 | pos = val.find(',', offset); 21 | } 22 | ret.push_back(std::stoi(val.substr(offset, val.size() - offset))); 23 | } else { 24 | std::cerr << "Cannot load arguments from environment variable HL_APP_ARGS" << std::endl; 25 | exit(-1); 26 | } 27 | return ret; 28 | } 29 | 30 | inline double benchmark(); 31 | -------------------------------------------------------------------------------- /AutoSearch/src/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Ensure that plugins export only what is needed to load them. 2 | # Everything else should be omitted to keep binary size low. 3 | 4 | set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS OFF) 5 | set(CMAKE_CXX_VISIBILITY_PRESET hidden) 6 | set(CMAKE_VISIBILITY_INLINES_HIDDEN YES) 7 | find_package(Halide REQUIRED) 8 | add_executable(binary2cpp common/binary2cpp.cpp) 9 | function(add_autoscheduler) 10 | set(options) 11 | set(oneValueArgs NAME) 12 | set(multiValueArgs SOURCES) 13 | cmake_parse_arguments("arg" "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) 14 | #message(STATUS "arg name:${arg_NAME} arg:source:${arg_SOURCES}") 15 | add_library(Halide_${arg_NAME} MODULE ${arg_SOURCES}) 16 | add_library(AutoSchedule::${arg_NAME} ALIAS Halide_${arg_NAME}) 17 | 18 | target_compile_definitions(Halide_${arg_NAME} PRIVATE Halide_EXPORTS) 19 | target_link_libraries(Halide_${arg_NAME} PRIVATE Halide::Plugin) 20 | 21 | string(TOLOWER "${arg_NAME}" name_lower) 22 | set_target_properties(Halide_${arg_NAME} PROPERTIES 23 | EXPORT_NAME ${arg_NAME} 24 | OUTPUT_NAME autoschedule_${name_lower}) 25 | endfunction() 26 | 27 | add_subdirectory(common) 28 | 29 | add_subdirectory(adams2019) 30 | add_subdirectory(li2018) 31 | add_subdirectory(mullapudi2016) 32 | add_subdirectory(sioutas2020) 33 | -------------------------------------------------------------------------------- /AutoSearch/src/adams2019/ASLog.cpp: -------------------------------------------------------------------------------- 1 | #include "ASLog.h" 2 | 3 | namespace Halide { 4 | namespace Internal { 5 | 6 | namespace { 7 | 8 | std::string get_env_variable(char const *env_var_name) { 9 | if (!env_var_name) { 10 | return ""; 11 | } 12 | 13 | #ifdef _MSC_VER 14 | // call getenv_s without a buffer to determine the correct string length: 15 | size_t length = 0; 16 | if ((getenv_s(&length, NULL, 0, env_var_name) != 0) || (length == 0)) { 17 | return ""; 18 | } 19 | // call it again to retrieve the value of the environment variable; 20 | // note that 'length' already accounts for the null-terminator 21 | std::string lvl(length - 1, '@'); 22 | size_t read = 0; 23 | if ((getenv_s(&read, &lvl[0], length, env_var_name) != 0) || (read != length)) { 24 | return ""; 25 | } 26 | return lvl; 27 | #else 28 | char *lvl = getenv(env_var_name); 29 | if (lvl) return std::string(lvl); 30 | #endif 31 | 32 | return ""; 33 | } 34 | 35 | } // namespace 36 | 37 | int aslog::aslog_level() { 38 | static int cached_aslog_level = ([]() -> int { 39 | // If HL_DEBUG_AUTOSCHEDULE is defined, use that value. 40 | std::string lvl = get_env_variable("HL_DEBUG_AUTOSCHEDULE"); 41 | if (!lvl.empty()) { 42 | return atoi(lvl.c_str()); 43 | } 44 | // Otherwise, use HL_DEBUG_CODEGEN. 45 | lvl = get_env_variable("HL_DEBUG_CODEGEN"); 46 | return !lvl.empty() ? atoi(lvl.c_str()) : 0; 47 | })(); 48 | return cached_aslog_level; 49 | } 50 | 51 | } // namespace Internal 52 | } // namespace Halide 53 | -------------------------------------------------------------------------------- /AutoSearch/src/adams2019/ASLog.h: -------------------------------------------------------------------------------- 1 | #ifndef ASLOG_H 2 | #define ASLOG_H 3 | 4 | // This class is used by train_cost_model, which doesn't link to 5 | // libHalide, so (despite the namespace) we are better off not 6 | // including Halide.h, lest we reference something we won't have available 7 | 8 | #include 9 | #include 10 | #include 11 | 12 | namespace Halide { 13 | namespace Internal { 14 | 15 | class aslog { 16 | const bool logging; 17 | 18 | public: 19 | aslog(int verbosity) 20 | : logging(verbosity <= aslog_level()) { 21 | } 22 | 23 | template 24 | aslog &operator<<(T &&x) { 25 | if (logging) { 26 | std::cerr << std::forward(x); 27 | } 28 | return *this; 29 | } 30 | 31 | static int aslog_level(); 32 | }; 33 | 34 | } // namespace Internal 35 | } // namespace Halide 36 | 37 | #endif 38 | -------------------------------------------------------------------------------- /AutoSearch/src/adams2019/AutoSchedule.h: -------------------------------------------------------------------------------- 1 | #include "CostModel.h" 2 | #include "Featurization.h" 3 | #include "FunctionDAG.h" 4 | #include "Halide.h" 5 | #include "PerfectHashMap.h" 6 | #include 7 | 8 | namespace Halide { 9 | namespace Internal { 10 | namespace Autoscheduler { 11 | 12 | typedef PerfectHashMap StageMapOfScheduleFeatures; 13 | 14 | void find_and_apply_schedule(FunctionDAG &dag, const std::vector &outputs, const MachineParams ¶ms, 15 | CostModel *cost_model, int beam_size, StageMapOfScheduleFeatures *schedule_features); 16 | 17 | } // namespace Autoscheduler 18 | } // namespace Internal 19 | } // namespace Halide 20 | -------------------------------------------------------------------------------- /AutoSearch/src/adams2019/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | ## 2 | # Resources for the autoscheduler library 3 | ## 4 | 5 | # weights 6 | set(WF_CPP baseline.cpp) 7 | configure_file(baseline.weights baseline.weights COPYONLY) 8 | add_custom_command(OUTPUT ${WF_CPP} 9 | COMMAND binary2cpp baseline_weights < baseline.weights > ${WF_CPP} 10 | DEPENDS baseline.weights binary2cpp 11 | VERBATIM) 12 | 13 | # cost_model, train_cost_model 14 | add_executable(cost_model.generator cost_model_generator.cpp) 15 | target_link_libraries(cost_model.generator PRIVATE Halide::Generator) 16 | 17 | add_halide_library(cost_model FROM cost_model.generator 18 | TARGETS cmake) 19 | add_halide_library(train_cost_model FROM cost_model.generator 20 | TARGETS cmake 21 | USE_RUNTIME cost_model.runtime) 22 | 23 | # retrain_cost_model 24 | add_executable(retrain_cost_model 25 | ASLog.cpp 26 | DefaultCostModel.cpp 27 | Weights.cpp 28 | retrain_cost_model.cpp 29 | ${WF_CPP}) 30 | target_link_libraries(retrain_cost_model PRIVATE cost_model train_cost_model Halide::Halide Halide::Plugin) 31 | 32 | ## 33 | # Main autoscheduler library 34 | ## 35 | 36 | add_autoscheduler(NAME Adams2019 37 | SOURCES 38 | ASLog.cpp 39 | AutoSchedule.cpp 40 | DefaultCostModel.cpp 41 | FunctionDAG.cpp 42 | LoopNest.cpp 43 | Weights.cpp 44 | ${WF_CPP}) 45 | 46 | target_link_libraries(Halide_Adams2019 PRIVATE cost_model train_cost_model) 47 | 48 | ## 49 | # Tests and demos 50 | # TODO(#4053): move these to a separate folder since they're tests. 51 | ## 52 | 53 | # ================================================================= 54 | 55 | add_executable(demo.generator demo_generator.cpp) 56 | target_link_libraries(demo.generator PRIVATE Halide::Generator) 57 | 58 | add_halide_library(demo FROM demo.generator 59 | TARGETS cmake 60 | AUTOSCHEDULER AutoSchedule::Adams2019 61 | REGISTRATION DEMO_REGISTRATION_FILE) 62 | 63 | add_executable(demo_apps_autoscheduler ${DEMO_REGISTRATION_FILE}) 64 | target_link_libraries(demo_apps_autoscheduler PRIVATE demo Halide::RunGenMain) 65 | 66 | add_test(NAME demo_apps_autoscheduler 67 | COMMAND demo_apps_autoscheduler --benchmarks=all --benchmark_min_time=1 --estimate_all) 68 | 69 | set_tests_properties(demo_apps_autoscheduler 70 | PROPERTIES 71 | LABELS Adams2019 72 | ENVIRONMENT "HL_TARGET=${Halide_TARGET}") 73 | 74 | # ================================================================= 75 | 76 | add_executable(included_schedule_file.generator included_schedule_file_generator.cpp) 77 | target_link_libraries(included_schedule_file.generator PRIVATE Halide::Generator) 78 | 79 | add_halide_library(included_schedule_file FROM included_schedule_file.generator 80 | TARGETS cmake 81 | AUTOSCHEDULER AutoSchedule::Adams2019 82 | REGISTRATION included_schedule_reg) 83 | 84 | add_executable(demo_included_schedule_file ${included_schedule_reg}) 85 | target_link_libraries(demo_included_schedule_file PRIVATE included_schedule_file Halide::RunGenMain) 86 | 87 | add_test(NAME demo_included_schedule_file 88 | COMMAND demo_included_schedule_file --benchmarks=all --benchmark_min_time=1 --estimate_all) 89 | 90 | set_tests_properties(demo_included_schedule_file 91 | PROPERTIES 92 | LABELS Adams2019 93 | ENVIRONMENT "HL_TARGET=${Halide_TARGET}") 94 | 95 | # ==================================================== 96 | # Auto-tuning support utilities. 97 | # TODO(#4053): implement auto-tuning support in CMake? 98 | 99 | add_executable(featurization_to_sample featurization_to_sample.cpp) 100 | 101 | add_executable(get_host_target get_host_target.cpp) 102 | target_link_libraries(get_host_target PRIVATE Halide::Halide) 103 | 104 | add_executable(weightsdir_to_weightsfile weightsdir_to_weightsfile.cpp Weights.cpp) 105 | target_link_libraries(weightsdir_to_weightsfile PRIVATE Halide::Runtime) 106 | 107 | # ================================================================= 108 | # Smaller tests 109 | 110 | if (BUILD_SHARED_LIBS) 111 | add_executable(test_apps_autoscheduler test.cpp) 112 | target_link_libraries(test_apps_autoscheduler PRIVATE Halide::Halide Halide::Tools ${CMAKE_DL_LIBS}) 113 | 114 | add_test(NAME test_apps_autoscheduler 115 | COMMAND test_apps_autoscheduler $) 116 | 117 | set_tests_properties(test_apps_autoscheduler PROPERTIES 118 | LABELS Adams2019 119 | ENVIRONMENT "LD_LIBRARY_PATH=$;HL_TARGET=${Halide_TARGET}") 120 | endif () 121 | 122 | ## 123 | 124 | add_executable(test_perfect_hash_map test_perfect_hash_map.cpp) 125 | 126 | add_test(NAME test_perfect_hash_map COMMAND test_perfect_hash_map) 127 | set_tests_properties(test_perfect_hash_map 128 | PROPERTIES 129 | LABELS Adams2019 130 | ENVIRONMENT "HL_TARGET=${Halide_TARGET}") 131 | 132 | ## 133 | 134 | add_executable(test_function_dag test_function_dag.cpp FunctionDAG.cpp ASLog.cpp) 135 | target_link_libraries(test_function_dag PRIVATE Halide::Halide Halide::Tools Halide::Plugin) 136 | 137 | add_test(NAME test_function_dag COMMAND test_function_dag) 138 | set_tests_properties(test_function_dag 139 | PROPERTIES 140 | LABELS Adams2019 141 | ENVIRONMENT "HL_TARGET=${Halide_TARGET}") 142 | -------------------------------------------------------------------------------- /AutoSearch/src/adams2019/CostModel.h: -------------------------------------------------------------------------------- 1 | #ifndef COST_MODEL_H 2 | #define COST_MODEL_H 3 | 4 | #include 5 | 6 | #include "FunctionDAG.h" 7 | #include "HalideBuffer.h" 8 | #include "PerfectHashMap.h" 9 | 10 | // An abstract base class for a cost model. 11 | namespace Halide { 12 | 13 | namespace Internal { 14 | namespace Autoscheduler { 15 | typedef PerfectHashMap StageMapOfScheduleFeatures; 16 | } // namespace Autoscheduler 17 | } // namespace Internal 18 | 19 | class CostModel { 20 | public: 21 | virtual ~CostModel() = default; 22 | 23 | // Configure the cost model for the algorithm to be scheduled. 24 | virtual void set_pipeline_features(const Internal::Autoscheduler::FunctionDAG &dag, 25 | const MachineParams ¶ms) = 0; 26 | 27 | // Enqueue a schedule to be evaluated. Will annotate the value located at cost_ptr when the evaluation takes place. 28 | // Note that the dag argument should correspond to the dag specified previously when calling set_pipeline_features. 29 | virtual void enqueue(const Internal::Autoscheduler::FunctionDAG &dag, 30 | const Halide::Internal::Autoscheduler::StageMapOfScheduleFeatures &schedule_feats, 31 | double *cost_ptr) = 0; 32 | 33 | // Evaluate all schedules in the queue. 34 | virtual void evaluate_costs() = 0; 35 | 36 | // Discard all schedules in the queue. 37 | virtual void reset() = 0; 38 | }; 39 | 40 | } // namespace Halide 41 | 42 | #endif // COST_MODEL_H 43 | -------------------------------------------------------------------------------- /AutoSearch/src/adams2019/DefaultCostModel.h: -------------------------------------------------------------------------------- 1 | #ifndef DEFAULT_COST_MODEL_H 2 | #define DEFAULT_COST_MODEL_H 3 | 4 | #include "CostModel.h" 5 | #include "Weights.h" 6 | #include 7 | 8 | namespace Halide { 9 | 10 | class DefaultCostModel : public CostModel { 11 | private: 12 | Internal::Weights weights; 13 | Runtime::Buffer schedule_feat_queue, pipeline_feat_queue, costs; 14 | Runtime::Buffer cost_ptrs; 15 | int cursor, num_stages, num_cores; 16 | 17 | const std::string weights_in_path, weights_out_path; 18 | const bool randomize_weights; 19 | 20 | Runtime::Buffer 21 | head1_filter_update, head1_bias_update, 22 | head2_filter_update, head2_bias_update, 23 | conv1_filter_update, conv1_bias_update; 24 | int timestep = 0; 25 | 26 | public: 27 | DefaultCostModel(const std::string &weights_in_path, 28 | const std::string &weights_out_path, 29 | bool randomize_weights) 30 | : weights_in_path(weights_in_path), 31 | weights_out_path(weights_out_path), 32 | randomize_weights(randomize_weights) { 33 | 34 | load_weights(); 35 | } 36 | ~DefaultCostModel() override = default; 37 | 38 | // Configure the cost model for the algorithm to be scheduled. 39 | void set_pipeline_features(const Internal::Autoscheduler::FunctionDAG &dag, 40 | const MachineParams ¶ms) override; 41 | void set_pipeline_features(const Runtime::Buffer &, int n); 42 | 43 | // Enqueue a schedule to be evaluated. The second version of this method returns a buffer of 44 | // schedule_features that should be filled in by the caller. 45 | void enqueue(const Internal::Autoscheduler::FunctionDAG &dag, 46 | const Halide::Internal::Autoscheduler::StageMapOfScheduleFeatures &schedule_feats, 47 | double *cost_ptr) override; 48 | void enqueue(int ns, Runtime::Buffer *schedule_feats, double *cost_ptr); 49 | 50 | // Evaluate all schedules in the queue. 51 | void evaluate_costs() override; 52 | 53 | // Discard all schedules in the queue. 54 | void reset() override; 55 | 56 | // Update model weights using true measured runtimes. 57 | float backprop(const Runtime::Buffer &true_runtimes, float learning_rate); 58 | 59 | // Save/Load the model weights to/from disk. 60 | void save_weights(); 61 | void load_weights(); 62 | }; 63 | 64 | std::unique_ptr make_default_cost_model(const std::string &weights_in_dir = "", 65 | const std::string &weights_out_dir = "", 66 | bool randomize_weights = false); 67 | } // namespace Halide 68 | 69 | #endif // DEFAULT_COST_MODEL_H 70 | -------------------------------------------------------------------------------- /AutoSearch/src/adams2019/NetworkSize.h: -------------------------------------------------------------------------------- 1 | #ifndef HALIDE_NETWORK_SIZE_H 2 | #define HALIDE_NETWORK_SIZE_H 3 | 4 | namespace Halide { 5 | // The size of the best cost model network found. Needed by the cost 6 | // model and also the cost model training script. 7 | const int head1_channels = 8, head1_w = 40, head1_h = 7; 8 | const int head2_channels = 24, head2_w = 39; 9 | const int conv1_channels = 32; 10 | } // namespace Halide 11 | 12 | #endif // HALIDE_NETWORK_SIZE_H 13 | -------------------------------------------------------------------------------- /AutoSearch/src/adams2019/Weights.h: -------------------------------------------------------------------------------- 1 | #ifndef _WEIGHTS 2 | #define _WEIGHTS 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #include "Featurization.h" 9 | #include "HalideBuffer.h" 10 | #include "NetworkSize.h" 11 | 12 | namespace Halide { 13 | namespace Internal { 14 | 15 | struct Weights { 16 | uint32_t pipeline_features_version = PipelineFeatures::version(); 17 | uint32_t schedule_features_version = ScheduleFeatures::version(); 18 | 19 | Halide::Runtime::Buffer head1_filter{head1_channels, head1_w, head1_h}; 20 | Halide::Runtime::Buffer head1_bias{head1_channels}; 21 | 22 | Halide::Runtime::Buffer head2_filter{head2_channels, head2_w}; 23 | Halide::Runtime::Buffer head2_bias{head2_channels}; 24 | 25 | Halide::Runtime::Buffer conv1_filter{conv1_channels, head1_channels + head2_channels}; 26 | Halide::Runtime::Buffer conv1_bias{conv1_channels}; 27 | 28 | template 29 | void for_each_buffer(F f) { 30 | f(head1_filter); 31 | f(head1_bias); 32 | f(head2_filter); 33 | f(head2_bias); 34 | f(conv1_filter); 35 | f(conv1_bias); 36 | } 37 | 38 | void randomize(uint32_t seed); 39 | 40 | bool load(std::istream &i); 41 | bool save(std::ostream &o) const; 42 | 43 | bool load_from_file(const std::string &filename); 44 | bool save_to_file(const std::string &filename) const; 45 | 46 | // Load/save from the 'classic' form of six raw data files 47 | bool load_from_dir(const std::string &dir); 48 | bool save_to_dir(const std::string &dir) const; 49 | }; 50 | 51 | } // namespace Internal 52 | } // namespace Halide 53 | 54 | #endif // _WEIGHTS 55 | -------------------------------------------------------------------------------- /AutoSearch/src/adams2019/baseline.weights: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OAID/AutoKernel/6ada93a743c26e3aae0372f7518433889dd0b437/AutoSearch/src/adams2019/baseline.weights -------------------------------------------------------------------------------- /AutoSearch/src/adams2019/demo_generator.cpp: -------------------------------------------------------------------------------- 1 | #include "Halide.h" 2 | #include "iostream" 3 | //#include "utils.h" 4 | 5 | namespace { 6 | 7 | class BatchMatmul : public Halide::Generator { 8 | public: 9 | //std::vector args = GetArgsFromEnv(); 10 | int i = 0; 11 | //const int B = args[i++]; 12 | //const int N = args[i++]; 13 | //const int M = args[i++]; 14 | //const int K = args[i++]; 15 | const int B = 1; 16 | const int N = 1024; 17 | const int M = 1024; 18 | const int K = 1024; 19 | 20 | Input> input_a{"input_a", 3}; 21 | Input> input_b{"input_b", 3}; 22 | 23 | Output> output{"output", 3}; 24 | 25 | void generate() { 26 | Var x("x"), y("y"), b("b"),bo("bo"), xo("xo"), yo("yo"),xoa("xoa"),yoa("yoa"),yoai("yoai"),xi("xi"); 27 | Var xii("xii"); 28 | Var yi("yi"); 29 | 30 | // Algorithm 31 | RDom k(0, K); 32 | 33 | 34 | Func func("func"), Bs("Bs");//,As("input_b_im#interleave"); 35 | //As(x,y,xo,b) = input_b(xo*16+x,y,b); 36 | func(xi, y, b) = 0.0f; 37 | func(xi, y, b) += input_a(k, y, b) * input_b(xi,k,b); 38 | output(xi, y, b) = func(xi, y, b); 39 | //func.trace_stores(); 40 | 41 | output.bound(xi, 0, M) 42 | .bound(y, 0, N) 43 | .bound(b, 0, B); 44 | input_a.dim(0).set_bounds(0, K).set_stride(1) 45 | .dim(1).set_bounds(0, N).set_stride(K) 46 | .dim(2).set_bounds(0, B).set_stride(K * N); 47 | 48 | input_b.dim(0).set_bounds(0, M).set_stride(1) 49 | .dim(1).set_bounds(0, K).set_stride(M) 50 | .dim(2).set_bounds(0, B).set_stride(M * K); 51 | 52 | output.dim(0).set_bounds(0, M).set_stride(1) 53 | .dim(1).set_bounds(0, N).set_stride(M) 54 | .dim(2).set_bounds(0, B).set_stride(M * N); 55 | //Br.print_loop_nest(); 56 | //func.print_loop_nest(); 57 | } 58 | }; 59 | 60 | } // namespace 61 | 62 | HALIDE_REGISTER_GENERATOR(BatchMatmul, demo) 63 | -------------------------------------------------------------------------------- /AutoSearch/src/adams2019/featurization_to_sample.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | // A sample is a featurization + a runtime + some ids, all together in one file. 7 | // This utility concats the runtime and ids onto a featurization to produce a sample. 8 | int main(int argc, char **argv) { 9 | if (argc != 6) { 10 | std::cout << "Usage: featurization_to_sample in.featurization runtime pipeline_id schedule_id out.sample\n"; 11 | return -1; 12 | } 13 | 14 | std::ifstream src(argv[1], std::ios::binary); 15 | if (!src) { 16 | std::cerr << "Unable to open input file: " << argv[1] << "\n"; 17 | return -1; 18 | } 19 | 20 | std::ofstream dst(argv[5], std::ios::binary); 21 | if (!dst) { 22 | std::cerr << "Unable to open output file: " << argv[5] << "\n"; 23 | return -1; 24 | } 25 | 26 | dst << src.rdbuf(); 27 | 28 | // Input runtime value is presumed to be in seconds, 29 | // but sample file stores times in milliseconds. 30 | float r = atof(argv[2]) * 1000.f; 31 | int32_t pid = atoi(argv[3]); 32 | int32_t sid = atoi(argv[4]); 33 | 34 | dst.write((const char *)&r, 4); 35 | dst.write((const char *)&pid, 4); 36 | dst.write((const char *)&sid, 4); 37 | 38 | src.close(); 39 | dst.close(); 40 | 41 | return 0; 42 | } 43 | -------------------------------------------------------------------------------- /AutoSearch/src/adams2019/get_host_target.cpp: -------------------------------------------------------------------------------- 1 | #include "Halide.h" 2 | 3 | using namespace Halide; 4 | 5 | // Print the host target to stdout. 6 | // Any extra arguments are assumed to be features that should be stripped from 7 | // the target (as a convenience for use in Makefiles, where string manipulation 8 | // can be painful). 9 | int main(int argc, char **argv) { 10 | Target t = get_host_target(); 11 | for (int i = 1; i < argc; ++i) { 12 | auto f = Target::feature_from_name(argv[i]); 13 | if (f == Target::FeatureEnd) { 14 | fprintf(stderr, "Unknown feature: %s\n", argv[i]); 15 | exit(1); 16 | } 17 | t = t.without_feature(f); 18 | } 19 | printf("%s", t.to_string().c_str()); 20 | return 0; 21 | } 22 | -------------------------------------------------------------------------------- /AutoSearch/src/adams2019/included_schedule_file.schedule.h: -------------------------------------------------------------------------------- 1 | 2 | #ifndef included_schedule_file_SCHEDULE_H 3 | #define included_schedule_file_SCHEDULE_H 4 | 5 | // MACHINE GENERATED -- DO NOT EDIT 6 | // This schedule was automatically generated by apps/autoscheduler/AutoSchedule 7 | // for target=x86-64-osx-avx-avx2-f16c-fma-sse41 8 | // with machine_params=16,16777216,40 9 | 10 | #include "Halide.h" 11 | 12 | inline void apply_schedule_included_schedule_file( 13 | ::Halide::Pipeline pipeline, 14 | ::Halide::Target target) { 15 | using ::Halide::Func; 16 | using ::Halide::MemoryType; 17 | using ::Halide::RVar; 18 | using ::Halide::TailStrategy; 19 | using ::Halide::Var; 20 | 21 | Func relu = pipeline.get_func(4); 22 | Func conv = pipeline.get_func(3); 23 | Var c(relu.get_schedule().dims()[0].var); 24 | Var ci("ci"); 25 | Var n(relu.get_schedule().dims()[3].var); 26 | Var x(relu.get_schedule().dims()[1].var); 27 | Var xi("xi"); 28 | Var y(relu.get_schedule().dims()[2].var); 29 | Var yi("yi"); 30 | RVar r4_x(conv.update(0).get_schedule().dims()[0].var); 31 | RVar r4_y(conv.update(0).get_schedule().dims()[1].var); 32 | RVar r4_z(conv.update(0).get_schedule().dims()[2].var); 33 | relu 34 | .split(x, x, xi, 2, TailStrategy::ShiftInwards) 35 | .split(c, c, ci, 8, TailStrategy::ShiftInwards) 36 | .split(y, y, yi, 4, TailStrategy::ShiftInwards) 37 | .unroll(xi) 38 | .unroll(yi) 39 | .vectorize(ci) 40 | .compute_root() 41 | .reorder(ci, xi, yi, c, y, x, n) 42 | .fuse(x, n, x) 43 | .parallel(x); 44 | conv.update(0) 45 | .split(c, c, ci, 8, TailStrategy::GuardWithIf) 46 | .unroll(x) 47 | .unroll(y) 48 | .vectorize(ci) 49 | .reorder(ci, c, x, y, n, r4_x, r4_y, r4_z); 50 | conv 51 | .store_in(MemoryType::Stack) 52 | .split(c, c, ci, 8, TailStrategy::ShiftInwards) 53 | .unroll(x) 54 | .unroll(y) 55 | .vectorize(ci) 56 | .compute_at(relu, c) 57 | .reorder(ci, c, x, y, n); 58 | } 59 | 60 | #endif // included_schedule_file_SCHEDULE_H 61 | -------------------------------------------------------------------------------- /AutoSearch/src/adams2019/included_schedule_file_generator.cpp: -------------------------------------------------------------------------------- 1 | #include "Halide.h" 2 | 3 | #if defined(GENERATING_SCHEDULE) 4 | // nothing 5 | #else 6 | #include "included_schedule_file.schedule.h" 7 | #endif 8 | 9 | namespace { 10 | 11 | // Trivial Generator for testing (and demonstrating) use of .schedule.h 12 | // files produced by the autoschedulers; this is very similar to 13 | // demo_generator.cpp, but packaged separately to avoid confusion for 14 | // newcomers. 15 | struct IncludedScheduleFile : public Halide::Generator { 16 | Input> input{"input", 4}; 17 | Input> filter{"filter", 4}; 18 | Input> bias{"bias", 1}; 19 | Output> relu{"relu", 4}; 20 | 21 | void generate() { 22 | const int N = 5, CI = 120, CO = 24, W = 100, H = 80; 23 | 24 | Var x("x"), y("y"), c("c"), n("n"); 25 | 26 | // Algorithm 27 | Func conv("conv"); 28 | RDom r(0, CI, 0, 3, 0, 3); 29 | conv(c, x, y, n) = bias(c); 30 | conv(c, x, y, n) += filter(c, r.y, r.z, r.x) * input(r.x, x + r.y, y + r.z, n); 31 | relu(c, x, y, n) = max(0, conv(c, x, y, n)); 32 | 33 | // Estimates (for autoscheduler and/or RunGen) 34 | input.set_estimates({{0, CI}, {0, W + 2}, {0, H + 2}, {0, N}}); 35 | filter.set_estimates({{0, CO}, {0, 3}, {0, 3}, {0, CI}}); 36 | bias.set_estimates({{0, CO}}); 37 | relu.set_estimates({{0, CO}, {0, W}, {0, H}, {0, N}}); 38 | 39 | // Schedule 40 | if (auto_schedule) { 41 | // nothing 42 | } else { 43 | #if defined(GENERATING_SCHEDULE) 44 | abort(); 45 | #else 46 | apply_schedule_included_schedule_file(get_pipeline(), get_target()); 47 | #endif 48 | } 49 | } 50 | }; 51 | 52 | } // namespace 53 | 54 | HALIDE_REGISTER_GENERATOR(IncludedScheduleFile, included_schedule_file) 55 | -------------------------------------------------------------------------------- /AutoSearch/src/adams2019/test_perfect_hash_map.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include "PerfectHashMap.h" 9 | 10 | using std::map; 11 | using std::vector; 12 | 13 | struct Key { 14 | int id, max_id; 15 | Key(int i, int m) 16 | : id(i), max_id(m) { 17 | } 18 | }; 19 | 20 | int main(int argc, char **argv) { 21 | std::mt19937 rng(0); 22 | int seed = argc > 1 ? atoi(argv[1]) : time(nullptr); 23 | rng.seed(seed); 24 | printf("seed: %d\n", seed); 25 | 26 | PerfectHashMap h; 27 | std::map ref; 28 | 29 | std::vector keys; 30 | const int N = 100; 31 | 32 | for (int i = 0; i < N; i++) { 33 | keys.emplace_back(i, N); 34 | } 35 | std::shuffle(keys.begin(), keys.end(), rng); 36 | 37 | for (int i = 0; i < 10000; i++) { 38 | // Insert. Possibly a duplicate of an existing item. 39 | int next = rng() % N; 40 | h.insert(&keys[next], next); 41 | ref.insert({&keys[next], next}); 42 | 43 | // Check the map and hash map contain the same stuff in the same order 44 | if (h.size() != ref.size()) { 45 | fprintf(stderr, "Size mismatch: %d vs %d\n", (int)h.size(), (int)ref.size()); 46 | return -1; 47 | } 48 | // Use iterators to convert PerfectHashMap to map and compare to reference map 49 | decltype(ref) h_map; 50 | for (auto it = h.begin(); it != h.end(); it++) { 51 | h_map.insert({it.key(), it.value()}); 52 | } 53 | 54 | auto it = h_map.begin(); 55 | auto ref_it = ref.begin(); 56 | while (it != h_map.end()) { 57 | if (it->first != ref_it->first) { 58 | fprintf(stderr, "Key mismatch: %p vs %p\n", (const void *)it->first, (const void *)ref_it->first); 59 | return -1; 60 | } 61 | if (it->second != ref_it->second) { 62 | fprintf(stderr, "Value mismatch: %d vs %d\n", it->second, ref_it->second); 63 | return -1; 64 | } 65 | it++; 66 | ref_it++; 67 | } 68 | } 69 | printf("Perfect hash map test passed\n"); 70 | return 0; 71 | } 72 | -------------------------------------------------------------------------------- /AutoSearch/src/adams2019/updated.weights: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OAID/AutoKernel/6ada93a743c26e3aae0372f7518433889dd0b437/AutoSearch/src/adams2019/updated.weights -------------------------------------------------------------------------------- /AutoSearch/src/adams2019/weightsdir_to_weightsfile.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "Weights.h" 7 | 8 | // Utility to convert from the old dir-of-raw-data into a new .weights file. 9 | // Should live only long enough for downstream users to convert existing data files 10 | // to the new format. 11 | int main(int argc, char **argv) { 12 | if (argc != 3) { 13 | std::cout << "Usage: weights_dir weights_file.weights\n"; 14 | return -1; 15 | } 16 | 17 | Halide::Internal::Weights w; 18 | if (!w.load_from_dir(argv[1])) { 19 | std::cerr << "Unable to read input dir: " << argv[1] << "\n"; 20 | return -1; 21 | } 22 | 23 | if (!w.save_to_file(argv[2])) { 24 | std::cerr << "Unable to save output file: " << argv[2] << "\n"; 25 | return -1; 26 | } 27 | 28 | return 0; 29 | } 30 | -------------------------------------------------------------------------------- /AutoSearch/src/common/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_library(Halide_Plugin INTERFACE) 2 | add_library(Halide::Plugin ALIAS Halide_Plugin) 3 | target_include_directories(Halide_Plugin INTERFACE $) 4 | target_link_libraries(Halide_Plugin INTERFACE Halide::Halide) 5 | -------------------------------------------------------------------------------- /AutoSearch/src/common/Errors.h: -------------------------------------------------------------------------------- 1 | #ifndef ERRORS_H 2 | #define ERRORS_H 3 | 4 | #include "Halide.h" 5 | 6 | #ifndef user_error 7 | #define user_error Halide::Internal::ErrorReport(__FILE__, __LINE__, nullptr, Halide::Internal::ErrorReport::User) 8 | #endif 9 | 10 | #ifndef user_warning 11 | #define user_warning Halide::Internal::ErrorReport(__FILE__, __LINE__, nullptr, Halide::Internal::ErrorReport::User | Halide::Internal::ErrorReport::Warning) 12 | #endif 13 | 14 | #ifndef user_assert 15 | #define user_assert(c) _halide_internal_assertion(c, Halide::Internal::ErrorReport::User) 16 | #endif 17 | 18 | #ifndef internal_assert 19 | #define internal_assert(c) _halide_internal_assertion(c, 0) 20 | #endif 21 | 22 | #ifndef internal_error 23 | #define internal_error Halide::Internal::ErrorReport(__FILE__, __LINE__, nullptr, 0) 24 | #endif 25 | 26 | #endif 27 | -------------------------------------------------------------------------------- /AutoSearch/src/common/HalidePlugin.h: -------------------------------------------------------------------------------- 1 | #ifndef HALIDE_HALIDEPLUGIN_H 2 | #define HALIDE_HALIDEPLUGIN_H 3 | 4 | #include "Errors.h" 5 | #include "DataTransform.h" 6 | #define REGISTER_AUTOSCHEDULER(NAME) \ 7 | struct HALIDE_EXPORT Register##NAME { \ 8 | Register##NAME() { \ 9 | debug(1) << "Registering autoscheduler '" #NAME "'...\n"; \ 10 | Pipeline::add_autoscheduler(#NAME, NAME()); \ 11 | } \ 12 | } register_##NAME; 13 | 14 | #endif //HALIDE_HALIDEPLUGIN_H 15 | -------------------------------------------------------------------------------- /AutoSearch/src/common/binary2cpp.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #ifdef _WIN32 8 | #include // O_BINARY 9 | #include // setmode 10 | #endif 11 | 12 | // Embeds a binary blob (from stdin) in a C++ source array of unsigned 13 | // chars. Similar to the xxd utility. 14 | 15 | static int usage() { 16 | fprintf(stderr, "Usage: binary2cpp identifier [-header]\n"); 17 | return -1; 18 | } 19 | 20 | int main(int argc, const char **argv) { 21 | const char *target = argv[1]; 22 | if (argc == 3) { 23 | if (!strcmp(argv[2], "-header")) { 24 | printf("#ifndef _H_%s_binary2cpp\n", target); 25 | printf("#define _H_%s_binary2cpp\n", target); 26 | printf("extern \"C\" {\n"); 27 | printf("extern unsigned char %s[];\n", target); 28 | printf("extern int %s_length;\n", target); 29 | printf("} // extern \"C\"\n"); 30 | printf("#endif // _H_%s_binary2cpp\n", target); 31 | return 0; 32 | } else { 33 | return usage(); 34 | } 35 | } else if (argc > 3) { 36 | return usage(); 37 | } 38 | 39 | #ifdef _WIN32 40 | setmode(fileno(stdin), O_BINARY); // On windows bad things will happen unless we read stdin in binary mode 41 | #endif 42 | printf("extern \"C\" {\n"); 43 | printf("unsigned char %s[] = {\n", target); 44 | int count = 0; 45 | int line_break = 0; 46 | while (1) { 47 | int c = getchar(); 48 | if (c == EOF) break; 49 | printf("0x%02x, ", c); 50 | // Not necessary, but makes a bit easier to read 51 | if (++line_break > 12) { 52 | printf("\n"); 53 | line_break = 0; 54 | } 55 | count++; 56 | } 57 | printf("0};\n"); 58 | printf("int %s_length = %d;\n", target, count); 59 | printf("} // extern \"C\"\n"); 60 | return 0; 61 | } 62 | -------------------------------------------------------------------------------- /AutoSearch/src/li2018/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_autoscheduler(NAME Li2018 SOURCES GradientAutoscheduler.cpp) 2 | 3 | # ========================================================== 4 | # TODO(#4053): move these to a separate folder since they're tests. 5 | 6 | add_executable(demo_gradient.generator demo_generator.cpp) 7 | target_link_libraries(demo_gradient.generator PRIVATE Halide::Generator) 8 | 9 | add_halide_library(demo_gradient FROM demo_gradient.generator 10 | TARGETS cmake 11 | GENERATOR demo 12 | FUNCTION_NAME demo 13 | AUTOSCHEDULER AutoSchedule::Li2018 14 | REGISTRATION DEMO_REGISTRATION_FILE) 15 | 16 | add_executable(demo_gradient_autoscheduler ${DEMO_REGISTRATION_FILE}) 17 | target_link_libraries(demo_gradient_autoscheduler PRIVATE demo_gradient Halide::RunGenMain) 18 | 19 | add_test(NAME demo_gradient_autoscheduler 20 | COMMAND demo_gradient_autoscheduler --benchmarks=all --benchmark_min_time=1 --estimate_all) 21 | 22 | set_tests_properties(demo_gradient_autoscheduler PROPERTIES LABELS Li2018) 23 | 24 | ## 25 | 26 | if (BUILD_SHARED_LIBS) 27 | add_executable(gradient_autoscheduler_test_cpp test.cpp) 28 | target_link_libraries(gradient_autoscheduler_test_cpp PRIVATE Halide::Halide) 29 | 30 | add_test(NAME gradient_autoscheduler_test_cpp 31 | COMMAND gradient_autoscheduler_test_cpp $) 32 | 33 | set_tests_properties(gradient_autoscheduler_test_cpp PROPERTIES LABELS Li2018) 34 | endif () 35 | 36 | ## 37 | 38 | if (WITH_PYTHON_BINDINGS) 39 | # TODO(#4053): rework this as an app under python_bindings. 40 | # TODO(#4876): Disabled due to issue #4876 41 | if (FALSE) 42 | find_package(Python3 REQUIRED COMPONENTS Interpreter Development) 43 | 44 | add_test(NAME gradient_autoscheduler_test_py 45 | COMMAND Python3::Interpreter "${CMAKE_CURRENT_SOURCE_DIR}/test.py") 46 | 47 | set(PYTHONPATH "$>") 48 | 49 | if (WIN32) 50 | set(SEP "\\$") 51 | else () 52 | set(SEP ":") 53 | endif () 54 | 55 | set(_PATH "$>;$>;$ENV{PATH}") 56 | string(REPLACE ";" "${SEP}" _PATH "${_PATH}") 57 | set_tests_properties(gradient_autoscheduler_test_py PROPERTIES 58 | LABELS Li2018 59 | ENVIRONMENT "PYTHONPATH=${PYTHONPATH};PATH=${_PATH}") 60 | endif () 61 | endif () 62 | -------------------------------------------------------------------------------- /AutoSearch/src/li2018/Makefile: -------------------------------------------------------------------------------- 1 | THIS_MAKEFILE = $(realpath $(filter %Makefile, $(MAKEFILE_LIST))) 2 | SRC = $(strip $(shell dirname $(THIS_MAKEFILE))) 3 | HALIDE_SRC_ROOT = $(realpath $(SRC)/../../../) 4 | COMMON_DIR = $(realpath $(SRC)/../common/) 5 | 6 | # Assume an in-tree build of a halide distro exists. Most uses of this 7 | # Makefile should probably set this variable explicitly. 8 | HALIDE_DISTRIB_PATH ?= $(HALIDE_SRC_ROOT)/distrib 9 | 10 | # The example uses a generator, though the autoscheduler itself does not require one 11 | include $(HALIDE_SRC_ROOT)/apps/support/Makefile.inc 12 | 13 | CXXFLAGS += -I$(COMMON_DIR) 14 | 15 | ifeq ($(UNAME), Darwin) 16 | HALIDE_RPATH_FOR_LIB += '-Wl,-rpath,@loader_path' 17 | else 18 | HALIDE_RPATH_FOR_LIB += '-Wl,-rpath,$$ORIGIN' 19 | endif 20 | 21 | $(BIN)/libautoschedule_li2018.$(SHARED_EXT): $(SRC)/GradientAutoscheduler.cpp $(LIB_HALIDE) 22 | @mkdir -p $(@D) 23 | $(CXX) -shared $(USE_EXPORT_DYNAMIC) -fPIC -fvisibility=hidden -fvisibility-inlines-hidden $(CXXFLAGS) $(OPTIMIZE) $^ -o $@ $(HALIDE_SYSTEM_LIBS) $(HALIDE_RPATH_FOR_LIB) 24 | 25 | # Demonstrate a JIT-based use of gradient autoscheuler 26 | $(BIN)/test: $(SRC)/test.cpp $(BIN)/libautoschedule_li2018.$(SHARED_EXT) 27 | @mkdir -p $(@D) 28 | $(CXX) $(CXXFLAGS) $(USE_EXPORT_DYNAMIC) $(SRC)/test.cpp -o $@ $(LIBHALIDE_LDFLAGS) $(HALIDE_SYSTEM_LIBS) 29 | 30 | # Demonstrate a generator-based use of gradient autoscheuler 31 | $(GENERATOR_BIN)/demo.generator: $(SRC)/demo_generator.cpp $(GENERATOR_DEPS) 32 | @mkdir -p $(@D) 33 | $(CXX) $(CXXFLAGS) $(USE_EXPORT_DYNAMIC) -g $(filter-out %.h,$^) -o $@ $(LIBHALIDE_LDFLAGS) $(HALIDE_SYSTEM_LIBS) 34 | 35 | # Use the -p flag to the generator to load the autoscheduler as a plugin 36 | $(BIN)/%/demo.a: $(GENERATOR_BIN)/demo.generator $(BIN)/libautoschedule_li2018.$(SHARED_EXT) 37 | @mkdir -p $(@D) 38 | $(GENERATOR_BIN)/demo.generator -g demo -o $(@D) -f demo target=$* auto_schedule=true -p $(BIN)/libautoschedule_li2018.$(SHARED_EXT) -s Li2018 39 | 40 | $(BIN)/%/demo.rungen: $(BIN)/%/RunGenMain.o $(BIN)/%/demo.registration.cpp $(BIN)/%/demo.a 41 | @mkdir -p $(@D) 42 | $(CXX) $(CXXFLAGS) -I$(BIN)/$* $^ -o $@ $(HALIDE_SYSTEM_LIBS) $(IMAGE_IO_FLAGS) 43 | 44 | .PHONY: build test clean run_test_cpp run_test_py test_generator 45 | 46 | # demonstrates single-shot use of the autoscheduler 47 | test_generator: $(BIN)/$(HL_TARGET)/demo.rungen $(BIN)/libautoschedule_li2018.$(SHARED_EXT) 48 | $< --benchmarks=all --benchmark_min_time=1 --estimate_all 49 | 50 | run_test_cpp: $(BIN)/test 51 | LD_LIBRARY_PATH=$(BIN) $< $(BIN)/libautoschedule_li2018.$(SHARED_EXT) 52 | 53 | run_test_py: $(SRC)/test.py $(BIN)/libautoschedule_li2018.$(SHARED_EXT) 54 | PYTHONPATH=$(BIN):$(HALIDE_PYTHON_BINDINGS_PATH):$(HALIDE_DISTRIB_PATH)/bin:$$PYTHONPATH \ 55 | LD_LIBRARY_PATH=$(BIN):$(HALIDE_PYTHON_BINDINGS_PATH):$(HALIDE_DISTRIB_PATH)/bin \ 56 | $(PYTHON) $(SRC)/test.py 57 | 58 | \build: $(BIN)/test $(BIN)/$(HL_TARGET)/demo.rungen $(BIN)/libautoschedule_li2018.$(SHARED_EXT) 59 | 60 | test: run_test_cpp run_test_py test_generator 61 | 62 | clean: 63 | rm -rf $(BIN) 64 | -------------------------------------------------------------------------------- /AutoSearch/src/li2018/README.md: -------------------------------------------------------------------------------- 1 | This is a conservative autoscheduler that `compute_root` most Funcs except for 2 | the trivial ones (think of it as a -O1 optimizer for Halide). It recognizes 3 | large reduction patterns and use `rfactor` or `atomic` to parallelize on 4 | associative reduction when there's not enough parallelism in the pure variable 5 | domain. This strategy works reasonably well for gradient pipelines, and is 6 | suitable as a default option for decent but not optimal performance. This is 7 | also currently the only autoscheduler that generates GPU schedules. 8 | 9 | Running some benchmarks in the app directory gives the following statistics (all 10 | use `halide_reuse_device_allocations(nullptr, true)` for GPU) 11 | 12 | | app | manual (CPU) | gradient-autoscheduler (CPU) | manual (GPU) | gradient-autoscheduler (GPU) | 13 | | ---------------- | ------------ | ---------------------------- | ------------ | ---------------------------- | 14 | | bilateral filter | 7.93 ms | 12.92 ms | 0.29 ms | 1.05 ms | 15 | | camera_pipe | 8823.33 us | 25126 us | 605.03 us | 3347.44 us | 16 | | lens_blur | 7.77 ms | 22.41 ms | 0.73 ms | 5.60 ms | 17 | | local_laplacian | 42.29 ms | 128.31 ms | 0.81 ms | 14.30 ms | 18 | | nl_means | 145.003 ms | out-of-memory | N/A | 82.93 ms | 19 | | conv_layer | 15.46 ms | 6.89 ms | N/A | 1.90 ms | 20 | | stencil_chain | 18.86 ms | 21.46 ms | N/A | 6.35 ms | 21 | 22 | Tested on a 8 core Intel CPU (16 with HT) and TITAN Xp. 23 | 24 | See `test.cpp` and `demo_generator.cpp` for how to use this autoscheduler. It 25 | can also be used with Python bindings. Compile with 26 | 27 | ``` 28 | WITH_PYTHON=1 make 29 | ``` 30 | 31 | and see `test.py` for usage. 32 | -------------------------------------------------------------------------------- /AutoSearch/src/li2018/demo_generator.cpp: -------------------------------------------------------------------------------- 1 | #include "Halide.h" 2 | 3 | namespace { 4 | 5 | using namespace Halide; 6 | 7 | class ConvRelu : public Halide::Generator { 8 | public: 9 | Input> input{"input", 4}; 10 | Input> filter{"filter", 4}; 11 | Input> bias{"bias", 1}; 12 | Output> relu{"relu", 4}; 13 | 14 | void generate() { 15 | const int N = 5, CI = 120, CO = 24, W = 100, H = 80; 16 | 17 | Var x("x"), y("y"), c("c"), n("n"); 18 | 19 | Func conv("conv"); 20 | RDom r(0, CI, 0, 3, 0, 3); 21 | conv(c, x, y, n) = bias(c); 22 | conv(c, x, y, n) += filter(c, r.y, r.z, r.x) * input(r.x, x + r.y, y + r.z, n); 23 | relu(c, x, y, n) = max(0, conv(c, x, y, n)); 24 | 25 | relu.bound(c, 0, CO) 26 | .bound(x, 0, W) 27 | .bound(y, 0, H) 28 | .bound(n, 0, N); 29 | 30 | relu.dim(0).set_bounds(0, CO).set_stride(1); 31 | relu.dim(1).set_bounds(0, W).set_stride(CO); 32 | relu.dim(2).set_bounds(0, H).set_stride(CO * W); 33 | relu.dim(3).set_bounds(0, N).set_stride(CO * H * W); 34 | 35 | input.dim(0).set_bounds(0, CI).set_stride(1); 36 | input.dim(1).set_bounds(0, W + 2).set_stride(CI); 37 | input.dim(2).set_bounds(0, H + 2).set_stride(CI * (W + 2)); 38 | input.dim(3).set_bounds(0, N).set_stride(CI * (W + 2) * (H + 2)); 39 | 40 | filter.dim(0).set_bounds(0, CO).set_stride(1); 41 | filter.dim(1).set_bounds(0, 3).set_stride(CO); 42 | filter.dim(2).set_bounds(0, 3).set_stride(CO * 3); 43 | filter.dim(3).set_bounds(0, CI).set_stride(CO * 3 * 3); 44 | 45 | bias.dim(0).set_bounds(0, CO).set_stride(1); 46 | } 47 | }; 48 | 49 | } // namespace 50 | 51 | HALIDE_REGISTER_GENERATOR(ConvRelu, demo) 52 | -------------------------------------------------------------------------------- /AutoSearch/src/li2018/test.cpp: -------------------------------------------------------------------------------- 1 | #include "Halide.h" 2 | 3 | using namespace Halide; 4 | 5 | int main(int argc, char **argv) { 6 | if (argc != 2) { 7 | fprintf(stderr, "Usage: %s \n", argv[0]); 8 | return 1; 9 | } 10 | 11 | load_plugin(argv[1]); 12 | 13 | MachineParams params(32, 16000000, 40); 14 | Target target; 15 | 16 | Var x("x"), y("y"); 17 | 18 | { // Simple 1D pointwise operations. Should inline. 19 | Func in("in"); 20 | in(x) = cast(x); 21 | Func f0("f0"); 22 | f0(x) = 2.f * in(x); 23 | Func f1("f1"); 24 | f1(x) = sin(f0(x)); 25 | Func f2("f2"); 26 | f2(x) = f1(x) * f1(x); 27 | 28 | f2.set_estimate(x, 0, 10000); 29 | 30 | AutoSchedulerResults result = 31 | Pipeline(f2).auto_schedule(target, params); 32 | std::cout << "Schedule for 1D pointwise operations:\n" 33 | << result.schedule_source << "\n\n"; 34 | } 35 | 36 | { // Simple 2D pointwise operations. Should inline. 37 | Func in("in"); 38 | in(x, y) = cast(x + y); 39 | Func f0("f0"); 40 | f0(x, y) = 2.f * in(x, y); 41 | Func f1("f1"); 42 | f1(x, y) = sin(f0(x, y)); 43 | Func f2("f2"); 44 | f2(x, y) = f1(x, y) * f1(x, y); 45 | 46 | f2.set_estimate(x, 0, 1000) 47 | .set_estimate(y, 0, 1000); 48 | 49 | AutoSchedulerResults result = 50 | Pipeline(f2).auto_schedule(target, params); 51 | std::cout << "Schedule for 2D pointwise operations:\n" 52 | << result.schedule_source << "\n\n"; 53 | } 54 | 55 | { // 1D Convolution. 56 | Func in("in"); 57 | in(x) = cast(x); 58 | RDom r(0, 5); 59 | Func f0("f0"); 60 | f0(x) += in(x + r) / 5.f; 61 | 62 | f0.set_estimate(x, 0, 1000); 63 | 64 | AutoSchedulerResults result = 65 | Pipeline(f0).auto_schedule(target, params); 66 | std::cout << "Schedule for 1D convolution:\n" 67 | << result.schedule_source << "\n\n"; 68 | } 69 | 70 | { // 2D Convolution. 71 | Func in("in"); 72 | in(x, y) = cast(x + y); 73 | RDom r(0, 5, 0, 5); 74 | Func f0("f0"); 75 | f0(x, y) += in(x + r.x, y + r.y) / 25.f; 76 | 77 | f0.set_estimate(x, 0, 1000) 78 | .set_estimate(y, 0, 1000); 79 | 80 | AutoSchedulerResults result = 81 | Pipeline(f0).auto_schedule(target, params); 82 | std::cout << "Schedule for 2D convolution:\n" 83 | << result.schedule_source << "\n\n"; 84 | } 85 | 86 | { // 1D Histogram. 87 | Func in("in"); 88 | in(x) = x % 10; 89 | RDom r(0, 1000); 90 | Func hist("hist"); 91 | hist(x) = 0; 92 | hist(clamp(in(r), 0, 10)) += 1; 93 | 94 | hist.set_estimate(x, 0, 10); 95 | 96 | AutoSchedulerResults result = 97 | Pipeline(hist).auto_schedule(target, params); 98 | std::cout << "Schedule for 1D histogram:\n" 99 | << result.schedule_source << "\n\n"; 100 | } 101 | 102 | { // 2D Histogram. 103 | Func in("in"); 104 | in(x, y) = (x + y) % 10; 105 | RDom r(0, 1000, 0, 1000); 106 | Func hist("hist"); 107 | hist(x) = 0; 108 | hist(clamp(in(r.x, r.y), 0, 10)) += 1; 109 | 110 | hist.set_estimate(x, 0, 10); 111 | 112 | AutoSchedulerResults result = 113 | Pipeline(hist).auto_schedule(target, params); 114 | std::cout << "Schedule for 2D histogram:\n" 115 | << result.schedule_source << "\n\n"; 116 | } 117 | 118 | { // 2D Histogram, but the domain is much larger. 119 | Func in("in"); 120 | in(x, y) = (x + y) % 10000; 121 | RDom r(0, 1000, 0, 1000); 122 | Func hist("hist"); 123 | hist(x) = 0; 124 | hist(clamp(in(r.x, r.y), 0, 10000)) += 1; 125 | 126 | hist.set_estimate(x, 0, 10000); 127 | 128 | AutoSchedulerResults result = 129 | Pipeline(hist).auto_schedule(target, params); 130 | std::cout << "Schedule for 2D histogram with larger domain:\n" 131 | << result.schedule_source << "\n\n"; 132 | } 133 | 134 | { // Test for conjunction use of bound and estimates. 135 | Func in("in"); 136 | in(x, y) = cast(x + y); 137 | Func f0("f0"); 138 | f0(x, y) = 2.f * in(x, y); 139 | Func f1("f1"); 140 | f1(x, y) = sin(f0(x, y)); 141 | Func f2("f2"); 142 | f2(x, y) = f1(x, y) * f1(x, y); 143 | 144 | f2.bound(x, 0, 4); 145 | // make sure it also works if we reverse the estimate order 146 | f2.set_estimate(y, 0, 1024) 147 | .set_estimate(x, 0, 4); 148 | 149 | AutoSchedulerResults result = 150 | Pipeline(f2).auto_schedule(target, params); 151 | std::cout << "Schedule for 2D pointwise operations with small x dimension:\n" 152 | << result.schedule_source << "\n\n"; 153 | } 154 | return 0; 155 | } 156 | -------------------------------------------------------------------------------- /AutoSearch/src/li2018/test.py: -------------------------------------------------------------------------------- 1 | import halide as hl 2 | 3 | def main(): 4 | hl.load_plugin("autoschedule_li2018") 5 | 6 | x = hl.Var('x') 7 | f_in = hl.Func('in') 8 | f_in[x] = hl.f32(x) # Cast to float 32 9 | f_0 = hl.Func('f_0') 10 | f_0[x] = 2 * f_in[x] 11 | f_1 = hl.Func('f_1') 12 | f_1[x] = hl.sin(f_0[x]) 13 | f_2 = hl.Func('f_2') 14 | f_2[x] = f_1[x] * f_1[x] 15 | 16 | # Setup 17 | f_2.set_estimate(x, 0, 1000) 18 | p = hl.Pipeline(f_2) 19 | target = hl.Target() 20 | # Only first parameter is used (number of cores on CPU) 21 | params = hl.MachineParams(32, 0, 0); 22 | result = p.auto_schedule('Li2018', target, params) 23 | print('Schedule:') 24 | print(result.schedule_source) 25 | 26 | p.compile_jit() # compile 27 | buf = p.realize(1000) # compute and get the buffer 28 | 29 | if __name__ == '__main__': 30 | main() 31 | -------------------------------------------------------------------------------- /AutoSearch/src/mullapudi2016/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_autoscheduler(NAME Mullapudi2016 SOURCES AutoSchedule.cpp) 2 | -------------------------------------------------------------------------------- /AutoSearch/src/mullapudi2016/Makefile: -------------------------------------------------------------------------------- 1 | THIS_MAKEFILE = $(realpath $(filter %Makefile, $(MAKEFILE_LIST))) 2 | SRC = $(strip $(shell dirname $(THIS_MAKEFILE))) 3 | HALIDE_ROOT = $(realpath $(SRC)/../../../) 4 | COMMON_DIR = $(realpath $(SRC)/../common/) 5 | 6 | HALIDE_DISTRIB_PATH ?= $(HALIDE_SRC_ROOT)/distrib 7 | include $(HALIDE_ROOT)/apps/support/Makefile.inc 8 | 9 | # Add the relative location of libHalide.so in the rpath in a distro so that the autoscheduler library can find libHalide 10 | ifeq ($(UNAME), Darwin) 11 | HALIDE_RPATH_FOR_LIB += '-Wl,-rpath,@loader_path' 12 | else 13 | HALIDE_RPATH_FOR_LIB += '-Wl,-rpath,$$ORIGIN' 14 | endif 15 | 16 | CXXFLAGS += -I$(COMMON_DIR) 17 | 18 | $(BIN)/libautoschedule_mullapudi2016.$(SHARED_EXT): $(SRC)/AutoSchedule.cpp $(LIB_HALIDE) 19 | @mkdir -p $(@D) 20 | $(CXX) -shared $(USE_EXPORT_DYNAMIC) -fPIC -fvisibility=hidden -fvisibility-inlines-hidden $(CXXFLAGS) $(OPTIMIZE) $^ -o $@ $(HALIDE_RPATH_FOR_LIB) 21 | -------------------------------------------------------------------------------- /AutoSearch/src/sioutas2020/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_autoscheduler(NAME Sioutas20 SOURCES AutoSchedule.cpp) 2 | -------------------------------------------------------------------------------- /AutoSearch/src/sioutas2020/Makefile: -------------------------------------------------------------------------------- 1 | THIS_MAKEFILE = $(realpath $(filter %Makefile, $(MAKEFILE_LIST))) 2 | SRC = $(strip $(shell dirname $(THIS_MAKEFILE))) 3 | HALIDE_ROOT = $(realpath $(SRC)/../../../) 4 | COMMON_DIR = $(realpath $(SRC)/../common/) 5 | 6 | HALIDE_DISTRIB_PATH ?= $(HALIDE_SRC_ROOT)/distrib 7 | include $(HALIDE_ROOT)/apps/support/Makefile.inc 8 | 9 | # Add the relative location of libHalide.so in the rpath in a distro so that the autoscheduler library can find libHalide 10 | ifeq ($(UNAME), Darwin) 11 | HALIDE_RPATH_FOR_LIB += '-Wl,-rpath,@loader_path' 12 | else 13 | HALIDE_RPATH_FOR_LIB += '-Wl,-rpath,$$ORIGIN' 14 | endif 15 | 16 | CXXFLAGS += -I$(COMMON_DIR) 17 | 18 | $(BIN)/libautoschedule_sioutas2020.$(SHARED_EXT): $(SRC)/AutoSchedule.cpp $(LIB_HALIDE) 19 | @mkdir -p $(@D) 20 | $(CXX) -shared $(USE_EXPORT_DYNAMIC) -fPIC -fvisibility=hidden -fvisibility-inlines-hidden $(CXXFLAGS) $(OPTIMIZE) $^ -o $@ $(HALIDE_RPATH_FOR_LIB) 21 | -------------------------------------------------------------------------------- /AutoSearch/src/sioutas2020/test.cpp: -------------------------------------------------------------------------------- 1 | #include "Halide.h" 2 | 3 | using namespace Halide; 4 | 5 | int main(int argc, char **argv) { 6 | if (argc != 2) { 7 | fprintf(stderr, "Usage: %s \n", argv[0]); 8 | return 1; 9 | } 10 | 11 | load_plugin(argv[1]); 12 | 13 | MachineParams params(32, 16000000, 40); 14 | Target target; 15 | 16 | Var x("x"), y("y"); 17 | 18 | { // Simple 1D pointwise operations. Should inline. 19 | Func in("in"); 20 | in(x) = cast(x); 21 | Func f0("f0"); 22 | f0(x) = 2.f * in(x); 23 | Func f1("f1"); 24 | f1(x) = sin(f0(x)); 25 | Func f2("f2"); 26 | f2(x) = f1(x) * f1(x); 27 | 28 | f2.set_estimate(x, 0, 10000); 29 | 30 | AutoSchedulerResults result = 31 | Pipeline(f2).auto_schedule(target, params); 32 | std::cout << "Schedule for 1D pointwise operations:\n" 33 | << result.schedule_source << "\n\n"; 34 | } 35 | 36 | { // Simple 2D pointwise operations. Should inline. 37 | Func in("in"); 38 | in(x, y) = cast(x + y); 39 | Func f0("f0"); 40 | f0(x, y) = 2.f * in(x, y); 41 | Func f1("f1"); 42 | f1(x, y) = sin(f0(x, y)); 43 | Func f2("f2"); 44 | f2(x, y) = f1(x, y) * f1(x, y); 45 | 46 | f2.set_estimate(x, 0, 1000) 47 | .set_estimate(y, 0, 1000); 48 | 49 | AutoSchedulerResults result = 50 | Pipeline(f2).auto_schedule(target, params); 51 | std::cout << "Schedule for 2D pointwise operations:\n" 52 | << result.schedule_source << "\n\n"; 53 | } 54 | 55 | { // 1D Convolution. 56 | Func in("in"); 57 | in(x) = cast(x); 58 | RDom r(0, 5); 59 | Func f0("f0"); 60 | f0(x) += in(x + r) / 5.f; 61 | 62 | f0.set_estimate(x, 0, 1000); 63 | 64 | AutoSchedulerResults result = 65 | Pipeline(f0).auto_schedule(target, params); 66 | std::cout << "Schedule for 1D convolution:\n" 67 | << result.schedule_source << "\n\n"; 68 | } 69 | 70 | { // 2D Convolution. 71 | Func in("in"); 72 | in(x, y) = cast(x + y); 73 | RDom r(0, 5, 0, 5); 74 | Func f0("f0"); 75 | f0(x, y) += in(x + r.x, y + r.y) / 25.f; 76 | 77 | f0.set_estimate(x, 0, 1000) 78 | .set_estimate(y, 0, 1000); 79 | 80 | AutoSchedulerResults result = 81 | Pipeline(f0).auto_schedule(target, params); 82 | std::cout << "Schedule for 2D convolution:\n" 83 | << result.schedule_source << "\n\n"; 84 | } 85 | 86 | { // 1D Histogram. 87 | Func in("in"); 88 | in(x) = x % 10; 89 | RDom r(0, 1000); 90 | Func hist("hist"); 91 | hist(x) = 0; 92 | hist(clamp(in(r), 0, 10)) += 1; 93 | 94 | hist.set_estimate(x, 0, 10); 95 | 96 | AutoSchedulerResults result = 97 | Pipeline(hist).auto_schedule(target, params); 98 | std::cout << "Schedule for 1D histogram:\n" 99 | << result.schedule_source << "\n\n"; 100 | } 101 | 102 | { // 2D Histogram. 103 | Func in("in"); 104 | in(x, y) = (x + y) % 10; 105 | RDom r(0, 1000, 0, 1000); 106 | Func hist("hist"); 107 | hist(x) = 0; 108 | hist(clamp(in(r.x, r.y), 0, 10)) += 1; 109 | 110 | hist.set_estimate(x, 0, 10); 111 | 112 | AutoSchedulerResults result = 113 | Pipeline(hist).auto_schedule(target, params); 114 | std::cout << "Schedule for 2D histogram:\n" 115 | << result.schedule_source << "\n\n"; 116 | } 117 | 118 | { // 2D Histogram, but the domain is much larger. 119 | Func in("in"); 120 | in(x, y) = (x + y) % 10000; 121 | RDom r(0, 1000, 0, 1000); 122 | Func hist("hist"); 123 | hist(x) = 0; 124 | hist(clamp(in(r.x, r.y), 0, 10000)) += 1; 125 | 126 | hist.set_estimate(x, 0, 10000); 127 | 128 | AutoSchedulerResults result = 129 | Pipeline(hist).auto_schedule(target, params); 130 | std::cout << "Schedule for 2D histogram with larger domain:\n" 131 | << result.schedule_source << "\n\n"; 132 | } 133 | 134 | { // Test for conjunction use of bound and estimates. 135 | Func in("in"); 136 | in(x, y) = cast(x + y); 137 | Func f0("f0"); 138 | f0(x, y) = 2.f * in(x, y); 139 | Func f1("f1"); 140 | f1(x, y) = sin(f0(x, y)); 141 | Func f2("f2"); 142 | f2(x, y) = f1(x, y) * f1(x, y); 143 | 144 | f2.bound(x, 0, 4); 145 | // make sure it also works if we reverse the estimate order 146 | f2.set_estimate(y, 0, 1024) 147 | .set_estimate(x, 0, 4); 148 | 149 | AutoSchedulerResults result = 150 | Pipeline(f2).auto_schedule(target, params); 151 | std::cout << "Schedule for 2D pointwise operations with small x dimension:\n" 152 | << result.schedule_source << "\n\n"; 153 | } 154 | return 0; 155 | } 156 | -------------------------------------------------------------------------------- /AutoSearch/toolkit/shape_config.py: -------------------------------------------------------------------------------- 1 | 2 | """ Shape configurations for single operator and subgraph evaluation """ 3 | 4 | matmul_args = [ 5 | [1, 512, 512, 512], 6 | # [1, 256, 256, 256], 7 | ] 8 | 9 | matmul_shapes = [ 10 | [[512, 512, 1],[512, 512, 1],[512, 512, 1]], 11 | # [[256, 256, 1],[256, 256, 1],[256, 256, 1]], 12 | ] 13 | 14 | shape_dict = { 15 | 'matmul': matmul_shapes, 16 | } 17 | 18 | args_dict = { 19 | 'matmul': matmul_args, 20 | } -------------------------------------------------------------------------------- /AutoSearch/toolkit/template/demo_eval.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | #include "HalideBuffer.h" 5 | 6 | #include "demo_1_512_512_512.h" 7 | using namespace std; 8 | 9 | #define ZERO 0 10 | #define ONE 1 11 | #define RAND 2 12 | 13 | #ifndef MAX 14 | #define MAX(a, b) (((a) > (b)) ? (a) : (b)) 15 | #endif 16 | 17 | #ifndef FABS 18 | #define FABS(a) (((a) > 0) ? (a) : (-a)) 19 | #endif 20 | 21 | // For x86-64-linux: 22 | //g++ demo_eval.cpp demo.s -I $HALIDE_HOME/include -ldl -lpthread -o demo_eval 23 | 24 | // For arm-64-linux: 25 | //aarch64-linux-gnu-g++ demo_eval.cpp demo.s -I $HALIDE_HOME/include -ldl -lpthread -o demo_eval 26 | 27 | const int B=1; 28 | const int N=512; 29 | const int M=512; 30 | const int K=512; 31 | 32 | void ref_func(float*data_a,float*data_b,float*data_c) 33 | { 34 | for(int b=0;b Halide_A((float*)a, K,M,B); 89 | Halide::Runtime::Buffer Halide_B((float*)b, N,K,B); 90 | Halide::Runtime::Buffer Halide_C((float*)halide_c, N,M,B); 91 | 92 | matmul(Halide_A,Halide_B,Halide_C); 93 | ref_func(a,b,ref_c); 94 | 95 | if (maxerr(ref_c,halide_c,M*N*B)<0.001) 96 | { 97 | cout<<"Correctness check passed!"< 5 | #include "vector" 6 | #include "iostream" 7 | using Halide::Runtime::Buffer; 8 | using namespace std::chrono; 9 | using namespace std; 10 | void init(Buffer &B) 11 | { 12 | for (auto iter=B.begin();iter!=B.end();iter++) 13 | { 14 | (*iter) = rand()*1.0/RAND_MAX; 15 | } 16 | } 17 | int main(int argc, char **argv) { 18 | { 19 | INPUT_TEMPLATE; 20 | INIT_INPUT; 21 | const auto benchmark_inner = [&]() { 22 | FUNC(DEMO_ARGS); 23 | OUTPUT.device_sync(); 24 | }; 25 | double benchmark_min_time=0.1f; 26 | Halide::Tools::BenchmarkConfig config; 27 | config.min_time = benchmark_min_time; 28 | config.max_time = benchmark_min_time * 4; 29 | double total_time = 0.0f; 30 | steady_clock::time_point start = steady_clock::now(); 31 | for (int i=0;i time_span = duration_cast>(end - start); 44 | //double t = Halide::Tools::benchmark(SAMPLES,ITERATORS,benchmark_inner); 45 | std::cout<<"autokernel time:\t"< /etc/timezone 5 | 6 | RUN apt-get update && apt-get install -y --no-install-recommends \ 7 | g++ \ 8 | git \ 9 | wget \ 10 | cmake \ 11 | python3 \ 12 | python3-dev \ 13 | python3-pip \ 14 | python3-setuptools \ 15 | clang-tools-10 \ 16 | lld-10 \ 17 | llvm-10-dev \ 18 | libclang-10-dev \ 19 | liblld-10-dev \ 20 | libpng-dev \ 21 | libjpeg-dev \ 22 | libgl-dev \ 23 | python3-numpy \ 24 | python3-scipy \ 25 | python3-imageio \ 26 | python3-pybind11 \ 27 | libopenblas-dev \ 28 | libeigen3-dev \ 29 | libatlas-base-dev \ 30 | doxygen \ 31 | ninja-build \ 32 | ca-certificates && \ 33 | rm -rf /var/lib/apt/lists/* && \ 34 | ln -s /usr/bin/python3 /usr/bin/python 35 | 36 | RUN pip3 install --upgrade cmake pip jupyter 37 | WORKDIR /workspace 38 | 39 | RUN git clone --branch v10.0.0 https://github.com/halide/Halide.git && \ 40 | cd Halide && \ 41 | mkdir halide-build && \ 42 | cd halide-build && \ 43 | cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH=/usr/lib/llvm-10/cmake .. && \ 44 | make -j$(nproc) install 45 | 46 | RUN git clone --branch tengine-lite https://github.com/OAID/Tengine.git && \ 47 | cd Tengine && \ 48 | mkdir build && \ 49 | cd build && \ 50 | cmake .. && \ 51 | make install -j$(nproc) 52 | 53 | ENV PYTHONPATH "${PYTHONPATH}:/workspace/Halide/halide-build/python_bindings/src" 54 | ENV LD_LIBRARY_PATH "${LD_LIBRARY_PATH}:/usr/local/lib:/workspace/Tengine/build/install/lib" 55 | -------------------------------------------------------------------------------- /Dockerfile/Dockerfile.cuda: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:10.1-devel-ubuntu18.04 2 | 3 | ENV TZ=Asia/Shanghai 4 | RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone 5 | 6 | RUN apt-get update && apt-get install -y --no-install-recommends \ 7 | g++ \ 8 | git \ 9 | wget \ 10 | cmake \ 11 | python3 \ 12 | python3-dev \ 13 | python3-pip \ 14 | python3-setuptools \ 15 | clang-tools-10 \ 16 | lld-10 \ 17 | llvm-10-dev \ 18 | libclang-10-dev \ 19 | liblld-10-dev \ 20 | libpng-dev \ 21 | libjpeg-dev \ 22 | libgl-dev \ 23 | python3-numpy \ 24 | python3-scipy \ 25 | python3-imageio \ 26 | python3-pybind11 \ 27 | libopenblas-dev \ 28 | libeigen3-dev \ 29 | libatlas-base-dev \ 30 | doxygen \ 31 | ninja-build \ 32 | ca-certificates && \ 33 | rm -rf /var/lib/apt/lists/* && \ 34 | ln -s /usr/bin/python3 /usr/bin/python 35 | 36 | RUN pip3 install --upgrade cmake pip jupyter 37 | WORKDIR /workspace 38 | 39 | RUN git clone --branch v10.0.0 https://github.com/halide/Halide.git && \ 40 | cd Halide && \ 41 | mkdir halide-build && \ 42 | cd halide-build && \ 43 | cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH=/usr/lib/llvm-10/cmake .. && \ 44 | make -j$(nproc) install 45 | 46 | RUN git clone --branch tengine-lite https://github.com/OAID/Tengine.git && \ 47 | cd Tengine && \ 48 | mkdir build && \ 49 | cd build && \ 50 | cmake .. && \ 51 | make install -j$(nproc) 52 | 53 | ENV PYTHONPATH "${PYTHONPATH}:/workspace/Halide/halide-build/python_bindings/src" 54 | ENV LD_LIBRARY_PATH "${LD_LIBRARY_PATH}:/usr/local/lib:/workspace/Tengine/build/install/lib" 55 | -------------------------------------------------------------------------------- /Dockerfile/Dockerfile.opencl: -------------------------------------------------------------------------------- 1 | FROM nvidia/opencl:devel-ubuntu18.04 2 | 3 | ENV TZ=Asia/Shanghai 4 | RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone 5 | 6 | RUN apt-get update && apt-get install -y --no-install-recommends \ 7 | g++ \ 8 | git \ 9 | wget \ 10 | cmake \ 11 | python3 \ 12 | python3-dev \ 13 | python3-pip \ 14 | python3-setuptools \ 15 | clang-tools-10 \ 16 | lld-10 \ 17 | llvm-10-dev \ 18 | libclang-10-dev \ 19 | liblld-10-dev \ 20 | libpng-dev \ 21 | libjpeg-dev \ 22 | libgl-dev \ 23 | python3-numpy \ 24 | python3-scipy \ 25 | python3-imageio \ 26 | python3-pybind11 \ 27 | libopenblas-dev \ 28 | libeigen3-dev \ 29 | libatlas-base-dev \ 30 | doxygen \ 31 | ninja-build \ 32 | ca-certificates && \ 33 | rm -rf /var/lib/apt/lists/* && \ 34 | ln -s /usr/bin/python3 /usr/bin/python 35 | 36 | RUN pip3 install --upgrade cmake pip jupyter 37 | WORKDIR /workspace 38 | 39 | RUN git clone --branch v10.0.0 https://github.com/halide/Halide.git && \ 40 | cd Halide && \ 41 | mkdir halide-build && \ 42 | cd halide-build && \ 43 | cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH=/usr/lib/llvm-10/cmake .. && \ 44 | make -j$(nproc) install 45 | 46 | RUN git clone --branch tengine-lite https://github.com/OAID/Tengine.git && \ 47 | cd Tengine && \ 48 | mkdir build && \ 49 | cd build && \ 50 | cmake .. && \ 51 | make install -j$(nproc) 52 | 53 | ENV PYTHONPATH "${PYTHONPATH}:/workspace/Halide/halide-build/python_bindings/src" 54 | ENV LD_LIBRARY_PATH "${LD_LIBRARY_PATH}:/usr/local/lib:/workspace/Tengine/build/install/lib" 55 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 5 | 6 | English | [简体中文](./README_CN.md) 7 | 8 | # AutoKernel 9 | 10 | ## Introduction 11 | Neural networks are now used in a wide variety of applications. Efficient execution of Neural networks on various devices plays a critical role for these applications. Facing the rapid evolution of deep learning algorithms, there're limited qualified programmers to write hand optimized low-level kernels on different hardware platforms. Using automatic optimization tools to generate high-performance implementations become a promising solution. 12 | 13 | AutoKernel began as a research project at OPEN AI LAB. The project is now open source. AutoKernel is an operator optimzation tools for automatically generating high-performance low-level codes for diverse hardware backends. It aims to accelerate the development of high performance operators on various hardware including specialized accelerators. 14 | 15 | ## AutoKernel Architecture 16 | 17 | ![AutoKernel arch](doc/architecture-en.png) 18 | 19 | AutoKernel consists of three modules: 20 | * Operator Generator: 21 | 22 | This module uses the open source project [Halide](https://github.com/halide/Halide). Halide is a domain specific language (DSL), embedded in C++, designed to make it easier to write high-performance image processing code on modern machines. Halide seperates the algorithm description from its schedule. The input of this module is the algorithm description of operator, and the output is compiled optimized assembly code/object file for corresponding back-ends. 23 | 24 | 25 | * AutoSearch 26 | 27 | AutoSearch is an automatic module for searching optimized schedules for halide operators, using multiple optimization algorithms (greedy algorithm, reinforce learning, marchine learning, ...). It supports searching optimized schedules on both CPU and GPU, and generate code files running on different platforms (x86 or arm). This module is still under developping. 28 | 29 | * AutoKernel Plugin: 30 | 31 | AutoKernel Plugin realizes one-click integration of auto-generated optimized operator codes into [Tengine](https://github.com/OAID/Tengine), without modifying the core code base of Tengine. 32 | AutoKernel plugin realizes the one-click deployment of the automatic generated operator implements. 33 | 34 | ## Features 35 | 36 | - Automated 37 | - Efficient 38 | - User-friendly 39 | 40 | 41 | ## Docker 42 | We provide following dockers with Halide and Tengine installed: 43 | - cpu: `openailab/autokernel` 44 | - cuda: `openailab/autokernel:cuda` 45 | - opencl: `openailab/autokernel:opencl` 46 | 47 | Detail Dockerfiles, see [Dockerfiles](https://github.com/OAID/AutoKernel/tree/main/Dockerfile) 48 | 49 | [NOTE]: 50 | if using the cuda image, you need use `nvidia-docker` instead of `docker`, here's [nvidia-docker install-guide](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#installing-on-ubuntu-and-debian). 51 | ``` 52 | nvidia-docker pull openailab/autokernel:cuda 53 | nvidia-docker run -it openailab/autokernel:cuda /bin/bash 54 | ``` 55 | 56 | ## License 57 | 58 | - [Apache 2.0](LICENSE) 59 | 60 | 61 | ## Discussion 62 | - Github issues 63 | - QQ group: 829565581 64 | -------------------------------------------------------------------------------- /README_CN.md: -------------------------------------------------------------------------------- 1 | 5 | 6 | 简体中文 | [English](./README.md) 7 | # AutoKernel 8 | 9 | ## 简介 10 | 11 | 随着人工智能的普及,深度学习网络的不断涌现,为了让各硬件(CPU, GPU, NPU,...)能够支持深度学习应用,各硬件芯片需要软件库去支持高性能的深度学习张量运算。目前,这些高性能计算库主要由资深HPC工程师(高性能计算优化工程师)进行开发,为了加快开发进程,缩短深度学习应用落地周期,自动化算子优化是一个趋势。 12 | 13 | AutoKernel是由OPEN AI LAB提出的高性能算子自动优化工具,可以自动优化调度策略、生成底层优化代码,大幅减少各硬件芯片算子开发成本,提升算子优化效率,让工程师更快实现深度学习算法在各硬件芯片上的高性能部署。 14 | 15 | ## AutoKernel特色 16 | 17 | - 自动化 18 | - 高效率 19 | - 低门槛 20 | 21 | 22 | ## AutoKernel架构 23 | 24 | ![AutoKernel 架构](doc/architecture.png) 25 | 26 | AutoKernel分为三个模块: 27 | * 算子生成器: 28 | 29 | 该模块使用了开源项目[Halide](https://github.com/halide/Halide);Halide是业界广泛使用的自动代码生成项目,它首次提出将计算和调度分离。该模块的输入是和硬件无关的算子计算描述,输出是相应后端的优化汇编代码/目标文件; 30 | 31 | * 自动搜索模块AutoSearch: 32 | 33 | AutoSearch 可以通过最优化算法/搜索算法/机器学习/强化学习搜索出不同后端的最优算子的调度策略参数,支持x86-cpu, cuda-gpu, arm-cpu, arm-mali-gpu等后端的调度策略自动生成。AutoSearch 集成了学术界自动调优近年来的最新研究成果。(该模块在持续开发中); 34 | 35 | * 算子部署插件( AutoKernel Plugin): 36 | 37 | [Tengine](https://github.com/OAID/Tengine)是OPEN AILAB开源的深度学习推理框架,实现了AI算法在不同硬件的快速高效部署。该模块实现了将自动生成的优化算子代码以plugin的形式一键集成到[Tengine](https://github.com/OAID/Tengine)中,实现自动优化算子的一键部署; 38 | 39 | 40 | ## Docker 41 | 我们提供了以下三个docker镜像,镜像内安装了Halide和Tengine, 方便开发者直接使用: 42 | - cpu: `openailab/autokernel` 43 | - cuda: `openailab/autokernel:cuda` 44 | - opencl: `openailab/autokernel:opencl` 45 | 46 | 具体的Dockerfile见 [Dockerfiles目录](https://github.com/OAID/AutoKernel/tree/main/Dockerfile) 47 | 48 | [NOTE]: 49 | 使用cuda镜像需要用`nvidia-docker`, 安装指南见 [nvidia-docker install-guide](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#installing-on-ubuntu-and-debian). 50 | ``` 51 | nvidia-docker pull openailab/autokernel:cuda 52 | nvidia-docker run -it openailab/autokernel:cuda /bin/bash 53 | ``` 54 | 55 | ## License 56 | 57 | - [Apache 2.0](LICENSE) 58 | 59 | ## 技术讨论 60 | - Github issues 61 | - QQ 群: 829565581 62 | -------------------------------------------------------------------------------- /auto_deploy/README.md: -------------------------------------------------------------------------------- 1 | # auto-deploy 2 | Auto-deploy is a light-weighted NN auto-deployment tools. It involves auto op-fusion and auto op codegen, generating neural network deployment codes for specific targets. 3 | 4 | ![](data/auto-deploy.png) 5 | Auto-deploy involves the following process: 6 | 1. read onnx model, parse into Graph IR 7 | 2. run graph optimization passes 8 | 3. dump graph to deployment codes 9 | 4. compile generated codes to executable file 10 | 11 | 12 | 13 | ## Quick Start 14 | env requirements: 15 | - python3 with onnx installed 16 | - autokernel docker with halide installed 17 | 18 | ``` 19 | python3 mnist.py 20 | python3 op_generator.py 21 | 22 | cd c_source 23 | g++ *.cpp -o mnist 24 | ./mnist ../data/mnist.weights ../data/input_6.bin 25 | ``` 26 | - run `python3 mnist.py` will generate `main.cpp` file in c_source directory 27 | - run `python3 op_generator.py` will compiler `op_gen.cpp` and generate the following codes: 28 | ```bash 29 | |-- generated.h 30 | |-- halide_conv.cpp 31 | |-- halide_conv.h 32 | |-- halide_matmul.cpp 33 | |-- halide_matmul.h 34 | |-- halide_maxpool.cpp 35 | |-- halide_maxpool.h 36 | |-- halide_relu.cpp 37 | |-- halide_relu.h 38 | ``` 39 | - finally, compile the source code and run, will print the output data, which is consistent with the result get in `graph_tutorial.ipynb`. with is output data, it will get `predicted number is 6` after postprocessing. 40 | ``` 41 | 2.797004 -12.441699 0.206829 -3.550967 0.014401 5.138205 17.518187 -16.953455 2.517180 -5.376605 42 | ``` 43 | ## Graph Tutorials 44 | see `graph_tutorial.ipynb` 45 | ## Pass Manager 46 | Passes perform the transformations and optimizations for the graph. It may contain more than one passes for the graph. Intuitively, passes can be called by 47 | ``` 48 | graph = pass1(graph) 49 | graph = pass2(graph) 50 | graph = pass3(graph) 51 | ``` 52 | Pass manager is used for better passes management. It can reuse same pattern to generate different passes. In Pass manager: 53 | - register fusion pattern 54 | - add pass_func by reuse pattern 55 | - auto pass dependent analysis, generate seq_pass_list 56 | - according generated seq_pass_list, auto run all passed 57 | 58 | ## Generated main.cpp 59 | - malloc all used tentors 60 | ```cpp 61 | //data 62 | float* _0= (float*)malloc(sizeof(float)*784); //Input3 63 | float* _1= (float*)malloc(sizeof(float)*200); //Parameter5 64 | float* _2= (float*)malloc(sizeof(float)*8); //Parameter6 65 | float* _3= (float*)malloc(sizeof(float)*6272); //Plus30_Output_0 66 | float* _4= (float*)malloc(sizeof(float)*6272); //ReLU32_Output_0 67 | float* _5= (float*)malloc(sizeof(float)*1568); //Pooling66_Output_0 68 | float* _6= (float*)malloc(sizeof(float)*3200); //Parameter87 69 | float* _7= (float*)malloc(sizeof(float)*16); //Parameter88 70 | float* _8= (float*)malloc(sizeof(float)*3136); //Plus112_Output_0 71 | float* _9= (float*)malloc(sizeof(float)*3136); //ReLU114_Output_0 72 | float* _10= (float*)malloc(sizeof(float)*256); //Pooling160_Output_0 73 | float* _11= (float*)malloc(sizeof(float)*2560); //Parameter193_reshape1 74 | float* _12= (float*)malloc(sizeof(float)*10); //Parameter194 75 | float* _13= (float*)malloc(sizeof(float)*10); //Plus214_Output_0 76 | ``` 77 | - load weights 78 | ```cpp 79 | //load_weight 80 | FILE* fp = fopen(weight_name, "rb"); 81 | if (!fp) printf("data can not be open"); 82 | fread(_1, sizeof(float), 200, fp); 83 | fread(_2, sizeof(float), 8, fp); 84 | fread(_6, sizeof(float), 3200, fp); 85 | fread(_7, sizeof(float), 16, fp); 86 | fread(_11, sizeof(float), 2560, fp); 87 | fread(_12, sizeof(float), 10, fp); 88 | fclose(fp); 89 | ``` 90 | - inference code 91 | ```cpp 92 | //code_inference 93 | Conv_Add_fused(_3,_0,_1,_2,¶m_0); 94 | Relu(_4,_3,¶m_1); 95 | MaxPool(_5,_4,¶m_2); 96 | Conv_Add_fused(_8,_5,_6,_7,¶m_3); 97 | Relu(_9,_8,¶m_4); 98 | MaxPool(_10,_9,¶m_5); 99 | MatMul_Add_fused(_13,_10,_11,_12,¶m_6); 100 | ``` 101 | 102 | ## Auto-deploy V.S. inference framenwork 103 | | |inference framework | auto-deploy | 104 | | -------- | ---------- |---------- | 105 | | op fusion implements |
- manual implementation of op fusion

- hard to reuse fusion patterns
| good for reuse op fusion patterns| 106 | | op fusion space | limited numbers of op fusions |can extend to automatic op-fusion with model, bigger search space| 107 | |op implementations|manual fused_op implementations for multi backends| auto codegen with autokernel for multi backends| 108 | |deployment codes| light-weighted, only generated op needed in assigned neural networks| provide op library with all common op implementations| 109 | 110 | ## Release Note 111 | ### 2021/09 auto-deploy v1.0 112 | - graph core codes: tensor, node, graph ir 113 | - pass manager: op_fusion, remove reshape 114 | - nn demo: mnist [onnx models/mnist](https://github.com/onnx/models/blob/master/vision/classification/mnist/model/mnist-8.onnx) 115 | - op: conv, add, relu, matmul, reshape 116 | - deployment main.cpp codegen 117 | 118 | ### Future work 119 | - auto tensor memory scheduling 120 | - tmfile supports 121 | - more nn demo 122 | - ... -------------------------------------------------------------------------------- /auto_deploy/data/0.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OAID/AutoKernel/6ada93a743c26e3aae0372f7518433889dd0b437/auto_deploy/data/0.jpg -------------------------------------------------------------------------------- /auto_deploy/data/3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OAID/AutoKernel/6ada93a743c26e3aae0372f7518433889dd0b437/auto_deploy/data/3.jpg -------------------------------------------------------------------------------- /auto_deploy/data/6.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OAID/AutoKernel/6ada93a743c26e3aae0372f7518433889dd0b437/auto_deploy/data/6.jpg -------------------------------------------------------------------------------- /auto_deploy/data/auto-deploy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OAID/AutoKernel/6ada93a743c26e3aae0372f7518433889dd0b437/auto_deploy/data/auto-deploy.png -------------------------------------------------------------------------------- /auto_deploy/data/input_6.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OAID/AutoKernel/6ada93a743c26e3aae0372f7518433889dd0b437/auto_deploy/data/input_6.bin -------------------------------------------------------------------------------- /auto_deploy/data/main_head: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include "halide_conv.h" 4 | #include "halide_relu.h" 5 | #include "halide_maxpool.h" 6 | #include "halide_matmul.h" 7 | #include "HalideRuntime.h" 8 | 9 | typedef struct Param { 10 | int* inp0_dims; 11 | int* inp1_dims; 12 | int* inp2_dims; 13 | int* inp3_dims; 14 | int* out0_dims; 15 | 16 | int ksize; 17 | int stride; 18 | int pad; 19 | }Param; 20 | 21 | void read_float_data(float* data, int size, char* fname) 22 | { 23 | FILE* fp = fopen(fname, "rb"); 24 | if (!fp) printf("data can not be open"); 25 | fread(data, sizeof(float), size, fp); 26 | fclose(fp); 27 | } 28 | void p(float* data,int size) 29 | { 30 | for(int i=0;idimensions=n; 39 | int step[4]={1,1,1,1}; 40 | for(int i=1;idim=(halide_dimension_t*)malloc(sizeof(halide_dimension_t)*n); 45 | 46 | for(int i=0;idim[i].min=0; 49 | buf->dim[i].extent=shape[i]; 50 | buf->dim[i].stride=step[i]; 51 | } 52 | 53 | //type 54 | buf->type.bits=32; 55 | buf->type.lanes=1; 56 | buf->type.code=halide_type_float; 57 | 58 | 59 | buf->host=(uint8_t*)data; 60 | buf->flags=0; 61 | buf->device=0; 62 | } 63 | -------------------------------------------------------------------------------- /auto_deploy/data/mnist-8.onnx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OAID/AutoKernel/6ada93a743c26e3aae0372f7518433889dd0b437/auto_deploy/data/mnist-8.onnx -------------------------------------------------------------------------------- /auto_deploy/data/mnist.weights: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OAID/AutoKernel/6ada93a743c26e3aae0372f7518433889dd0b437/auto_deploy/data/mnist.weights -------------------------------------------------------------------------------- /auto_deploy/data/reg_str: -------------------------------------------------------------------------------- 1 | import logging 2 | import torch 3 | from torch import nn 4 | import numpy as np 5 | 6 | class Register: 7 | def __init__(self, registry_name): 8 | self._dict = {} 9 | self._name = registry_name 10 | 11 | def __setitem__(self, key, value): 12 | if not callable(value): 13 | raise Exception("Value of a Registry must be a callable") 14 | if key is None: 15 | key = value.__name__ 16 | if key in self._dict: 17 | logging.warning("Key %s already in registry %s." % (key, self._name)) 18 | self._dict[key] = value 19 | 20 | def register(self, key_name): 21 | """Decorator to register a function or class.""" 22 | def add(key, value): 23 | self[key] = value 24 | return value 25 | # @reg.register('alias') 26 | return lambda func: add(key_name, func) 27 | 28 | def __getitem__(self, key): 29 | return self._dict[key] 30 | 31 | def __contains__(self, key): 32 | return key in self._dict 33 | 34 | def keys(self): 35 | """key""" 36 | return self._dict.keys() 37 | 38 | op_reg = Register("op_register") 39 | -------------------------------------------------------------------------------- /auto_deploy/generated_op.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import torch 3 | from torch import nn 4 | import numpy as np 5 | 6 | class Register: 7 | def __init__(self, registry_name): 8 | self._dict = {} 9 | self._name = registry_name 10 | 11 | def __setitem__(self, key, value): 12 | if not callable(value): 13 | raise Exception("Value of a Registry must be a callable") 14 | if key is None: 15 | key = value.__name__ 16 | if key in self._dict: 17 | logging.warning("Key %s already in registry %s." % (key, self._name)) 18 | self._dict[key] = value 19 | 20 | def register(self, key_name): 21 | """Decorator to register a function or class.""" 22 | def add(key, value): 23 | self[key] = value 24 | return value 25 | # @reg.register('alias') 26 | return lambda func: add(key_name, func) 27 | 28 | def __getitem__(self, key): 29 | return self._dict[key] 30 | 31 | def __contains__(self, key): 32 | return key in self._dict 33 | 34 | def keys(self): 35 | """key""" 36 | return self._dict.keys() 37 | 38 | op_reg = Register("op_register") 39 | @op_reg.register("Conv_Add_fused") 40 | def run_Conv_Add_fused(node): 41 | inp_0 = node.input[0] 42 | inp_1 = node.input[1] 43 | inp_2 = node.input[2] 44 | inp_0_tensor = torch.tensor(np.array(inp_0.value,dtype=np.float32).reshape(inp_0.dims)) 45 | inp_1_tensor = torch.tensor(np.array(inp_1.value,dtype=np.float32).reshape(inp_1.dims)) 46 | inp_2_tensor = torch.tensor(np.array(inp_2.value,dtype=np.float32).reshape(inp_2.dims)) 47 | param_0 = node.attr 48 | conv_0 = nn.Conv2d(param_0.c_in, param_0.c_out, param_0.ksize, param_0.stride, param_0.pad,1,1,True) 49 | conv_0.weight.data = inp_1_tensor 50 | conv_0.bias.data = inp_2_tensor 51 | tmp_0 = conv_0(inp_0_tensor) 52 | out = tmp_0.detach().numpy() 53 | out_0 = node.output[0] 54 | out_0.value=out 55 | if out_0.reshaped==0: 56 | out_0.dims=out.shape 57 | @op_reg.register("Relu") 58 | def run_Relu(node): 59 | inp_0 = node.input[0] 60 | inp_0_tensor = torch.tensor(np.array(inp_0.value,dtype=np.float32).reshape(inp_0.dims)) 61 | relu_0 = nn.ReLU() 62 | tmp_0 = relu_0(inp_0_tensor) 63 | out = np.array(tmp_0) 64 | out_0 = node.output[0] 65 | out_0.value=out 66 | if out_0.reshaped==0: 67 | out_0.dims=out.shape 68 | @op_reg.register("MaxPool") 69 | def run_MaxPool(node): 70 | inp_0 = node.input[0] 71 | inp_0_tensor = torch.tensor(np.array(inp_0.value,dtype=np.float32).reshape(inp_0.dims)) 72 | param_0 = node.attr 73 | maxpool_0 = nn.MaxPool2d(param_0.ksize, param_0.stride, param_0.pad) 74 | tmp_0 = maxpool_0(inp_0_tensor) 75 | out = np.array(tmp_0) 76 | out_0 = node.output[0] 77 | out_0.value=out 78 | if out_0.reshaped==0: 79 | out_0.dims=out.shape 80 | @op_reg.register("MatMul_Add_fused") 81 | def run_MatMul_Add_fused(node): 82 | inp_0 = node.input[0] 83 | inp_1 = node.input[1] 84 | inp_2 = node.input[2] 85 | inp_0_tensor = torch.tensor(np.array(inp_0.value,dtype=np.float32).reshape(inp_0.dims)) 86 | inp_1_tensor = torch.tensor(np.array(inp_1.value,dtype=np.float32).reshape(inp_1.dims)) 87 | inp_2_tensor = torch.tensor(np.array(inp_2.value,dtype=np.float32).reshape(inp_2.dims)) 88 | tmp_0 = torch.matmul(inp_0_tensor,inp_1_tensor) 89 | tmp_1 = torch.add(tmp_0,inp_2_tensor) 90 | out = np.array(tmp_1) 91 | out_0 = node.output[0] 92 | out_0.value=out 93 | if out_0.reshaped==0: 94 | out_0.dims=out.shape 95 | -------------------------------------------------------------------------------- /auto_deploy/mnist.py: -------------------------------------------------------------------------------- 1 | from graph import Graph 2 | from pass_manager import pass_m 3 | import cv2 4 | import numpy as np 5 | import array 6 | 7 | inp_dim = [1, 1, 28, 28] 8 | 9 | def gen_main_cpp(): 10 | graph = Graph('mnist', './data/mnist-8.onnx', inp_dim) 11 | graph = pass_m.run_all_pass(graph) 12 | graph.infershape() 13 | graph.gen_main_cpp('c_source/main.cpp') 14 | 15 | gen_main_cpp() -------------------------------------------------------------------------------- /auto_deploy/op_build.sh: -------------------------------------------------------------------------------- 1 | 2 | HALIDE_SOURCE_DIR=/workspace/Halide/ 3 | HALIDE_BUILD_DIR=/workspace/Halide/halide-build 4 | 5 | g++ op_gen.cpp \ 6 | ${HALIDE_SOURCE_DIR}/tools/GenGen.cpp \ 7 | -o op.gen \ 8 | -I ${HALIDE_BUILD_DIR}/include \ 9 | -Wl,-rpath,${HALIDE_BUILD_DIR}/src \ 10 | ${HALIDE_BUILD_DIR}/src/libHalide.so \ 11 | -pthread -std=c++14 -ldl 12 | 13 | OUT_DIR=c_source 14 | 15 | gen() 16 | { 17 | ./op.gen \ 18 | -g $1 \ 19 | -o ${OUT_DIR} \ 20 | -e c_header,c_source \ 21 | target=x86-64-linux-no_runtime-no_bounds_query-no_asserts 22 | } 23 | 24 | gen halide_conv 25 | gen halide_matmul 26 | gen halide_relu 27 | gen halide_maxpool 28 | -------------------------------------------------------------------------------- /auto_deploy/op_codegen.py: -------------------------------------------------------------------------------- 1 | 2 | def get_conv_body(idx,start_idx,tmp,flag): 3 | input_0 = 'inp_0_tensor' 4 | if start_idx!=0: 5 | input_0='tmp_{}'.format(tmp-1) 6 | param_name = 'param_{}'.format(idx) 7 | if flag==0: 8 | line = ''' {} = node.attr 9 | conv_{} = nn.Conv2d({}.c_in, {}.c_out, {}.ksize, {}.stride, {}.pad,1,1,False) 10 | conv_{}.weight.data = inp_{}_tensor 11 | tmp_{} = conv_{}({}) 12 | '''.format(param_name,idx,param_name,param_name,param_name,param_name,param_name,idx,start_idx+1,tmp,idx,input_0) 13 | if flag==1: 14 | line = ''' {} = node.attr 15 | conv_{} = nn.Conv2d({}.c_in, {}.c_out, {}.ksize, {}.stride, {}.pad,1,1,True) 16 | conv_{}.weight.data = inp_{}_tensor 17 | conv_{}.bias.data = inp_{}_tensor 18 | tmp_{} = conv_{}({}) 19 | '''.format(param_name,idx,param_name,param_name,param_name,param_name,param_name, 20 | idx,start_idx+1, 21 | idx,start_idx+2, 22 | tmp,idx,input_0) 23 | return line 24 | def get_maxpool_body(idx,start_idx,tmp): 25 | input_0 = 'inp_0_tensor' 26 | if start_idx!=0: 27 | input_0='tmp_{}'.format(tmp-1) 28 | param_name = 'param_{}'.format(idx) 29 | line = ''' {} = node.attr 30 | maxpool_{} = nn.MaxPool2d({}.ksize, {}.stride, {}.pad) 31 | tmp_{} = maxpool_{}({}) 32 | '''.format(param_name,idx,param_name,param_name,param_name,tmp,idx,input_0) 33 | return line 34 | def get_relu_body(idx,start_idx,tmp): 35 | input_0 = 'inp_0_tensor' 36 | if start_idx !=0: 37 | input_0 = 'tmp_{}'.format(tmp-1) 38 | line = ''' relu_{} = nn.ReLU() 39 | tmp_{} = relu_{}({}) 40 | '''.format(idx,tmp,idx,input_0) 41 | return line 42 | def get_matmul_body(idx,start_idx,tmp): 43 | input_0 = 'inp_0_tensor' 44 | input_1 = 'inp_1_tensor' 45 | if start_idx!=0: 46 | input_0 = 'tmp_{}'.format(tmp-1) 47 | input_1 = 'inp_{}_tensor'.format(start_idx) 48 | line = ''' tmp_{} = torch.matmul({},{})\n'''.format(tmp,input_0,input_1) 49 | return line 50 | def get_add_body(idx,start_idx,tmp): 51 | input_0 = 'inp_0_tensor' 52 | input_1 = 'inp_1_tensor' 53 | if start_idx!=0: 54 | input_0 = 'tmp_{}'.format(tmp-1) 55 | input_1 = 'inp_{}_tensor'.format(start_idx) 56 | line = ''' tmp_{} = torch.add({},{})\n'''.format(tmp,input_0,input_1) 57 | return line 58 | 59 | def codegen_node(node): 60 | 61 | # func parameter node 62 | op_type = node.op_type 63 | op_list = op_type.split("_") 64 | op_list = [i for i in op_list if i!='fused'] 65 | num_op = len(op_list) 66 | 67 | num_input = len(node.input) 68 | num_output = len(node.output) 69 | 70 | ############### head 71 | head = "@op_reg.register(\"{}\")\ndef run_{}(node):\n".format(op_type,op_type) 72 | 73 | ############### input 74 | input_declare = "" 75 | for i in range(num_input): 76 | input_declare+=" inp_{} = node.input[{}]\n".format(i,i) 77 | input_tensor = "" 78 | for i in range(num_input): 79 | input_tensor += " inp_{}_tensor = torch.tensor(np.array(inp_{}.value,dtype=np.float32).reshape(inp_{}.dims))\n".format(i,i,i) 80 | input = input_declare + input_tensor 81 | 82 | ############## body 83 | start_idx = 0 84 | body_idx = 0 85 | body ="" 86 | for body_idx in range(num_op): 87 | op = op_list[body_idx] 88 | if op=="Conv": 89 | flag = 0 90 | if op_list[body_idx+1]=='Add': 91 | flag = 1 92 | body += get_conv_body(body_idx,start_idx,body_idx,flag) 93 | start_idx+=2 94 | if op=="MaxPool": 95 | body += get_maxpool_body(body_idx,start_idx,body_idx) 96 | start_idx+=1 97 | if op=="Relu": 98 | body += get_relu_body(body_idx,start_idx,body_idx) 99 | if op=="MatMul": 100 | body += get_matmul_body(body_idx,start_idx,body_idx) 101 | start_idx+=2 102 | if op=="Add": 103 | if op_list[body_idx-1]=='Conv': 104 | continue 105 | body += get_add_body(body_idx,start_idx,body_idx) 106 | start_idx+=2 107 | body_idx+=1 108 | ############## output 109 | output = "" 110 | assert(num_output==1) 111 | i=0 112 | line = " out = np.array(tmp_{})\n".format(body_idx-1) 113 | if 'Conv' in op_type: 114 | line = " out = tmp_{}.detach().numpy()\n".format(body_idx-1) 115 | line1 = " out_{} = node.output[{}]\n".format(i,i) 116 | line2 = " out_{}.value=out\n".format(i) 117 | line3 = " if out_{}.reshaped==0:\n".format(i) 118 | line4 = " out_{}.dims=out.shape\n".format(i) 119 | output=(line+line1+line2+line3+line4) 120 | 121 | op_string = head + input + body + output 122 | # print(op_string) 123 | return op_string 124 | 125 | 126 | cg = codegen_node -------------------------------------------------------------------------------- /auto_deploy/op_gen.cpp: -------------------------------------------------------------------------------- 1 | #include "Halide.h" 2 | #include "HalideBuffer.h" 3 | 4 | using namespace Halide; 5 | using Halide::BoundaryConditions::constant_exterior; 6 | using Halide::Expr; 7 | 8 | Var x("x"), y("y"), c("c"), n("n"); 9 | 10 | //conv with bias 11 | class ConvGenerator : public Generator { 12 | public: 13 | Input> input{"input", 4}; //[w,h,c,n] 14 | Input> weight{"weight", 4}; //[kw,kh,cin,cout] 15 | Input> bias{"bias", 1}; //[cout] 16 | Input stride{"stride"}; 17 | Input pad{"pad"}; 18 | Output> output{"output", 4}; 19 | 20 | void generate() { 21 | RDom r(0, weight.dim(0).extent(), 0, weight.dim(1).extent(), 0, weight.dim(2).extent()); 22 | output(x, y, c, n) = bias(c); 23 | 24 | Func inp_bounded =constant_exterior(input, //source 25 | 0, //value 26 | {{0, input.dim(0).extent()}, //boundary-dim0 w 27 | {0, input.dim(1).extent()}, //boundary-dim1 h 28 | {Expr(), Expr()}, //boundary-dim2 c 29 | {Expr(), Expr()}}); //boundary-dim3 n 30 | Func inp_padded("inp_padded"); 31 | inp_padded(x, y, c, n) = inp_bounded(x - pad, y - pad, c, n); 32 | 33 | output(x, y, c, n) += weight(r[0], r[1], r[2], c) * 34 | inp_padded(x * stride + r[0], y * stride + r[1],r[2],n); 35 | } 36 | }; 37 | 38 | //matmul 39 | class MatMulGenerator : public Generator { 40 | public: 41 | Input> input{"input", 2}; 42 | Input> weight{"weight", 2}; 43 | Input> bias{"bias", 1}; 44 | Output> output{"output", 2}; 45 | 46 | void generate() { 47 | RDom k(0, input.dim(0).extent()); 48 | output(x, y) = bias(x); 49 | output(x, y) += input(k, y) * weight(x, k); 50 | } 51 | }; 52 | 53 | // maxpool 54 | class MaxPoolGenerator : public Generator { 55 | public: 56 | Input> input_a{"input_a", 4}; 57 | Input ksize{"ksize"}; 58 | Input stride{"stride"}; 59 | Output> output{"output", 4}; 60 | 61 | void generate() { 62 | RDom r(0, ksize, 0, ksize); 63 | int pad = 0; 64 | output(x, y, c, n) = maximum(input_a(stride*x+ r.x -pad , stride*y+r.y - pad, c, n)); 65 | } 66 | }; 67 | 68 | // relu 69 | class ReluGenerator : public Generator { 70 | public: 71 | 72 | Input> input_a{"input_a", 4}; 73 | Output> output{"output", 4}; 74 | 75 | void generate() { 76 | output(x, y, c, n) = max(0.0f,input_a(x,y,c,n)); 77 | } 78 | }; 79 | HALIDE_REGISTER_GENERATOR(MatMulGenerator, halide_matmul) 80 | HALIDE_REGISTER_GENERATOR(MaxPoolGenerator, halide_maxpool) 81 | HALIDE_REGISTER_GENERATOR(ReluGenerator, halide_relu) 82 | HALIDE_REGISTER_GENERATOR(ConvGenerator, halide_conv) 83 | -------------------------------------------------------------------------------- /auto_deploy/op_generator.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | 4 | def gen_code(op_name, head=0): 5 | 6 | f = open("c_source/%s.halide_generated.cpp" % op_name, "r") 7 | x = f.readlines() 8 | if head == 1: 9 | with open("op/generated.h", "w") as fp: 10 | for line in x[:2363]: 11 | fp.write(line) 12 | 13 | with open("c_source/%s.cpp" % op_name, "w") as fp: 14 | headline = "#include \"generated.h\"\n" 15 | fp.write(headline) 16 | for line in x[2364:]: 17 | fp.write(line) 18 | f.close() 19 | 20 | 21 | os.system("bash op_build.sh") 22 | 23 | gen_code("halide_relu", 1) 24 | gen_code("halide_conv") 25 | gen_code("halide_maxpool") 26 | gen_code("halide_matmul") 27 | 28 | os.system("rm c_source/*.halide_generated.cpp") 29 | -------------------------------------------------------------------------------- /autokernel_plugin/.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | .vscode/ 3 | halide_* 4 | *_gen 5 | 6 | -------------------------------------------------------------------------------- /autokernel_plugin/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | project(AutoKernel) 2 | 3 | set(TENGINE_ROOT /workspace/Tengine/) 4 | 5 | if(NOT DEFINED TENGINE_ROOT) 6 | message(FATAL_ERROR "please set TENGINE_ROOT for tengine directory") 7 | endif() 8 | 9 | set(TENGINE_DIR /workspace/Tengine/build/install) 10 | set(TENGINE_LIBRARY ${TENGINE_DIR}/lib/libtengine-lite.so) 11 | 12 | cmake_minimum_required(VERSION 3.0) 13 | 14 | add_definitions(-Wall) 15 | add_definitions(-fPIC) 16 | #add_definitions(-g) 17 | add_definitions(-O3) 18 | add_definitions(-funroll-loops) 19 | 20 | include_directories(include/) 21 | 22 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-overloaded-virtual") 23 | set(CMAKE_CXX_STANDARD 11) 24 | 25 | # sub directories 26 | add_subdirectory(src) 27 | add_subdirectory(tests) 28 | -------------------------------------------------------------------------------- /autokernel_plugin/common/GenGen.cpp: -------------------------------------------------------------------------------- 1 | #include "Halide.h" 2 | 3 | int main(int argc, char **argv) { 4 | return Halide::Internal::generate_filter_main(argc, argv, std::cerr); 5 | } -------------------------------------------------------------------------------- /autokernel_plugin/images/cat.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OAID/AutoKernel/6ada93a743c26e3aae0372f7518433889dd0b437/autokernel_plugin/images/cat.jpg -------------------------------------------------------------------------------- /autokernel_plugin/models/squeezenet.tmfile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OAID/AutoKernel/6ada93a743c26e3aae0372f7518433889dd0b437/autokernel_plugin/models/squeezenet.tmfile -------------------------------------------------------------------------------- /autokernel_plugin/scripts/clean.sh: -------------------------------------------------------------------------------- 1 | for dir in `ls src` 2 | do 3 | if [ -d src/$dir ] 4 | then 5 | echo src/$dir 6 | cd src/$dir 7 | rm *gen 8 | rm halide* 9 | cd ../../ 10 | fi 11 | done 12 | -------------------------------------------------------------------------------- /autokernel_plugin/scripts/generate.sh: -------------------------------------------------------------------------------- 1 | export HALIDE_DIR=/workspace/Halide/halide-build 2 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${HALIDE_DIR}/lib 3 | for dir in `ls src` 4 | do 5 | if [ -d src/$dir ] 6 | then 7 | echo src/$dir 8 | cd src/$dir 9 | chmod +x build.sh 10 | ./build.sh 11 | cd ../../ 12 | fi 13 | done 14 | -------------------------------------------------------------------------------- /autokernel_plugin/scripts/register_op.sh: -------------------------------------------------------------------------------- 1 | #/usr/bin 2 | #./register_op.sh op_dir_name op_func_name op_name 3 | op_name="" 4 | op_define_name="" 5 | op_src_file="" 6 | op_header_file="" 7 | op_func_name="" 8 | 9 | if [ ! -n "$1" ];then 10 | echo "please input op_name" 11 | read op_name 12 | echo "op_name:"$op_name 13 | echo "please input op_type, [eg.:OP_CONV, OP_POOL, ref @ tengine_op.h]" 14 | read op_define_name 15 | echo "op_type:"$op_define_name 16 | op_dir=src/$op_name 17 | op_src_file=$op_dir/$op_name.cpp 18 | op_header_file=$op_dir/$op_name.h 19 | op_define_name=${op_define_name^^} 20 | op_func_name=halide_${op_name} 21 | else 22 | op_name=$1 23 | op_dir=src/$op_name 24 | op_define_name=${2^^} 25 | op_src_file=$op_dir/$op_name.cpp 26 | op_header_file=$op_dir/$op_name.h 27 | op_func_name=halide_${op_name} 28 | fi 29 | 30 | 31 | echo "op name is $op_name" 32 | if [ ! -d $op_dir ];then 33 | mkdir $op_dir 34 | else 35 | rm -rf $op_dir 36 | mkdir $op_dir 37 | fi 38 | 39 | cp template/template.cpp $op_src_file 40 | cp template/template.h $op_header_file 41 | # cp generator/$op_name/$op_func_name.h $op_dir 42 | # cp generator/$op_name/$op_func_name.s $op_dir 43 | 44 | sed -i s/'template'/$op_name/g $op_src_file 45 | 46 | sed -i s/'AutoKernel_Func'/$op_func_name/g $op_header_file 47 | sed -i s/'AutoKernel_Func'/$op_func_name/g $op_src_file 48 | 49 | sed -i s/'OP_CONV'/$op_define_name/g $op_src_file 50 | 51 | sed -i s/'RegisterAutoKernelOP'/'RegisterAutoKernel'${op_name^}/g $op_header_file 52 | sed -i s/'RegisterAutoKernelOP'/'RegisterAutoKernel'${op_name^}/g $op_src_file 53 | 54 | # plugin_init.cpp 55 | if [ `grep -c 'RegisterAutoKernel'${op_name^} src/plugin_init.cpp` -eq '0' ]; then 56 | line=`grep -n "autokernel_plugin_init" src/plugin_init.cpp | cut -d ":" -f 1` 57 | sed -i '/register halide operator/a\ RegisterAutoKernel'${op_name^}'();' src/plugin_init.cpp 58 | sed -i '1a\#include "'${op_name}'/'${op_name}'.h"' src/plugin_init.cpp 59 | else 60 | echo "found" 61 | fi 62 | 63 | 64 | # op_name_gen.cpp 65 | op_gen_file=$op_dir/${op_name}_gen.cc 66 | cp template/generator.cc $op_gen_file 67 | 68 | sed -i s/'Halide_Func_Name'/$op_func_name/g $op_gen_file 69 | 70 | # build.sh 71 | cp template/build.sh $op_dir 72 | sed -i s/'OP_NAME'/$op_name/g $op_dir/build.sh 73 | chmod +x $op_dir/build.sh 74 | -------------------------------------------------------------------------------- /autokernel_plugin/src/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | 2 | # include directory list 3 | include_directories(${TENGINE_ROOT}/build/source) 4 | include_directories(${TENGINE_ROOT}/source) 5 | 6 | ENABLE_LANGUAGE(ASM) 7 | 8 | set(DRIVER_TARGET autokernel) 9 | FILE(GLOB_RECURSE DRIVER_SRCS "*.cpp" "*.c" "*.s") 10 | ADD_LIBRARY(${DRIVER_TARGET} SHARED ${DRIVER_SRCS}) 11 | target_link_libraries(${DRIVER_TARGET} ${TENGINE_LIBRARY}) 12 | -------------------------------------------------------------------------------- /autokernel_plugin/src/depthwise/build.sh: -------------------------------------------------------------------------------- 1 | g++ depthwise_gen.cc ../../common/GenGen.cpp \ 2 | -I ${HALIDE_DIR}/include \ 3 | -L ${HALIDE_DIR}/lib \ 4 | -lHalide -std=c++11 -fno-rtti \ 5 | -o depthwise_gen 6 | 7 | ./depthwise_gen -g halide_depthwise -e c_header,assembly -o . target=host-no_runtime-no_asserts-no_bounds_query 8 | -------------------------------------------------------------------------------- /autokernel_plugin/src/depthwise/depthwise.cpp: -------------------------------------------------------------------------------- 1 | #include "depthwise.h" 2 | 3 | // add helper data struct and functions here 4 | 5 | static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) 6 | { 7 | return 0; 8 | } 9 | 10 | static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) 11 | { 12 | bool info_autokernel = false; 13 | const char* debug_env = std::getenv("DEBUG_INFO"); 14 | if((debug_env) && (debug_env[0] == '1')) 15 | { 16 | info_autokernel = true; 17 | } 18 | struct node* ir_node = exec_node->ir_node; 19 | struct graph* ir_graph = ir_node->graph; 20 | struct tensor* input_tensor; 21 | struct tensor* weight_tensor; 22 | struct tensor* output_tensor; 23 | struct tensor* bias_tensor = NULL; 24 | int num_thread = exec_graph->num_thread; 25 | int cpu_affinity = exec_graph->cpu_affinity; 26 | 27 | // set the input data and shape again, in case of reshape or dynamic shape 28 | input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); 29 | weight_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]); 30 | output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); 31 | if (ir_node->input_num > 2) 32 | bias_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[2]); 33 | 34 | struct conv_param* conv_param = ( struct conv_param* )ir_node->op.param_mem; 35 | 36 | float* input_buf = (float*)(input_tensor->data); 37 | float* weight_buf = (float*)(weight_tensor->data); 38 | float* output_buf = (float*)(output_tensor->data); 39 | float* bias = NULL; 40 | if (ir_node->input_num > 2) 41 | bias = (float*)(bias_tensor->data); 42 | 43 | if (exec_graph->mode == TENGINE_MODE_FP32) 44 | { 45 | int stride = conv_param->stride_h; 46 | int pad_width = conv_param->pad_w0; 47 | int pad_height = conv_param->pad_h0; 48 | int act = conv_param->activation; 49 | int group = conv_param->group; 50 | 51 | Halide::Runtime::Buffer input(input_buf, input_tensor->dims[3], input_tensor->dims[2], input_tensor->dims[1], input_tensor->dims[0]); 52 | Halide::Runtime::Buffer filter(weight_buf, weight_tensor->dims[3], weight_tensor->dims[2], weight_tensor->dims[1], weight_tensor->dims[0]); 53 | Halide::Runtime::Buffer output(output_buf, output_tensor->dims[3], output_tensor->dims[2], output_tensor->dims[1], output_tensor->dims[0]); 54 | Halide::Runtime::Buffer bias1(bias, output_tensor->dims[1]); 55 | 56 | if(info_autokernel)printf("[INFO]: runing AutoKernel im2col_conv ...\n"); 57 | 58 | halide_depthwise(input, filter, bias1, stride, pad_width, pad_height, act, output); 59 | } 60 | else 61 | { 62 | printf("Tengine work node not support %d\n", exec_graph->mode); 63 | return -1; 64 | } 65 | return 0; 66 | } 67 | 68 | static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) 69 | { 70 | return 0; 71 | } 72 | 73 | static int postrun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) 74 | { 75 | /* 76 | release the helper memory you 77 | */ 78 | return 0; 79 | } 80 | 81 | static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) 82 | { 83 | /* 84 | init the private info data for your op: 85 | void ops_priv; 86 | int shared_mem_size; 87 | int shared_pack4_mem_size; 88 | */ 89 | return 0; 90 | } 91 | 92 | static int release_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) 93 | { 94 | /* 95 | release the private info data for your op: 96 | void ops_priv; 97 | int shared_mem_size; 98 | int shared_pack4_mem_size; 99 | */ 100 | return 0; 101 | } 102 | 103 | static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struct node* exec_node) 104 | { 105 | 106 | struct conv_param* param = ( struct conv_param* )exec_node->op.param_mem; 107 | struct node* ir_node = exec_node; 108 | struct graph* ir_graph = ir_node->graph; 109 | 110 | struct tensor* input_tensor; 111 | struct tensor* output_tensor; 112 | 113 | int group = param->group; 114 | int kernel_h = param->kernel_h; 115 | int kernel_w = param->kernel_w; 116 | int stride_h = param->stride_h; 117 | int stride_w = param->stride_w; 118 | int dilation_h = param->dilation_h; 119 | int dilation_w = param->dilation_w; 120 | int pad_h0 = param->pad_h0; 121 | int pad_w0 = param->pad_h0; 122 | int pad_h1 = param->pad_h1; 123 | int pad_w1 = param->pad_w1; 124 | 125 | input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); 126 | output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); 127 | 128 | int in_c = input_tensor->dims[1] / group; 129 | int out_c = output_tensor->dims[1] / group; 130 | 131 | if (input_tensor->data_type != TENGINE_DT_FP32) 132 | return 0; 133 | if (kernel_h != kernel_w || input_tensor->dims[0] > 1) 134 | return 0; 135 | 136 | if (param->group > 1 && in_c == 1 && out_c == 1 && pad_h0 == pad_h1 && pad_w0 == pad_w1 137 | && dilation_h == 1 && dilation_w == 1 && kernel_h == 3 && kernel_w == 3 138 | && ((stride_h == 1 && stride_w == 1) || (stride_w == 2 && stride_h == 2))) 139 | { 140 | return OPS_SCORE_STATIC; 141 | } 142 | else 143 | { 144 | return 0; 145 | } 146 | } 147 | 148 | static struct node_ops autokernel_node_ops = {.prerun = prerun, 149 | .run = run, 150 | .reshape = reshape, 151 | .postrun = postrun, 152 | .init_node = init_node, 153 | .release_node = release_node, 154 | .score = score}; 155 | 156 | int RegisterAutoKernelDepthwise() 157 | { 158 | return register_builtin_node_ops(OP_CONV, &autokernel_node_ops); 159 | } 160 | 161 | //static int unreg_autokernel_ops(void* arg) 162 | //{ 163 | // unregister_builtin_node_ops(OP_DEPTHWISE, &autokernel_node_ops); 164 | // return 0; 165 | //} 166 | -------------------------------------------------------------------------------- /autokernel_plugin/src/depthwise/depthwise.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | extern "C" 5 | { 6 | #include "device/cpu/cpu_define.h" 7 | #include "device/cpu/cpu_node.h" 8 | #include "device/cpu/cpu_module.h" 9 | #include "device/cpu/cpu_graph.h" 10 | 11 | #include "api/c_api.h" 12 | #include "device/device.h" 13 | #include "graph/tensor.h" 14 | #include "graph/node.h" 15 | #include "graph/graph.h" 16 | #include "graph/subgraph.h" 17 | #include "executer/executer.h" 18 | #include "optimizer/split.h" 19 | #include "module/module.h" 20 | #include "utility/vector.h" 21 | #include "utility/log.h" 22 | #include "utility/sys_port.h" 23 | #include "defines.h" 24 | 25 | // include op param header file here, locate in src/op/ 26 | #include "operator/prototype/convolution_param.h" 27 | } 28 | 29 | #include "HalideBuffer.h" 30 | 31 | // include the c_header file here 32 | #include "halide_depthwise.h" 33 | 34 | int RegisterAutoKernelDepthwise(); 35 | -------------------------------------------------------------------------------- /autokernel_plugin/src/depthwise/depthwise_gen.cc: -------------------------------------------------------------------------------- 1 | #include "Halide.h" 2 | #include "HalideBuffer.h" 3 | using namespace Halide; 4 | using Halide::Expr; 5 | using Halide::Func; 6 | using Halide::Generator; 7 | using Halide::Var; 8 | using Halide::BoundaryConditions::constant_exterior; 9 | 10 | class halide_depthwise:public Halide::Generator{ 11 | public: 12 | Input> input{"input", 4}; 13 | Input> kernel{"kernel", 4}; 14 | Input> bias{"bias", 1}; 15 | 16 | Input stride{"stride"}; 17 | Input pad_width{"pad_width"}; 18 | Input pad_height{"pad_height"}; 19 | Input act{"act"}; 20 | 21 | Output> output{"output", 4}; 22 | 23 | void generate() { 24 | // The algorithm. 25 | Var x("x"), y("y"), depth("depth"), n("n"); 26 | 27 | Func input_bounded = 28 | constant_exterior(input, 0, 29 | {{0, input.dim(0).extent()}, //boundary-dim0 w 30 | {0, input.dim(1).extent()}, //boundary-dim1 h 31 | {Expr(), Expr()}, //boundary-dim2 c 32 | {Expr(), Expr()}}); //boundary-dim3 n 33 | 34 | Func inp_padded("inp_padded"); 35 | inp_padded(x, y, depth, n) = input_bounded(x - pad_width, y - pad_height, depth, n); 36 | 37 | Func conv_nchw("conv_nchw"); 38 | RDom filter_dom(0, kernel.dim(0).extent(), 0, kernel.dim(1).extent()); 39 | 40 | conv_nchw(x, y, depth, n) = bias(depth); 41 | conv_nchw(x, y, depth, n) += kernel(filter_dom.x, filter_dom.y, 0, depth) * 42 | inp_padded(x * stride + filter_dom.x, y * stride + filter_dom.y, depth, n); 43 | output(x, y, depth, n) = select(act >= 0, max(act, conv_nchw(x, y, depth, n)), conv_nchw(x, y, depth, n)); 44 | } 45 | 46 | void schedule() 47 | { 48 | if(auto_schedule) 49 | { 50 | input.set_estimates({{0, 512}, {0, 512}, {0, 512}, {0, 1}}); 51 | kernel.set_estimates({{0, 512}, {0, 512}, {0, 512}, {0, 1}}); 52 | bias.set_estimates({{0, 512}}); 53 | 54 | stride.set_estimate(1); 55 | pad_width.set_estimate(1); 56 | pad_height.set_estimate(1); 57 | output.set_estimates({{0, 512}, {0, 512}, {0, 512}, {0, 1}}); 58 | } 59 | } 60 | }; 61 | 62 | HALIDE_REGISTER_GENERATOR(halide_depthwise, halide_depthwise) 63 | -------------------------------------------------------------------------------- /autokernel_plugin/src/direct_conv/build.sh: -------------------------------------------------------------------------------- 1 | g++ direct_conv_gen.cc ../../common/GenGen.cpp \ 2 | -I ${HALIDE_DIR}/include \ 3 | -L ${HALIDE_DIR}/lib \ 4 | -lHalide -std=c++11 -fno-rtti \ 5 | -o direct_conv_gen 6 | 7 | ./direct_conv_gen -g halide_direct_conv -e c_header,assembly -o . target=host 8 | -------------------------------------------------------------------------------- /autokernel_plugin/src/direct_conv/direct_conv.cpp: -------------------------------------------------------------------------------- 1 | #include "direct_conv.h" 2 | 3 | static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) 4 | { 5 | return 0; 6 | } 7 | 8 | static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) 9 | { 10 | struct node* ir_node = exec_node->ir_node; 11 | struct graph* ir_graph = ir_node->graph; 12 | struct tensor* input_tensor; 13 | struct tensor* weight_tensor; 14 | struct tensor* output_tensor; 15 | struct tensor* bias_tensor = NULL; 16 | int num_thread = exec_graph->num_thread; 17 | int cpu_affinity = exec_graph->cpu_affinity; 18 | 19 | /* set the input data and shape again, in case of reshape or dynamic shape */ 20 | input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); 21 | weight_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]); 22 | output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); 23 | if (ir_node->input_num > 2) 24 | bias_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[2]); 25 | 26 | struct conv_param* conv_param = ( struct conv_param* )ir_node->op.param_mem; 27 | 28 | float* input_buf = (float*)(input_tensor->data); 29 | float* weight_buf = (float*)(weight_tensor->data); 30 | float* output_buf = (float*)(output_tensor->data); 31 | float* bias = NULL; 32 | if (ir_node->input_num > 2) 33 | bias = (float*)(bias_tensor->data); 34 | 35 | if (exec_graph->mode == TENGINE_MODE_FP32) 36 | { 37 | int stride = conv_param->stride_h; 38 | int pad_width = conv_param->pad_w0; 39 | int pad_height = conv_param->pad_h0; 40 | int input_c = input_tensor->dims[1]; 41 | int act = conv_param->activation; 42 | 43 | Halide::Runtime::Buffer input(input_buf, input_tensor->dims[3], input_tensor->dims[2], input_tensor->dims[1], input_tensor->dims[0]); 44 | Halide::Runtime::Buffer filter(weight_buf, weight_tensor->dims[3], weight_tensor->dims[2], weight_tensor->dims[1], weight_tensor->dims[0]); 45 | Halide::Runtime::Buffer output(output_buf, output_tensor->dims[3], output_tensor->dims[2], output_tensor->dims[1], output_tensor->dims[0]); 46 | Halide::Runtime::Buffer bias1(bias, output_tensor->dims[1]); 47 | 48 | halide_direct_conv(input, filter, bias1, input_c, stride, pad_width, pad_height, act, output); 49 | } 50 | else 51 | { 52 | printf("Tengine work node not support %d\n", exec_graph->mode); 53 | return -1; 54 | } 55 | 56 | return 0; 57 | } 58 | 59 | static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) 60 | { 61 | return 0; 62 | } 63 | 64 | static int postrun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) 65 | { 66 | //printf("run halide postrun\n"); 67 | return 0; 68 | } 69 | 70 | static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) 71 | { 72 | return 0; 73 | } 74 | 75 | static int release_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) 76 | { 77 | return 0; 78 | } 79 | 80 | static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struct node* exec_node) 81 | { 82 | 83 | return 5003; 84 | } 85 | 86 | static struct node_ops hcl_node_ops = {.prerun = prerun, 87 | .run = run, 88 | .reshape = reshape, 89 | .postrun = postrun, 90 | .init_node = init_node, 91 | .release_node = release_node, 92 | .score = score}; 93 | 94 | int RegisterAutoKernelDirect_conv() 95 | { 96 | return register_builtin_node_ops(OP_CONV, &hcl_node_ops); 97 | } 98 | 99 | // static int unreg_conv_hcl_ops(void* arg) 100 | // { 101 | // unregister_builtin_node_ops(OP_CONV, &hcl_node_ops); 102 | // return 0; 103 | // } 104 | -------------------------------------------------------------------------------- /autokernel_plugin/src/direct_conv/direct_conv.h: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | extern "C" 4 | { 5 | #include "device/cpu/cpu_define.h" 6 | #include "device/cpu/cpu_node.h" 7 | #include "device/cpu/cpu_module.h" 8 | #include "device/cpu/cpu_graph.h" 9 | 10 | #include "api/c_api.h" 11 | #include "device/device.h" 12 | #include "graph/tensor.h" 13 | #include "graph/node.h" 14 | #include "graph/graph.h" 15 | #include "graph/subgraph.h" 16 | #include "executer/executer.h" 17 | #include "optimizer/split.h" 18 | #include "module/module.h" 19 | #include "utility/vector.h" 20 | #include "utility/log.h" 21 | #include "utility/sys_port.h" 22 | #include "defines.h" 23 | 24 | #include "operator/prototype/convolution_param.h" 25 | } 26 | #include "HalideBuffer.h" 27 | #include "halide_direct_conv.h" 28 | 29 | int RegisterAutoKernelDirect_conv(); 30 | -------------------------------------------------------------------------------- /autokernel_plugin/src/direct_conv/direct_conv_gen.cc: -------------------------------------------------------------------------------- 1 | #include "Halide.h" 2 | #include "HalideBuffer.h" 3 | using namespace Halide; 4 | using Halide::Expr; 5 | using Halide::Func; 6 | using Halide::Generator; 7 | using Halide::Var; 8 | using Halide::BoundaryConditions::constant_exterior; 9 | 10 | class halide_direct_conv:public Halide::Generator{ 11 | public: 12 | Input> input{"input", 4}; 13 | Input> kernel{"kernel", 4}; 14 | Input> bias{"bias", 1}; 15 | 16 | Input input_c{"input_depth"}; 17 | Input stride{"stride"}; 18 | Input pad_width{"pad_width"}; 19 | Input pad_height{"pad_height"}; 20 | Input act{"act"}; 21 | 22 | Output> relu{"relu", 4}; 23 | 24 | void generate() { 25 | /* THE ALGORITHM */ 26 | 27 | Var x("x"), y("y"), ci("ci"), n("n"), co("co"); 28 | 29 | Func inp_bounded =constant_exterior(input, //source 30 | 0, //value 31 | {{0, input.dim(0).extent()}, //boundary-dim0 w 32 | {0, input.dim(1).extent()}, //boundary-dim1 h 33 | {Expr(), Expr()}, //boundary-dim2 c 34 | {Expr(), Expr()}}); //boundary-dim3 n 35 | Func inp_padded("inp_padded"); 36 | inp_padded(x, y, ci, n) = inp_bounded(x - pad_width, y - pad_height, ci, n); 37 | 38 | Func conv_nchw("conv_nchw"); 39 | 40 | RDom r(0, kernel.dim(0).extent(), 0, kernel.dim(1).extent(), 0, input_c); 41 | 42 | conv_nchw(x, y, co, n) = bias(co); 43 | conv_nchw(x, y, co, n) += kernel(r[0], r[1], r[2], co) * 44 | inp_padded(x * stride + r[0], y * stride + r[1],r[2],n); 45 | 46 | relu(x, y, co, n) = select(act >= 0, max(act, conv_nchw(x, y, co, n)), conv_nchw(x, y, co, n)); 47 | /* 48 | if(act == 0) 49 | relu(x, y, co, n) = max(act, conv_nchw(x, y, co, n)); 50 | else 51 | relu(x, y, co, n) = conv_nchw(x, y, co, n); 52 | */ 53 | } 54 | 55 | void schedule() 56 | { 57 | if(auto_schedule) 58 | { 59 | input.set_estimates({{0, 512}, {0, 512}, {0, 512}, {0, 1}}); 60 | kernel.set_estimates({{0, 512}, {0, 512}, {0, 512}, {0, 512}}); 61 | bias.set_estimates({{0, 512}}); 62 | // input_c.set_estimate(64); 63 | stride.set_estimate(1); 64 | pad_width.set_estimate(1); 65 | pad_height.set_estimate(1); 66 | relu.set_estimates({{0, 512}, {0, 512}, {0, 512}, {0, 1}}); 67 | } 68 | } 69 | 70 | 71 | }; 72 | HALIDE_REGISTER_GENERATOR(halide_direct_conv, halide_direct_conv) 73 | -------------------------------------------------------------------------------- /autokernel_plugin/src/fc/build.sh: -------------------------------------------------------------------------------- 1 | g++ fc_gen.cc ../../common/GenGen.cpp \ 2 | -I ${HALIDE_DIR}/include \ 3 | -L ${HALIDE_DIR}/lib \ 4 | -lHalide -std=c++11 -fno-rtti \ 5 | -o fc_gen 6 | 7 | ./fc_gen -g halide_fc -e c_header,assembly -o . target=host 8 | -------------------------------------------------------------------------------- /autokernel_plugin/src/fc/fc.cpp: -------------------------------------------------------------------------------- 1 | #include "fc.h" 2 | 3 | // add helper data struct and functions here 4 | /* 5 | struct op_priv_info 6 | { 7 | 8 | }; 9 | */ 10 | 11 | static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) 12 | { 13 | /* 14 | allocate helper memory for your op 15 | */ 16 | return 0; 17 | } 18 | 19 | static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) 20 | { 21 | bool info_autokernel = false; 22 | const char* debug_env = std::getenv("DEBUG_INFO"); 23 | if((debug_env) && (debug_env[0] == '1')) 24 | { 25 | info_autokernel = true; 26 | } 27 | // step 1: get input and output 28 | struct node* ir_node = exec_node->ir_node; 29 | struct graph* ir_graph = ir_node->graph; 30 | struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); 31 | struct tensor* weight_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]); 32 | struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); 33 | struct tensor* bias_tensor = NULL; 34 | if (ir_node->input_num > 2) 35 | bias_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[2]); 36 | struct fc_data* fc_param = ( struct fc_data* )ir_node->op.param_mem; 37 | 38 | float* input_buf = (float*)(input_tensor->data); 39 | float* weight_buf = (float*)(weight_tensor->data); 40 | float* output_buf = (float*)(output_tensor->data); 41 | float* bias = NULL; 42 | if(ir_node->input_num > 2) 43 | bias = (float*)(bias_tensor->data); 44 | 45 | if(exec_graph->mode == TENGINE_MODE_FP32) 46 | { 47 | Halide::Runtime::Buffer input(input_buf, input_tensor->dims[1], input_tensor->dims[0]); 48 | Halide::Runtime::Buffer weight(weight_buf, weight_tensor->dims[1], weight_tensor->dims[0]); 49 | Halide::Runtime::Buffer output(output_buf, output_tensor->dims[1], output_tensor->dims[0]); 50 | Halide::Runtime::Buffer bias1(bias, output_tensor->dims[1]); 51 | 52 | printf("[INFO]:using halide fc...\n"); 53 | 54 | halide_fc(input, weight, bias1, output); 55 | } 56 | else 57 | { 58 | printf("Tengine work node not support %d\n", exec_graph->mode); 59 | return -1; 60 | } 61 | 62 | return 0; 63 | } 64 | 65 | static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) 66 | { 67 | return 0; 68 | } 69 | 70 | static int postrun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) 71 | { 72 | return 0; 73 | } 74 | 75 | static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) 76 | { 77 | return 0; 78 | } 79 | 80 | static int release_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) 81 | { 82 | /* 83 | release the private info data for your op: 84 | void ops_priv; 85 | int shared_mem_size; 86 | int shared_pack4_mem_size; 87 | */ 88 | return 0; 89 | } 90 | 91 | static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struct node* exec_node) 92 | { 93 | /* 94 | OPS_SCORE_STATIC 10000 95 | OPS_SCORE_BEST 8000 96 | OPS_SCORE_PREFER 6000 97 | OPS_SCORE_CANDO 4000 98 | OPS_SCORE_NOTSUP 2000 99 | */ 100 | return OPS_SCORE_STATIC; 101 | } 102 | 103 | static struct node_ops autokernel_node_ops = {.prerun = prerun, 104 | .run = run, 105 | .reshape = reshape, 106 | .postrun = postrun, 107 | .init_node = init_node, 108 | .release_node = release_node, 109 | .score = score}; 110 | 111 | int RegisterAutoKernelFc() 112 | { 113 | return register_builtin_node_ops(OP_FC, &autokernel_node_ops); 114 | } 115 | 116 | //static int unreg_autokernel_ops(void* arg) 117 | //{ 118 | // unregister_builtin_node_ops(OP_FC, &autokernel_node_ops); 119 | // return 0; 120 | //} 121 | -------------------------------------------------------------------------------- /autokernel_plugin/src/fc/fc.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | extern "C" 5 | { 6 | #include "device/cpu/cpu_define.h" 7 | #include "device/cpu/cpu_node.h" 8 | #include "device/cpu/cpu_module.h" 9 | #include "device/cpu/cpu_graph.h" 10 | 11 | #include "api/c_api.h" 12 | #include "device/device.h" 13 | #include "graph/tensor.h" 14 | #include "graph/node.h" 15 | #include "graph/graph.h" 16 | #include "graph/subgraph.h" 17 | #include "executer/executer.h" 18 | #include "optimizer/split.h" 19 | #include "module/module.h" 20 | #include "utility/vector.h" 21 | #include "utility/log.h" 22 | #include "utility/sys_port.h" 23 | #include "defines.h" 24 | 25 | // include op param header file here, locate in src/op/ 26 | #include "operator/prototype/convolution_param.h" 27 | } 28 | 29 | #include "HalideBuffer.h" 30 | 31 | // include the c_header file here 32 | #include "halide_fc.h" 33 | 34 | int RegisterAutoKernelFc(); 35 | -------------------------------------------------------------------------------- /autokernel_plugin/src/fc/fc_gen.cc: -------------------------------------------------------------------------------- 1 | #include "Halide.h" 2 | #include "HalideBuffer.h" 3 | using namespace Halide; 4 | using Halide::Expr; 5 | using Halide::Func; 6 | using Halide::Generator; 7 | using Halide::Var; 8 | using Halide::BoundaryConditions::constant_exterior; 9 | 10 | class halide_fc:public Halide::Generator{ 11 | public: 12 | // args 13 | Input> input{"input", 2}; 14 | Input> filter{"filter", 2}; 15 | Input> bias{"bias", 1}; 16 | Output> output{"output", 2}; 17 | 18 | void generate() 19 | { 20 | /* THE ALGORITHM */ 21 | const Expr hidden = input.width(); 22 | 23 | Var b("b"), co("co"); 24 | Func halide_fc("halide_fc"); 25 | RDom hi(0, hidden); 26 | halide_fc(co, b) = bias(co); 27 | halide_fc(co, b) += input(hi, b) * filter(hi, co); 28 | 29 | output(co, b) = halide_fc(co, b); 30 | } 31 | 32 | void schedule() 33 | { 34 | /* THE SCHEDULE */ 35 | input.set_estimates({{0, 512}, {0, 512}}); 36 | filter.set_estimates({{0, 512}, {0, 512}}); 37 | bias.set_estimates({{0, 512}}); 38 | output.set_estimates({{0, 512}, {0, 512}}); 39 | 40 | } 41 | }; 42 | 43 | HALIDE_REGISTER_GENERATOR(halide_fc, halide_fc) 44 | -------------------------------------------------------------------------------- /autokernel_plugin/src/im2col_conv/build.sh: -------------------------------------------------------------------------------- 1 | g++ im2col_conv_gen.cc ../../common/GenGen.cpp \ 2 | -I ${HALIDE_DIR}/include \ 3 | -L ${HALIDE_DIR}/lib \ 4 | -lHalide -std=c++11 -fno-rtti \ 5 | -o im2col_conv_gen 6 | 7 | ./im2col_conv_gen -g halide_im2col_conv -e c_header,assembly -o . target=host-no_runtime-no_asserts-no_bounds_query 8 | -------------------------------------------------------------------------------- /autokernel_plugin/src/im2col_conv/im2col_conv.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | extern "C" 5 | { 6 | #include "device/cpu/cpu_define.h" 7 | #include "device/cpu/cpu_node.h" 8 | #include "device/cpu/cpu_module.h" 9 | #include "device/cpu/cpu_graph.h" 10 | 11 | #include "api/c_api.h" 12 | #include "device/device.h" 13 | #include "graph/tensor.h" 14 | #include "graph/node.h" 15 | #include "graph/graph.h" 16 | #include "graph/subgraph.h" 17 | #include "executer/executer.h" 18 | #include "optimizer/split.h" 19 | #include "module/module.h" 20 | #include "utility/vector.h" 21 | #include "utility/log.h" 22 | #include "utility/sys_port.h" 23 | #include "defines.h" 24 | 25 | // include op param header file here, locate in src/op/ 26 | #include "operator/prototype/convolution_param.h" 27 | } 28 | #include "HalideBuffer.h" 29 | #include "halide_im2col_conv.h" 30 | 31 | int RegisterAutoKernelIm2col_conv(); 32 | -------------------------------------------------------------------------------- /autokernel_plugin/src/im2col_conv/im2col_conv_gen.cc: -------------------------------------------------------------------------------- 1 | #include "Halide.h" 2 | #include 3 | 4 | using namespace Halide; 5 | namespace { 6 | 7 | // Generator class for BLAS gemm operations. 8 | template 9 | class GEMMGenerator : public Generator> { 10 | public: 11 | typedef Generator> Base; 12 | using Base::get_target; 13 | using Base::natural_vector_size; 14 | using Base::target; 15 | template 16 | using Input = typename Base::template Input; 17 | template 18 | using Output = typename Base::template Output; 19 | 20 | Input> A_ = {"A_", 2}; 21 | Input> B_ = {"B_", 2}; 22 | Output> result_ = {"result", 2}; 23 | 24 | void generate() { 25 | 26 | const Expr num_rows = A_.height(); //M A(K,M) 27 | const Expr num_cols = B_.width(); //N B(N,k) 28 | const Expr sum_size = A_.width(); //K 29 | 30 | const int vec = 8; 31 | const int s = vec * 2; 32 | 33 | Input> *A_in = &A_; 34 | Input> *B_in = &B_; 35 | 36 | 37 | Var i, j, ii, ji, jii, iii, io, jo, t; 38 | Var ti[3], tj[3]; 39 | 40 | Func A("A"), B("B"), Btmp("Btmp"), As("As"), Atmp("Atmp"), Bs("Bs"); 41 | Btmp(i, j) = BoundaryConditions::constant_exterior(*B_in, cast(0))(i, j); 42 | 43 | Bs(i, j, io) = Btmp(io * s + i, j); 44 | B(i, j) = Bs(i % s, j, i / s); 45 | 46 | Atmp(i, j) = (*A_in)(i, j); 47 | A(i, j) = Atmp(i, j); 48 | 49 | Var k("k"); 50 | Func prod; 51 | prod(k, j, i) = A(k, i) * B(j, k); 52 | 53 | Func AB("AB"); 54 | RDom rv(0, sum_size); 55 | AB(i, j) += prod(rv, i, j); 56 | 57 | 58 | result_(i, j) = AB(i, j); 59 | 60 | //schedule 61 | result_.tile(i, j, ti[1], tj[1], i, j, 2 * s, 2 * s, TailStrategy::GuardWithIf); 62 | result_ 63 | .tile(i, j, ii, ji, s, 4) 64 | .tile(i, j, ti[0], tj[0], i, j, 1, s / 4); 65 | 66 | result_.specialize(num_rows >= 512 && num_cols >= 512) 67 | .fuse(tj[1], ti[1], t) 68 | .parallel(t); 69 | 70 | result_.specialize(num_rows >= 128 && num_cols >= 128) 71 | .tile(ti[1], tj[1], ti[2], tj[2], ti[1], tj[1], 2, 2) 72 | .fuse(tj[2], ti[2], t) 73 | .parallel(t); 74 | 75 | //long N 76 | result_.specialize(num_rows >= 64 && num_cols >= 8000) 77 | .parallel(ti[1],4); 78 | result_.specialize(num_rows >= 64 && num_cols >= 256) 79 | .tile(ti[1], tj[1], ti[2], tj[2], ti[1], tj[1], 4, 2) 80 | .parallel(ti[2]); 81 | result_.specialize(num_rows >= 64 && num_cols >= 128) 82 | .tile(ti[1], tj[1], ti[2], tj[2], ti[1], tj[1], 2, 2) 83 | .fuse(tj[2], ti[2], t) 84 | .parallel(t); 85 | //long M 86 | result_.specialize(num_rows >= 512 && num_cols >=32) 87 | .tile(ti[1], tj[1], ti[2], tj[2], ti[1], tj[1], 1, 4) 88 | .parallel(tj[2]); 89 | // long N 90 | result_.specialize(num_rows >= 32 && num_cols >= 8000) 91 | .parallel(ti[1],8); 92 | result_.specialize(num_rows >= 16 && num_cols >= 256) 93 | .tile(ti[1], tj[1], ti[2], tj[2], ti[1], tj[1], 4, 1) 94 | .parallel(ti[2]); 95 | result_.specialize(num_rows >= 16 && num_cols >= 128) 96 | .tile(ti[1], tj[1], ti[2], tj[2], ti[1], tj[1], 2, 1) 97 | .fuse(tj[2], ti[2], t) 98 | .parallel(t); 99 | 100 | // 101 | result_.rename(tj[0], t); 102 | result_.bound(i, 0, num_cols).bound(j, 0, num_rows); 103 | 104 | AB.compute_at(result_, i) 105 | .bound_extent(j, 4) 106 | .unroll(j) 107 | .bound_extent(i, s) 108 | .vectorize(i) 109 | .update() 110 | .reorder(i, j, rv) 111 | .unroll(j) 112 | .unroll(rv, 2) 113 | .vectorize(i); 114 | 115 | Bs.compute_root() 116 | .split(j, jo, ji, s) 117 | .reorder(i, ji, io, jo) 118 | .unroll(i) 119 | .vectorize(ji); 120 | Bs.specialize(B_.width() >= 256 && B_.height() >= 64) 121 | .parallel(jo, 4); 122 | 123 | Btmp.compute_at(Bs, io) 124 | .vectorize(i) 125 | .unroll(j); 126 | 127 | A_.dim(0).set_min(0).dim(1).set_min(0); 128 | B_.dim(0).set_min(0).dim(1).set_bounds(0, sum_size); 129 | result_.dim(0).set_bounds(0, num_cols).dim(1).set_bounds(0, num_rows); 130 | } 131 | }; 132 | 133 | } // namespace 134 | 135 | HALIDE_REGISTER_GENERATOR(GEMMGenerator, halide_im2col_conv) 136 | 137 | -------------------------------------------------------------------------------- /autokernel_plugin/src/normalize/build.sh: -------------------------------------------------------------------------------- 1 | g++ normalize_gen.cc ../../common/GenGen.cpp \ 2 | -I ${HALIDE_DIR}/include \ 3 | -L ${HALIDE_DIR}/lib \ 4 | -lHalide -std=c++11 -fno-rtti \ 5 | -o normalize_gen 6 | 7 | ./normalize_gen -g halide_normalize -e c_header,assembly -o . target=host 8 | -------------------------------------------------------------------------------- /autokernel_plugin/src/normalize/normalize.cpp: -------------------------------------------------------------------------------- 1 | #include "normalize.h" 2 | 3 | // add helper data struct and functions here 4 | /* 5 | struct op_priv_info 6 | { 7 | 8 | }; 9 | */ 10 | 11 | static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) 12 | { 13 | /* 14 | allocate helper memory for your op 15 | */ 16 | return 0; 17 | } 18 | 19 | static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) 20 | { 21 | bool info_autokernel = false; 22 | const char* debug_env = std::getenv("DEBUG_INFO"); 23 | if((debug_env) && (debug_env[0] == '1')) 24 | { 25 | info_autokernel = true; 26 | } 27 | // step 1: get input and output 28 | struct node* ir_node = exec_node->ir_node; 29 | struct graph* ir_graph = ir_node->graph; 30 | struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); 31 | struct tensor* scale_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]); 32 | struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); 33 | 34 | float* input_buf = (float*)(input_tensor->data); 35 | float* scale_buf = (float*)(scale_tensor->data); 36 | float* output_buf = (float*)(output_tensor->data); 37 | 38 | if(exec_graph->mode == TENGINE_MODE_FP32) 39 | { 40 | Halide::Runtime::Buffer input(input_buf, 41 | input_tensor->dims[3], input_tensor->dims[2], input_tensor->dims[1], input_tensor->dims[0]); 42 | Halide::Runtime::Buffer scale(scale_buf, scale_tensor->dims[0]); 43 | Halide::Runtime::Buffer output(output_buf, 44 | output_tensor->dims[3], output_tensor->dims[2], output_tensor->dims[1], output_tensor->dims[0]); 45 | 46 | printf("[INFO]:using halide normalize...\n"); 47 | 48 | halide_normalize(input, scale, output); 49 | } 50 | else 51 | { 52 | printf("Tengine work node not support %d\n", exec_graph->mode); 53 | return -1; 54 | } 55 | 56 | return 0; 57 | } 58 | 59 | static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) 60 | { 61 | return 0; 62 | } 63 | 64 | static int postrun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) 65 | { 66 | return 0; 67 | } 68 | 69 | static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) 70 | { 71 | return 0; 72 | } 73 | 74 | static int release_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) 75 | { 76 | /* 77 | release the private info data for your op: 78 | void ops_priv; 79 | int shared_mem_size; 80 | int shared_pack4_mem_size; 81 | */ 82 | return 0; 83 | } 84 | 85 | static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struct node* exec_node) 86 | { 87 | /* 88 | OPS_SCORE_STATIC 10000 89 | OPS_SCORE_BEST 8000 90 | OPS_SCORE_PREFER 6000 91 | OPS_SCORE_CANDO 4000 92 | OPS_SCORE_NOTSUP 2000 93 | */ 94 | return OPS_SCORE_STATIC; 95 | } 96 | 97 | static struct node_ops autokernel_node_ops = {.prerun = prerun, 98 | .run = run, 99 | .reshape = reshape, 100 | .postrun = postrun, 101 | .init_node = init_node, 102 | .release_node = release_node, 103 | .score = score}; 104 | 105 | int RegisterAutoKernelNormalize() 106 | { 107 | return register_builtin_node_ops(OP_NORMALIZE, &autokernel_node_ops); 108 | } 109 | 110 | -------------------------------------------------------------------------------- /autokernel_plugin/src/normalize/normalize.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | extern "C" 5 | { 6 | #include "device/cpu/cpu_define.h" 7 | #include "device/cpu/cpu_node.h" 8 | #include "device/cpu/cpu_module.h" 9 | #include "device/cpu/cpu_graph.h" 10 | 11 | #include "api/c_api.h" 12 | #include "device/device.h" 13 | #include "graph/tensor.h" 14 | #include "graph/node.h" 15 | #include "graph/graph.h" 16 | #include "graph/subgraph.h" 17 | #include "executer/executer.h" 18 | #include "optimizer/split.h" 19 | #include "module/module.h" 20 | #include "utility/vector.h" 21 | #include "utility/log.h" 22 | #include "utility/sys_port.h" 23 | #include "defines.h" 24 | 25 | // include op param header file here, locate in src/op/ 26 | #include "operator/prototype/normalize_param.h" 27 | } 28 | 29 | #include "HalideBuffer.h" 30 | 31 | // include the c_header file here 32 | #include "halide_normalize.h" 33 | 34 | int RegisterAutoKernelNormalize(); 35 | -------------------------------------------------------------------------------- /autokernel_plugin/src/normalize/normalize_gen.cc: -------------------------------------------------------------------------------- 1 | #include "Halide.h" 2 | #include "HalideBuffer.h" 3 | using namespace Halide; 4 | using Halide::Expr; 5 | using Halide::Func; 6 | using Halide::Generator; 7 | using Halide::Var; 8 | using Halide::BoundaryConditions::constant_exterior; 9 | 10 | class halide_normalize:public Halide::Generator{ 11 | public: 12 | // args 13 | Input> input{"input", 4}; 14 | Input> scale{"scale", 1}; 15 | Output> output{"output", 4}; 16 | 17 | void generate() 18 | { 19 | /* THE ALGORITHM */ 20 | const Expr channel_number = input.dim(2).extent(); 21 | 22 | Var n("n"), c("c"), h("h"), w("w"); 23 | RDom cn(0, channel_number); 24 | Func channel_reduce("channel_reduce"); 25 | channel_reduce(w, h, n) += input(w, h, cn, n) * input(w, h, cn, n); 26 | channel_reduce(w, h, n) = 1.f / sqrt(channel_reduce(w, h, n)); 27 | 28 | Func halide_normalize("halide_normalize"); 29 | halide_normalize(w, h, c, n) = channel_reduce(w, h, n) * scale(c) * input(w, h, c, n); 30 | 31 | output(w, h, c, n) = halide_normalize(w, h, c, n); 32 | } 33 | 34 | void schedule() 35 | { 36 | /* THE SCHEDULE */ 37 | input.set_estimates({{0, 512}, {0, 512}, {0, 512}, {0, 512}}); 38 | scale.set_estimates({{0, 512}}); 39 | output.set_estimates({{0, 512}, {0, 512}, {0, 512}, {0, 512}}); 40 | } 41 | }; 42 | 43 | HALIDE_REGISTER_GENERATOR(halide_normalize, halide_normalize) 44 | -------------------------------------------------------------------------------- /autokernel_plugin/src/plugin_init.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include "pool/pool.h" 3 | #include "direct_conv/direct_conv.h" 4 | #include "im2col_conv/im2col_conv.h" 5 | #include "fc/fc.h" 6 | #include "depthwise/depthwise.h" 7 | #include "softmax/softmax.h" 8 | #include "normalize/normalize.h" 9 | 10 | extern "C" int autokernel_plugin_init(void) 11 | { 12 | /* register halide operator */ 13 | RegisterAutoKernelDepthwise(); 14 | RegisterAutoKernelSoftmax(); 15 | RegisterAutoKernelFc(); 16 | RegisterAutoKernelPool(); 17 | RegisterAutoKernelDirect_conv(); 18 | RegisterAutoKernelIm2col_conv(); 19 | RegisterAutoKernelNormalize(); 20 | printf("AutoKernel plugin inited\n"); 21 | return 0; 22 | } 23 | -------------------------------------------------------------------------------- /autokernel_plugin/src/pool/avepool_gen.cc: -------------------------------------------------------------------------------- 1 | #include "Halide.h" 2 | #include "HalideBuffer.h" 3 | using namespace Halide; 4 | using Halide::Expr; 5 | using Halide::Func; 6 | using Halide::Generator; 7 | using Halide::Var; 8 | using Halide::BoundaryConditions::constant_exterior; 9 | 10 | class halide_avepool:public Halide::Generator{ 11 | public: 12 | // args 13 | Input> input{"input", 4}; 14 | Input stride{"stride"}; 15 | Input pad_width{"pad_width"}; 16 | Input pad_height{"height"}; 17 | Input kernel_w{"kernel_w"}; 18 | Input kernel_h{"kernel_h"}; 19 | Output> output{"output", 4}; 20 | 21 | void generate() 22 | { 23 | /* THE ALGORITHM */ 24 | Var x("x"), y("y"), c("c"), n("n"); 25 | 26 | constexpr float kMinValue = -3.4028235e38; 27 | Func input_bounded = constant_exterior(input, kMinValue, 28 | {{0, input.dim(0).extent()}, 29 | {0, input.dim(1).extent()}, 30 | {Expr(), Expr()}, 31 | {Expr(), Expr()}, 32 | }); 33 | Func input_padded("input_padded"); 34 | input_padded(x, y, c, n) = input_bounded(x - pad_width, y - pad_height, c, n); 35 | 36 | Func sum("sum"); 37 | RDom filter_dom(0, kernel_w, 0, kernel_h); 38 | sum(x, y, c, n) += select( 39 | stride == 1, 40 | input_padded(x + filter_dom.x, y + filter_dom.y, c, n), 41 | input_padded(x * stride + filter_dom.x, y * stride + filter_dom.y, c, n) ); 42 | Expr in_x_origin = x * stride - pad_width; 43 | Expr x_start = max(0, -in_x_origin); 44 | Expr x_end = min(kernel_w, input.dim(0).extent() - in_x_origin); 45 | 46 | Expr in_y_origin = y * stride - pad_height; 47 | Expr y_start = max(0, -in_y_origin); 48 | Expr y_end = min(kernel_h, input.dim(1).extent() - in_y_origin); 49 | 50 | Expr filter_count = (x_end - x_start) * (y_end - y_start); 51 | 52 | output(x, y, c, n) = sum(x, y, c, n) / filter_count; 53 | } 54 | 55 | void schedule() 56 | { 57 | if(auto_schedule) 58 | { 59 | input.set_estimates({{0, 512}, {0, 512}, {0, 512}, {0, 1}}); 60 | stride.set_estimate(1); 61 | pad_width.set_estimate(1); 62 | pad_height.set_estimate(1); 63 | kernel_w.set_estimate(1); 64 | kernel_h.set_estimate(1); 65 | output.set_estimates({{0, 512}, {0, 512}, {0, 512}, {0, 1}}); 66 | } 67 | } 68 | }; 69 | 70 | HALIDE_REGISTER_GENERATOR(halide_avepool, halide_avepool) 71 | -------------------------------------------------------------------------------- /autokernel_plugin/src/pool/build.sh: -------------------------------------------------------------------------------- 1 | g++ maxpool_gen.cc ../../common/GenGen.cpp \ 2 | -I ${HALIDE_DIR}/include \ 3 | -L ${HALIDE_DIR}/lib \ 4 | -lHalide -std=c++11 -fno-rtti \ 5 | -o maxpool_gen 6 | 7 | ./maxpool_gen -g halide_maxpool -e c_header,assembly -o . target=host 8 | 9 | g++ avepool_gen.cc ../../common/GenGen.cpp \ 10 | -I ${HALIDE_DIR}/include \ 11 | -L ${HALIDE_DIR}/lib \ 12 | -lHalide -std=c++11 -fno-rtti \ 13 | -o avepool_gen 14 | 15 | ./avepool_gen -g halide_avepool -e c_header,assembly -o . target=host 16 | -------------------------------------------------------------------------------- /autokernel_plugin/src/pool/maxpool_gen.cc: -------------------------------------------------------------------------------- 1 | #include "Halide.h" 2 | #include "HalideBuffer.h" 3 | using namespace Halide; 4 | using Halide::Expr; 5 | using Halide::Func; 6 | using Halide::Generator; 7 | using Halide::Var; 8 | using Halide::BoundaryConditions::constant_exterior; 9 | 10 | class halide_maxpool:public Halide::Generator{ 11 | public: 12 | // args 13 | Input> input{"input", 4}; 14 | Input stride{"stride"}; 15 | Input pad_width{"pad_width"}; 16 | Input pad_height{"height"}; 17 | Input kernel_w{"kernel_w"}; 18 | Input kernel_h{"kernel_h"}; 19 | Output> output{"output", 4}; 20 | 21 | void generate() 22 | { 23 | /* THE ALGORITHM */ 24 | Var x("x"), y("y"), c("c"), n("n"); 25 | 26 | constexpr float kMinValue = -3.4028235e38; 27 | Func input_bounded = constant_exterior(input, kMinValue, 28 | {{0, input.dim(0).extent()}, 29 | {0, input.dim(1).extent()}, 30 | {Expr(), Expr()}, 31 | {Expr(), Expr()}, 32 | }); 33 | Func input_padded("input_padded"); 34 | input_padded(x, y, c, n) = input_bounded(x - pad_width, y - pad_height, c, n); 35 | 36 | Func local_max("local_max"); 37 | RDom filter_dom(0, kernel_w, 0, kernel_h); 38 | local_max(x, y, c, n) = maximum(select( 39 | stride == 1, 40 | input_padded(x + filter_dom.x, y + filter_dom.y, c, n), 41 | input_padded(x * stride + filter_dom.x, y * stride + filter_dom.y, c, n) )); 42 | output(x, y, c, n) = local_max(x, y, c, n); 43 | 44 | } 45 | 46 | void schedule() 47 | { 48 | if(auto_schedule) 49 | { 50 | input.set_estimates({{0, 512}, {0, 512}, {0, 512}, {0, 1}}); 51 | stride.set_estimate(1); 52 | pad_width.set_estimate(1); 53 | pad_height.set_estimate(1); 54 | kernel_w.set_estimate(1); 55 | kernel_h.set_estimate(1); 56 | output.set_estimates({{0, 512}, {0, 512}, {0, 512}, {0, 1}}); 57 | } 58 | } 59 | }; 60 | 61 | HALIDE_REGISTER_GENERATOR(halide_maxpool, halide_maxpool) 62 | -------------------------------------------------------------------------------- /autokernel_plugin/src/pool/pool.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | step 1: init_node 4 | init the private info data for your op, if no need, skip this 5 | step 2: prerun 6 | allocate helper memory for your op, if no need, skip this 7 | step 3: run 8 | complete the run function to use the function generated by autokernel 9 | step 4: postrun 10 | release helper memory you allocated for your op, if no need, skip this 11 | step 5: release_node 12 | release the private info data you allocated for your op, if no need, skip this 13 | step 6: score 14 | adjust you score priority strategy, default score value is defined in cpu_node_ops.h 15 | step 7: reshape 16 | reshape output tensor, if no need, skip this 17 | step 8: register op 18 | change register func name and called in init.cpp 19 | */ 20 | 21 | #include "pool.h" 22 | 23 | // add helper data struct and functions here 24 | /* 25 | struct op_priv_info 26 | { 27 | 28 | }; 29 | */ 30 | 31 | static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) 32 | { 33 | /* 34 | allocate helper memory for your op 35 | */ 36 | return 0; 37 | } 38 | 39 | static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) 40 | { 41 | bool info_autokernel = false; 42 | const char* debug_env = std::getenv("DEBUG_INFO"); 43 | if((debug_env) && (debug_env[0] == '1')) 44 | { 45 | info_autokernel = true; 46 | } 47 | // step 1: get input and output 48 | struct node* ir_node = exec_node->ir_node; 49 | struct graph* ir_graph = ir_node->graph; 50 | struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); 51 | struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); 52 | 53 | // get op private data info (if needed) 54 | // struct op_priv_info* conv_priv_info = ( struct conv_priv_info* )exec_node->ops_priv; 55 | 56 | // step 2: get op params (if needed), the op_param struct is defined in src/op/ 57 | // struct op_param* op_param = ( struct conv_param* )ir_node->op.param_mem; 58 | // DTYPE [param_list] = conv_param->param_list; 59 | struct pool_param* pool_param = ( struct pool_param* )ir_node->op.param_mem; 60 | int stride = pool_param->stride_h; 61 | int pad_width = pool_param->pad_w0; 62 | int pad_height = pool_param->pad_h0; 63 | int kernel_w = pool_param->kernel_w; 64 | int kernel_h = pool_param->kernel_h; 65 | 66 | // step 3: call the func generated by Autokernel 67 | if (exec_graph->mode == TENGINE_MODE_FP32) 68 | { 69 | Halide::Runtime::Buffer input((float*)input_tensor->data, input_tensor->dims[3], input_tensor->dims[2], input_tensor->dims[1], input_tensor->dims[0]); 70 | Halide::Runtime::Buffer output((float*)output_tensor->data, output_tensor->dims[3], output_tensor->dims[2], output_tensor->dims[1], output_tensor->dims[0]); 71 | if (pool_param->pool_method == 0) 72 | { 73 | // maxpooling 74 | halide_maxpool(input, stride, pad_width, pad_height, kernel_w, kernel_h, output); 75 | if(info_autokernel)printf("[INFO]: runing Autokernel halide_maxpool...\n"); 76 | } 77 | else if (pool_param->pool_method == 1) 78 | { 79 | // average pooling 80 | halide_avepool(input, stride, pad_width, pad_height, kernel_w, kernel_h, output); 81 | if(info_autokernel)printf("[INFO]: runing Autokernel halide_avgpool...\n"); 82 | } 83 | 84 | } 85 | else 86 | { 87 | printf("Tengine work node with halide plugin not support %d\n", exec_graph->mode); 88 | return -1; 89 | } 90 | 91 | return 0; 92 | } 93 | 94 | static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) 95 | { 96 | return 0; 97 | } 98 | 99 | static int postrun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) 100 | { 101 | //printf("run halide postrun\n"); 102 | return 0; 103 | } 104 | 105 | static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) 106 | { 107 | /* 108 | init the private info data for your op: 109 | void ops_priv; 110 | int shared_mem_size; 111 | int shared_pack4_mem_size; 112 | */ 113 | return 0; 114 | } 115 | 116 | static int release_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) 117 | { 118 | /* 119 | release the private info data for your op: 120 | void ops_priv; 121 | int shared_mem_size; 122 | int shared_pack4_mem_size; 123 | */ 124 | return 0; 125 | } 126 | 127 | static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struct node* exec_node) 128 | { 129 | /* 130 | OPS_SCORE_STATIC 10000 131 | OPS_SCORE_BEST 8000 132 | OPS_SCORE_PREFER 6000 133 | OPS_SCORE_CANDO 4000 134 | OPS_SCORE_NOTSUP 2000 135 | */ 136 | return OPS_SCORE_STATIC; 137 | } 138 | 139 | static struct node_ops autokernel_node_ops = {.prerun = prerun, 140 | .run = run, 141 | .reshape = reshape, 142 | .postrun = postrun, 143 | .init_node = init_node, 144 | .release_node = release_node, 145 | .score = score}; 146 | 147 | int RegisterAutoKernelPool() 148 | { 149 | return register_builtin_node_ops(OP_POOL, &autokernel_node_ops); 150 | } 151 | 152 | //static int unreg_autokernel_ops(void* arg) 153 | //{ 154 | // unregister_builtin_node_ops(OP_POOL, &autokernel_node_ops); 155 | // return 0; 156 | //} 157 | -------------------------------------------------------------------------------- /autokernel_plugin/src/pool/pool.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | extern "C" 5 | { 6 | #include "device/cpu/cpu_define.h" 7 | #include "device/cpu/cpu_node.h" 8 | #include "device/cpu/cpu_module.h" 9 | #include "device/cpu/cpu_graph.h" 10 | 11 | #include "api/c_api.h" 12 | #include "device/device.h" 13 | #include "graph/tensor.h" 14 | #include "graph/node.h" 15 | #include "graph/graph.h" 16 | #include "graph/subgraph.h" 17 | #include "executer/executer.h" 18 | #include "optimizer/split.h" 19 | #include "module/module.h" 20 | #include "utility/vector.h" 21 | #include "utility/log.h" 22 | #include "utility/sys_port.h" 23 | #include "defines.h" 24 | 25 | // include op param header file here, locate in src/op/ 26 | #include "operator/prototype/pooling_param.h" 27 | } 28 | 29 | #include "HalideBuffer.h" 30 | 31 | // include the c_header file here 32 | #include "halide_maxpool.h" 33 | #include "halide_avepool.h" 34 | 35 | int RegisterAutoKernelPool(); 36 | -------------------------------------------------------------------------------- /autokernel_plugin/src/softmax/build.sh: -------------------------------------------------------------------------------- 1 | g++ softmax_gen.cc ../../common/GenGen.cpp \ 2 | -I ${HALIDE_DIR}/include \ 3 | -L ${HALIDE_DIR}/lib \ 4 | -lHalide -std=c++11 -fno-rtti \ 5 | -o softmax_gen 6 | 7 | ./softmax_gen -g halide_softmax -e c_header,assembly -o . target=host 8 | -------------------------------------------------------------------------------- /autokernel_plugin/src/softmax/softmax.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | step 1: init_node 4 | init the private info data for your op, if no need, skip this 5 | step 2: prerun 6 | allocate helper memory for your op, if no need, skip this 7 | step 3: run 8 | complete the run function to use the function generated by autokernel 9 | step 4: postrun 10 | release helper memory you allocated for your op, if no need, skip this 11 | step 5: release_node 12 | release the private info data you allocated for your op, if no need, skip this 13 | step 6: score 14 | adjust you score priority strategy, default score value is defined in cpu_node_ops.h 15 | step 7: reshape 16 | reshape output tensor, if no need, skip this 17 | step 8: register op 18 | change register func name and called in init.cpp 19 | */ 20 | 21 | #include "softmax.h" 22 | 23 | // add helper data struct and functions here 24 | /* 25 | struct op_priv_info 26 | { 27 | 28 | }; 29 | */ 30 | 31 | static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) 32 | { 33 | /* 34 | allocate helper memory for your op 35 | */ 36 | return 0; 37 | } 38 | 39 | static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) 40 | { 41 | bool info_autokernel = false; 42 | const char* debug_env = std::getenv("DEBUG_INFO"); 43 | if((debug_env) && (debug_env[0] == '1')) 44 | { 45 | info_autokernel = true; 46 | } 47 | // step 1: get input and output 48 | struct node* ir_node = exec_node->ir_node; 49 | struct graph* ir_graph = ir_node->graph; 50 | struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); 51 | struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); 52 | 53 | // get op private data info (if needed) 54 | // struct op_priv_info* conv_priv_info = ( struct conv_priv_info* )exec_node->ops_priv; 55 | 56 | // struct softmax_param* softmax_param=(strcut softmax_param*)ir_node->op.param_mem; 57 | 58 | // step 2: get op params (if needed), the op_param struct is defined in src/op/ 59 | // struct op_param* op_param = ( struct conv_param* )ir_node->op.param_mem; 60 | // DTYPE [param_list] = conv_param->param_list; 61 | 62 | // step 3: call the func generated by Autokernel 63 | Halide::Runtime::Buffer input((float*)input_tensor->data, input_tensor->dims[1], input_tensor->dims[0]); 64 | Halide::Runtime::Buffer output((float*)output_tensor->data, output_tensor->dims[1], output_tensor->dims[0]); 65 | 66 | halide_softmax(input, output); 67 | 68 | if(info_autokernel)printf("[INFO]: runing Autokernel halide_softmax...\n"); 69 | return 0; 70 | } 71 | 72 | static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) 73 | { 74 | return 0; 75 | } 76 | 77 | static int postrun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) 78 | { 79 | /* 80 | release the helper memory you 81 | */ 82 | return 0; 83 | } 84 | 85 | static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) 86 | { 87 | /* 88 | init the private info data for your op: 89 | void ops_priv; 90 | int shared_mem_size; 91 | int shared_pack4_mem_size; 92 | */ 93 | return 0; 94 | } 95 | 96 | static int release_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) 97 | { 98 | /* 99 | release the private info data for your op: 100 | void ops_priv; 101 | int shared_mem_size; 102 | int shared_pack4_mem_size; 103 | */ 104 | return 0; 105 | } 106 | 107 | static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struct node* exec_node) 108 | { 109 | /* 110 | OPS_SCORE_STATIC 10000 111 | OPS_SCORE_BEST 8000 112 | OPS_SCORE_PREFER 6000 113 | OPS_SCORE_CANDO 4000 114 | OPS_SCORE_NOTSUP 2000 115 | */ 116 | return OPS_SCORE_STATIC; 117 | } 118 | 119 | static struct node_ops autokernel_node_ops = {.prerun = prerun, 120 | .run = run, 121 | .reshape = reshape, 122 | .postrun = postrun, 123 | .init_node = init_node, 124 | .release_node = release_node, 125 | .score = score}; 126 | 127 | int RegisterAutoKernelSoftmax() 128 | { 129 | return register_builtin_node_ops(OP_SOFTMAX, &autokernel_node_ops); 130 | } 131 | 132 | //static int unreg_autokernel_ops(void* arg) 133 | //{ 134 | // unregister_builtin_node_ops(OP_SOFTMAX, &autokernel_node_ops); 135 | // return 0; 136 | //} 137 | -------------------------------------------------------------------------------- /autokernel_plugin/src/softmax/softmax.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | extern "C" 5 | { 6 | #include "device/cpu/cpu_define.h" 7 | #include "device/cpu/cpu_node.h" 8 | #include "device/cpu/cpu_module.h" 9 | #include "device/cpu/cpu_graph.h" 10 | 11 | #include "api/c_api.h" 12 | #include "device/device.h" 13 | #include "graph/tensor.h" 14 | #include "graph/node.h" 15 | #include "graph/graph.h" 16 | #include "graph/subgraph.h" 17 | #include "executer/executer.h" 18 | #include "optimizer/split.h" 19 | #include "module/module.h" 20 | #include "utility/vector.h" 21 | #include "utility/log.h" 22 | #include "utility/sys_port.h" 23 | #include "defines.h" 24 | 25 | // include op param header file here, locate in src/op/ 26 | #include "operator/prototype/softmax_param.h" 27 | } 28 | 29 | #include "HalideBuffer.h" 30 | 31 | // include the c_header file here 32 | #include "halide_softmax.h" 33 | 34 | int RegisterAutoKernelSoftmax(); 35 | -------------------------------------------------------------------------------- /autokernel_plugin/src/softmax/softmax_gen.cc: -------------------------------------------------------------------------------- 1 | #include "Halide.h" 2 | #include "HalideBuffer.h" 3 | using namespace Halide; 4 | using Halide::Expr; 5 | using Halide::Func; 6 | using Halide::Generator; 7 | using Halide::Var; 8 | using Halide::BoundaryConditions::constant_exterior; 9 | 10 | class halide_softmax:public Halide::Generator{ 11 | public: 12 | // args 13 | Input> input{"input", 2}; 14 | 15 | Output> output{"output", 2}; 16 | 17 | void generate() 18 | { 19 | /* THE ALGORITHM */ 20 | const Expr num_classes=input.width(); 21 | Var in("in"), n("n"); 22 | Func expInput; 23 | RDom r(0,num_classes); 24 | expInput(in, n) = exp(input(in, n)); 25 | Expr globalSum=sum(expInput(r.x,n)); 26 | 27 | 28 | output(in,n)=expInput(in,n)/globalSum; 29 | } 30 | 31 | void schedule() 32 | { 33 | /* THE SCHEDULE */ 34 | input.set_estimates({{0, 512}, {0, 512}}); 35 | output.set_estimates({{0, 512}, {0, 512}}); 36 | } 37 | }; 38 | 39 | HALIDE_REGISTER_GENERATOR(halide_softmax, halide_softmax) -------------------------------------------------------------------------------- /autokernel_plugin/template/build.sh: -------------------------------------------------------------------------------- 1 | g++ OP_NAME_gen.cc ../../common/GenGen.cpp \ 2 | -I ${HALIDE_DIR}/include \ 3 | -L ${HALIDE_DIR}/lib \ 4 | -lHalide -std=c++11 -fno-rtti \ 5 | -o OP_NAME_gen 6 | 7 | ./OP_NAME_gen -g halide_OP_NAME -e c_header,assembly -o . target=host 8 | -------------------------------------------------------------------------------- /autokernel_plugin/template/generator.cc: -------------------------------------------------------------------------------- 1 | #include "Halide.h" 2 | #include "HalideBuffer.h" 3 | using namespace Halide; 4 | using Halide::Expr; 5 | using Halide::Func; 6 | using Halide::Generator; 7 | using Halide::Var; 8 | using Halide::BoundaryConditions::constant_exterior; 9 | 10 | class Halide_Func_Name:public Halide::Generator{ 11 | public: 12 | // args 13 | Input> input{"input", 4}; 14 | Input param{"param"}; 15 | 16 | Output> output{"output", 4}; 17 | 18 | void generate() 19 | { 20 | /* THE ALGORITHM */ 21 | Var x("x"), y("y"), c("c"), n("n"); 22 | Func Halide_Func_Name("Halide_Func_Name"); 23 | Halide_Func_Name(c, x, y, n) = input(c, x, y, n); 24 | 25 | output(c, x, y, n) = select(param >= 0, max(param, Halide_Func_Name(c, x, y, n)), Halide_Func_Name(c, x, y, n)); 26 | } 27 | 28 | void schedule() 29 | { 30 | /* THE SCHEDULE */ 31 | } 32 | }; 33 | 34 | HALIDE_REGISTER_GENERATOR(Halide_Func_Name, Halide_Func_Name) 35 | -------------------------------------------------------------------------------- /autokernel_plugin/template/template.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | 3 | step 1: init_node 4 | init the private info data for your op, if no need, skip this 5 | step 2: prerun 6 | allocate helper memory for your op, if no need, skip this 7 | step 3: run 8 | complete the run function to use the function generated by autokernel 9 | step 4: postrun 10 | release helper memory you allocated for your op, if no need, skip this 11 | step 5: release_node 12 | release the private info data you allocated for your op, if no need, skip this 13 | step 6: score 14 | adjust you score priority strategy, default score value is defined in cpu_node_ops.h 15 | step 7: reshape 16 | reshape output tensor, if no need, skip this 17 | step 8: register op 18 | change register func name and called in init.cpp 19 | */ 20 | 21 | #include "template.h" 22 | 23 | // add helper data struct and functions here 24 | /* 25 | struct op_priv_info 26 | { 27 | 28 | }; 29 | */ 30 | 31 | static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) 32 | { 33 | /* 34 | allocate helper memory for your op 35 | */ 36 | return 0; 37 | } 38 | 39 | static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) 40 | { 41 | bool info_autokernel = false; 42 | const char* debug_env = std::getenv("DEBUG_INFO"); 43 | if((debug_env) && (debug_env[0] == '1')) 44 | { 45 | info_autokernel = true; 46 | } 47 | // step 1: get input and output 48 | struct node* ir_node = exec_node->ir_node; 49 | struct graph* ir_graph = ir_node->graph; 50 | struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]); 51 | struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]); 52 | 53 | // get op private data info (if needed) 54 | // struct op_priv_info* conv_priv_info = ( struct conv_priv_info* )exec_node->ops_priv; 55 | 56 | // step 2: get op params (if needed), the op_param struct is defined in src/op/ 57 | // struct op_param* op_param = ( struct conv_param* )ir_node->op.param_mem; 58 | // DTYPE [param_list] = conv_param->param_list; 59 | 60 | // step 3: call the func generated by Autokernel 61 | Halide::Runtime::Buffer input((float*)input_tensor->data, input_tensor->dims[3], input_tensor->dims[2], input_tensor->dims[1], input_tensor->dims[0]); 62 | Halide::Runtime::Buffer output((float*)output_tensor->data, output_tensor->dims[3], output_tensor->dims[2], output_tensor->dims[1], output_tensor->dims[0]); 63 | 64 | int param = 0; 65 | AutoKernel_Func(input, param, output); 66 | 67 | if(info_autokernel)printf("[INFO]: runing Autokernel AutoKernel_Func...\n"); 68 | return 0; 69 | } 70 | 71 | static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) 72 | { 73 | return 0; 74 | } 75 | 76 | static int postrun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) 77 | { 78 | /* 79 | release the helper memory you 80 | */ 81 | return 0; 82 | } 83 | 84 | static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) 85 | { 86 | /* 87 | init the private info data for your op: 88 | void ops_priv; 89 | int shared_mem_size; 90 | int shared_pack4_mem_size; 91 | */ 92 | return 0; 93 | } 94 | 95 | static int release_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph) 96 | { 97 | /* 98 | release the private info data for your op: 99 | void ops_priv; 100 | int shared_mem_size; 101 | int shared_pack4_mem_size; 102 | */ 103 | return 0; 104 | } 105 | 106 | static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struct node* exec_node) 107 | { 108 | /* 109 | OPS_SCORE_STATIC 10000 110 | OPS_SCORE_BEST 8000 111 | OPS_SCORE_PREFER 6000 112 | OPS_SCORE_CANDO 4000 113 | OPS_SCORE_NOTSUP 2000 114 | */ 115 | return OPS_SCORE_STATIC; 116 | } 117 | 118 | static struct node_ops autokernel_node_ops = {.prerun = prerun, 119 | .run = run, 120 | .reshape = reshape, 121 | .postrun = postrun, 122 | .init_node = init_node, 123 | .release_node = release_node, 124 | .score = score}; 125 | 126 | int RegisterAutoKernelOP() 127 | { 128 | return register_builtin_node_ops(OP_CONV, &autokernel_node_ops); 129 | } 130 | 131 | // int unreg_autokernel_ops(void* arg) 132 | // { 133 | // unregister_builtin_node_ops(OP_CONV, &autokernel_node_ops); 134 | // return 0; 135 | // } 136 | -------------------------------------------------------------------------------- /autokernel_plugin/template/template.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | extern "C" 5 | { 6 | #include "device/cpu/cpu_define.h" 7 | #include "device/cpu/cpu_node.h" 8 | #include "device/cpu/cpu_module.h" 9 | #include "device/cpu/cpu_graph.h" 10 | 11 | #include "api/c_api.h" 12 | #include "device/device.h" 13 | #include "graph/tensor.h" 14 | #include "graph/node.h" 15 | #include "graph/graph.h" 16 | #include "graph/subgraph.h" 17 | #include "executer/executer.h" 18 | #include "optimizer/split.h" 19 | #include "module/module.h" 20 | #include "utility/vector.h" 21 | #include "utility/log.h" 22 | #include "utility/sys_port.h" 23 | #include "defines.h" 24 | 25 | // include op param header file here, locate in operator/prototype/ 26 | #include "operator/prototype/convolution_param.h" 27 | } 28 | 29 | #include "HalideBuffer.h" 30 | 31 | // include the c_header file here 32 | #include "AutoKernel_Func.h" 33 | 34 | int RegisterAutoKernelOP(); 35 | -------------------------------------------------------------------------------- /autokernel_plugin/tests/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | project(test) 2 | 3 | include_directories(${TENGINE_DIR}/include) 4 | include_directories(./common) 5 | link_directories(${TENGINE_DIR}/lib) 6 | 7 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") 8 | set(LINK_LIBS tengine-lite) 9 | set(CMAKE_EXE_LINKER_FLAGS "-rdynamic -ldl") 10 | 11 | add_executable(test_conv test_conv.cpp) 12 | target_link_libraries(test_conv ${LINK_LIBS}) 13 | 14 | add_executable(test_depthwise test_depthwise.cpp) 15 | target_link_libraries(test_depthwise ${LINK_LIBS}) 16 | 17 | add_executable(test_fc test_fc.cpp) 18 | target_link_libraries(test_fc ${LINK_LIBS}) 19 | 20 | add_executable(test_pool test_pool.cpp) 21 | target_link_libraries(test_pool ${LINK_LIBS}) 22 | 23 | add_executable(test_softmax test_softmax.cpp) 24 | target_link_libraries(test_softmax ${LINK_LIBS}) 25 | 26 | add_executable(test_normalize test_normalize.cpp) 27 | target_link_libraries(test_normalize ${LINK_LIBS}) 28 | 29 | add_executable(tm_classification tm_classification.cpp common/tengine_operations.cpp) 30 | target_link_libraries (tm_classification ${LINK_LIBS}) 31 | -------------------------------------------------------------------------------- /autokernel_plugin/tests/common/utils.hpp: -------------------------------------------------------------------------------- 1 | #ifndef __UTILS_HPP__ 2 | #define __UTILS_HPP__ 3 | 4 | #include 5 | int is_file_exist(std::string file_name); 6 | int is_file_exist(std::string file_name) 7 | { 8 | FILE* fp = fopen(file_name.c_str(), "r"); 9 | if (!fp) 10 | { 11 | return 0; 12 | } 13 | fclose(fp); 14 | return 1; 15 | } 16 | #endif // __UTILS_HPP__ -------------------------------------------------------------------------------- /doc/add_op.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OAID/AutoKernel/6ada93a743c26e3aae0372f7518433889dd0b437/doc/add_op.png -------------------------------------------------------------------------------- /doc/architecture-en.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OAID/AutoKernel/6ada93a743c26e3aae0372f7518433889dd0b437/doc/architecture-en.png -------------------------------------------------------------------------------- /doc/architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OAID/AutoKernel/6ada93a743c26e3aae0372f7518433889dd0b437/doc/architecture.png -------------------------------------------------------------------------------- /doc/how_to_add_op.md: -------------------------------------------------------------------------------- 1 | ## 如何快速开发一个自动优化的新算子 2 | 3 | 运用AutoKernel开发一个Tengine可以用算子,具体可分为两个步骤: 4 | 1. 生成:编写算法描述和调度策略,生成相应后端的优化算子代码 5 | 6 | 2. 部署:将生成的优化算子代码通过plugin的形式集成进Tengine 7 | 8 | -------------------------- 9 | 本教程将以Relu算子为例,演示如何快速开发Tengine可用的自动优化算子。 10 | 11 | ![add_op.png](add_op.png) 12 | ### 1.执行`register_op.sh`,自动生成模板文件 13 | 我们提供了一个快速生成算子的脚本文件,根据模板生成这两个步骤需要的源文件和编译脚本。 14 | ``` 15 | cd AutoKernel/autokernel_plugin 16 | chmod +x -R . 17 | ./scripts/register_op.sh 18 | ``` 19 | 根据提示填入: 20 | ``` 21 | op_name: relu 22 | op_type: OP_RELU 23 | ``` 24 | 可得到文件目录如下: 25 | ``` 26 | src/relu/relu.cpp 27 | src/relu/relu.h 28 | src/relu/relu_gen.cc 29 | src/relu/build/sh 30 | ``` 31 | ### 2.生成:编辑生成文件`relu_gen.cc` 32 | 该文件用于生成算子汇编代码。使用Halide语言描述算子的计算过程和调度策略schedule。 33 | 该示例中,schedule默认为空。 34 | 35 | ``` 36 | class halide_relu:public Halide::Generator{ 37 | public: 38 | // args 39 | Input> input{"input", 4}; 40 | Input param{"param"}; 41 | 42 | Output> output{"output", 4}; 43 | 44 | void generate() 45 | { 46 | /* THE ALGORITHM */ 47 | Var w("w"), h("h"), c("c"), n("n"); 48 | Func halide_relu("halide_relu"); 49 | halide_relu(w, h, c, n) = input(w, h, c, n); 50 | 51 | output(w, h, c, n) = select(param >= 0, max(param, halide_relu(w, h, c, n)), halide_relu(w, h, c, n)); 52 | } 53 | 54 | void schedule() 55 | { 56 | /* THE SCHEDULE */ 57 | } 58 | }; 59 | 60 | ``` 61 | ### 3.部署:编辑`auto_relu.cpp`,一键编译生成`AutoKernel.so` 62 | 63 | ``` 64 | ./scripts/generate.sh # 一键生成所有算子所需的.s .h文件 65 | mkdir build 66 | cd build 67 | cmake .. 68 | make -j4 69 | ``` 70 | 71 | ### 4.测试 72 | 73 | 测试用例仅供参考 74 | 75 | ``` 76 | #include "HalideBuffer.h" 77 | #include 78 | #include "halide_relu.h" 79 | 80 | int main(int argc, char **argv) 81 | { 82 | int C = 1, W = 4, H = 4, N = 1; 83 | Halide::Runtime::Buffer input_tensor(nullptr, W, H, C, N); 84 | Halide::Runtime::Buffer output_tensor(nullptr, W, H, C, N); 85 | input_tensor.allocate(); 86 | output_tensor.allocate(); 87 | input_tensor.for_each_value([](float &x) { 88 | x = 2.0 * rand() / RAND_MAX - 1.0; 89 | }); 90 | 91 | output_tensor.for_each_value([](float &x) { 92 | x = 2.0 * rand() / RAND_MAX - 1.0; 93 | }); 94 | 95 | halide_relu(input_tensor, 0, output_tensor); 96 | 97 | printf("input:\n"); 98 | for (int c = 0; c < input_tensor.dim(3).extent(); c++) { 99 | for (int z = 0; z < input_tensor.channels(); z++) { 100 | for (int y = 0; y < input_tensor.height(); y++) { 101 | for (int x = 0; x < input_tensor.width(); x++) { 102 | std::cout< Documentation | 中文文档 3 | 4 | -------------------------------------------------------------------------------- /doc/tutorials/01_AutoKernel开发环境快速入门.md: -------------------------------------------------------------------------------- 1 | # AutoKernel开发环境 2 | 3 | 在这个教程中,我们将会介绍如何安装/配置AutoKernel的开发环境,并介绍本项目所依赖的两大组件Tengine, Halide。为了减少开发者配置环境的遇到的问题,目前我们提供了Docker来配置所需的基本环境。后续我们会提供更多的环境配置方式。 4 | 5 | - AutoKernel开发环境 6 | - [AutoKernel 安装指引](#autokernel-安装指引) 7 | - [Halide](#halide) 8 | - [Tengine](#tengine) 9 | ------------------- 10 | 11 | ## AutoKernel 安装指引 12 | AutoKernel提供了docker镜像,提供了AutoKernel的开发环境 13 | 14 | - 如果你还没安装docker,请查看[docker的官方安装文档](https://docs.docker.com/engine/install/debian/)。 15 | 16 | - 如果你对docker不熟悉,可以参考docker使用入门教程: [菜鸟课程docker使用入门](https://www.runoob.com/docker/docker-hello-world.html) 17 | 18 | 接下来我们认为你已经安装好docker。 19 | 20 | 1. 拉取镜像(可能需要一段时间,请耐心等待, 根据网速,可能需要10-20mins) 21 | ``` 22 | docker pull openailab/autokernel 23 | ``` 24 | 2. 创建容器,进入开发环境 25 | ``` 26 | docker run -ti openailab/autokernel /bin/bash 27 | ``` 28 | 进入到docker容器里 29 | ``` 30 | root@39bfb5ea515d:/workspace# 31 | ``` 32 | * 注意,如果你已经创建了容器,那你只需要启动容器,并且进入即可。 否则,你之前的改动不会在新创建的容器中生效。 33 | 34 | 查看之前创建的容器, 你可以通过命令`docker container rename `来重命名你的容器,这里,我们的容器叫做`autokernel` 35 | ``` 36 | $ docker container ls -a 37 | CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES 38 | ff8b59212784 openailab/autokernel "/bin/bash" 21 hours ago Exited (255) 2 minutes ago autokernel 39 | ``` 40 | 41 | 启动容器 42 | ``` 43 | docker start autokernel 44 | ``` 45 | 进入容器 46 | ``` 47 | docker exec -ti autokernel /bin/bash 48 | ``` 49 | 3. docker里面已经安装好Halide, Tengine 50 | ``` 51 | /workspace/Halide # Halide 52 | /workspace/Tengine # Tengine 53 | ``` 54 | 55 | 4. 克隆AutoKernel项目 56 | ``` 57 | git clone https://github.com/OAID/AutoKernel.git 58 | ``` 59 | 60 | 至此,我们后面所需的环境文件都已经准备完毕。 61 | 62 | ## Halide 63 | Halide 是一个DSL编程语言,他将算法和硬件后端分离了。本项目将使用Halide的DSL 以及IR。docker里面已经安装好Halide, 并且配置好了Python的API。 64 | 65 | Halide相关的文件都在`/workspace/Halide/`文件夹下,Halide的安装文件都在`/workspace/Halide/halide-build` 文件夹下。 66 | 67 | ``` 68 | cd /workspace/Halide/halide-build 69 | ``` 70 | * Halide相关头文件在`/workspace/Halide/halide-build/include` 71 | ``` 72 | root@bd3faab0f079:/workspace/Halide/halide-build/include# ls 73 | 74 | Halide.h HalideRuntimeHexagonDma.h 75 | HalideBuffer.h HalideRuntimeHexagonHost.h 76 | HalidePyTorchCudaHelpers.h HalideRuntimeMetal.h 77 | HalidePyTorchHelpers.h HalideRuntimeOpenCL.h 78 | HalideRuntime.h HalideRuntimeOpenGL.h 79 | HalideRuntimeCuda.h HalideRuntimeOpenGLCompute.h 80 | HalideRuntimeD3D12Compute.h HalideRuntimeQurt.h 81 | ``` 82 | * 编译好的Halide库在`/workspace/Halide/halide-build/src`目录下, 可以看到`libHalide.so` 83 | ``` 84 | root@bd3faab0f079:/workspace/Halide/halide-build/src# ls 85 | CMakeFiles autoschedulers libHalide.so.10 86 | CTestTestfile.cmake cmake_install.cmake libHalide.so.10.0.0 87 | Makefile libHalide.so runtime 88 | ``` 89 | * 运行Halide小程序 90 | ``` 91 | cd /workspace/Halide/halide-build 92 | ./tutorial/lesson_01_basics 93 | ``` 94 | 运行结果 95 | ``` 96 | Success! 97 | ``` 98 | * 运行Halide的Python接口 99 | 首先查看Python的系统路径 100 | ``` 101 | python 102 | >>>import sys 103 | >>> sys.path 104 | ['', '/root', '/workspace/Halide/halide-build/python_bindings/src', '/usr/lib/python36.zip', '/usr/lib/python3.6', '/usr/lib/python3.6/lib-dynload', '/usr/local/lib/python3.6/dist-packages', '/usr/lib/python3/dist-packages'] 105 | ``` 106 | 可以看到Python的系统路径已经有Halide的编译后的python包路径`'/workspace/Halide/halide-build/python_bindings/src'` 107 | ``` 108 | python 109 | >>> import halide 110 | ``` 111 | 直接`import halide`成功! 112 | 113 | 114 | 115 | ## Tengine 116 | Tengine是一个轻量级高性能深度神经网络推理引擎。本项目将基于Tengine进行算子开发优化的工作。 117 | 118 | docker里面已经安装好Tengine, 相关文件都在`/workspace/Tengine/`目录下 119 | ``` 120 | cd /workspace/Tengine/build 121 | ``` 122 | * Tengine相关头文件在`/workspace/Tengine/build/install/include` 123 | ``` 124 | root@bd3faab0f079:/workspace/Tengine/build/install/include# ls 125 | 126 | tengine_c_api.h 127 | tengine_cpp_api.h 128 | ``` 129 | * 编译好的Tengine库在`/workspace/Tengine/build/install/lib`目录下, 可以看到`libtengine-lite.so` 130 | ``` 131 | root@bd3faab0f079:/workspace/Tengine/build/install/lib# ls 132 | 133 | libtengine-lite.so 134 | ``` 135 | * 运行Tengine小程序 136 | 137 | 该示例跑了Tengine在目标电脑上各个网络模型的性能benchmark 138 | ``` 139 | cd /workspace/Tengine/benchmark 140 | ../build/benchmark/tm_benchmark 141 | ``` 142 | 运行结果 143 | ``` 144 | start to run register cpu allocator 145 | loop_counts = 1 146 | num_threads = 1 147 | power = 0 148 | tengine-lite library version: 1.0-dev 149 | squeezenet_v1.1 min = 32.74 ms max = 32.74 ms avg = 32.74 ms 150 | mobilenetv1 min = 31.33 ms max = 31.33 ms avg = 31.33 ms 151 | mobilenetv2 min = 35.55 ms max = 35.55 ms avg = 35.55 ms 152 | mobilenetv3 min = 37.65 ms max = 37.65 ms avg = 37.65 ms 153 | shufflenetv2 min = 10.93 ms max = 10.93 ms avg = 10.93 ms 154 | resnet18 min = 74.53 ms max = 74.53 ms avg = 74.53 ms 155 | resnet50 min = 175.55 ms max = 175.55 ms avg = 175.55 ms 156 | googlenet min = 133.23 ms max = 133.23 ms avg = 133.23 ms 157 | inceptionv3 min = 298.22 ms max = 298.22 ms avg = 298.22 ms 158 | vgg16 min = 555.60 ms max = 555.60 ms avg = 555.60 ms 159 | mssd min = 69.41 ms max = 69.41 ms avg = 69.41 ms 160 | retinaface min = 13.14 ms max = 13.14 ms avg = 13.14 ms 161 | yolov3_tiny min = 132.67 ms max = 132.67 ms avg = 132.67 ms 162 | mobilefacenets min = 14.95 ms max = 14.95 ms avg = 14.95 ms 163 | ALL TEST DONE 164 | ``` 165 | -------------------------------------------------------------------------------- /doc/tutorials/02_Tengine快速入门.md: -------------------------------------------------------------------------------- 1 | # Tengine快速入门 2 | 3 | Tengine是一个轻量级深度神经网络推理引擎。本文档将在x86 Linux平台,以分类模型(Squezenet模型)为例,带你快速上手Tengine。 4 | 5 | ## 深度学习神经网络计算流程 6 | 7 | **概念理解** 8 | 9 | - `神经网络`: 神经网络可以理解为计算图(graph),一个计算图由多个算子(operator)节点组成,这些节点可以是卷积算子(Convolution), 池化算子(Pooling), 全连接算子(Fc)等。 10 | 11 | - `神经网络模型`: 神经网络模型是由深度学习训练框架(Tensorflow, Caffe, Pytorch, Mxnet等)训练得到的,模型包含了两个信息: 12 | - 神经网络的计算图结构 13 | - 算子的权重数据 14 | 15 | **计算流程** 16 | 17 | ![inference](data/inference.png) 18 | 19 | 1. 加载模型:得到神经网络结构和权重数据 20 | 21 | 2. 准备输入数据,喂入输入数据 22 | 23 | 3. 进行模型推理计算 24 | 25 | 4. 获取输出数据 26 | 27 | ## Tengine Squeezenet示例 28 | 本示例将按照神经网络推理计算流程,演示如何在Tengine中进行Squeezenet分类网络的推理计算 29 | 30 | 1. 加载模型 31 | ```cpp 32 | /* load model */ 33 | graph_t graph = create_graph(NULL, "tengine", model_file); 34 | 35 | ``` 36 | `model_file`是tengine格式的模型文件:"squeezenet.tmfile" 37 | 38 | 2. 准备输入数据, 喂入输入数据 39 | ```cpp 40 | /* prepare input data */ 41 | tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0); 42 | set_tensor_shape(input_tensor, dims, 4); 43 | set_tensor_buffer(input_tensor, input_data, img_size * sizeof(float)); 44 | ``` 45 | 46 | 3. 进行模型推理计算 47 | ```cpp 48 | /* forward */ 49 | run_graph(graph, 1); 50 | ``` 51 | 4. 获取输出数据 52 | ```cpp 53 | /* get result */ 54 | tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); 55 | float* output_data = ( float* )get_tensor_buffer(output_tensor); 56 | ``` 57 | * 代码: 58 | - 完整的代码源文件在: [data/02_tengine_tutorial.cpp](data/02_tengine_tutorial.cpp),代码非常清晰简洁~ 59 | - 代码使用了一些工具函数,在文件[tengine_operations.h](https://github.com/OAID/Tengine/blob/tengine-lite/examples/common/tengine_operations.h)中 60 | 61 | * 编译 62 | ``` 63 | cd tutorials/data 64 | cp /workspace/Tengine/examples/common -r . 65 | mkdir build 66 | cd build 67 | cmake .. 68 | make 69 | ``` 70 | * 执行 71 | ``` 72 | cd tutorials/data/build 73 | 74 | #下载模型和图片 75 | wget https://github.com/OAID/TengineModels/raw/main/images/cat.jpg . 76 | wget https://github.com/OAID/TengineModels/raw/main/tmfiles/squeezenet.tmfile . 77 | ./02_tengine_tutorial 78 | ``` 79 | 得到结果 80 | ``` 81 | 0.273198, 281 82 | 0.267550, 282 83 | 0.181006, 278 84 | 0.081798, 285 85 | 0.072406, 151 86 | -------------------------------------- 87 | ALL TEST DONE 88 | ``` 89 | 这是一个分类网络,1000类,index从0到999,每个类别有一个概率分数,运行结果打印出了排名前5的概率分数score和index. 90 | 91 | 92 | ## 更多Tengine示例 93 | 更多Tengine的应用示例在[Tengine/examples](https://github.com/OAID/Tengine/tree/tengine-lite/examples): 94 | - 分类任务 95 | - 人脸关键点检测任务 96 | - ssd 目标检测任务 97 | - retinaface 人脸检测任务 98 | - yolact 实例分割任务 99 | - yolov3 目标检测任务 100 | - yolov4-tiny目标检测任务 101 | - openpose人体姿态识别任务 102 | - crnn汉字识别任务 103 | -------------------------------------------------------------------------------- /doc/tutorials/03_Halide初体验.md: -------------------------------------------------------------------------------- 1 | # Halide初体验 2 | 3 | 在深入了解Halide之前,我们先来体验一下Halide的黑魔法。 4 | 5 | 进入AutoKernel的docker, docker里已有Halide的python环境,直接运行 6 | ``` 7 | python data/03_halide_magic.py 8 | ``` 9 | 可以得到输出 10 | ``` 11 | func_origin__ cost 0.510215 second 12 | func_parallel cost 0.122265 second 13 | ``` 14 | 以上这个脚本执行了一个简单的函数计算:`func[x,y] = x + 10*y` 15 | 对比了两个函数的运行时间: 16 | - func_origin: 默认函数 17 | - func_parallel: 添加了Halide的一个调度策略:`func.parallel(y,4)`, 对y维度进行并行化,并行度为4 18 | 19 | 结果可以看到,第二个函数的耗时是第一个函数的四分之一。 20 | 21 |
这,就是Halide的魔法!
22 | 23 | 无需底层优化汇编知识,只需添加一行代码,就能得到比较好的优化效果 24 | 25 | 26 | ## Halide语言基础 27 | 要想调用Halide的调度策略,首先要掌握基本的Halide语言,用Halide语言来描述算子的计算。下面以简单的函数来演示Halide语言的基本数据结构。 28 | 29 | - `变量 Var`:可以理解为函数的自变量,比如要描述一个图像的像素,需要两个变量x和y来描述 w维度和h维度的坐标。 30 | - `函数 Func`:和数学上的函数类似,定义了一个计算过程。复杂的计算过程可以拆成多个小函数来实现。 31 | 32 | ### 示例一 33 | 本例子的函数计算公式为:`func(x,y)= 10*y + x` 34 | 用Halide语言来描述这个函数: 35 | * Python: 36 | ```python 37 | import halide as hl 38 | 39 | x, y = hl.Var("x"), hl.Var("y") 40 | func = hl.Func("func") 41 | func[x,y] = x + 10*y 42 | ``` 43 | * C++ 44 | ```c++ 45 | #include "Halide.h" 46 | using namespace Halide; 47 | 48 | Var x("x"), y("y"); 49 | Func func("func"); 50 | 51 | func(x, y) = x + 10 * y; 52 | ``` 53 | Func的realize会计算函数在定义域的值并返回数值结果。调用了realize,函数才被即时编译(jit-compile),在这之前只是定义了函数的计算过程。 54 | 55 | 查看计算结果 56 | 57 | * Python: 58 | ```python 59 | out = func.realize(3, 4) # width, height = 3,4 60 | 61 | for j in range(out.height()): 62 | for i in range(out.width()): 63 | print("out[x=%i,y=%i]=%i"%(i,j,out[i,j])) 64 | ``` 65 | * C++ 66 | ```c++ 67 | Buffer out = func.realize(3, 4); 68 | 69 | for (int j = 0; j < out.height(); j++) { 70 | for (int i = 0; i < out.width(); i++) { 71 | printf("out[x=%d,y=%d]=%d",i,j,out(i,j)); 72 | } 73 | } 74 | ``` 75 | 这个函数的计算是: 76 | ``` 77 | wide = 3 78 | x=0 x=1 x=2 79 | ------------ 80 | y=0 | 0 1 2 81 | hight = 4 y=1 | 10 11 12 82 | y=2 | 20 21 22 83 | y=3 | 30 31 32 84 | ``` 85 | 86 | 完整的代码在[data/03_halide_basic.py](data/03_halide_basic.py) 87 | 可以直接运行: 88 | ``` 89 | python data/03_halide_basic.py 90 | ``` 91 | 另外可以调用`func.trace_stores()`来跟踪函数的值 92 | 93 | ### 示例二 94 | 本示例演示如何喂入输入数据,取出输出数据 95 | 完整的代码在[data/03_halide_feed_data.py](data/03_halide_feed_data.py) 96 | 97 | 本示例的函数: 98 | ``` 99 | B(x,y)=A(x,y)+1 100 | ``` 101 | A是输入数据,可以定义Halide.Buffer,然后把numpy的array数据喂入buffer 102 | ```python 103 | # feed input 104 | input_data = np.ones((4,4),dtype=np.uint8) 105 | A = hl.Buffer(input_data) 106 | ``` 107 | 定义函数B 108 | ```python 109 | i,j = hl.Var("i"), hl.Var("j") 110 | B = hl.Func("B") 111 | B[i,j] = A[i,j] + 1 112 | ``` 113 | 获取输出数据, 有以下几种方式 114 | ```python 115 | # 1 116 | output = B.realize(4,4) 117 | print("out: \n",np.asanyarray(output)) 118 | # 2 119 | output = hl.Buffer(hl.UInt(8),[4,4]) 120 | B.realize(output) 121 | print("out: \n",np.asanyarray(output)) 122 | # 3 123 | output_data = np.empty(input_data.shape, dtype=input_data.dtype,order="F") 124 | output = hl.Buffer(output_data) 125 | B.realize(output) 126 | print("out: \n",output_data) 127 | 128 | ``` 129 | 可以直接运行完整代码: 130 | ``` 131 | python data/03_halide_feed_data.py 132 | ``` -------------------------------------------------------------------------------- /doc/tutorials/data/02_tengine_tutorial.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "tengine_c_api.h" 7 | #include "tengine_operations.h" 8 | 9 | const char* model_file = "squeezenet.tmfile"; 10 | const char* image_file = "cat.jpg"; 11 | 12 | using namespace std; 13 | 14 | int main() 15 | { 16 | // check files 17 | if(!check_file_exist(model_file) || !check_file_exist(image_file)) 18 | { 19 | return -1; 20 | } 21 | 22 | int img_h = 227; 23 | int img_w = 227; 24 | float mean[3] = {104.007, 116.669, 122.679}; 25 | float scale[3] = {1.f, 1.f, 1.f}; 26 | 27 | /* set runtime options of Net */ 28 | struct options opt; 29 | opt.num_thread = 1; 30 | opt.precision = TENGINE_MODE_FP32; 31 | opt.cluster = TENGINE_CLUSTER_ALL; 32 | 33 | /* load model */ 34 | init_tengine(); 35 | graph_t graph = create_graph(NULL, "tengine", model_file); 36 | 37 | /* prepare input data */ 38 | int img_size = img_h * img_w * 3; 39 | int dims[] = {1, 3, img_h, img_w}; 40 | float* input_data = ( float* )malloc(img_size * sizeof(float)); 41 | get_input_data(image_file, input_data, img_h, img_w, mean, scale); 42 | tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0); 43 | set_tensor_shape(input_tensor, dims, 4); 44 | set_tensor_buffer(input_tensor, input_data, img_size * sizeof(float)); 45 | 46 | /* forward */ 47 | prerun_graph_multithread(graph, opt); 48 | run_graph(graph, 1); 49 | 50 | /* get result */ 51 | tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0); 52 | float* output_data = ( float* )get_tensor_buffer(output_tensor); 53 | int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float); 54 | 55 | /* after process */ 56 | print_topk(output_data, output_size, 5); 57 | std::cout << "--------------------------------------\n"; 58 | std::cout << "ALL TEST DONE\n"; 59 | 60 | 61 | free(input_data); 62 | postrun_graph(graph); 63 | destroy_graph(graph); 64 | release_tengine(); 65 | return 0; 66 | } 67 | -------------------------------------------------------------------------------- /doc/tutorials/data/03_halide_basic.py: -------------------------------------------------------------------------------- 1 | 2 | #!/usr/bin/python3 3 | 4 | import halide as hl 5 | 6 | x, y = hl.Var("x"), hl.Var("y") 7 | func = hl.Func("func") 8 | 9 | func[x,y] = x + 10*y 10 | #func.trace_stores() 11 | 12 | out = func.realize(3, 4) # width, height = 3,4 13 | 14 | print("=============================") 15 | for j in range(out.height()): 16 | for i in range(out.width()): 17 | print("out[x=%i,y=%i]=%i"%(i,j,out[i,j])) 18 | 19 | print("Success!") 20 | 21 | 22 | -------------------------------------------------------------------------------- /doc/tutorials/data/03_halide_feed_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | import halide as hl 4 | import numpy as np 5 | 6 | w,h = 8,8 7 | 8 | def addone(): 9 | # feed input 10 | input_data = np.ones((4,4),dtype=np.uint8) 11 | A = hl.Buffer(input_data) 12 | 13 | i,j = hl.Var("i"), hl.Var("j") 14 | B = hl.Func("B") 15 | B[i,j] = A[i,j] + 1 16 | 17 | # output 18 | if 0: 19 | output = B.realize(4,4) 20 | print("out: \n",np.asanyarray(output)) 21 | if 0: 22 | output = hl.Buffer(hl.UInt(8),[4,4]) 23 | B.realize(output) 24 | print("out: \n",np.asanyarray(output)) 25 | if 1: 26 | output_data = np.empty(input_data.shape, dtype=input_data.dtype,order="F") 27 | output = hl.Buffer(output_data) 28 | B.realize(output) 29 | print("out: \n",output_data) 30 | 31 | addone() 32 | 33 | 34 | -------------------------------------------------------------------------------- /doc/tutorials/data/03_halide_magic.py: -------------------------------------------------------------------------------- 1 | 2 | #!/usr/bin/python3 3 | 4 | import halide as hl 5 | import time 6 | 7 | def time_me(fn): 8 | def _wrapper(*args, **kwargs): 9 | start = time.clock() 10 | fn(*args, **kwargs) 11 | print("%s cost %.6f second"%(fn.__name__, float(time.clock() - start))) 12 | return _wrapper 13 | 14 | 15 | x, y = hl.Var("x"), hl.Var("y") 16 | 17 | @time_me 18 | def func_origin__(w,h): 19 | func = hl.Func("func") 20 | func[x,y] = x + 10*y 21 | out = func.realize(w, h) 22 | 23 | @time_me 24 | def func_parallel(w,h): 25 | func = hl.Func("func") 26 | func[x,y] = x + 10*y 27 | func.parallel(y,4) 28 | func.realize(w,h) 29 | 30 | 31 | func_origin__(400,400) 32 | func_parallel(400,400) 33 | 34 | ''' 35 | 运行结果: 36 | func_origin__ cost 0.510215 second 37 | func_parallel cost 0.122265 second 38 | ''' 39 | 40 | print("Success!") 41 | 42 | -------------------------------------------------------------------------------- /doc/tutorials/data/04_test_relu.cpp: -------------------------------------------------------------------------------- 1 | #include "HalideBuffer.h" 2 | #include 3 | #include "halide_relu.h" 4 | 5 | int main(int argc, char **argv) 6 | { 7 | int C = 1, W = 4, H = 4, N = 1; 8 | Halide::Runtime::Buffer input_tensor(nullptr, W, H, C, N); 9 | Halide::Runtime::Buffer output_tensor(nullptr, W, H, C, N); 10 | input_tensor.allocate(); 11 | output_tensor.allocate(); 12 | input_tensor.for_each_value([](float &x) { 13 | x = 2.0 * rand() / RAND_MAX - 1.0; 14 | }); 15 | 16 | output_tensor.for_each_value([](float &x) { 17 | x = 2.0 * rand() / RAND_MAX - 1.0; 18 | }); 19 | 20 | halide_relu(input_tensor, 0, output_tensor); 21 | 22 | printf("input:\n"); 23 | for (int c = 0; c < input_tensor.dim(3).extent(); c++) { 24 | for (int z = 0; z < input_tensor.channels(); z++) { 25 | for (int y = 0; y < input_tensor.height(); y++) { 26 | for (int x = 0; x < input_tensor.width(); x++) { 27 | std::cout<(step=1,2,..,7)" 18 | echo "e.g. execute step3:./build.sh 3" 19 | exit 20 | fi 21 | STEP=$1 22 | echo "step = " ${STEP} 23 | ./${EXE_FILE} ${STEP} 24 | -------------------------------------------------------------------------------- /doc/tutorials/data/06_gemm_optimization.cpp: -------------------------------------------------------------------------------- 1 | #include "Halide.h" 2 | #include 3 | #include 4 | #include 5 | #include 6 | using namespace Halide; 7 | unsigned long get_cur_time(void) 8 | { 9 | struct timeval tv; 10 | 11 | gettimeofday(&tv, NULL); 12 | 13 | return (tv.tv_sec * 1000000 + tv.tv_usec); 14 | } 15 | #ifndef MAX 16 | #define MAX(a, b) (((a) > (b)) ? (a) : (b)) 17 | #endif 18 | static inline float *init(int size, int mode) 19 | { 20 | srand(0); //set rand_seed 21 | int i; 22 | float *m = (float *)malloc(size * sizeof(float)); 23 | for (i = 0; i < size; ++i) { 24 | if (mode == 0) 25 | m[i] = 0; 26 | else if (mode == 1) 27 | m[i] = 1; 28 | else if (mode == 2) 29 | m[i] = i % 8; 30 | else if (mode == 3) 31 | m[i] = (float)(rand()%4); 32 | else 33 | m[i] = (float)rand() / RAND_MAX; 34 | } 35 | return m; 36 | } 37 | void maxerr(float* pred, float* gt, int h,int w) 38 | { 39 | float maxError = 0.f; 40 | 41 | for(int i=0; i< (h*w); i++){ 42 | maxError = MAX(( float )fabs(gt[i] - pred[i]), maxError); 43 | } 44 | // printf("====================================\n"); 45 | printf("err %.2f\t", maxError); 46 | // printf("====================================\n"); 47 | } 48 | 49 | int main(int argc, char **argv) { 50 | if(argc<2) 51 | { 52 | printf("[usage] exe [step] \n"); 53 | return 1; 54 | } 55 | int M= 640; 56 | int N= 640; 57 | int K= 640; 58 | printf("M N K = %3d %3d %3d\t",M,N,K); 59 | int debug=0; 60 | int repeat_count=50; 61 | 62 | int step = atoi(argv[1]); 63 | 64 | float* a = init(M*K,4); 65 | float* b = init(N*K,4); 66 | float* c = init(M*N,1); 67 | float* ct = init(M*N,2); 68 | 69 | Buffer A(a,K,M); 70 | Buffer B(b,N,K); 71 | Buffer C(c,N,M); 72 | 73 | Var x,y,xy; 74 | Var xi,yi,xo,yo,yii; 75 | RDom k(0, K); 76 | Func gemm("gemm"); 77 | 78 | //1: default 79 | if (step==1) 80 | { 81 | gemm(x, y) += A(k, y) * B(x, k); 82 | 83 | } 84 | //2: tile 85 | if(step==2) 86 | { 87 | gemm(x, y) += A(k, y) * B(x, k); 88 | gemm.update() 89 | .tile(x, y, xo, yo, xi, yi, 16, 8) 90 | .reorder(xi, yi, k, xo, yo); 91 | } 92 | //3 tile + vectorize 93 | if(step==3) 94 | { 95 | gemm(x, y) += A(k, y) * B(x, k); 96 | gemm.update() 97 | .tile(x, y, xo, yo, xi, yi, 16, 8) 98 | .reorder(xi, yi, k, xo, yo) 99 | .vectorize(xi, 8); 100 | } 101 | //4 tile + vectorize + parallel 102 | if(step==4) 103 | { 104 | gemm(x, y) += A(k, y) * B(x, k); 105 | gemm.update() 106 | .tile(x, y, xo, yo, xi, yi, 16, 8) 107 | .reorder(xi, yi, k, xo, yo) 108 | .vectorize(xi, 8) 109 | .parallel(yo); 110 | } 111 | //5 tile + vectorize + parallel + unroll 112 | if(step==5) 113 | { 114 | gemm(x, y) += A(k, y) * B(x, k); 115 | gemm.update() 116 | .tile(x, y, xo, yo, xi, yi, 16, 8) 117 | .reorder(xi, yi, k, xo, yo) 118 | .vectorize(xi, 8) 119 | .parallel(yo) 120 | .unroll(xi) 121 | .unroll(yi,2); 122 | } 123 | //6 micro_kernel 4x16 124 | if(step==6) 125 | { 126 | Func prod; 127 | prod(x, y) += A(k, y) * B(x, k); 128 | gemm(x, y) = prod(x, y); 129 | 130 | gemm.tile(x, y, xi, yi, 16, 32) 131 | .fuse(x, y, xy).parallel(xy) 132 | .split(yi, yi, yii, 4) 133 | .vectorize(xi, 8) 134 | .unroll(xi) 135 | .unroll(yii); 136 | 137 | prod.compute_at(gemm, yi) 138 | .vectorize(x, 8).unroll(y); 139 | 140 | prod.update() 141 | .reorder(x, y, k) 142 | .vectorize(x, 8) 143 | .unroll(x) 144 | .unroll(y) 145 | .unroll(k, 2); 146 | } 147 | // 7.interleave B 148 | if(step==7) 149 | { 150 | Func B_interleave("B"), Bs("Bs"); 151 | Bs(x, y, xo) = B(xo * 16 + x, y); 152 | B_interleave(x, y) = Bs(x % 16, y, x / 16); 153 | 154 | Func prod; 155 | prod(x, y) += A(k, y) * B_interleave(x, k); 156 | gemm(x, y) = prod(x, y); 157 | 158 | gemm.tile(x, y, xi, yi, 16, 32) 159 | .fuse(x, y, xy).parallel(xy) 160 | .split(yi, yi, yii, 4) 161 | .vectorize(xi, 8) 162 | .unroll(xi) 163 | .unroll(yii); 164 | 165 | prod.compute_at(gemm, yi) 166 | .vectorize(x, 8).unroll(y); 167 | 168 | prod.update() 169 | .reorder(x, y, k) 170 | .vectorize(x, 8) 171 | .unroll(x) 172 | .unroll(y) 173 | .unroll(k, 2); 174 | Bs.compute_root() 175 | .split(y, yo, yi, 16) 176 | .reorder(x, yi, xo, yo) 177 | .unroll(x) 178 | .vectorize(yi).parallel(yo, 4); 179 | } 180 | gemm.output_buffer().dim(0).set_bounds(0, N).dim(1).set_bounds(0, M); 181 | gemm.realize(C); 182 | 183 | cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, K, 1, a, K, b, N, 0, ct, N); 184 | maxerr(c, ct, M,N); 185 | if(debug) 186 | { 187 | for (int j = 0; j < C.height(); j++) { 188 | for (int i = 0; i < C.width(); i++) { 189 | printf("%.1f ",C(i,j)); 190 | } 191 | printf("\n"); 192 | } 193 | } 194 | 195 | 196 | unsigned long t0, t1; 197 | float totalTime = 0; 198 | for (int i = 0; i < repeat_count; i++) 199 | { 200 | t0 = get_cur_time(); 201 | gemm.realize(C); 202 | t1 = get_cur_time(); 203 | totalTime += ((float)(t1 - t0) / 1000.); 204 | } 205 | printf("[rep %d] autokernel | blas \t%.4f ms \t",repeat_count, totalTime / repeat_count); 206 | 207 | 208 | totalTime = 0; 209 | for (int i = 0; i < repeat_count; i++) 210 | { 211 | t0 = get_cur_time(); 212 | cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, K, 1, a, K, b, N, 0, ct, N), 213 | t1 = get_cur_time(); 214 | totalTime += ((float)(t1 - t0) / 1000.); 215 | } 216 | printf("%.4f ms\n",totalTime / repeat_count); 217 | return 0; 218 | } 219 | -------------------------------------------------------------------------------- /doc/tutorials/data/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | project(tengine_tutorial) 2 | cmake_minimum_required(VERSION 3.10) 3 | 4 | # Tengine path 5 | set(TENGINE_ROOT /workspace/Tengine) 6 | 7 | include_directories(./common) 8 | include_directories(${TENGINE_ROOT}/include) 9 | link_directories(${TENGINE_ROOT}/build/install/lib) 10 | 11 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") 12 | set(LINK_LIBS tengine-lite) 13 | set(CMAKE_EXE_LINKER_FLAGS "-rdynamic -ldl") 14 | 15 | add_executable(02_tengine_tutorial 02_tengine_tutorial.cpp common/tengine_operations.c) 16 | target_link_libraries (02_tengine_tutorial ${LINK_LIBS}) 17 | -------------------------------------------------------------------------------- /doc/tutorials/data/gemm.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OAID/AutoKernel/6ada93a743c26e3aae0372f7518433889dd0b437/doc/tutorials/data/gemm.jpg -------------------------------------------------------------------------------- /doc/tutorials/data/inference.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OAID/AutoKernel/6ada93a743c26e3aae0372f7518433889dd0b437/doc/tutorials/data/inference.png -------------------------------------------------------------------------------- /doc/tutorials/data/interleave.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OAID/AutoKernel/6ada93a743c26e3aae0372f7518433889dd0b437/doc/tutorials/data/interleave.png -------------------------------------------------------------------------------- /doc/tutorials/data/memory.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OAID/AutoKernel/6ada93a743c26e3aae0372f7518433889dd0b437/doc/tutorials/data/memory.png -------------------------------------------------------------------------------- /doc/tutorials/data/plugin.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OAID/AutoKernel/6ada93a743c26e3aae0372f7518433889dd0b437/doc/tutorials/data/plugin.png -------------------------------------------------------------------------------- /doc/tutorials/data/step6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OAID/AutoKernel/6ada93a743c26e3aae0372f7518433889dd0b437/doc/tutorials/data/step6.png -------------------------------------------------------------------------------- /doc/tutorials/readme.md: -------------------------------------------------------------------------------- 1 | # AutoKernel教程 2 | 3 | 随着人工智能技术的发展,任何耗费大量人力的工作都将被自动化取代,码农的开发工作也不例外。**自动化**是未来发展的大方向,大趋势。那么,掌握自动化工具就显得格外重要。那么,赶紧跟着小编的AutoKernel教程,一起进军`自动化优化`领域吧~ 4 | 5 | AutoKernel教程分为初级课程和高级课程,目前开放的是初级课程 6 | 7 | ## 初级课程 8 | 通过本教程,你将获得的能力: 9 | - 熟悉AutoKernel开发环境 10 | - 掌握基本的Halide语言,具备描述算子能力 11 | - 掌握常见的优化调度策略Schedule 12 | - 具备快速将自动优化算子集成进Tengine框架的能力 13 | 14 | 15 | 课程大纲: 16 | - 01: [AutoKernel开发环境快速入门](01_AutoKernel开发环境快速入门.md) 17 | - 02: [Tengine快速入门](02_Tengine快速入门.md) 18 | - 03: [Halide初体验](03_Halide初体验.md) 19 | - 04: [AutoKernel Plugin快速入门](04_AutoKernel插件指南.md) 20 | - 05: [Halide调度策略Schedule](05_Halide调度策略Schedule.md) 21 | - 06: [GEMM调度策略优化指南](06_GEMM调度策略优化指南.md) 22 | 23 | 24 | 课程配套代码: 25 | - [02_tengine_tutorial.cpp](data/02_tengine_tutorial.cpp) 26 | - [03_halide_basic.py](data/03_halide_basic.py) 27 | - [03_halide_feed_data.py](data/03_halide_feed_data.py) 28 | - [03_halide_magic.py](data/03_halide_magic.py) 29 | - [04_test_relu.cpp](data/04_test_relu.cpp) 30 | - [05_loop_schedule.py](data/05_loop_schedule.py) 31 | - [06_gemm_optimization.cpp](data/06_gemm_optimization.cpp) 32 | 33 | ## 高级课程 34 | (待补充...) 35 | --------------------------------------------------------------------------------