├── .gitignore
├── AutoSearch
    ├── CMakeLists.txt
    ├── README.md
    ├── generator
    │   └── batch_matmul.cpp
    ├── include
    │   └── utils.h
    ├── src
    │   ├── CMakeLists.txt
    │   ├── adams2019
    │   │   ├── ASLog.cpp
    │   │   ├── ASLog.h
    │   │   ├── AutoSchedule.cpp
    │   │   ├── AutoSchedule.h
    │   │   ├── CMakeLists.txt
    │   │   ├── CostModel.h
    │   │   ├── DefaultCostModel.cpp
    │   │   ├── DefaultCostModel.h
    │   │   ├── Featurization.h
    │   │   ├── FunctionDAG.cpp
    │   │   ├── FunctionDAG.h
    │   │   ├── LoopNest.cpp
    │   │   ├── LoopNest.h
    │   │   ├── Makefile
    │   │   ├── NetworkSize.h
    │   │   ├── PerfectHashMap.h
    │   │   ├── Weights.cpp
    │   │   ├── Weights.h
    │   │   ├── autotune_loop.sh
    │   │   ├── baseline.weights
    │   │   ├── cost_model_generator.cpp
    │   │   ├── cost_model_schedule.h
    │   │   ├── demo_generator.cpp
    │   │   ├── featurization_to_sample.cpp
    │   │   ├── get_host_target.cpp
    │   │   ├── included_schedule_file.schedule.h
    │   │   ├── included_schedule_file_generator.cpp
    │   │   ├── retrain_cost_model.cpp
    │   │   ├── test.cpp
    │   │   ├── test_function_dag.cpp
    │   │   ├── test_perfect_hash_map.cpp
    │   │   ├── updated.weights
    │   │   └── weightsdir_to_weightsfile.cpp
    │   ├── common
    │   │   ├── BoundEstimate.h
    │   │   ├── CMakeLists.txt
    │   │   ├── DataOP.h
    │   │   ├── DataTransform.h
    │   │   ├── Errors.h
    │   │   ├── HalidePlugin.h
    │   │   ├── binary2cpp.cpp
    │   │   └── cmdline.h
    │   ├── li2018
    │   │   ├── CMakeLists.txt
    │   │   ├── GradientAutoscheduler.cpp
    │   │   ├── Makefile
    │   │   ├── README.md
    │   │   ├── demo_generator.cpp
    │   │   ├── test.cpp
    │   │   └── test.py
    │   ├── mullapudi2016
    │   │   ├── AutoSchedule.cpp
    │   │   ├── CMakeLists.txt
    │   │   └── Makefile
    │   └── sioutas2020
    │   │   ├── AutoSchedule.cpp
    │   │   ├── CMakeLists.txt
    │   │   ├── Makefile
    │   │   └── test.cpp
    └── toolkit
    │   ├── RunGen.h
    │   ├── RunGenMain.cpp
    │   ├── shape_config.py
    │   ├── template
    │       ├── demo_eval.cpp
    │       ├── demo_run.cpp
    │       └── gen.cpp
    │   ├── tools.py
    │   └── utils.py
├── Dockerfile
    ├── Dockerfile.cpu
    ├── Dockerfile.cuda
    └── Dockerfile.opencl
├── LICENSE
├── README.md
├── README_CN.md
├── auto_deploy
    ├── README.md
    ├── c_source
    │   └── main.cpp
    ├── data
    │   ├── 0.jpg
    │   ├── 3.jpg
    │   ├── 6.jpg
    │   ├── auto-deploy.png
    │   ├── input_6.bin
    │   ├── main_head
    │   ├── mnist-8.onnx
    │   ├── mnist.weights
    │   └── reg_str
    ├── generated_op.py
    ├── graph.py
    ├── graph_tutorial.ipynb
    ├── mnist.py
    ├── op_build.sh
    ├── op_codegen.py
    ├── op_gen.cpp
    ├── op_generator.py
    └── pass_manager.py
├── autokernel_plugin
    ├── .gitignore
    ├── CMakeLists.txt
    ├── common
    │   └── GenGen.cpp
    ├── images
    │   └── cat.jpg
    ├── include
    │   ├── Halide.h
    │   ├── HalideBuffer.h
    │   └── HalideRuntime.h
    ├── models
    │   ├── squeezenet.tmfile
    │   ├── synset2015.txt
    │   └── synset_words.txt
    ├── scripts
    │   ├── clean.sh
    │   ├── generate.sh
    │   └── register_op.sh
    ├── src
    │   ├── CMakeLists.txt
    │   ├── depthwise
    │   │   ├── build.sh
    │   │   ├── depthwise.cpp
    │   │   ├── depthwise.h
    │   │   └── depthwise_gen.cc
    │   ├── direct_conv
    │   │   ├── build.sh
    │   │   ├── direct_conv.cpp
    │   │   ├── direct_conv.h
    │   │   └── direct_conv_gen.cc
    │   ├── fc
    │   │   ├── build.sh
    │   │   ├── fc.cpp
    │   │   ├── fc.h
    │   │   └── fc_gen.cc
    │   ├── im2col_conv
    │   │   ├── build.sh
    │   │   ├── im2col_conv.cpp
    │   │   ├── im2col_conv.h
    │   │   └── im2col_conv_gen.cc
    │   ├── normalize
    │   │   ├── build.sh
    │   │   ├── normalize.cpp
    │   │   ├── normalize.h
    │   │   └── normalize_gen.cc
    │   ├── plugin_init.cpp
    │   ├── pool
    │   │   ├── avepool_gen.cc
    │   │   ├── build.sh
    │   │   ├── maxpool_gen.cc
    │   │   ├── pool.cpp
    │   │   └── pool.h
    │   └── softmax
    │   │   ├── build.sh
    │   │   ├── softmax.cpp
    │   │   ├── softmax.h
    │   │   └── softmax_gen.cc
    ├── template
    │   ├── build.sh
    │   ├── generator.cc
    │   ├── template.cpp
    │   └── template.h
    └── tests
    │   ├── CMakeLists.txt
    │   ├── common
    │       ├── alphabeta.hpp
    │       ├── common.hpp
    │       ├── stb_image.h
    │       ├── stb_image_write.h
    │       ├── tengine_operations.cpp
    │       ├── tengine_operations.h
    │       └── utils.hpp
    │   ├── test_conv.cpp
    │   ├── test_depthwise.cpp
    │   ├── test_fc.cpp
    │   ├── test_normalize.cpp
    │   ├── test_pool.cpp
    │   ├── test_softmax.cpp
    │   └── tm_classification.cpp
└── doc
    ├── add_op.png
    ├── architecture-en.png
    ├── architecture.png
    ├── how_to_add_op.md
    ├── logo.png
    ├── readme.md
    └── tutorials
        ├── 01_AutoKernel开发环境快速入门.md
        ├── 02_Tengine快速入门.md
        ├── 03_Halide初体验.md
        ├── 04_AutoKernel插件指南.md
        ├── 05_Halide调度策略Schedule.md
        ├── 06_GEMM调度策略优化指南.md
        ├── data
            ├── 02_tengine_tutorial.cpp
            ├── 03_halide_basic.py
            ├── 03_halide_feed_data.py
            ├── 03_halide_magic.py
            ├── 04_test_relu.cpp
            ├── 05_loop_schedule.py
            ├── 06_build.sh
            ├── 06_gemm_optimization.cpp
            ├── CMakeLists.txt
            ├── gemm.jpg
            ├── inference.png
            ├── interleave.png
            ├── memory.png
            ├── plugin.png
            └── step6.png
        └── readme.md


/.gitignore:
--------------------------------------------------------------------------------
1 | *.s
2 | .vscode
3 | build
4 | 


--------------------------------------------------------------------------------
/AutoSearch/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | 
2 | cmake_minimum_required(VERSION 3.18)
3 | project(AutoSearch)
4 | add_subdirectory(src)


--------------------------------------------------------------------------------
/AutoSearch/README.md:
--------------------------------------------------------------------------------
1 | More documentation see:
2 | 
3 | - [AutoSearch中文文档](https://autokernel-docs-en.readthedocs.io/zh_CN/latest/tutorials/autosearch.html)
4 | - [AutoSearch Doc](https://autokernel-docs-en.readthedocs.io/en/latest/tutorials/autosearch.html)
5 | 


--------------------------------------------------------------------------------
/AutoSearch/generator/batch_matmul.cpp:
--------------------------------------------------------------------------------
  1 | #include "Halide.h"
  2 | #include "iostream"
  3 | #include "utils.h"
  4 | 
  5 | /*
  6 |     C(x,y)+=A(k,y)*B(x,k)
  7 | 
  8 |                 ____N____
  9 |              K  |   B   |
 10 |                 |_______|
 11 | 
 12 |       __K___     ___N___
 13 |      |     |    |       |
 14 |     M|  A  |   M|   C   |
 15 |      |_____|    |_______|
 16 | 
 17 | */
 18 | namespace {
 19 | 
 20 | class BatchMatmul : public Halide::Generator<BatchMatmul> {
 21 | public:
 22 |     std::vector<int> args = GetArgsFromEnv();
 23 |     int i = 0;
 24 |     const int B = args[i++];
 25 |     const int M = args[i++];
 26 |     const int N = args[i++];
 27 |     const int K = args[i++];
 28 | 
 29 |     // const int B = 1;
 30 |     // const int N = 1024;
 31 |     // const int N = 1024;
 32 |     // const int K = 1024;
 33 | 
 34 |     Input<Buffer<float>>    input_a{"input_a", 3}; //(dim0,dim1,dim2)=(width,heiht,batch)=(K,M,B)
 35 |     Input<Buffer<float>>    input_b{"input_b", 3}; //(dim0,dim1,dim2)=(width,heiht,batch)=(N,K,B)
 36 |     Output<Buffer<float>>   output{"output", 3};  //(dim0,dim1,dim2)=(width,heiht,batch)=(N,M,B)
 37 | 
 38 |     void generate() {
 39 |         Var x("x"), y("y"),b("b");
 40 |         RDom k(0, K);
 41 |         Func prod("prod"),Br("Br");
 42 |  
 43 |         // Algorithm
 44 |         prod(x, y, b) = 0.0f;
 45 |         prod(x, y, b) += input_a(k, y, b) * input_b(x, k, b);
 46 |         output(x, y, b) = prod(x, y, b);
 47 | 
 48 |         if (!auto_schedule) {
 49 |             Var xi("xi"), yi("yi"), xii("xii"), yii("yii"), xt("xt"), yt("yt"), xy("xy");
 50 | 
 51 |             if(get_target().has_gpu_feature())
 52 |             {
 53 |                 // manuel gpu schedule
 54 |                 output.tile(x,y,xi,yi,8,8)
 55 |                     .unroll(xi)
 56 |                     .unroll(yi)
 57 |                     .gpu_tile(x, y, xt, yt, 2, 2);
 58 | 
 59 |                 prod.compute_at(output,x)
 60 |                     .gpu_threads(x,y)
 61 |                     .update()
 62 |                     .gpu_threads(x,y);
 63 |             }
 64 |             else 
 65 |             {
 66 |                 //manuel cpu schedul
 67 |                 output.tile(x, y, xi, yi, 16, 32)
 68 |                             .fuse(x, y, xy).parallel(xy)
 69 |                             .split(yi, yi, yii, 4)
 70 |                             .vectorize(xi, 8)
 71 |                             .unroll(xi)
 72 |                             .unroll(yii);
 73 | 
 74 |                 prod.compute_at(output, yi)
 75 |                     .vectorize(x, 8).unroll(y);
 76 | 
 77 |                 prod.update()
 78 |                     .reorder(x, y, k)
 79 |                     .vectorize(x, 8)
 80 |                     .unroll(x)
 81 |                     .unroll(y)
 82 |                     .unroll(k, 2);
 83 |             }
 84 |         }
 85 | 
 86 |         output.bound(x, 0, N)
 87 |               .bound(y, 0, M)
 88 |               .bound(b, 0, B);
 89 | 
 90 |         input_a.dim(0).set_bounds(0, K).set_stride(1)
 91 |                .dim(1).set_bounds(0, M).set_stride(K)
 92 |                .dim(2).set_bounds(0, B).set_stride(K * M);
 93 | 
 94 |         input_b.dim(0).set_bounds(0, N).set_stride(1)
 95 |                .dim(1).set_bounds(0, K).set_stride(N)
 96 |                .dim(2).set_bounds(0, B).set_stride(N * K);
 97 | 
 98 |         output.dim(0).set_bounds(0, N).set_stride(1)
 99 |               .dim(1).set_bounds(0, M).set_stride(N)
100 |               .dim(2).set_bounds(0, B).set_stride(M * N);
101 | 
102 |     }
103 | };
104 | 
105 | }  // namespace
106 | 
107 | HALIDE_REGISTER_GENERATOR(BatchMatmul, matmul)


--------------------------------------------------------------------------------
/AutoSearch/include/utils.h:
--------------------------------------------------------------------------------
 1 | #include <iostream>
 2 | #include <functional>
 3 | #include <string>
 4 | #include <vector>
 5 | #include <stdlib.h>
 6 | 
 7 | inline int GetArg(const std::vector<int> &args, size_t index, int default_value = 0) {
 8 |     return index < args.size() ? args[index] : default_value;
 9 | }
10 | 
11 | inline std::vector<int> GetArgsFromEnv() {
12 |     std::vector<int> ret;
13 |     if (const char* env_p = std::getenv("HL_APP_ARGS")) {
14 |         std::string val(env_p);
15 |         size_t offset = 0;
16 |         auto pos = val.find(',', offset);
17 |         while (pos != std::string::npos) {
18 |             ret.push_back(std::stoi(val.substr(offset, pos - offset)));
19 |             offset = pos + 1;
20 |             pos = val.find(',', offset);
21 |         }
22 |         ret.push_back(std::stoi(val.substr(offset, val.size() - offset)));
23 |     } else {
24 |         std::cerr << "Cannot load arguments from environment variable HL_APP_ARGS" << std::endl;
25 |         exit(-1);
26 |     }
27 |     return ret;
28 | }
29 | 
30 | inline double benchmark();
31 | 


--------------------------------------------------------------------------------
/AutoSearch/src/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # Ensure that plugins export only what is needed to load them.
 2 | # Everything else should be omitted to keep binary size low.
 3 | 
 4 | set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS OFF)
 5 | set(CMAKE_CXX_VISIBILITY_PRESET hidden)
 6 | set(CMAKE_VISIBILITY_INLINES_HIDDEN YES)
 7 | find_package(Halide REQUIRED)
 8 | add_executable(binary2cpp common/binary2cpp.cpp)
 9 | function(add_autoscheduler)
10 |     set(options)
11 |     set(oneValueArgs NAME)
12 |     set(multiValueArgs SOURCES)
13 |     cmake_parse_arguments("arg" "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
14 |     #message(STATUS "arg name:${arg_NAME} arg:source:${arg_SOURCES}")
15 |     add_library(Halide_${arg_NAME} MODULE ${arg_SOURCES})
16 |     add_library(AutoSchedule::${arg_NAME} ALIAS Halide_${arg_NAME})
17 | 
18 |     target_compile_definitions(Halide_${arg_NAME} PRIVATE Halide_EXPORTS)
19 |     target_link_libraries(Halide_${arg_NAME} PRIVATE Halide::Plugin)
20 | 
21 |     string(TOLOWER "${arg_NAME}" name_lower)
22 |     set_target_properties(Halide_${arg_NAME} PROPERTIES
23 |                           EXPORT_NAME ${arg_NAME}
24 |                           OUTPUT_NAME autoschedule_${name_lower})
25 | endfunction()
26 | 
27 | add_subdirectory(common)
28 | 
29 | add_subdirectory(adams2019)
30 | add_subdirectory(li2018)
31 | add_subdirectory(mullapudi2016)
32 | add_subdirectory(sioutas2020)
33 | 


--------------------------------------------------------------------------------
/AutoSearch/src/adams2019/ASLog.cpp:
--------------------------------------------------------------------------------
 1 | #include "ASLog.h"
 2 | 
 3 | namespace Halide {
 4 | namespace Internal {
 5 | 
 6 | namespace {
 7 | 
 8 | std::string get_env_variable(char const *env_var_name) {
 9 |     if (!env_var_name) {
10 |         return "";
11 |     }
12 | 
13 | #ifdef _MSC_VER
14 |     // call getenv_s without a buffer to determine the correct string length:
15 |     size_t length = 0;
16 |     if ((getenv_s(&length, NULL, 0, env_var_name) != 0) || (length == 0)) {
17 |         return "";
18 |     }
19 |     // call it again to retrieve the value of the environment variable;
20 |     // note that 'length' already accounts for the null-terminator
21 |     std::string lvl(length - 1, '@');
22 |     size_t read = 0;
23 |     if ((getenv_s(&read, &lvl[0], length, env_var_name) != 0) || (read != length)) {
24 |         return "";
25 |     }
26 |     return lvl;
27 | #else
28 |     char *lvl = getenv(env_var_name);
29 |     if (lvl) return std::string(lvl);
30 | #endif
31 | 
32 |     return "";
33 | }
34 | 
35 | }  // namespace
36 | 
37 | int aslog::aslog_level() {
38 |     static int cached_aslog_level = ([]() -> int {
39 |         // If HL_DEBUG_AUTOSCHEDULE is defined, use that value.
40 |         std::string lvl = get_env_variable("HL_DEBUG_AUTOSCHEDULE");
41 |         if (!lvl.empty()) {
42 |             return atoi(lvl.c_str());
43 |         }
44 |         // Otherwise, use HL_DEBUG_CODEGEN.
45 |         lvl = get_env_variable("HL_DEBUG_CODEGEN");
46 |         return !lvl.empty() ? atoi(lvl.c_str()) : 0;
47 |     })();
48 |     return cached_aslog_level;
49 | }
50 | 
51 | }  // namespace Internal
52 | }  // namespace Halide
53 | 


--------------------------------------------------------------------------------
/AutoSearch/src/adams2019/ASLog.h:
--------------------------------------------------------------------------------
 1 | #ifndef ASLOG_H
 2 | #define ASLOG_H
 3 | 
 4 | // This class is used by train_cost_model, which doesn't link to
 5 | // libHalide, so (despite the namespace) we are better off not
 6 | // including Halide.h, lest we reference something we won't have available
 7 | 
 8 | #include <cstdlib>
 9 | #include <iostream>
10 | #include <utility>
11 | 
12 | namespace Halide {
13 | namespace Internal {
14 | 
15 | class aslog {
16 |     const bool logging;
17 | 
18 | public:
19 |     aslog(int verbosity)
20 |         : logging(verbosity <= aslog_level()) {
21 |     }
22 | 
23 |     template<typename T>
24 |     aslog &operator<<(T &&x) {
25 |         if (logging) {
26 |             std::cerr << std::forward<T>(x);
27 |         }
28 |         return *this;
29 |     }
30 | 
31 |     static int aslog_level();
32 | };
33 | 
34 | }  // namespace Internal
35 | }  // namespace Halide
36 | 
37 | #endif
38 | 


--------------------------------------------------------------------------------
/AutoSearch/src/adams2019/AutoSchedule.h:
--------------------------------------------------------------------------------
 1 | #include "CostModel.h"
 2 | #include "Featurization.h"
 3 | #include "FunctionDAG.h"
 4 | #include "Halide.h"
 5 | #include "PerfectHashMap.h"
 6 | #include <vector>
 7 | 
 8 | namespace Halide {
 9 | namespace Internal {
10 | namespace Autoscheduler {
11 | 
12 | typedef PerfectHashMap<FunctionDAG::Node::Stage, ScheduleFeatures> StageMapOfScheduleFeatures;
13 | 
14 | void find_and_apply_schedule(FunctionDAG &dag, const std::vector<Function> &outputs, const MachineParams &params,
15 |                              CostModel *cost_model, int beam_size, StageMapOfScheduleFeatures *schedule_features);
16 | 
17 | }  // namespace Autoscheduler
18 | }  // namespace Internal
19 | }  // namespace Halide
20 | 


--------------------------------------------------------------------------------
/AutoSearch/src/adams2019/CMakeLists.txt:
--------------------------------------------------------------------------------
  1 | ##
  2 | # Resources for the autoscheduler library
  3 | ##
  4 | 
  5 | # weights
  6 | set(WF_CPP baseline.cpp)
  7 | configure_file(baseline.weights baseline.weights COPYONLY)
  8 | add_custom_command(OUTPUT ${WF_CPP}
  9 |                    COMMAND binary2cpp baseline_weights < baseline.weights > ${WF_CPP}
 10 |                    DEPENDS baseline.weights binary2cpp
 11 |                    VERBATIM)
 12 | 
 13 | # cost_model, train_cost_model
 14 | add_executable(cost_model.generator cost_model_generator.cpp)
 15 | target_link_libraries(cost_model.generator PRIVATE Halide::Generator)
 16 | 
 17 | add_halide_library(cost_model FROM cost_model.generator
 18 |                    TARGETS cmake)
 19 | add_halide_library(train_cost_model FROM cost_model.generator
 20 |                    TARGETS cmake
 21 |                    USE_RUNTIME cost_model.runtime)
 22 | 
 23 | # retrain_cost_model
 24 | add_executable(retrain_cost_model
 25 |                ASLog.cpp
 26 |                DefaultCostModel.cpp
 27 |                Weights.cpp
 28 |                retrain_cost_model.cpp
 29 |                ${WF_CPP})
 30 | target_link_libraries(retrain_cost_model PRIVATE cost_model train_cost_model Halide::Halide Halide::Plugin)
 31 | 
 32 | ##
 33 | # Main autoscheduler library
 34 | ##
 35 | 
 36 | add_autoscheduler(NAME Adams2019
 37 |                   SOURCES
 38 |                   ASLog.cpp
 39 |                   AutoSchedule.cpp
 40 |                   DefaultCostModel.cpp
 41 |                   FunctionDAG.cpp
 42 |                   LoopNest.cpp
 43 |                   Weights.cpp
 44 |                   ${WF_CPP})
 45 | 
 46 | target_link_libraries(Halide_Adams2019 PRIVATE cost_model train_cost_model)
 47 | 
 48 | ##
 49 | # Tests and demos
 50 | # TODO(#4053): move these to a separate folder since they're tests.
 51 | ##
 52 | 
 53 | # =================================================================
 54 | 
 55 | add_executable(demo.generator demo_generator.cpp)
 56 | target_link_libraries(demo.generator PRIVATE Halide::Generator)
 57 | 
 58 | add_halide_library(demo FROM demo.generator
 59 |                    TARGETS cmake
 60 |                    AUTOSCHEDULER AutoSchedule::Adams2019
 61 |                    REGISTRATION DEMO_REGISTRATION_FILE)
 62 | 
 63 | add_executable(demo_apps_autoscheduler ${DEMO_REGISTRATION_FILE})
 64 | target_link_libraries(demo_apps_autoscheduler PRIVATE demo Halide::RunGenMain)
 65 | 
 66 | add_test(NAME demo_apps_autoscheduler
 67 |          COMMAND demo_apps_autoscheduler --benchmarks=all --benchmark_min_time=1 --estimate_all)
 68 | 
 69 | set_tests_properties(demo_apps_autoscheduler
 70 |                      PROPERTIES
 71 |                      LABELS Adams2019
 72 |                      ENVIRONMENT "HL_TARGET=${Halide_TARGET}")
 73 | 
 74 | # =================================================================
 75 | 
 76 | add_executable(included_schedule_file.generator included_schedule_file_generator.cpp)
 77 | target_link_libraries(included_schedule_file.generator PRIVATE Halide::Generator)
 78 | 
 79 | add_halide_library(included_schedule_file FROM included_schedule_file.generator
 80 |                    TARGETS cmake
 81 |                    AUTOSCHEDULER AutoSchedule::Adams2019
 82 |                    REGISTRATION included_schedule_reg)
 83 | 
 84 | add_executable(demo_included_schedule_file ${included_schedule_reg})
 85 | target_link_libraries(demo_included_schedule_file PRIVATE included_schedule_file Halide::RunGenMain)
 86 | 
 87 | add_test(NAME demo_included_schedule_file
 88 |          COMMAND demo_included_schedule_file --benchmarks=all --benchmark_min_time=1 --estimate_all)
 89 | 
 90 | set_tests_properties(demo_included_schedule_file
 91 |                      PROPERTIES
 92 |                      LABELS Adams2019
 93 |                      ENVIRONMENT "HL_TARGET=${Halide_TARGET}")
 94 | 
 95 | # ====================================================
 96 | # Auto-tuning support utilities.
 97 | # TODO(#4053): implement auto-tuning support in CMake?
 98 | 
 99 | add_executable(featurization_to_sample featurization_to_sample.cpp)
100 | 
101 | add_executable(get_host_target get_host_target.cpp)
102 | target_link_libraries(get_host_target PRIVATE Halide::Halide)
103 | 
104 | add_executable(weightsdir_to_weightsfile weightsdir_to_weightsfile.cpp Weights.cpp)
105 | target_link_libraries(weightsdir_to_weightsfile PRIVATE Halide::Runtime)
106 | 
107 | # =================================================================
108 | # Smaller tests
109 | 
110 | if (BUILD_SHARED_LIBS)
111 |     add_executable(test_apps_autoscheduler test.cpp)
112 |     target_link_libraries(test_apps_autoscheduler PRIVATE Halide::Halide Halide::Tools ${CMAKE_DL_LIBS})
113 | 
114 |     add_test(NAME test_apps_autoscheduler
115 |              COMMAND test_apps_autoscheduler $<TARGET_FILE:Halide_Adams2019>)
116 | 
117 |     set_tests_properties(test_apps_autoscheduler PROPERTIES
118 |                          LABELS Adams2019
119 |                          ENVIRONMENT "LD_LIBRARY_PATH=$<TARGET_FILE_DIR:Halide_Adams2019>;HL_TARGET=${Halide_TARGET}")
120 | endif ()
121 | 
122 | ##
123 | 
124 | add_executable(test_perfect_hash_map test_perfect_hash_map.cpp)
125 | 
126 | add_test(NAME test_perfect_hash_map COMMAND test_perfect_hash_map)
127 | set_tests_properties(test_perfect_hash_map
128 |                      PROPERTIES
129 |                      LABELS Adams2019
130 |                      ENVIRONMENT "HL_TARGET=${Halide_TARGET}")
131 | 
132 | ##
133 | 
134 | add_executable(test_function_dag test_function_dag.cpp FunctionDAG.cpp ASLog.cpp)
135 | target_link_libraries(test_function_dag PRIVATE Halide::Halide Halide::Tools Halide::Plugin)
136 | 
137 | add_test(NAME test_function_dag COMMAND test_function_dag)
138 | set_tests_properties(test_function_dag
139 |                      PROPERTIES
140 |                      LABELS Adams2019
141 |                      ENVIRONMENT "HL_TARGET=${Halide_TARGET}")
142 | 


--------------------------------------------------------------------------------
/AutoSearch/src/adams2019/CostModel.h:
--------------------------------------------------------------------------------
 1 | #ifndef COST_MODEL_H
 2 | #define COST_MODEL_H
 3 | 
 4 | #include <string>
 5 | 
 6 | #include "FunctionDAG.h"
 7 | #include "HalideBuffer.h"
 8 | #include "PerfectHashMap.h"
 9 | 
10 | // An abstract base class for a cost model.
11 | namespace Halide {
12 | 
13 | namespace Internal {
14 | namespace Autoscheduler {
15 | typedef PerfectHashMap<FunctionDAG::Node::Stage, ScheduleFeatures> StageMapOfScheduleFeatures;
16 | }  // namespace Autoscheduler
17 | }  // namespace Internal
18 | 
19 | class CostModel {
20 | public:
21 |     virtual ~CostModel() = default;
22 | 
23 |     // Configure the cost model for the algorithm to be scheduled.
24 |     virtual void set_pipeline_features(const Internal::Autoscheduler::FunctionDAG &dag,
25 |                                        const MachineParams &params) = 0;
26 | 
27 |     // Enqueue a schedule to be evaluated. Will annotate the value located at cost_ptr when the evaluation takes place.
28 |     // Note that the dag argument should correspond to the dag specified previously when calling set_pipeline_features.
29 |     virtual void enqueue(const Internal::Autoscheduler::FunctionDAG &dag,
30 |                          const Halide::Internal::Autoscheduler::StageMapOfScheduleFeatures &schedule_feats,
31 |                          double *cost_ptr) = 0;
32 | 
33 |     // Evaluate all schedules in the queue.
34 |     virtual void evaluate_costs() = 0;
35 | 
36 |     // Discard all schedules in the queue.
37 |     virtual void reset() = 0;
38 | };
39 | 
40 | }  // namespace Halide
41 | 
42 | #endif  // COST_MODEL_H
43 | 


--------------------------------------------------------------------------------
/AutoSearch/src/adams2019/DefaultCostModel.h:
--------------------------------------------------------------------------------
 1 | #ifndef DEFAULT_COST_MODEL_H
 2 | #define DEFAULT_COST_MODEL_H
 3 | 
 4 | #include "CostModel.h"
 5 | #include "Weights.h"
 6 | #include <string>
 7 | 
 8 | namespace Halide {
 9 | 
10 | class DefaultCostModel : public CostModel {
11 | private:
12 |     Internal::Weights weights;
13 |     Runtime::Buffer<float> schedule_feat_queue, pipeline_feat_queue, costs;
14 |     Runtime::Buffer<double *> cost_ptrs;
15 |     int cursor, num_stages, num_cores;
16 | 
17 |     const std::string weights_in_path, weights_out_path;
18 |     const bool randomize_weights;
19 | 
20 |     Runtime::Buffer<float>
21 |         head1_filter_update, head1_bias_update,
22 |         head2_filter_update, head2_bias_update,
23 |         conv1_filter_update, conv1_bias_update;
24 |     int timestep = 0;
25 | 
26 | public:
27 |     DefaultCostModel(const std::string &weights_in_path,
28 |                      const std::string &weights_out_path,
29 |                      bool randomize_weights)
30 |         : weights_in_path(weights_in_path),
31 |           weights_out_path(weights_out_path),
32 |           randomize_weights(randomize_weights) {
33 | 
34 |         load_weights();
35 |     }
36 |     ~DefaultCostModel() override = default;
37 | 
38 |     // Configure the cost model for the algorithm to be scheduled.
39 |     void set_pipeline_features(const Internal::Autoscheduler::FunctionDAG &dag,
40 |                                const MachineParams &params) override;
41 |     void set_pipeline_features(const Runtime::Buffer<float> &, int n);
42 | 
43 |     // Enqueue a schedule to be evaluated. The second version of this method returns a buffer of
44 |     // schedule_features that should be filled in by the caller.
45 |     void enqueue(const Internal::Autoscheduler::FunctionDAG &dag,
46 |                  const Halide::Internal::Autoscheduler::StageMapOfScheduleFeatures &schedule_feats,
47 |                  double *cost_ptr) override;
48 |     void enqueue(int ns, Runtime::Buffer<float> *schedule_feats, double *cost_ptr);
49 | 
50 |     // Evaluate all schedules in the queue.
51 |     void evaluate_costs() override;
52 | 
53 |     // Discard all schedules in the queue.
54 |     void reset() override;
55 | 
56 |     // Update model weights using true measured runtimes.
57 |     float backprop(const Runtime::Buffer<const float> &true_runtimes, float learning_rate);
58 | 
59 |     // Save/Load the model weights to/from disk.
60 |     void save_weights();
61 |     void load_weights();
62 | };
63 | 
64 | std::unique_ptr<DefaultCostModel> make_default_cost_model(const std::string &weights_in_dir = "",
65 |                                                           const std::string &weights_out_dir = "",
66 |                                                           bool randomize_weights = false);
67 | }  // namespace Halide
68 | 
69 | #endif  // DEFAULT_COST_MODEL_H
70 | 


--------------------------------------------------------------------------------
/AutoSearch/src/adams2019/NetworkSize.h:
--------------------------------------------------------------------------------
 1 | #ifndef HALIDE_NETWORK_SIZE_H
 2 | #define HALIDE_NETWORK_SIZE_H
 3 | 
 4 | namespace Halide {
 5 | // The size of the best cost model network found. Needed by the cost
 6 | // model and also the cost model training script.
 7 | const int head1_channels = 8, head1_w = 40, head1_h = 7;
 8 | const int head2_channels = 24, head2_w = 39;
 9 | const int conv1_channels = 32;
10 | }  // namespace Halide
11 | 
12 | #endif  // HALIDE_NETWORK_SIZE_H
13 | 


--------------------------------------------------------------------------------
/AutoSearch/src/adams2019/Weights.h:
--------------------------------------------------------------------------------
 1 | #ifndef _WEIGHTS
 2 | #define _WEIGHTS
 3 | 
 4 | #include <cstdint>
 5 | #include <iostream>
 6 | #include <string>
 7 | 
 8 | #include "Featurization.h"
 9 | #include "HalideBuffer.h"
10 | #include "NetworkSize.h"
11 | 
12 | namespace Halide {
13 | namespace Internal {
14 | 
15 | struct Weights {
16 |     uint32_t pipeline_features_version = PipelineFeatures::version();
17 |     uint32_t schedule_features_version = ScheduleFeatures::version();
18 | 
19 |     Halide::Runtime::Buffer<float> head1_filter{head1_channels, head1_w, head1_h};
20 |     Halide::Runtime::Buffer<float> head1_bias{head1_channels};
21 | 
22 |     Halide::Runtime::Buffer<float> head2_filter{head2_channels, head2_w};
23 |     Halide::Runtime::Buffer<float> head2_bias{head2_channels};
24 | 
25 |     Halide::Runtime::Buffer<float> conv1_filter{conv1_channels, head1_channels + head2_channels};
26 |     Halide::Runtime::Buffer<float> conv1_bias{conv1_channels};
27 | 
28 |     template<typename F>
29 |     void for_each_buffer(F f) {
30 |         f(head1_filter);
31 |         f(head1_bias);
32 |         f(head2_filter);
33 |         f(head2_bias);
34 |         f(conv1_filter);
35 |         f(conv1_bias);
36 |     }
37 | 
38 |     void randomize(uint32_t seed);
39 | 
40 |     bool load(std::istream &i);
41 |     bool save(std::ostream &o) const;
42 | 
43 |     bool load_from_file(const std::string &filename);
44 |     bool save_to_file(const std::string &filename) const;
45 | 
46 |     // Load/save from the 'classic' form of six raw data files
47 |     bool load_from_dir(const std::string &dir);
48 |     bool save_to_dir(const std::string &dir) const;
49 | };
50 | 
51 | }  // namespace Internal
52 | }  // namespace Halide
53 | 
54 | #endif  // _WEIGHTS
55 | 


--------------------------------------------------------------------------------
/AutoSearch/src/adams2019/baseline.weights:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OAID/AutoKernel/6ada93a743c26e3aae0372f7518433889dd0b437/AutoSearch/src/adams2019/baseline.weights


--------------------------------------------------------------------------------
/AutoSearch/src/adams2019/demo_generator.cpp:
--------------------------------------------------------------------------------
 1 | #include "Halide.h"
 2 | #include "iostream"
 3 | //#include "utils.h"
 4 | 
 5 | namespace {
 6 | 
 7 | class BatchMatmul : public Halide::Generator<BatchMatmul> {
 8 | public:
 9 |     //std::vector<int> args = GetArgsFromEnv();
10 |     int i = 0;
11 |     //const int B = args[i++];
12 |     //const int N = args[i++];
13 |     //const int M = args[i++];
14 |     //const int K = args[i++];
15 |      const int B = 1;
16 |      const int N = 1024;
17 |      const int M = 1024;
18 |      const int K = 1024;
19 | 
20 |     Input<Buffer<float>>    input_a{"input_a", 3};
21 |     Input<Buffer<float>>    input_b{"input_b", 3};
22 | 
23 |     Output<Buffer<float>>   output{"output", 3};
24 | 
25 |     void generate() {
26 |         Var x("x"), y("y"), b("b"),bo("bo"), xo("xo"), yo("yo"),xoa("xoa"),yoa("yoa"),yoai("yoai"),xi("xi");
27 |         Var xii("xii");
28 |         Var yi("yi");
29 | 
30 |         // Algorithm
31 |         RDom k(0, K);
32 | 
33 | 
34 |         Func func("func"), Bs("Bs");//,As("input_b_im#interleave");
35 |         //As(x,y,xo,b) = input_b(xo*16+x,y,b);
36 |         func(xi, y, b) = 0.0f;
37 |         func(xi, y, b) += input_a(k, y, b) * input_b(xi,k,b);
38 |         output(xi, y, b) = func(xi, y, b);
39 |         //func.trace_stores();  
40 | 
41 |         output.bound(xi, 0, M)
42 |               .bound(y, 0, N)
43 |               .bound(b, 0, B);
44 |         input_a.dim(0).set_bounds(0, K).set_stride(1)
45 |                .dim(1).set_bounds(0, N).set_stride(K)
46 |                .dim(2).set_bounds(0, B).set_stride(K * N);
47 | 
48 |         input_b.dim(0).set_bounds(0, M).set_stride(1)
49 |                .dim(1).set_bounds(0, K).set_stride(M)
50 |                .dim(2).set_bounds(0, B).set_stride(M * K);
51 | 
52 |         output.dim(0).set_bounds(0, M).set_stride(1)
53 |               .dim(1).set_bounds(0, N).set_stride(M)
54 |               .dim(2).set_bounds(0, B).set_stride(M * N);
55 |         //Br.print_loop_nest();
56 |         //func.print_loop_nest();
57 |     }
58 | };
59 | 
60 | }  // namespace
61 | 
62 | HALIDE_REGISTER_GENERATOR(BatchMatmul, demo)
63 | 


--------------------------------------------------------------------------------
/AutoSearch/src/adams2019/featurization_to_sample.cpp:
--------------------------------------------------------------------------------
 1 | #include <cstdint>
 2 | #include <cstdlib>
 3 | #include <fstream>
 4 | #include <iostream>
 5 | 
 6 | // A sample is a featurization + a runtime + some ids, all together in one file.
 7 | // This utility concats the runtime and ids onto a featurization to produce a sample.
 8 | int main(int argc, char **argv) {
 9 |     if (argc != 6) {
10 |         std::cout << "Usage: featurization_to_sample in.featurization runtime pipeline_id schedule_id out.sample\n";
11 |         return -1;
12 |     }
13 | 
14 |     std::ifstream src(argv[1], std::ios::binary);
15 |     if (!src) {
16 |         std::cerr << "Unable to open input file: " << argv[1] << "\n";
17 |         return -1;
18 |     }
19 | 
20 |     std::ofstream dst(argv[5], std::ios::binary);
21 |     if (!dst) {
22 |         std::cerr << "Unable to open output file: " << argv[5] << "\n";
23 |         return -1;
24 |     }
25 | 
26 |     dst << src.rdbuf();
27 | 
28 |     // Input runtime value is presumed to be in seconds,
29 |     // but sample file stores times in milliseconds.
30 |     float r = atof(argv[2]) * 1000.f;
31 |     int32_t pid = atoi(argv[3]);
32 |     int32_t sid = atoi(argv[4]);
33 | 
34 |     dst.write((const char *)&r, 4);
35 |     dst.write((const char *)&pid, 4);
36 |     dst.write((const char *)&sid, 4);
37 | 
38 |     src.close();
39 |     dst.close();
40 | 
41 |     return 0;
42 | }
43 | 


--------------------------------------------------------------------------------
/AutoSearch/src/adams2019/get_host_target.cpp:
--------------------------------------------------------------------------------
 1 | #include "Halide.h"
 2 | 
 3 | using namespace Halide;
 4 | 
 5 | // Print the host target to stdout.
 6 | // Any extra arguments are assumed to be features that should be stripped from
 7 | // the target (as a convenience for use in Makefiles, where string manipulation
 8 | // can be painful).
 9 | int main(int argc, char **argv) {
10 |     Target t = get_host_target();
11 |     for (int i = 1; i < argc; ++i) {
12 |         auto f = Target::feature_from_name(argv[i]);
13 |         if (f == Target::FeatureEnd) {
14 |             fprintf(stderr, "Unknown feature: %s\n", argv[i]);
15 |             exit(1);
16 |         }
17 |         t = t.without_feature(f);
18 |     }
19 |     printf("%s", t.to_string().c_str());
20 |     return 0;
21 | }
22 | 


--------------------------------------------------------------------------------
/AutoSearch/src/adams2019/included_schedule_file.schedule.h:
--------------------------------------------------------------------------------
 1 | 
 2 | #ifndef included_schedule_file_SCHEDULE_H
 3 | #define included_schedule_file_SCHEDULE_H
 4 | 
 5 | // MACHINE GENERATED -- DO NOT EDIT
 6 | // This schedule was automatically generated by apps/autoscheduler/AutoSchedule
 7 | // for target=x86-64-osx-avx-avx2-f16c-fma-sse41
 8 | // with machine_params=16,16777216,40
 9 | 
10 | #include "Halide.h"
11 | 
12 | inline void apply_schedule_included_schedule_file(
13 |     ::Halide::Pipeline pipeline,
14 |     ::Halide::Target target) {
15 |     using ::Halide::Func;
16 |     using ::Halide::MemoryType;
17 |     using ::Halide::RVar;
18 |     using ::Halide::TailStrategy;
19 |     using ::Halide::Var;
20 | 
21 |     Func relu = pipeline.get_func(4);
22 |     Func conv = pipeline.get_func(3);
23 |     Var c(relu.get_schedule().dims()[0].var);
24 |     Var ci("ci");
25 |     Var n(relu.get_schedule().dims()[3].var);
26 |     Var x(relu.get_schedule().dims()[1].var);
27 |     Var xi("xi");
28 |     Var y(relu.get_schedule().dims()[2].var);
29 |     Var yi("yi");
30 |     RVar r4_x(conv.update(0).get_schedule().dims()[0].var);
31 |     RVar r4_y(conv.update(0).get_schedule().dims()[1].var);
32 |     RVar r4_z(conv.update(0).get_schedule().dims()[2].var);
33 |     relu
34 |         .split(x, x, xi, 2, TailStrategy::ShiftInwards)
35 |         .split(c, c, ci, 8, TailStrategy::ShiftInwards)
36 |         .split(y, y, yi, 4, TailStrategy::ShiftInwards)
37 |         .unroll(xi)
38 |         .unroll(yi)
39 |         .vectorize(ci)
40 |         .compute_root()
41 |         .reorder(ci, xi, yi, c, y, x, n)
42 |         .fuse(x, n, x)
43 |         .parallel(x);
44 |     conv.update(0)
45 |         .split(c, c, ci, 8, TailStrategy::GuardWithIf)
46 |         .unroll(x)
47 |         .unroll(y)
48 |         .vectorize(ci)
49 |         .reorder(ci, c, x, y, n, r4_x, r4_y, r4_z);
50 |     conv
51 |         .store_in(MemoryType::Stack)
52 |         .split(c, c, ci, 8, TailStrategy::ShiftInwards)
53 |         .unroll(x)
54 |         .unroll(y)
55 |         .vectorize(ci)
56 |         .compute_at(relu, c)
57 |         .reorder(ci, c, x, y, n);
58 | }
59 | 
60 | #endif  // included_schedule_file_SCHEDULE_H
61 | 


--------------------------------------------------------------------------------
/AutoSearch/src/adams2019/included_schedule_file_generator.cpp:
--------------------------------------------------------------------------------
 1 | #include "Halide.h"
 2 | 
 3 | #if defined(GENERATING_SCHEDULE)
 4 | // nothing
 5 | #else
 6 | #include "included_schedule_file.schedule.h"
 7 | #endif
 8 | 
 9 | namespace {
10 | 
11 | // Trivial Generator for testing (and demonstrating) use of .schedule.h
12 | // files produced by the autoschedulers; this is very similar to
13 | // demo_generator.cpp, but packaged separately to avoid confusion for
14 | // newcomers.
15 | struct IncludedScheduleFile : public Halide::Generator<IncludedScheduleFile> {
16 |     Input<Buffer<float>> input{"input", 4};
17 |     Input<Buffer<float>> filter{"filter", 4};
18 |     Input<Buffer<float>> bias{"bias", 1};
19 |     Output<Buffer<float>> relu{"relu", 4};
20 | 
21 |     void generate() {
22 |         const int N = 5, CI = 120, CO = 24, W = 100, H = 80;
23 | 
24 |         Var x("x"), y("y"), c("c"), n("n");
25 | 
26 |         // Algorithm
27 |         Func conv("conv");
28 |         RDom r(0, CI, 0, 3, 0, 3);
29 |         conv(c, x, y, n) = bias(c);
30 |         conv(c, x, y, n) += filter(c, r.y, r.z, r.x) * input(r.x, x + r.y, y + r.z, n);
31 |         relu(c, x, y, n) = max(0, conv(c, x, y, n));
32 | 
33 |         // Estimates (for autoscheduler and/or RunGen)
34 |         input.set_estimates({{0, CI}, {0, W + 2}, {0, H + 2}, {0, N}});
35 |         filter.set_estimates({{0, CO}, {0, 3}, {0, 3}, {0, CI}});
36 |         bias.set_estimates({{0, CO}});
37 |         relu.set_estimates({{0, CO}, {0, W}, {0, H}, {0, N}});
38 | 
39 |         // Schedule
40 |         if (auto_schedule) {
41 |             // nothing
42 |         } else {
43 | #if defined(GENERATING_SCHEDULE)
44 |             abort();
45 | #else
46 |             apply_schedule_included_schedule_file(get_pipeline(), get_target());
47 | #endif
48 |         }
49 |     }
50 | };
51 | 
52 | }  // namespace
53 | 
54 | HALIDE_REGISTER_GENERATOR(IncludedScheduleFile, included_schedule_file)
55 | 


--------------------------------------------------------------------------------
/AutoSearch/src/adams2019/test_perfect_hash_map.cpp:
--------------------------------------------------------------------------------
 1 | #include <time.h>
 2 | 
 3 | #include <iostream>
 4 | #include <map>
 5 | #include <random>
 6 | #include <vector>
 7 | 
 8 | #include "PerfectHashMap.h"
 9 | 
10 | using std::map;
11 | using std::vector;
12 | 
13 | struct Key {
14 |     int id, max_id;
15 |     Key(int i, int m)
16 |         : id(i), max_id(m) {
17 |     }
18 | };
19 | 
20 | int main(int argc, char **argv) {
21 |     std::mt19937 rng(0);
22 |     int seed = argc > 1 ? atoi(argv[1]) : time(nullptr);
23 |     rng.seed(seed);
24 |     printf("seed: %d\n", seed);
25 | 
26 |     PerfectHashMap<Key, int> h;
27 |     std::map<const Key *, int> ref;
28 | 
29 |     std::vector<Key> keys;
30 |     const int N = 100;
31 | 
32 |     for (int i = 0; i < N; i++) {
33 |         keys.emplace_back(i, N);
34 |     }
35 |     std::shuffle(keys.begin(), keys.end(), rng);
36 | 
37 |     for (int i = 0; i < 10000; i++) {
38 |         // Insert. Possibly a duplicate of an existing item.
39 |         int next = rng() % N;
40 |         h.insert(&keys[next], next);
41 |         ref.insert({&keys[next], next});
42 | 
43 |         // Check the map and hash map contain the same stuff in the same order
44 |         if (h.size() != ref.size()) {
45 |             fprintf(stderr, "Size mismatch: %d vs %d\n", (int)h.size(), (int)ref.size());
46 |             return -1;
47 |         }
48 |         // Use iterators to convert PerfectHashMap to map and compare to reference map
49 |         decltype(ref) h_map;
50 |         for (auto it = h.begin(); it != h.end(); it++) {
51 |             h_map.insert({it.key(), it.value()});
52 |         }
53 | 
54 |         auto it = h_map.begin();
55 |         auto ref_it = ref.begin();
56 |         while (it != h_map.end()) {
57 |             if (it->first != ref_it->first) {
58 |                 fprintf(stderr, "Key mismatch: %p vs %p\n", (const void *)it->first, (const void *)ref_it->first);
59 |                 return -1;
60 |             }
61 |             if (it->second != ref_it->second) {
62 |                 fprintf(stderr, "Value mismatch: %d vs %d\n", it->second, ref_it->second);
63 |                 return -1;
64 |             }
65 |             it++;
66 |             ref_it++;
67 |         }
68 |     }
69 |     printf("Perfect hash map test passed\n");
70 |     return 0;
71 | }
72 | 


--------------------------------------------------------------------------------
/AutoSearch/src/adams2019/updated.weights:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OAID/AutoKernel/6ada93a743c26e3aae0372f7518433889dd0b437/AutoSearch/src/adams2019/updated.weights


--------------------------------------------------------------------------------
/AutoSearch/src/adams2019/weightsdir_to_weightsfile.cpp:
--------------------------------------------------------------------------------
 1 | #include <cstdint>
 2 | #include <cstdlib>
 3 | #include <fstream>
 4 | #include <iostream>
 5 | 
 6 | #include "Weights.h"
 7 | 
 8 | // Utility to convert from the old dir-of-raw-data into a new .weights file.
 9 | // Should live only long enough for downstream users to convert existing data files
10 | // to the new format.
11 | int main(int argc, char **argv) {
12 |     if (argc != 3) {
13 |         std::cout << "Usage: weights_dir weights_file.weights\n";
14 |         return -1;
15 |     }
16 | 
17 |     Halide::Internal::Weights w;
18 |     if (!w.load_from_dir(argv[1])) {
19 |         std::cerr << "Unable to read input dir: " << argv[1] << "\n";
20 |         return -1;
21 |     }
22 | 
23 |     if (!w.save_to_file(argv[2])) {
24 |         std::cerr << "Unable to save output file: " << argv[2] << "\n";
25 |         return -1;
26 |     }
27 | 
28 |     return 0;
29 | }
30 | 


--------------------------------------------------------------------------------
/AutoSearch/src/common/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_library(Halide_Plugin INTERFACE)
2 | add_library(Halide::Plugin ALIAS Halide_Plugin)
3 | target_include_directories(Halide_Plugin INTERFACE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>)
4 | target_link_libraries(Halide_Plugin INTERFACE Halide::Halide)
5 | 


--------------------------------------------------------------------------------
/AutoSearch/src/common/Errors.h:
--------------------------------------------------------------------------------
 1 | #ifndef ERRORS_H
 2 | #define ERRORS_H
 3 | 
 4 | #include "Halide.h"
 5 | 
 6 | #ifndef user_error
 7 | #define user_error Halide::Internal::ErrorReport(__FILE__, __LINE__, nullptr, Halide::Internal::ErrorReport::User)
 8 | #endif
 9 | 
10 | #ifndef user_warning
11 | #define user_warning Halide::Internal::ErrorReport(__FILE__, __LINE__, nullptr, Halide::Internal::ErrorReport::User | Halide::Internal::ErrorReport::Warning)
12 | #endif
13 | 
14 | #ifndef user_assert
15 | #define user_assert(c) _halide_internal_assertion(c, Halide::Internal::ErrorReport::User)
16 | #endif
17 | 
18 | #ifndef internal_assert
19 | #define internal_assert(c) _halide_internal_assertion(c, 0)
20 | #endif
21 | 
22 | #ifndef internal_error
23 | #define internal_error Halide::Internal::ErrorReport(__FILE__, __LINE__, nullptr, 0)
24 | #endif
25 | 
26 | #endif
27 | 


--------------------------------------------------------------------------------
/AutoSearch/src/common/HalidePlugin.h:
--------------------------------------------------------------------------------
 1 | #ifndef HALIDE_HALIDEPLUGIN_H
 2 | #define HALIDE_HALIDEPLUGIN_H
 3 | 
 4 | #include "Errors.h"
 5 | #include "DataTransform.h"
 6 | #define REGISTER_AUTOSCHEDULER(NAME)                                  \
 7 |     struct HALIDE_EXPORT Register##NAME {                             \
 8 |         Register##NAME() {                                            \
 9 |             debug(1) << "Registering autoscheduler '" #NAME "'...\n"; \
10 |             Pipeline::add_autoscheduler(#NAME, NAME());               \
11 |         }                                                             \
12 |     } register_##NAME;
13 | 
14 | #endif  //HALIDE_HALIDEPLUGIN_H
15 | 


--------------------------------------------------------------------------------
/AutoSearch/src/common/binary2cpp.cpp:
--------------------------------------------------------------------------------
 1 | 
 2 | #include <assert.h>
 3 | #include <stdio.h>
 4 | #include <stdlib.h>
 5 | #include <string.h>
 6 | 
 7 | #ifdef _WIN32
 8 | #include <fcntl.h>  // O_BINARY
 9 | #include <io.h>     // setmode
10 | #endif
11 | 
12 | // Embeds a binary blob (from stdin) in a C++ source array of unsigned
13 | // chars. Similar to the xxd utility.
14 | 
15 | static int usage() {
16 |     fprintf(stderr, "Usage: binary2cpp identifier [-header]\n");
17 |     return -1;
18 | }
19 | 
20 | int main(int argc, const char **argv) {
21 |     const char *target = argv[1];
22 |     if (argc == 3) {
23 |         if (!strcmp(argv[2], "-header")) {
24 |             printf("#ifndef _H_%s_binary2cpp\n", target);
25 |             printf("#define _H_%s_binary2cpp\n", target);
26 |             printf("extern \"C\" {\n");
27 |             printf("extern unsigned char %s[];\n", target);
28 |             printf("extern int %s_length;\n", target);
29 |             printf("}  // extern \"C\"\n");
30 |             printf("#endif  // _H_%s_binary2cpp\n", target);
31 |             return 0;
32 |         } else {
33 |             return usage();
34 |         }
35 |     } else if (argc > 3) {
36 |         return usage();
37 |     }
38 | 
39 | #ifdef _WIN32
40 |     setmode(fileno(stdin), O_BINARY);  // On windows bad things will happen unless we read stdin in binary mode
41 | #endif
42 |     printf("extern \"C\" {\n");
43 |     printf("unsigned char %s[] = {\n", target);
44 |     int count = 0;
45 |     int line_break = 0;
46 |     while (1) {
47 |         int c = getchar();
48 |         if (c == EOF) break;
49 |         printf("0x%02x, ", c);
50 |         // Not necessary, but makes a bit easier to read
51 |         if (++line_break > 12) {
52 |             printf("\n");
53 |             line_break = 0;
54 |         }
55 |         count++;
56 |     }
57 |     printf("0};\n");
58 |     printf("int %s_length = %d;\n", target, count);
59 |     printf("}  // extern \"C\"\n");
60 |     return 0;
61 | }
62 | 


--------------------------------------------------------------------------------
/AutoSearch/src/li2018/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_autoscheduler(NAME Li2018 SOURCES GradientAutoscheduler.cpp)
 2 | 
 3 | # ==========================================================
 4 | # TODO(#4053): move these to a separate folder since they're tests.
 5 | 
 6 | add_executable(demo_gradient.generator demo_generator.cpp)
 7 | target_link_libraries(demo_gradient.generator PRIVATE Halide::Generator)
 8 | 
 9 | add_halide_library(demo_gradient FROM demo_gradient.generator
10 |                    TARGETS cmake
11 |                    GENERATOR demo
12 |                    FUNCTION_NAME demo
13 |                    AUTOSCHEDULER AutoSchedule::Li2018
14 |                    REGISTRATION DEMO_REGISTRATION_FILE)
15 | 
16 | add_executable(demo_gradient_autoscheduler ${DEMO_REGISTRATION_FILE})
17 | target_link_libraries(demo_gradient_autoscheduler PRIVATE demo_gradient Halide::RunGenMain)
18 | 
19 | add_test(NAME demo_gradient_autoscheduler
20 |          COMMAND demo_gradient_autoscheduler --benchmarks=all --benchmark_min_time=1 --estimate_all)
21 | 
22 | set_tests_properties(demo_gradient_autoscheduler PROPERTIES LABELS Li2018)
23 | 
24 | ##
25 | 
26 | if (BUILD_SHARED_LIBS)
27 |     add_executable(gradient_autoscheduler_test_cpp test.cpp)
28 |     target_link_libraries(gradient_autoscheduler_test_cpp PRIVATE Halide::Halide)
29 | 
30 |     add_test(NAME gradient_autoscheduler_test_cpp
31 |              COMMAND gradient_autoscheduler_test_cpp $<TARGET_FILE:Halide_Li2018>)
32 | 
33 |     set_tests_properties(gradient_autoscheduler_test_cpp PROPERTIES LABELS Li2018)
34 | endif ()
35 | 
36 | ##
37 | 
38 | if (WITH_PYTHON_BINDINGS)
39 |     # TODO(#4053): rework this as an app under python_bindings.
40 |     # TODO(#4876): Disabled due to issue #4876
41 |     if (FALSE)
42 |         find_package(Python3 REQUIRED COMPONENTS Interpreter Development)
43 | 
44 |         add_test(NAME gradient_autoscheduler_test_py
45 |                  COMMAND Python3::Interpreter "${CMAKE_CURRENT_SOURCE_DIR}/test.py")
46 | 
47 |         set(PYTHONPATH "$<SHELL_PATH:$<TARGET_FILE_DIR:Halide::Python>>")
48 | 
49 |         if (WIN32)
50 |             set(SEP "\\$<SEMICOLON>")
51 |         else ()
52 |             set(SEP ":")
53 |         endif ()
54 | 
55 |         set(_PATH "$<SHELL_PATH:$<TARGET_FILE_DIR:Halide_Li2018>>;$<SHELL_PATH:$<TARGET_FILE_DIR:Halide::Halide>>;$ENV{PATH}")
56 |         string(REPLACE ";" "${SEP}" _PATH "${_PATH}")
57 |         set_tests_properties(gradient_autoscheduler_test_py PROPERTIES
58 |                              LABELS Li2018
59 |                              ENVIRONMENT "PYTHONPATH=${PYTHONPATH};PATH=${_PATH}")
60 |     endif ()
61 | endif ()
62 | 


--------------------------------------------------------------------------------
/AutoSearch/src/li2018/Makefile:
--------------------------------------------------------------------------------
 1 | THIS_MAKEFILE = $(realpath $(filter %Makefile, $(MAKEFILE_LIST)))
 2 | SRC  = $(strip $(shell dirname $(THIS_MAKEFILE)))
 3 | HALIDE_SRC_ROOT = $(realpath $(SRC)/../../../)
 4 | COMMON_DIR =  $(realpath $(SRC)/../common/)
 5 | 
 6 | # Assume an in-tree build of a halide distro exists. Most uses of this
 7 | # Makefile should probably set this variable explicitly.
 8 | HALIDE_DISTRIB_PATH ?= $(HALIDE_SRC_ROOT)/distrib
 9 | 
10 | # The example uses a generator, though the autoscheduler itself does not require one
11 | include $(HALIDE_SRC_ROOT)/apps/support/Makefile.inc
12 | 
13 | CXXFLAGS += -I$(COMMON_DIR)
14 | 
15 | ifeq ($(UNAME), Darwin)
16 | HALIDE_RPATH_FOR_LIB += '-Wl,-rpath,@loader_path'
17 | else
18 | HALIDE_RPATH_FOR_LIB += '-Wl,-rpath,$$ORIGIN'
19 | endif
20 | 
21 | $(BIN)/libautoschedule_li2018.$(SHARED_EXT): $(SRC)/GradientAutoscheduler.cpp $(LIB_HALIDE)
22 | 	@mkdir -p $(@D)
23 | 	$(CXX) -shared $(USE_EXPORT_DYNAMIC) -fPIC -fvisibility=hidden -fvisibility-inlines-hidden $(CXXFLAGS) $(OPTIMIZE) $^ -o $@ $(HALIDE_SYSTEM_LIBS) $(HALIDE_RPATH_FOR_LIB)
24 | 
25 | # Demonstrate a JIT-based use of gradient autoscheuler
26 | $(BIN)/test: $(SRC)/test.cpp $(BIN)/libautoschedule_li2018.$(SHARED_EXT)
27 | 	@mkdir -p $(@D)
28 | 	$(CXX) $(CXXFLAGS) $(USE_EXPORT_DYNAMIC) $(SRC)/test.cpp -o $@ $(LIBHALIDE_LDFLAGS) $(HALIDE_SYSTEM_LIBS)
29 | 
30 | # Demonstrate a generator-based use of gradient autoscheuler
31 | $(GENERATOR_BIN)/demo.generator: $(SRC)/demo_generator.cpp $(GENERATOR_DEPS)
32 | 	@mkdir -p $(@D)
33 | 	$(CXX) $(CXXFLAGS) $(USE_EXPORT_DYNAMIC) -g $(filter-out %.h,$^) -o $@ $(LIBHALIDE_LDFLAGS) $(HALIDE_SYSTEM_LIBS)
34 | 
35 | # Use the -p flag to the generator to load the autoscheduler as a plugin
36 | $(BIN)/%/demo.a: $(GENERATOR_BIN)/demo.generator $(BIN)/libautoschedule_li2018.$(SHARED_EXT)
37 | 	@mkdir -p $(@D)
38 | 	$(GENERATOR_BIN)/demo.generator -g demo -o $(@D) -f demo target=$* auto_schedule=true -p $(BIN)/libautoschedule_li2018.$(SHARED_EXT) -s Li2018
39 | 
40 | $(BIN)/%/demo.rungen: $(BIN)/%/RunGenMain.o $(BIN)/%/demo.registration.cpp $(BIN)/%/demo.a
41 | 	@mkdir -p $(@D)
42 | 	$(CXX) $(CXXFLAGS) -I$(BIN)/$* $^ -o $@ $(HALIDE_SYSTEM_LIBS) $(IMAGE_IO_FLAGS)
43 | 
44 | .PHONY: build test clean run_test_cpp run_test_py test_generator
45 | 
46 | # demonstrates single-shot use of the autoscheduler
47 | test_generator: $(BIN)/$(HL_TARGET)/demo.rungen $(BIN)/libautoschedule_li2018.$(SHARED_EXT)
48 | 	$< --benchmarks=all --benchmark_min_time=1 --estimate_all
49 | 
50 | run_test_cpp: $(BIN)/test
51 | 	LD_LIBRARY_PATH=$(BIN) $< $(BIN)/libautoschedule_li2018.$(SHARED_EXT)
52 | 
53 | run_test_py: $(SRC)/test.py $(BIN)/libautoschedule_li2018.$(SHARED_EXT)
54 | 	PYTHONPATH=$(BIN):$(HALIDE_PYTHON_BINDINGS_PATH):$(HALIDE_DISTRIB_PATH)/bin:$$PYTHONPATH \
55 | 		LD_LIBRARY_PATH=$(BIN):$(HALIDE_PYTHON_BINDINGS_PATH):$(HALIDE_DISTRIB_PATH)/bin \
56 | 		$(PYTHON) $(SRC)/test.py
57 | 
58 | \build: $(BIN)/test $(BIN)/$(HL_TARGET)/demo.rungen $(BIN)/libautoschedule_li2018.$(SHARED_EXT)
59 | 
60 | test: run_test_cpp run_test_py test_generator
61 | 
62 | clean:
63 | 	rm -rf $(BIN)
64 | 


--------------------------------------------------------------------------------
/AutoSearch/src/li2018/README.md:
--------------------------------------------------------------------------------
 1 | This is a conservative autoscheduler that `compute_root` most Funcs except for
 2 | the trivial ones (think of it as a -O1 optimizer for Halide). It recognizes
 3 | large reduction patterns and use `rfactor` or `atomic` to parallelize on
 4 | associative reduction when there's not enough parallelism in the pure variable
 5 | domain. This strategy works reasonably well for gradient pipelines, and is
 6 | suitable as a default option for decent but not optimal performance. This is
 7 | also currently the only autoscheduler that generates GPU schedules.
 8 | 
 9 | Running some benchmarks in the app directory gives the following statistics (all
10 | use `halide_reuse_device_allocations(nullptr, true)` for GPU)
11 | 
12 | | app              | manual (CPU) | gradient-autoscheduler (CPU) | manual (GPU) | gradient-autoscheduler (GPU) |
13 | | ---------------- | ------------ | ---------------------------- | ------------ | ---------------------------- |
14 | | bilateral filter | 7.93 ms      | 12.92 ms                     | 0.29 ms      | 1.05 ms                      |
15 | | camera_pipe      | 8823.33 us   | 25126 us                     | 605.03 us    | 3347.44 us                   |
16 | | lens_blur        | 7.77 ms      | 22.41 ms                     | 0.73 ms      | 5.60 ms                      |
17 | | local_laplacian  | 42.29 ms     | 128.31 ms                    | 0.81 ms      | 14.30 ms                     |
18 | | nl_means         | 145.003 ms   | out-of-memory                | N/A          | 82.93 ms                     |
19 | | conv_layer       | 15.46 ms     | 6.89 ms                      | N/A          | 1.90 ms                      |
20 | | stencil_chain    | 18.86 ms     | 21.46 ms                     | N/A          | 6.35 ms                      |
21 | 
22 | Tested on a 8 core Intel CPU (16 with HT) and TITAN Xp.
23 | 
24 | See `test.cpp` and `demo_generator.cpp` for how to use this autoscheduler. It
25 | can also be used with Python bindings. Compile with
26 | 
27 | ```
28 | WITH_PYTHON=1 make
29 | ```
30 | 
31 | and see `test.py` for usage.
32 | 


--------------------------------------------------------------------------------
/AutoSearch/src/li2018/demo_generator.cpp:
--------------------------------------------------------------------------------
 1 | #include "Halide.h"
 2 | 
 3 | namespace {
 4 | 
 5 | using namespace Halide;
 6 | 
 7 | class ConvRelu : public Halide::Generator<ConvRelu> {
 8 | public:
 9 |     Input<Buffer<float>> input{"input", 4};
10 |     Input<Buffer<float>> filter{"filter", 4};
11 |     Input<Buffer<float>> bias{"bias", 1};
12 |     Output<Buffer<float>> relu{"relu", 4};
13 | 
14 |     void generate() {
15 |         const int N = 5, CI = 120, CO = 24, W = 100, H = 80;
16 | 
17 |         Var x("x"), y("y"), c("c"), n("n");
18 | 
19 |         Func conv("conv");
20 |         RDom r(0, CI, 0, 3, 0, 3);
21 |         conv(c, x, y, n) = bias(c);
22 |         conv(c, x, y, n) += filter(c, r.y, r.z, r.x) * input(r.x, x + r.y, y + r.z, n);
23 |         relu(c, x, y, n) = max(0, conv(c, x, y, n));
24 | 
25 |         relu.bound(c, 0, CO)
26 |             .bound(x, 0, W)
27 |             .bound(y, 0, H)
28 |             .bound(n, 0, N);
29 | 
30 |         relu.dim(0).set_bounds(0, CO).set_stride(1);
31 |         relu.dim(1).set_bounds(0, W).set_stride(CO);
32 |         relu.dim(2).set_bounds(0, H).set_stride(CO * W);
33 |         relu.dim(3).set_bounds(0, N).set_stride(CO * H * W);
34 | 
35 |         input.dim(0).set_bounds(0, CI).set_stride(1);
36 |         input.dim(1).set_bounds(0, W + 2).set_stride(CI);
37 |         input.dim(2).set_bounds(0, H + 2).set_stride(CI * (W + 2));
38 |         input.dim(3).set_bounds(0, N).set_stride(CI * (W + 2) * (H + 2));
39 | 
40 |         filter.dim(0).set_bounds(0, CO).set_stride(1);
41 |         filter.dim(1).set_bounds(0, 3).set_stride(CO);
42 |         filter.dim(2).set_bounds(0, 3).set_stride(CO * 3);
43 |         filter.dim(3).set_bounds(0, CI).set_stride(CO * 3 * 3);
44 | 
45 |         bias.dim(0).set_bounds(0, CO).set_stride(1);
46 |     }
47 | };
48 | 
49 | }  // namespace
50 | 
51 | HALIDE_REGISTER_GENERATOR(ConvRelu, demo)
52 | 


--------------------------------------------------------------------------------
/AutoSearch/src/li2018/test.cpp:
--------------------------------------------------------------------------------
  1 | #include "Halide.h"
  2 | 
  3 | using namespace Halide;
  4 | 
  5 | int main(int argc, char **argv) {
  6 |     if (argc != 2) {
  7 |         fprintf(stderr, "Usage: %s <autoscheduler-lib>\n", argv[0]);
  8 |         return 1;
  9 |     }
 10 | 
 11 |     load_plugin(argv[1]);
 12 | 
 13 |     MachineParams params(32, 16000000, 40);
 14 |     Target target;
 15 | 
 16 |     Var x("x"), y("y");
 17 | 
 18 |     {  // Simple 1D pointwise operations. Should inline.
 19 |         Func in("in");
 20 |         in(x) = cast<float>(x);
 21 |         Func f0("f0");
 22 |         f0(x) = 2.f * in(x);
 23 |         Func f1("f1");
 24 |         f1(x) = sin(f0(x));
 25 |         Func f2("f2");
 26 |         f2(x) = f1(x) * f1(x);
 27 | 
 28 |         f2.set_estimate(x, 0, 10000);
 29 | 
 30 |         AutoSchedulerResults result =
 31 |             Pipeline(f2).auto_schedule(target, params);
 32 |         std::cout << "Schedule for 1D pointwise operations:\n"
 33 |                   << result.schedule_source << "\n\n";
 34 |     }
 35 | 
 36 |     {  // Simple 2D pointwise operations. Should inline.
 37 |         Func in("in");
 38 |         in(x, y) = cast<float>(x + y);
 39 |         Func f0("f0");
 40 |         f0(x, y) = 2.f * in(x, y);
 41 |         Func f1("f1");
 42 |         f1(x, y) = sin(f0(x, y));
 43 |         Func f2("f2");
 44 |         f2(x, y) = f1(x, y) * f1(x, y);
 45 | 
 46 |         f2.set_estimate(x, 0, 1000)
 47 |             .set_estimate(y, 0, 1000);
 48 | 
 49 |         AutoSchedulerResults result =
 50 |             Pipeline(f2).auto_schedule(target, params);
 51 |         std::cout << "Schedule for 2D pointwise operations:\n"
 52 |                   << result.schedule_source << "\n\n";
 53 |     }
 54 | 
 55 |     {  // 1D Convolution.
 56 |         Func in("in");
 57 |         in(x) = cast<float>(x);
 58 |         RDom r(0, 5);
 59 |         Func f0("f0");
 60 |         f0(x) += in(x + r) / 5.f;
 61 | 
 62 |         f0.set_estimate(x, 0, 1000);
 63 | 
 64 |         AutoSchedulerResults result =
 65 |             Pipeline(f0).auto_schedule(target, params);
 66 |         std::cout << "Schedule for 1D convolution:\n"
 67 |                   << result.schedule_source << "\n\n";
 68 |     }
 69 | 
 70 |     {  // 2D Convolution.
 71 |         Func in("in");
 72 |         in(x, y) = cast<float>(x + y);
 73 |         RDom r(0, 5, 0, 5);
 74 |         Func f0("f0");
 75 |         f0(x, y) += in(x + r.x, y + r.y) / 25.f;
 76 | 
 77 |         f0.set_estimate(x, 0, 1000)
 78 |             .set_estimate(y, 0, 1000);
 79 | 
 80 |         AutoSchedulerResults result =
 81 |             Pipeline(f0).auto_schedule(target, params);
 82 |         std::cout << "Schedule for 2D convolution:\n"
 83 |                   << result.schedule_source << "\n\n";
 84 |     }
 85 | 
 86 |     {  // 1D Histogram.
 87 |         Func in("in");
 88 |         in(x) = x % 10;
 89 |         RDom r(0, 1000);
 90 |         Func hist("hist");
 91 |         hist(x) = 0;
 92 |         hist(clamp(in(r), 0, 10)) += 1;
 93 | 
 94 |         hist.set_estimate(x, 0, 10);
 95 | 
 96 |         AutoSchedulerResults result =
 97 |             Pipeline(hist).auto_schedule(target, params);
 98 |         std::cout << "Schedule for 1D histogram:\n"
 99 |                   << result.schedule_source << "\n\n";
100 |     }
101 | 
102 |     {  // 2D Histogram.
103 |         Func in("in");
104 |         in(x, y) = (x + y) % 10;
105 |         RDom r(0, 1000, 0, 1000);
106 |         Func hist("hist");
107 |         hist(x) = 0;
108 |         hist(clamp(in(r.x, r.y), 0, 10)) += 1;
109 | 
110 |         hist.set_estimate(x, 0, 10);
111 | 
112 |         AutoSchedulerResults result =
113 |             Pipeline(hist).auto_schedule(target, params);
114 |         std::cout << "Schedule for 2D histogram:\n"
115 |                   << result.schedule_source << "\n\n";
116 |     }
117 | 
118 |     {  // 2D Histogram, but the domain is much larger.
119 |         Func in("in");
120 |         in(x, y) = (x + y) % 10000;
121 |         RDom r(0, 1000, 0, 1000);
122 |         Func hist("hist");
123 |         hist(x) = 0;
124 |         hist(clamp(in(r.x, r.y), 0, 10000)) += 1;
125 | 
126 |         hist.set_estimate(x, 0, 10000);
127 | 
128 |         AutoSchedulerResults result =
129 |             Pipeline(hist).auto_schedule(target, params);
130 |         std::cout << "Schedule for 2D histogram with larger domain:\n"
131 |                   << result.schedule_source << "\n\n";
132 |     }
133 | 
134 |     {  // Test for conjunction use of bound and estimates.
135 |         Func in("in");
136 |         in(x, y) = cast<float>(x + y);
137 |         Func f0("f0");
138 |         f0(x, y) = 2.f * in(x, y);
139 |         Func f1("f1");
140 |         f1(x, y) = sin(f0(x, y));
141 |         Func f2("f2");
142 |         f2(x, y) = f1(x, y) * f1(x, y);
143 | 
144 |         f2.bound(x, 0, 4);
145 |         // make sure it also works if we reverse the estimate order
146 |         f2.set_estimate(y, 0, 1024)
147 |             .set_estimate(x, 0, 4);
148 | 
149 |         AutoSchedulerResults result =
150 |             Pipeline(f2).auto_schedule(target, params);
151 |         std::cout << "Schedule for 2D pointwise operations with small x dimension:\n"
152 |                   << result.schedule_source << "\n\n";
153 |     }
154 |     return 0;
155 | }
156 | 


--------------------------------------------------------------------------------
/AutoSearch/src/li2018/test.py:
--------------------------------------------------------------------------------
 1 | import halide as hl
 2 | 
 3 | def main():
 4 |     hl.load_plugin("autoschedule_li2018")
 5 | 
 6 |     x = hl.Var('x')
 7 |     f_in = hl.Func('in')
 8 |     f_in[x] = hl.f32(x) # Cast to float 32
 9 |     f_0 = hl.Func('f_0')
10 |     f_0[x] = 2 * f_in[x]
11 |     f_1 = hl.Func('f_1')
12 |     f_1[x] = hl.sin(f_0[x])
13 |     f_2 = hl.Func('f_2')
14 |     f_2[x] = f_1[x] * f_1[x]
15 | 
16 |     # Setup
17 |     f_2.set_estimate(x, 0, 1000)
18 |     p = hl.Pipeline(f_2)
19 |     target = hl.Target()
20 |     # Only first parameter is used (number of cores on CPU)
21 |     params = hl.MachineParams(32, 0, 0);
22 |     result = p.auto_schedule('Li2018', target, params)
23 |     print('Schedule:')
24 |     print(result.schedule_source)
25 | 
26 |     p.compile_jit() # compile
27 |     buf = p.realize(1000) # compute and get the buffer
28 | 
29 | if __name__ == '__main__':
30 |     main()
31 | 


--------------------------------------------------------------------------------
/AutoSearch/src/mullapudi2016/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_autoscheduler(NAME Mullapudi2016 SOURCES AutoSchedule.cpp)
2 | 


--------------------------------------------------------------------------------
/AutoSearch/src/mullapudi2016/Makefile:
--------------------------------------------------------------------------------
 1 | THIS_MAKEFILE = $(realpath $(filter %Makefile, $(MAKEFILE_LIST)))
 2 | SRC  = $(strip $(shell dirname $(THIS_MAKEFILE)))
 3 | HALIDE_ROOT = $(realpath $(SRC)/../../../)
 4 | COMMON_DIR =  $(realpath $(SRC)/../common/)
 5 | 
 6 | HALIDE_DISTRIB_PATH ?= $(HALIDE_SRC_ROOT)/distrib
 7 | include $(HALIDE_ROOT)/apps/support/Makefile.inc
 8 | 
 9 | # Add the relative location of libHalide.so in the rpath in a distro so that the autoscheduler library can find libHalide
10 | ifeq ($(UNAME), Darwin)
11 | HALIDE_RPATH_FOR_LIB += '-Wl,-rpath,@loader_path'
12 | else
13 | HALIDE_RPATH_FOR_LIB += '-Wl,-rpath,$$ORIGIN'
14 | endif
15 | 
16 | CXXFLAGS += -I$(COMMON_DIR)
17 | 
18 | $(BIN)/libautoschedule_mullapudi2016.$(SHARED_EXT): $(SRC)/AutoSchedule.cpp $(LIB_HALIDE)
19 | 	@mkdir -p $(@D)
20 | 	$(CXX) -shared $(USE_EXPORT_DYNAMIC) -fPIC -fvisibility=hidden -fvisibility-inlines-hidden $(CXXFLAGS) $(OPTIMIZE) $^ -o $@ $(HALIDE_RPATH_FOR_LIB)
21 | 


--------------------------------------------------------------------------------
/AutoSearch/src/sioutas2020/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_autoscheduler(NAME Sioutas20 SOURCES AutoSchedule.cpp)
2 | 


--------------------------------------------------------------------------------
/AutoSearch/src/sioutas2020/Makefile:
--------------------------------------------------------------------------------
 1 | THIS_MAKEFILE = $(realpath $(filter %Makefile, $(MAKEFILE_LIST)))
 2 | SRC  = $(strip $(shell dirname $(THIS_MAKEFILE)))
 3 | HALIDE_ROOT = $(realpath $(SRC)/../../../)
 4 | COMMON_DIR =  $(realpath $(SRC)/../common/)
 5 | 
 6 | HALIDE_DISTRIB_PATH ?= $(HALIDE_SRC_ROOT)/distrib
 7 | include $(HALIDE_ROOT)/apps/support/Makefile.inc
 8 | 
 9 | # Add the relative location of libHalide.so in the rpath in a distro so that the autoscheduler library can find libHalide
10 | ifeq ($(UNAME), Darwin)
11 | HALIDE_RPATH_FOR_LIB += '-Wl,-rpath,@loader_path'
12 | else
13 | HALIDE_RPATH_FOR_LIB += '-Wl,-rpath,$$ORIGIN'
14 | endif
15 | 
16 | CXXFLAGS += -I$(COMMON_DIR)
17 | 
18 | $(BIN)/libautoschedule_sioutas2020.$(SHARED_EXT): $(SRC)/AutoSchedule.cpp $(LIB_HALIDE)
19 | 	@mkdir -p $(@D)
20 | 	$(CXX) -shared $(USE_EXPORT_DYNAMIC) -fPIC -fvisibility=hidden -fvisibility-inlines-hidden $(CXXFLAGS) $(OPTIMIZE) $^ -o $@ $(HALIDE_RPATH_FOR_LIB)
21 | 


--------------------------------------------------------------------------------
/AutoSearch/src/sioutas2020/test.cpp:
--------------------------------------------------------------------------------
  1 | #include "Halide.h"
  2 | 
  3 | using namespace Halide;
  4 | 
  5 | int main(int argc, char **argv) {
  6 |     if (argc != 2) {
  7 |         fprintf(stderr, "Usage: %s <autoscheduler-lib>\n", argv[0]);
  8 |         return 1;
  9 |     }
 10 | 
 11 |     load_plugin(argv[1]);
 12 | 
 13 |     MachineParams params(32, 16000000, 40);
 14 |     Target target;
 15 | 
 16 |     Var x("x"), y("y");
 17 | 
 18 |     {  // Simple 1D pointwise operations. Should inline.
 19 |         Func in("in");
 20 |         in(x) = cast<float>(x);
 21 |         Func f0("f0");
 22 |         f0(x) = 2.f * in(x);
 23 |         Func f1("f1");
 24 |         f1(x) = sin(f0(x));
 25 |         Func f2("f2");
 26 |         f2(x) = f1(x) * f1(x);
 27 | 
 28 |         f2.set_estimate(x, 0, 10000);
 29 | 
 30 |         AutoSchedulerResults result =
 31 |             Pipeline(f2).auto_schedule(target, params);
 32 |         std::cout << "Schedule for 1D pointwise operations:\n"
 33 |                   << result.schedule_source << "\n\n";
 34 |     }
 35 | 
 36 |     {  // Simple 2D pointwise operations. Should inline.
 37 |         Func in("in");
 38 |         in(x, y) = cast<float>(x + y);
 39 |         Func f0("f0");
 40 |         f0(x, y) = 2.f * in(x, y);
 41 |         Func f1("f1");
 42 |         f1(x, y) = sin(f0(x, y));
 43 |         Func f2("f2");
 44 |         f2(x, y) = f1(x, y) * f1(x, y);
 45 | 
 46 |         f2.set_estimate(x, 0, 1000)
 47 |             .set_estimate(y, 0, 1000);
 48 | 
 49 |         AutoSchedulerResults result =
 50 |             Pipeline(f2).auto_schedule(target, params);
 51 |         std::cout << "Schedule for 2D pointwise operations:\n"
 52 |                   << result.schedule_source << "\n\n";
 53 |     }
 54 | 
 55 |     {  // 1D Convolution.
 56 |         Func in("in");
 57 |         in(x) = cast<float>(x);
 58 |         RDom r(0, 5);
 59 |         Func f0("f0");
 60 |         f0(x) += in(x + r) / 5.f;
 61 | 
 62 |         f0.set_estimate(x, 0, 1000);
 63 | 
 64 |         AutoSchedulerResults result =
 65 |             Pipeline(f0).auto_schedule(target, params);
 66 |         std::cout << "Schedule for 1D convolution:\n"
 67 |                   << result.schedule_source << "\n\n";
 68 |     }
 69 | 
 70 |     {  // 2D Convolution.
 71 |         Func in("in");
 72 |         in(x, y) = cast<float>(x + y);
 73 |         RDom r(0, 5, 0, 5);
 74 |         Func f0("f0");
 75 |         f0(x, y) += in(x + r.x, y + r.y) / 25.f;
 76 | 
 77 |         f0.set_estimate(x, 0, 1000)
 78 |             .set_estimate(y, 0, 1000);
 79 | 
 80 |         AutoSchedulerResults result =
 81 |             Pipeline(f0).auto_schedule(target, params);
 82 |         std::cout << "Schedule for 2D convolution:\n"
 83 |                   << result.schedule_source << "\n\n";
 84 |     }
 85 | 
 86 |     {  // 1D Histogram.
 87 |         Func in("in");
 88 |         in(x) = x % 10;
 89 |         RDom r(0, 1000);
 90 |         Func hist("hist");
 91 |         hist(x) = 0;
 92 |         hist(clamp(in(r), 0, 10)) += 1;
 93 | 
 94 |         hist.set_estimate(x, 0, 10);
 95 | 
 96 |         AutoSchedulerResults result =
 97 |             Pipeline(hist).auto_schedule(target, params);
 98 |         std::cout << "Schedule for 1D histogram:\n"
 99 |                   << result.schedule_source << "\n\n";
100 |     }
101 | 
102 |     {  // 2D Histogram.
103 |         Func in("in");
104 |         in(x, y) = (x + y) % 10;
105 |         RDom r(0, 1000, 0, 1000);
106 |         Func hist("hist");
107 |         hist(x) = 0;
108 |         hist(clamp(in(r.x, r.y), 0, 10)) += 1;
109 | 
110 |         hist.set_estimate(x, 0, 10);
111 | 
112 |         AutoSchedulerResults result =
113 |             Pipeline(hist).auto_schedule(target, params);
114 |         std::cout << "Schedule for 2D histogram:\n"
115 |                   << result.schedule_source << "\n\n";
116 |     }
117 | 
118 |     {  // 2D Histogram, but the domain is much larger.
119 |         Func in("in");
120 |         in(x, y) = (x + y) % 10000;
121 |         RDom r(0, 1000, 0, 1000);
122 |         Func hist("hist");
123 |         hist(x) = 0;
124 |         hist(clamp(in(r.x, r.y), 0, 10000)) += 1;
125 | 
126 |         hist.set_estimate(x, 0, 10000);
127 | 
128 |         AutoSchedulerResults result =
129 |             Pipeline(hist).auto_schedule(target, params);
130 |         std::cout << "Schedule for 2D histogram with larger domain:\n"
131 |                   << result.schedule_source << "\n\n";
132 |     }
133 | 
134 |     {  // Test for conjunction use of bound and estimates.
135 |         Func in("in");
136 |         in(x, y) = cast<float>(x + y);
137 |         Func f0("f0");
138 |         f0(x, y) = 2.f * in(x, y);
139 |         Func f1("f1");
140 |         f1(x, y) = sin(f0(x, y));
141 |         Func f2("f2");
142 |         f2(x, y) = f1(x, y) * f1(x, y);
143 | 
144 |         f2.bound(x, 0, 4);
145 |         // make sure it also works if we reverse the estimate order
146 |         f2.set_estimate(y, 0, 1024)
147 |             .set_estimate(x, 0, 4);
148 | 
149 |         AutoSchedulerResults result =
150 |             Pipeline(f2).auto_schedule(target, params);
151 |         std::cout << "Schedule for 2D pointwise operations with small x dimension:\n"
152 |                   << result.schedule_source << "\n\n";
153 |     }
154 |     return 0;
155 | }
156 | 


--------------------------------------------------------------------------------
/AutoSearch/toolkit/shape_config.py:
--------------------------------------------------------------------------------
 1 | 
 2 | """ Shape configurations for single operator and subgraph evaluation """
 3 | 
 4 | matmul_args = [
 5 |     [1, 512, 512, 512],
 6 |     # [1, 256, 256, 256],
 7 | ]
 8 | 
 9 | matmul_shapes = [
10 |     [[512, 512, 1],[512, 512, 1],[512, 512, 1]],
11 |     # [[256, 256, 1],[256, 256, 1],[256, 256, 1]],
12 | ]
13 | 
14 | shape_dict = {
15 |     'matmul': matmul_shapes,
16 | }
17 | 
18 | args_dict = {
19 |     'matmul': matmul_args,
20 | }


--------------------------------------------------------------------------------
/AutoSearch/toolkit/template/demo_eval.cpp:
--------------------------------------------------------------------------------
  1 | 
  2 | #include <stdlib.h>
  3 | #include <iostream>
  4 | #include "HalideBuffer.h"
  5 | 
  6 | #include "demo_1_512_512_512.h"
  7 | using namespace std;
  8 | 
  9 | #define ZERO 0
 10 | #define ONE 1
 11 | #define RAND 2
 12 | 
 13 | #ifndef MAX
 14 | #define MAX(a, b) (((a) > (b)) ? (a) : (b))
 15 | #endif
 16 | 
 17 | #ifndef FABS
 18 | #define FABS(a) (((a) > 0) ? (a) : (-a))
 19 | #endif
 20 | 
 21 | // For x86-64-linux:
 22 | //g++ demo_eval.cpp demo.s -I $HALIDE_HOME/include  -ldl -lpthread -o demo_eval
 23 | 
 24 | // For arm-64-linux:
 25 | //aarch64-linux-gnu-g++ demo_eval.cpp demo.s -I $HALIDE_HOME/include  -ldl -lpthread -o demo_eval
 26 | 
 27 | const int B=1;
 28 | const int N=512;
 29 | const int M=512;
 30 | const int K=512;
 31 | 
 32 | void ref_func(float*data_a,float*data_b,float*data_c)
 33 | {
 34 |     for(int b=0;b<B;b++)
 35 |     {
 36 |         for(int i=0;i<M;i++)
 37 |         {
 38 |             for(int j=0;j<N;j++)
 39 |             {
 40 |                 data_c[b*M*N+i*N+j]=0;
 41 |                 for(int k=0;k<K;k++)
 42 |                 {
 43 |                     //data_c[b][i][j] +=data_a[b][i][k]*data_b[b][k][j]
 44 |                     data_c[b*M*N+i*N+j]+=data_a[b*M*K+i*K+k]*data_b[b*N*K+k*N+j];
 45 |                 }
 46 |             }
 47 |         }
 48 |     }
 49 | }
 50 | void init(float* data,int size, int mode)
 51 | {
 52 |     srand(0); //set rand_seed
 53 |     int i;
 54 |     for (i = 0; i < size; ++i) {
 55 |         if (mode == ZERO)
 56 |             data[i] = 0;
 57 |         else if (mode == ONE)
 58 |             data[i] = 1;
 59 |         else
 60 |             data[i] = (float)rand() / RAND_MAX;
 61 |     }
 62 | }
 63 | 
 64 | float maxerr(float* pred, float* gt, int size)
 65 | {
 66 |     float maxError = 0.f;
 67 |     for(int i=0; i< size; i++){
 68 |             maxError = MAX(FABS(gt[i] - pred[i]), maxError);
 69 |     }
 70 |     printf("maxerr %.6f\t", maxError);
 71 |     return maxError;
 72 | }
 73 | 
 74 | int main()
 75 | {
 76 |     float a[M*K*B];
 77 |     float b[N*K*B];
 78 |     float halide_c[M*N*B];
 79 |     float ref_c[M*N*B];
 80 | 
 81 |     //input data random init 
 82 |     init(a,M*K*B,RAND);
 83 |     init(b,N*K*B,RAND);
 84 |     // output data zero init
 85 |     init(halide_c,M*N*B,ZERO);
 86 |     init(ref_c, M*N*B,ZERO);
 87 | 
 88 |     Halide::Runtime::Buffer<float> Halide_A((float*)a, K,M,B);
 89 |     Halide::Runtime::Buffer<float> Halide_B((float*)b, N,K,B);
 90 |     Halide::Runtime::Buffer<float> Halide_C((float*)halide_c, N,M,B);
 91 | 
 92 |     matmul(Halide_A,Halide_B,Halide_C);
 93 |     ref_func(a,b,ref_c);
 94 | 
 95 |     if (maxerr(ref_c,halide_c,M*N*B)<0.001)
 96 |     {
 97 |         cout<<"Correctness check passed!"<<endl;
 98 |     }else
 99 |     {
100 |         cout<<"Correctness check failed"<<endl;
101 |     }
102 |     return 0;
103 | }


--------------------------------------------------------------------------------
/AutoSearch/toolkit/template/demo_run.cpp:
--------------------------------------------------------------------------------
 1 | #include "HalideBuffer.h"
 2 | #include "HalideRuntimeCuda.h"
 3 | #include "halide_benchmark.h"
 4 | #include <chrono>
 5 | #include "vector"
 6 | #include "iostream"
 7 | using Halide::Runtime::Buffer;
 8 | using namespace std::chrono;
 9 | using namespace std;
10 | void init(Buffer<float> &B)
11 | {
12 |     for (auto iter=B.begin();iter!=B.end();iter++)
13 |     {
14 |         (*iter) = rand()*1.0/RAND_MAX;
15 |     }
16 | }
17 | int main(int argc, char **argv) {
18 |     {
19 |         INPUT_TEMPLATE;
20 |         INIT_INPUT;
21 |         const auto benchmark_inner = [&]() {
22 |             FUNC(DEMO_ARGS);
23 |             OUTPUT.device_sync();
24 |         };
25 |         double benchmark_min_time=0.1f;
26 |         Halide::Tools::BenchmarkConfig config;
27 |         config.min_time = benchmark_min_time;
28 |         config.max_time = benchmark_min_time * 4;
29 |         double total_time = 0.0f;
30 |         steady_clock::time_point start = steady_clock::now();
31 |         for (int i=0;i<SAMPLES;i++)
32 |         {
33 |             //std::cout<<"samples:"<<i<<std::endl;
34 |             for (int j=0;j<ITERATORS;j++)
35 |             {
36 |                 FUNC(DEMO_ARGS);
37 |                 OUTPUT.device_sync();
38 |             }
39 |             
40 |             
41 |         }
42 |         steady_clock::time_point end = steady_clock::now();
43 |         duration<double> time_span = duration_cast<duration<double>>(end - start);
44 |         //double t  = Halide::Tools::benchmark(SAMPLES,ITERATORS,benchmark_inner);
45 |         std::cout<<"autokernel time:\t"<<time_span.count()*1000.0/(SAMPLES*ITERATORS)<<" ms\n";
46 |     }
47 |     return 0;
48 | }
49 | 


--------------------------------------------------------------------------------
/AutoSearch/toolkit/template/gen.cpp:
--------------------------------------------------------------------------------
1 | #include "Halide.h"
2 | 
3 | int main(int argc, char **argv) {
4 |     return Halide::Internal::generate_filter_main(argc, argv, std::cerr);
5 | }


--------------------------------------------------------------------------------
/AutoSearch/toolkit/utils.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from tools import run_cmd
 3 | import os
 4 | import time
 5 | HALIDE_HOME = os.environ['HALIDE_HOME']
 6 | CURRENT_DIR = os.getcwd()
 7 | def create_generator(gen,demo_name):
 8 |     global CURRENT_DIR,HALIDE_HOME
 9 |     run_cmd('g++ %s %s/template/gen.cpp -g -I %s/include -I %s/../src/ -L %s/bin -lHalide -ldl -lpthread -std=c++11 -fno-rtti -o %s_gen' 
10 |     % (gen, CURRENT_DIR, HALIDE_HOME, CURRENT_DIR, HALIDE_HOME,demo_name))
11 |     run_cmd('if [ ! -d samples  ];then mkdir samples; fi')
12 |     run_cmd('mv %s_gen ./samples/%s_gen' % (demo_name,demo_name))
13 | def create_schedule(weight_dir,excute,target,demo_name,output_dir,fname,seed):
14 |     run_cmd('mkdir -p {}'.format(output_dir))
15 |     run_cmd('''HL_SEED={} HL_WEIGHTS_DIR={} HL_RANDOM_DROPOUT=100 HL_BEAM_SIZE=32 \
16 |             HL_MACHINE_PARAMS=32,24000000,40 timeout -k 600s 600s {} -g {} -f {} -o {} \
17 |             -e stmt,assembly,static_library,c_header,registration,schedule,featurization\
18 |              target={}-disable_llvm_loop_opt \
19 |               auto_schedule=true -p ../build/src/adams2019/libautoschedule_adams2019.so \
20 |               -s Adams2019'''.format(seed,weight_dir,excute,demo_name,fname,output_dir,target),True)
21 | def read_bench_float(dir):
22 |     import re
23 |     fr = open(dir,'r')
24 |     line = fr.readline()
25 |     temp=re.findall(r'-?\d+\.?\d*e?-?\d*?', line)
26 |     return temp[0]
27 | def benchmark(schedule_dir,demo_name,seed,fname):
28 |     global HALIDE_HOME,CURRENT_DIR
29 |     run_cmd('''c++ -std=c++11 -I {}/halide-build/include -I {}/tools RunGenMain.cpp \
30 |     {}/*.registration.cpp {}/*.a -o {}/bench -DHALIDE_NO_PNG \
31 |     -DHALIDE_NO_JPEG -ldl -lpthread'''.format(HALIDE_HOME,HALIDE_HOME,schedule_dir,schedule_dir,schedule_dir),False)
32 |     time.sleep(1)
33 |     run_cmd('''HL_NUM_THREADS=32 timeout -k 60s 60s {}/bench --estimate_all --benchmarks=all | tee {}/bench.txt'''.format(schedule_dir,schedule_dir))
34 | # ../build/src/adams2019//featurization_to_sample ./temp/batch_2_0/0/blur_batch_0002_sample_0000.featurization 0.000108463 0 00020000 ./temp/batch_2_0/0/blur_batch_0002_sample_0000.sample
35 |     cost = read_bench_float('{}/bench.txt'.format(schedule_dir))
36 |     cost = eval(cost)/1000.0
37 |     run_cmd('''../build/src/adams2019/featurization_to_sample {}/{}.featurization {} 0 {} {}/{}.sample'''.format(schedule_dir,fname,cost,seed,schedule_dir,fname))
38 | def retrain_cost_model(weight_dir,weight_out,demo_name,epochs=1):
39 |     run_cmd('''find samples/ -name '*.sample' | ../build/src/adams2019/retrain_cost_model --epochs={} \
40 |             --rates="0.0001" --num_cores=32 --initial_weights={} --weights_out={} \
41 |             --best_benchmark= samples/best.{}.benchmark.txt \
42 |             --best_schedule = samples/best.{}.schedule.h'''.format(epochs,weight_dir,weight_out,demo_name,demo_name),True)
43 |     
44 | if __name__=='__main__':
45 |     run_cmd('rm -rf ./samples')
46 |     create_generator('../generator/batch_matmul.cpp','batch_matmul')
47 | 
48 |     os.environ['HL_APP_ARGS']='1, 512, 512, 512'
49 | 
50 |     weight_dir = 'samples/updated.weights'
51 |     run_cmd('cp ../src/adams2019/baseline.weights {}'.format(weight_dir))
52 | 
53 |     for iters in range(2):
54 |         DIR = 'samples/batch_{}'.format(iters)
55 |         run_cmd('mkdir -p {}'.format(DIR))
56 |         for batch in range(2):
57 |             output_dir = DIR+'/{}'.format(batch)
58 |             seed=iters*10000+batch
59 |             fname = 'matmul_batch_%04d_sample_%04d' % (iters,batch)
60 |             create_schedule(weight_dir,'./samples/batch_matmul_gen','x86-64-linux-avx-avx2-f16c-fma-sse41','matmul',output_dir,fname,seed)
61 | 
62 |         for batch in range(2):
63 |             output_dir = DIR+'/{}'.format(batch)
64 |             seed=iters*10000+batch
65 |             fname = 'matmul_batch_%04d_sample_%04d' % (iters,batch)
66 |             benchmark(output_dir,'matmul',seed,fname)
67 |         print("begin retrain")
68 |         retrain_cost_model(weight_dir,"new.weights",'matmul',epochs=2)          
69 |         print("end retrain")


--------------------------------------------------------------------------------
/Dockerfile/Dockerfile.cpu:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:18.04                                                             
 2 |                                                                               
 3 | ENV TZ=Asia/Shanghai                                                          
 4 | RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
 5 |                                                                               
 6 | RUN apt-get update && apt-get install -y --no-install-recommends \
 7 |         g++ \
 8 |         git \
 9 |         wget \
10 |         cmake \
11 |         python3 \
12 |         python3-dev \
13 |         python3-pip \
14 |         python3-setuptools \
15 |         clang-tools-10 \
16 |         lld-10 \
17 |         llvm-10-dev \
18 |         libclang-10-dev \
19 |         liblld-10-dev \
20 |         libpng-dev \
21 |         libjpeg-dev \
22 |         libgl-dev \
23 |         python3-numpy \
24 |         python3-scipy \
25 |         python3-imageio \
26 |         python3-pybind11 \
27 |         libopenblas-dev \
28 |         libeigen3-dev \
29 |         libatlas-base-dev \
30 |         doxygen \
31 |         ninja-build \
32 |         ca-certificates && \               
33 |     rm -rf /var/lib/apt/lists/* && \       
34 |     ln -s /usr/bin/python3 /usr/bin/python 
35 | 
36 | RUN pip3 install --upgrade cmake pip jupyter
37 | WORKDIR /workspace
38 | 
39 | RUN git clone --branch v10.0.0 https://github.com/halide/Halide.git && \
40 |     cd Halide && \
41 |     mkdir halide-build && \
42 |     cd halide-build && \
43 |     cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH=/usr/lib/llvm-10/cmake .. && \
44 |     make -j$(nproc) install
45 | 
46 | RUN git clone --branch tengine-lite https://github.com/OAID/Tengine.git && \
47 |     cd Tengine && \
48 |     mkdir build && \
49 |     cd build && \
50 |     cmake .. && \
51 |     make install -j$(nproc)
52 | 
53 | ENV PYTHONPATH "${PYTHONPATH}:/workspace/Halide/halide-build/python_bindings/src"
54 | ENV LD_LIBRARY_PATH "${LD_LIBRARY_PATH}:/usr/local/lib:/workspace/Tengine/build/install/lib"
55 | 


--------------------------------------------------------------------------------
/Dockerfile/Dockerfile.cuda:
--------------------------------------------------------------------------------
 1 | FROM nvidia/cuda:10.1-devel-ubuntu18.04                                                             
 2 |                                                                               
 3 | ENV TZ=Asia/Shanghai                                                          
 4 | RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
 5 |                                                                               
 6 | RUN apt-get update && apt-get install -y --no-install-recommends \
 7 |         g++ \
 8 |         git \
 9 |         wget \
10 |         cmake \
11 |         python3 \
12 |         python3-dev \
13 |         python3-pip \
14 |         python3-setuptools \
15 |         clang-tools-10 \
16 |         lld-10 \
17 |         llvm-10-dev \
18 |         libclang-10-dev \
19 |         liblld-10-dev \
20 |         libpng-dev \
21 |         libjpeg-dev \
22 |         libgl-dev \
23 |         python3-numpy \
24 |         python3-scipy \
25 |         python3-imageio \
26 |         python3-pybind11 \
27 |         libopenblas-dev \
28 |         libeigen3-dev \
29 |         libatlas-base-dev \
30 |         doxygen \
31 |         ninja-build \
32 |         ca-certificates && \               
33 |     rm -rf /var/lib/apt/lists/* && \       
34 |     ln -s /usr/bin/python3 /usr/bin/python 
35 | 
36 | RUN pip3 install --upgrade cmake pip jupyter
37 | WORKDIR /workspace
38 | 
39 | RUN git clone --branch v10.0.0 https://github.com/halide/Halide.git && \
40 |     cd Halide && \
41 |     mkdir halide-build && \
42 |     cd halide-build && \
43 |     cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH=/usr/lib/llvm-10/cmake .. && \
44 |     make -j$(nproc) install
45 | 
46 | RUN git clone --branch tengine-lite https://github.com/OAID/Tengine.git && \
47 |     cd Tengine && \
48 |     mkdir build && \
49 |     cd build && \
50 |     cmake .. && \
51 |     make install -j$(nproc)
52 | 
53 | ENV PYTHONPATH "${PYTHONPATH}:/workspace/Halide/halide-build/python_bindings/src"
54 | ENV LD_LIBRARY_PATH "${LD_LIBRARY_PATH}:/usr/local/lib:/workspace/Tengine/build/install/lib"
55 | 


--------------------------------------------------------------------------------
/Dockerfile/Dockerfile.opencl:
--------------------------------------------------------------------------------
 1 | FROM nvidia/opencl:devel-ubuntu18.04                                                             
 2 |                                                                               
 3 | ENV TZ=Asia/Shanghai                                                          
 4 | RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
 5 |                                                                               
 6 | RUN apt-get update && apt-get install -y --no-install-recommends \
 7 |         g++ \
 8 |         git \
 9 |         wget \
10 |         cmake \
11 |         python3 \
12 |         python3-dev \
13 |         python3-pip \
14 |         python3-setuptools \
15 |         clang-tools-10 \
16 |         lld-10 \
17 |         llvm-10-dev \
18 |         libclang-10-dev \
19 |         liblld-10-dev \
20 |         libpng-dev \
21 |         libjpeg-dev \
22 |         libgl-dev \
23 |         python3-numpy \
24 |         python3-scipy \
25 |         python3-imageio \
26 |         python3-pybind11 \
27 |         libopenblas-dev \
28 |         libeigen3-dev \
29 |         libatlas-base-dev \
30 |         doxygen \
31 |         ninja-build \
32 |         ca-certificates && \               
33 |     rm -rf /var/lib/apt/lists/* && \       
34 |     ln -s /usr/bin/python3 /usr/bin/python 
35 | 
36 | RUN pip3 install --upgrade cmake pip jupyter
37 | WORKDIR /workspace
38 | 
39 | RUN git clone --branch v10.0.0 https://github.com/halide/Halide.git && \
40 |     cd Halide && \
41 |     mkdir halide-build && \
42 |     cd halide-build && \
43 |     cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_PREFIX_PATH=/usr/lib/llvm-10/cmake .. && \
44 |     make -j$(nproc) install
45 | 
46 | RUN git clone --branch tengine-lite https://github.com/OAID/Tengine.git && \
47 |     cd Tengine && \
48 |     mkdir build && \
49 |     cd build && \
50 |     cmake .. && \
51 |     make install -j$(nproc)
52 | 
53 | ENV PYTHONPATH "${PYTHONPATH}:/workspace/Halide/halide-build/python_bindings/src"
54 | ENV LD_LIBRARY_PATH "${LD_LIBRARY_PATH}:/usr/local/lib:/workspace/Tengine/build/install/lib"
55 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <div align="center">
 2 |   <img width="30%" src="doc/logo.png">
 3 |   <h3> <a href="https://autokernel-docs-en.readthedocs.io/"> Documentation </a> | <a href="https://autokernel-docs.readthedocs.io/"> 中文文档 </a>  </h3>
 4 | </div>
 5 | 
 6 | English | [简体中文](./README_CN.md)
 7 | 
 8 | # AutoKernel
 9 | 
10 | ## Introduction
11 | Neural networks are now used in a wide variety of applications. Efficient execution of Neural networks on various devices plays a critical role for these applications. Facing the rapid evolution of deep learning algorithms, there're limited qualified programmers to write hand optimized low-level kernels on different hardware platforms. Using automatic optimization tools to generate high-performance implementations become a promising solution. 
12 | 
13 | AutoKernel began as a research project at OPEN AI LAB. The project is now open source. AutoKernel is an operator optimzation tools for automatically generating high-performance low-level codes for diverse hardware backends. It aims to accelerate the development of high performance operators on various hardware including specialized accelerators.
14 | 
15 | ## AutoKernel Architecture
16 | 
17 | ![AutoKernel arch](doc/architecture-en.png)
18 | 
19 | AutoKernel consists of three modules：
20 | * Operator Generator: 
21 | 
22 |   This module uses the open source project [Halide](https://github.com/halide/Halide). Halide is a domain specific language (DSL), embedded in C++, designed to make it easier to write high-performance image processing code on modern machines. Halide seperates the algorithm description from its schedule. The input of this module is the algorithm description of operator, and the output is compiled optimized assembly code/object file for corresponding back-ends.
23 | 
24 | 
25 | * AutoSearch
26 | 
27 |   AutoSearch is an automatic module for searching optimized schedules for halide operators, using multiple optimization algorithms (greedy algorithm, reinforce learning, marchine learning, ...). It supports searching optimized schedules on both CPU and GPU, and generate code files running on different platforms (x86 or arm). This module is still under developping.
28 | 
29 | * AutoKernel Plugin：
30 |   
31 |   AutoKernel Plugin realizes one-click integration of auto-generated optimized operator codes into [Tengine](https://github.com/OAID/Tengine), without modifying the core code base of Tengine.
32 |   AutoKernel plugin realizes the one-click deployment of the automatic generated operator implements.
33 | 
34 | ## Features
35 | 
36 | - Automated    
37 | - Efficient   
38 | - User-friendly    
39 | 
40 | 
41 | ## Docker
42 | We provide following dockers with Halide and Tengine installed:
43 | - cpu: `openailab/autokernel`
44 | - cuda: `openailab/autokernel:cuda`
45 | - opencl: `openailab/autokernel:opencl`
46 | 
47 | Detail Dockerfiles, see [Dockerfiles](https://github.com/OAID/AutoKernel/tree/main/Dockerfile)
48 | 
49 | [NOTE]:
50 | if using the cuda image, you need use `nvidia-docker` instead of `docker`, here's [nvidia-docker install-guide](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#installing-on-ubuntu-and-debian).
51 | ```
52 | nvidia-docker pull openailab/autokernel:cuda
53 | nvidia-docker run -it openailab/autokernel:cuda /bin/bash
54 | ```
55 | 
56 | ## License
57 | 
58 | - [Apache 2.0](LICENSE)
59 | 
60 | 
61 | ## Discussion
62 | - Github issues
63 | - QQ group: 829565581
64 | 


--------------------------------------------------------------------------------
/README_CN.md:
--------------------------------------------------------------------------------
 1 | <div align="center">
 2 |   <img width="30%" src="doc/logo.png">
 3 |   <h3> <a href="https://autokernel-docs-en.readthedocs.io/"> Documentation </a> | <a href="https://autokernel-docs.readthedocs.io/"> 中文文档 </a>  </h3>
 4 | </div>
 5 | 
 6 | 简体中文 | [English](./README.md)
 7 | # AutoKernel
 8 | 
 9 | ## 简介
10 | 
11 | 随着人工智能的普及，深度学习网络的不断涌现，为了让各硬件(CPU, GPU, NPU,...)能够支持深度学习应用，各硬件芯片需要软件库去支持高性能的深度学习张量运算。目前，这些高性能计算库主要由资深HPC工程师(高性能计算优化工程师）进行开发，为了加快开发进程，缩短深度学习应用落地周期，自动化算子优化是一个趋势。
12 | 
13 | AutoKernel是由OPEN AI LAB提出的高性能算子自动优化工具，可以自动优化调度策略、生成底层优化代码，大幅减少各硬件芯片算子开发成本，提升算子优化效率，让工程师更快实现深度学习算法在各硬件芯片上的高性能部署。
14 | 
15 | ## AutoKernel特色
16 | 
17 | - 自动化
18 | - 高效率
19 | - 低门槛
20 |   
21 | 
22 | ## AutoKernel架构
23 | 
24 | ![AutoKernel 架构](doc/architecture.png)
25 | 
26 | AutoKernel分为三个模块：
27 | * 算子生成器: 
28 | 
29 |   该模块使用了开源项目[Halide](https://github.com/halide/Halide)；Halide是业界广泛使用的自动代码生成项目，它首次提出将计算和调度分离。该模块的输入是和硬件无关的算子计算描述，输出是相应后端的优化汇编代码/目标文件；
30 | 
31 | * 自动搜索模块AutoSearch：
32 | 
33 |   AutoSearch 可以通过最优化算法/搜索算法/机器学习/强化学习搜索出不同后端的最优算子的调度策略参数，支持x86-cpu, cuda-gpu, arm-cpu, arm-mali-gpu等后端的调度策略自动生成。AutoSearch 集成了学术界自动调优近年来的最新研究成果。(该模块在持续开发中);
34 | 
35 | * 算子部署插件（ AutoKernel Plugin）：
36 |   
37 |   [Tengine](https://github.com/OAID/Tengine)是OPEN AILAB开源的深度学习推理框架，实现了AI算法在不同硬件的快速高效部署。该模块实现了将自动生成的优化算子代码以plugin的形式一键集成到[Tengine](https://github.com/OAID/Tengine)中，实现自动优化算子的一键部署；
38 | 
39 | 
40 | ## Docker
41 | 我们提供了以下三个docker镜像，镜像内安装了Halide和Tengine, 方便开发者直接使用:
42 | - cpu: `openailab/autokernel`
43 | - cuda: `openailab/autokernel:cuda`
44 | - opencl: `openailab/autokernel:opencl`
45 | 
46 | 具体的Dockerfile见 [Dockerfiles目录](https://github.com/OAID/AutoKernel/tree/main/Dockerfile)
47 | 
48 | [NOTE]:
49 | 使用cuda镜像需要用`nvidia-docker`, 安装指南见 [nvidia-docker install-guide](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#installing-on-ubuntu-and-debian).
50 | ```
51 | nvidia-docker pull openailab/autokernel:cuda
52 | nvidia-docker run -it openailab/autokernel:cuda /bin/bash
53 | ```
54 | 
55 | ## License
56 | 
57 | - [Apache 2.0](LICENSE)
58 | 
59 | ## 技术讨论
60 | - Github issues
61 | - QQ 群: 829565581
62 | 


--------------------------------------------------------------------------------
/auto_deploy/README.md:
--------------------------------------------------------------------------------
  1 | # auto-deploy
  2 | Auto-deploy is a light-weighted NN auto-deployment tools. It involves auto op-fusion and auto op codegen, generating neural network deployment codes for specific targets.
  3 | 
  4 | ![](data/auto-deploy.png)
  5 | Auto-deploy involves the following process:
  6 | 1. read onnx model, parse into Graph IR
  7 | 2. run graph optimization passes
  8 | 3. dump graph to deployment codes
  9 | 4. compile generated codes to executable file
 10 | 
 11 | 
 12 | 
 13 | ## Quick Start
 14 | env requirements:
 15 | - python3 with onnx installed
 16 | - autokernel docker with halide installed
 17 | 
 18 | ```
 19 | python3 mnist.py
 20 | python3 op_generator.py
 21 | 
 22 | cd c_source
 23 | g++ *.cpp -o mnist
 24 | ./mnist ../data/mnist.weights ../data/input_6.bin
 25 | ```
 26 | - run `python3 mnist.py` will generate  `main.cpp` file in c_source directory
 27 | - run `python3 op_generator.py` will compiler `op_gen.cpp` and generate the following codes:
 28 |     ```bash
 29 |     |-- generated.h
 30 |     |-- halide_conv.cpp
 31 |     |-- halide_conv.h
 32 |     |-- halide_matmul.cpp
 33 |     |-- halide_matmul.h
 34 |     |-- halide_maxpool.cpp
 35 |     |-- halide_maxpool.h
 36 |     |-- halide_relu.cpp
 37 |     |-- halide_relu.h
 38 |     ```
 39 | - finally, compile the source code and run, will print the output data, which is consistent with the result get in `graph_tutorial.ipynb`. with is output data, it will get `predicted number is 6` after postprocessing.
 40 |     ```
 41 |     2.797004 -12.441699 0.206829 -3.550967 0.014401 5.138205 17.518187 -16.953455 2.517180 -5.376605
 42 |     ```
 43 | ## Graph Tutorials
 44 | see `graph_tutorial.ipynb`
 45 | ## Pass Manager
 46 | Passes perform the transformations and optimizations for the graph. It may contain more than one passes for the graph. Intuitively, passes can be called by
 47 | ```
 48 | graph = pass1(graph)
 49 | graph = pass2(graph)
 50 | graph = pass3(graph)
 51 | ```
 52 | Pass manager is used for better passes management. It can reuse same pattern to generate different passes. In Pass manager:
 53 | - register fusion pattern
 54 | - add pass_func by reuse pattern
 55 | - auto pass dependent analysis, generate seq_pass_list
 56 | - according generated seq_pass_list, auto run all passed
 57 | 
 58 | ## Generated main.cpp
 59 | - malloc all used tentors
 60 | ```cpp
 61 |     //data
 62 |     float* _0= (float*)malloc(sizeof(float)*784); //Input3
 63 |     float* _1= (float*)malloc(sizeof(float)*200); //Parameter5
 64 |     float* _2= (float*)malloc(sizeof(float)*8); //Parameter6
 65 |     float* _3= (float*)malloc(sizeof(float)*6272); //Plus30_Output_0
 66 |     float* _4= (float*)malloc(sizeof(float)*6272); //ReLU32_Output_0
 67 |     float* _5= (float*)malloc(sizeof(float)*1568); //Pooling66_Output_0
 68 |     float* _6= (float*)malloc(sizeof(float)*3200); //Parameter87
 69 |     float* _7= (float*)malloc(sizeof(float)*16); //Parameter88
 70 |     float* _8= (float*)malloc(sizeof(float)*3136); //Plus112_Output_0
 71 |     float* _9= (float*)malloc(sizeof(float)*3136); //ReLU114_Output_0
 72 |     float* _10= (float*)malloc(sizeof(float)*256); //Pooling160_Output_0
 73 |     float* _11= (float*)malloc(sizeof(float)*2560); //Parameter193_reshape1
 74 |     float* _12= (float*)malloc(sizeof(float)*10); //Parameter194
 75 |     float* _13= (float*)malloc(sizeof(float)*10); //Plus214_Output_0
 76 | ```
 77 | - load weights
 78 | ```cpp
 79 |     //load_weight
 80 |     FILE* fp = fopen(weight_name, "rb");
 81 |     if (!fp) printf("data can not be open");
 82 |     fread(_1, sizeof(float), 200, fp);
 83 |     fread(_2, sizeof(float), 8, fp);
 84 |     fread(_6, sizeof(float), 3200, fp);
 85 |     fread(_7, sizeof(float), 16, fp);
 86 |     fread(_11, sizeof(float), 2560, fp);
 87 |     fread(_12, sizeof(float), 10, fp);
 88 |     fclose(fp);
 89 | ```
 90 | - inference code
 91 | ```cpp
 92 |     //code_inference
 93 |     Conv_Add_fused(_3,_0,_1,_2,&param_0);
 94 |     Relu(_4,_3,&param_1);
 95 |     MaxPool(_5,_4,&param_2);
 96 |     Conv_Add_fused(_8,_5,_6,_7,&param_3);
 97 |     Relu(_9,_8,&param_4);
 98 |     MaxPool(_10,_9,&param_5);
 99 |     MatMul_Add_fused(_13,_10,_11,_12,&param_6);
100 | ```
101 | 
102 | ## Auto-deploy V.S. inference framenwork
103 | |    |inference framework | auto-deploy |
104 | | -------- | ---------- |---------- |
105 | | op fusion implements    | <br> - manual implementation of op fusion <br /> <br>- hard to reuse fusion patterns <br />    | good for reuse op fusion patterns|
106 | | op fusion space | limited numbers of op fusions       |can extend to automatic op-fusion with model, bigger search space|
107 | |op implementations|manual fused_op implementations for multi backends| auto codegen with autokernel for multi backends|
108 | |deployment codes| light-weighted, only generated op needed in assigned neural networks| provide op library with all common op implementations| 
109 | 
110 | ## Release Note
111 | ###  2021/09 auto-deploy v1.0
112 | - graph core codes: tensor, node, graph ir
113 | - pass manager: op_fusion, remove reshape
114 | - nn demo: mnist [onnx models/mnist](https://github.com/onnx/models/blob/master/vision/classification/mnist/model/mnist-8.onnx)
115 | - op: conv, add, relu, matmul, reshape
116 | - deployment main.cpp codegen
117 | 
118 | ### Future work
119 | - auto tensor memory scheduling
120 | - tmfile supports
121 | - more nn demo
122 | - ...


--------------------------------------------------------------------------------
/auto_deploy/data/0.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OAID/AutoKernel/6ada93a743c26e3aae0372f7518433889dd0b437/auto_deploy/data/0.jpg


--------------------------------------------------------------------------------
/auto_deploy/data/3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OAID/AutoKernel/6ada93a743c26e3aae0372f7518433889dd0b437/auto_deploy/data/3.jpg


--------------------------------------------------------------------------------
/auto_deploy/data/6.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OAID/AutoKernel/6ada93a743c26e3aae0372f7518433889dd0b437/auto_deploy/data/6.jpg


--------------------------------------------------------------------------------
/auto_deploy/data/auto-deploy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OAID/AutoKernel/6ada93a743c26e3aae0372f7518433889dd0b437/auto_deploy/data/auto-deploy.png


--------------------------------------------------------------------------------
/auto_deploy/data/input_6.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OAID/AutoKernel/6ada93a743c26e3aae0372f7518433889dd0b437/auto_deploy/data/input_6.bin


--------------------------------------------------------------------------------
/auto_deploy/data/main_head:
--------------------------------------------------------------------------------
 1 | #include<stdio.h>
 2 | #include<stdlib.h>
 3 | #include "halide_conv.h"
 4 | #include "halide_relu.h"
 5 | #include "halide_maxpool.h"
 6 | #include "halide_matmul.h"
 7 | #include "HalideRuntime.h"
 8 | 
 9 | typedef struct Param {
10 |     int* inp0_dims;
11 |     int* inp1_dims;
12 |     int* inp2_dims;
13 |     int* inp3_dims;
14 |     int* out0_dims;
15 | 
16 |     int ksize;
17 |     int stride;
18 |     int pad;
19 | }Param;
20 | 
21 | void read_float_data(float* data, int size, char* fname)
22 | {
23 |     FILE* fp = fopen(fname, "rb");
24 |     if (!fp) printf("data can not be open");
25 |     fread(data, sizeof(float), size, fp);
26 |     fclose(fp);
27 | }
28 | void p(float* data,int size)
29 | {
30 |     for(int i=0;i<size;i++)
31 |     printf("%f ",data[i]);
32 |     printf("\n");
33 | }
34 | 
35 | void set_data(struct halide_buffer_t* buf,int* shape,int n,void* data)
36 | {
37 |     //dims
38 |     buf->dimensions=n;
39 |     int step[4]={1,1,1,1};
40 |     for(int i=1;i<n;i++)
41 |     {
42 |         step[i]=step[i-1]*shape[i-1];
43 |     }
44 |     buf->dim=(halide_dimension_t*)malloc(sizeof(halide_dimension_t)*n);
45 | 
46 |     for(int i=0;i<n;i++)
47 |     {
48 |         buf->dim[i].min=0;
49 |         buf->dim[i].extent=shape[i];
50 |         buf->dim[i].stride=step[i];
51 |     }
52 |  
53 |     //type
54 |     buf->type.bits=32;
55 |     buf->type.lanes=1;
56 |     buf->type.code=halide_type_float;
57 | 
58 | 
59 |     buf->host=(uint8_t*)data;
60 |     buf->flags=0;
61 |     buf->device=0;
62 | }
63 | 


--------------------------------------------------------------------------------
/auto_deploy/data/mnist-8.onnx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OAID/AutoKernel/6ada93a743c26e3aae0372f7518433889dd0b437/auto_deploy/data/mnist-8.onnx


--------------------------------------------------------------------------------
/auto_deploy/data/mnist.weights:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OAID/AutoKernel/6ada93a743c26e3aae0372f7518433889dd0b437/auto_deploy/data/mnist.weights


--------------------------------------------------------------------------------
/auto_deploy/data/reg_str:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import torch
 3 | from torch import nn
 4 | import numpy as np
 5 | 
 6 | class Register:
 7 |     def __init__(self, registry_name):
 8 |         self._dict = {}
 9 |         self._name = registry_name
10 | 
11 |     def __setitem__(self, key, value):
12 |         if not callable(value):
13 |             raise Exception("Value of a Registry must be a callable")
14 |         if key is None:
15 |             key = value.__name__
16 |         if key in self._dict:
17 |             logging.warning("Key %s already in registry %s." % (key, self._name))
18 |         self._dict[key] = value
19 | 
20 |     def register(self, key_name):
21 |         """Decorator to register a function or class."""
22 |         def add(key, value):
23 |             self[key] = value
24 |             return value
25 |         # @reg.register('alias')
26 |         return lambda func: add(key_name, func)
27 | 
28 |     def __getitem__(self, key):
29 |         return self._dict[key]
30 | 
31 |     def __contains__(self, key):
32 |         return key in self._dict
33 | 
34 |     def keys(self):
35 |         """key"""
36 |         return self._dict.keys()
37 | 
38 | op_reg = Register("op_register")
39 | 


--------------------------------------------------------------------------------
/auto_deploy/generated_op.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import torch
 3 | from torch import nn
 4 | import numpy as np
 5 | 
 6 | class Register:
 7 |     def __init__(self, registry_name):
 8 |         self._dict = {}
 9 |         self._name = registry_name
10 | 
11 |     def __setitem__(self, key, value):
12 |         if not callable(value):
13 |             raise Exception("Value of a Registry must be a callable")
14 |         if key is None:
15 |             key = value.__name__
16 |         if key in self._dict:
17 |             logging.warning("Key %s already in registry %s." % (key, self._name))
18 |         self._dict[key] = value
19 | 
20 |     def register(self, key_name):
21 |         """Decorator to register a function or class."""
22 |         def add(key, value):
23 |             self[key] = value
24 |             return value
25 |         # @reg.register('alias')
26 |         return lambda func: add(key_name, func)
27 | 
28 |     def __getitem__(self, key):
29 |         return self._dict[key]
30 | 
31 |     def __contains__(self, key):
32 |         return key in self._dict
33 | 
34 |     def keys(self):
35 |         """key"""
36 |         return self._dict.keys()
37 | 
38 | op_reg = Register("op_register")
39 | @op_reg.register("Conv_Add_fused")
40 | def run_Conv_Add_fused(node):
41 |     inp_0 = node.input[0]
42 |     inp_1 = node.input[1]
43 |     inp_2 = node.input[2]
44 |     inp_0_tensor = torch.tensor(np.array(inp_0.value,dtype=np.float32).reshape(inp_0.dims))
45 |     inp_1_tensor = torch.tensor(np.array(inp_1.value,dtype=np.float32).reshape(inp_1.dims))
46 |     inp_2_tensor = torch.tensor(np.array(inp_2.value,dtype=np.float32).reshape(inp_2.dims))
47 |     param_0 = node.attr
48 |     conv_0 = nn.Conv2d(param_0.c_in, param_0.c_out, param_0.ksize, param_0.stride, param_0.pad,1,1,True)
49 |     conv_0.weight.data = inp_1_tensor
50 |     conv_0.bias.data = inp_2_tensor
51 |     tmp_0 = conv_0(inp_0_tensor)
52 |     out = tmp_0.detach().numpy()
53 |     out_0 = node.output[0]
54 |     out_0.value=out
55 |     if out_0.reshaped==0:
56 |         out_0.dims=out.shape
57 | @op_reg.register("Relu")
58 | def run_Relu(node):
59 |     inp_0 = node.input[0]
60 |     inp_0_tensor = torch.tensor(np.array(inp_0.value,dtype=np.float32).reshape(inp_0.dims))
61 |     relu_0 = nn.ReLU()
62 |     tmp_0 = relu_0(inp_0_tensor)
63 |     out = np.array(tmp_0)
64 |     out_0 = node.output[0]
65 |     out_0.value=out
66 |     if out_0.reshaped==0:
67 |         out_0.dims=out.shape
68 | @op_reg.register("MaxPool")
69 | def run_MaxPool(node):
70 |     inp_0 = node.input[0]
71 |     inp_0_tensor = torch.tensor(np.array(inp_0.value,dtype=np.float32).reshape(inp_0.dims))
72 |     param_0 = node.attr
73 |     maxpool_0 = nn.MaxPool2d(param_0.ksize, param_0.stride, param_0.pad)
74 |     tmp_0 = maxpool_0(inp_0_tensor)
75 |     out = np.array(tmp_0)
76 |     out_0 = node.output[0]
77 |     out_0.value=out
78 |     if out_0.reshaped==0:
79 |         out_0.dims=out.shape
80 | @op_reg.register("MatMul_Add_fused")
81 | def run_MatMul_Add_fused(node):
82 |     inp_0 = node.input[0]
83 |     inp_1 = node.input[1]
84 |     inp_2 = node.input[2]
85 |     inp_0_tensor = torch.tensor(np.array(inp_0.value,dtype=np.float32).reshape(inp_0.dims))
86 |     inp_1_tensor = torch.tensor(np.array(inp_1.value,dtype=np.float32).reshape(inp_1.dims))
87 |     inp_2_tensor = torch.tensor(np.array(inp_2.value,dtype=np.float32).reshape(inp_2.dims))
88 |     tmp_0 = torch.matmul(inp_0_tensor,inp_1_tensor)
89 |     tmp_1 = torch.add(tmp_0,inp_2_tensor)
90 |     out = np.array(tmp_1)
91 |     out_0 = node.output[0]
92 |     out_0.value=out
93 |     if out_0.reshaped==0:
94 |         out_0.dims=out.shape
95 | 


--------------------------------------------------------------------------------
/auto_deploy/mnist.py:
--------------------------------------------------------------------------------
 1 | from graph import Graph
 2 | from pass_manager import pass_m
 3 | import cv2
 4 | import numpy as np
 5 | import array
 6 | 
 7 | inp_dim = [1, 1, 28, 28]
 8 | 
 9 | def gen_main_cpp():
10 |     graph = Graph('mnist', './data/mnist-8.onnx', inp_dim)
11 |     graph = pass_m.run_all_pass(graph)
12 |     graph.infershape()
13 |     graph.gen_main_cpp('c_source/main.cpp')
14 | 
15 | gen_main_cpp()


--------------------------------------------------------------------------------
/auto_deploy/op_build.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | HALIDE_SOURCE_DIR=/workspace/Halide/
 3 | HALIDE_BUILD_DIR=/workspace/Halide/halide-build
 4 | 
 5 | g++ op_gen.cpp \
 6 |     ${HALIDE_SOURCE_DIR}/tools/GenGen.cpp \
 7 |     -o op.gen \
 8 |     -I ${HALIDE_BUILD_DIR}/include \
 9 |     -Wl,-rpath,${HALIDE_BUILD_DIR}/src \
10 |     ${HALIDE_BUILD_DIR}/src/libHalide.so \
11 |     -pthread -std=c++14  -ldl 
12 | 
13 | OUT_DIR=c_source
14 | 
15 | gen()
16 | {
17 | ./op.gen \
18 | -g $1  \
19 | -o ${OUT_DIR} \
20 | -e c_header,c_source \
21 | target=x86-64-linux-no_runtime-no_bounds_query-no_asserts
22 | }
23 | 
24 | gen halide_conv
25 | gen halide_matmul
26 | gen halide_relu
27 | gen halide_maxpool
28 | 


--------------------------------------------------------------------------------
/auto_deploy/op_codegen.py:
--------------------------------------------------------------------------------
  1 | 
  2 | def get_conv_body(idx,start_idx,tmp,flag):
  3 |     input_0 = 'inp_0_tensor'
  4 |     if start_idx!=0:
  5 |         input_0='tmp_{}'.format(tmp-1)
  6 |     param_name = 'param_{}'.format(idx)
  7 |     if flag==0:
  8 |         line = '''    {} = node.attr
  9 |     conv_{} = nn.Conv2d({}.c_in, {}.c_out, {}.ksize, {}.stride, {}.pad,1,1,False)
 10 |     conv_{}.weight.data = inp_{}_tensor
 11 |     tmp_{} = conv_{}({})
 12 | '''.format(param_name,idx,param_name,param_name,param_name,param_name,param_name,idx,start_idx+1,tmp,idx,input_0)
 13 |     if flag==1:
 14 |         line = '''    {} = node.attr
 15 |     conv_{} = nn.Conv2d({}.c_in, {}.c_out, {}.ksize, {}.stride, {}.pad,1,1,True)
 16 |     conv_{}.weight.data = inp_{}_tensor
 17 |     conv_{}.bias.data = inp_{}_tensor
 18 |     tmp_{} = conv_{}({})
 19 | '''.format(param_name,idx,param_name,param_name,param_name,param_name,param_name,
 20 |         idx,start_idx+1,
 21 |         idx,start_idx+2,
 22 |         tmp,idx,input_0)
 23 |     return line
 24 | def get_maxpool_body(idx,start_idx,tmp):
 25 |     input_0 = 'inp_0_tensor'
 26 |     if start_idx!=0:
 27 |         input_0='tmp_{}'.format(tmp-1)
 28 |     param_name = 'param_{}'.format(idx)
 29 |     line = '''    {} = node.attr
 30 |     maxpool_{} = nn.MaxPool2d({}.ksize, {}.stride, {}.pad)
 31 |     tmp_{} = maxpool_{}({})
 32 | '''.format(param_name,idx,param_name,param_name,param_name,tmp,idx,input_0)
 33 |     return line
 34 | def get_relu_body(idx,start_idx,tmp):
 35 |     input_0 = 'inp_0_tensor'
 36 |     if start_idx !=0:
 37 |         input_0 = 'tmp_{}'.format(tmp-1)
 38 |     line = '''    relu_{} = nn.ReLU()
 39 |     tmp_{} = relu_{}({})
 40 | '''.format(idx,tmp,idx,input_0)
 41 |     return line
 42 | def get_matmul_body(idx,start_idx,tmp):
 43 |     input_0 = 'inp_0_tensor'
 44 |     input_1 = 'inp_1_tensor'
 45 |     if start_idx!=0:
 46 |         input_0 = 'tmp_{}'.format(tmp-1)
 47 |         input_1 = 'inp_{}_tensor'.format(start_idx)
 48 |     line = '''    tmp_{} = torch.matmul({},{})\n'''.format(tmp,input_0,input_1)
 49 |     return line
 50 | def get_add_body(idx,start_idx,tmp):
 51 |     input_0 = 'inp_0_tensor'
 52 |     input_1 = 'inp_1_tensor'
 53 |     if start_idx!=0:
 54 |         input_0 = 'tmp_{}'.format(tmp-1)
 55 |         input_1 = 'inp_{}_tensor'.format(start_idx)
 56 |     line = '''    tmp_{} = torch.add({},{})\n'''.format(tmp,input_0,input_1)
 57 |     return line
 58 | 
 59 | def codegen_node(node):
 60 | 
 61 |     # func parameter node
 62 |     op_type = node.op_type
 63 |     op_list = op_type.split("_")
 64 |     op_list = [i for i in op_list if i!='fused']
 65 |     num_op = len(op_list)
 66 | 
 67 |     num_input = len(node.input)
 68 |     num_output = len(node.output)
 69 | 
 70 |     ############### head
 71 |     head = "@op_reg.register(\"{}\")\ndef run_{}(node):\n".format(op_type,op_type)
 72 | 
 73 |     ############### input
 74 |     input_declare = ""
 75 |     for i in range(num_input):
 76 |         input_declare+="    inp_{} = node.input[{}]\n".format(i,i)
 77 |     input_tensor = ""
 78 |     for i in range(num_input):
 79 |         input_tensor += "    inp_{}_tensor = torch.tensor(np.array(inp_{}.value,dtype=np.float32).reshape(inp_{}.dims))\n".format(i,i,i)
 80 |     input = input_declare + input_tensor
 81 | 
 82 |     ############## body 
 83 |     start_idx = 0
 84 |     body_idx = 0
 85 |     body =""
 86 |     for body_idx in range(num_op):
 87 |         op = op_list[body_idx]
 88 |         if op=="Conv":
 89 |             flag = 0
 90 |             if op_list[body_idx+1]=='Add':
 91 |                 flag = 1
 92 |             body += get_conv_body(body_idx,start_idx,body_idx,flag)
 93 |             start_idx+=2
 94 |         if op=="MaxPool":
 95 |             body += get_maxpool_body(body_idx,start_idx,body_idx)
 96 |             start_idx+=1
 97 |         if op=="Relu":
 98 |             body += get_relu_body(body_idx,start_idx,body_idx)
 99 |         if op=="MatMul":
100 |             body += get_matmul_body(body_idx,start_idx,body_idx)
101 |             start_idx+=2
102 |         if op=="Add":
103 |             if op_list[body_idx-1]=='Conv':
104 |                 continue
105 |             body += get_add_body(body_idx,start_idx,body_idx)
106 |             start_idx+=2
107 |         body_idx+=1
108 |     ############## output
109 |     output = ""
110 |     assert(num_output==1)
111 |     i=0
112 |     line = "    out = np.array(tmp_{})\n".format(body_idx-1)
113 |     if 'Conv' in op_type:
114 |         line = "    out = tmp_{}.detach().numpy()\n".format(body_idx-1)
115 |     line1 = "    out_{} = node.output[{}]\n".format(i,i)
116 |     line2 = "    out_{}.value=out\n".format(i)
117 |     line3 = "    if out_{}.reshaped==0:\n".format(i)
118 |     line4 = "        out_{}.dims=out.shape\n".format(i)
119 |     output=(line+line1+line2+line3+line4)
120 | 
121 |     op_string = head + input + body + output
122 |     # print(op_string)
123 |     return op_string
124 | 
125 | 
126 | cg = codegen_node


--------------------------------------------------------------------------------
/auto_deploy/op_gen.cpp:
--------------------------------------------------------------------------------
 1 | #include "Halide.h"
 2 | #include "HalideBuffer.h"
 3 | 
 4 | using namespace Halide;
 5 | using Halide::BoundaryConditions::constant_exterior;
 6 | using Halide::Expr;
 7 | 
 8 | Var x("x"), y("y"), c("c"), n("n");
 9 | 
10 | //conv with bias
11 | class ConvGenerator : public Generator<ConvGenerator> {
12 | public:
13 |     Input<Buffer<float>> input{"input", 4}; //[w,h,c,n]
14 |     Input<Buffer<float>> weight{"weight", 4}; //[kw,kh,cin,cout]
15 |     Input<Buffer<float>> bias{"bias", 1}; //[cout]
16 |     Input<int> stride{"stride"}; 
17 |     Input<int> pad{"pad"}; 
18 |     Output<Buffer<float>> output{"output", 4};
19 | 
20 |     void generate() {
21 |         RDom r(0, weight.dim(0).extent(), 0, weight.dim(1).extent(), 0, weight.dim(2).extent()); 
22 |         output(x, y, c, n) = bias(c);
23 | 
24 |         Func inp_bounded =constant_exterior(input,  //source
25 |                                             0,      //value
26 |                         {{0, input.dim(0).extent()},	    //boundary-dim0 w
27 |                          {0, input.dim(1).extent()},	    //boundary-dim1 h
28 |                          {Expr(), Expr()},                  //boundary-dim2 c
29 |                          {Expr(), Expr()}});                //boundary-dim3 n
30 |         Func inp_padded("inp_padded");
31 |         inp_padded(x, y, c, n) = inp_bounded(x - pad, y - pad, c, n);
32 | 
33 |         output(x, y, c, n) += weight(r[0], r[1], r[2], c) * 
34 |                 inp_padded(x * stride + r[0], y * stride + r[1],r[2],n);
35 |     }
36 | };
37 | 
38 | //matmul
39 | class MatMulGenerator : public Generator<MatMulGenerator> {
40 | public:
41 |     Input<Buffer<float>> input{"input", 2};
42 |     Input<Buffer<float>> weight{"weight", 2};
43 |     Input<Buffer<float>> bias{"bias", 1};
44 |     Output<Buffer<float>> output{"output", 2};
45 | 
46 |     void generate() {
47 |         RDom k(0, input.dim(0).extent());
48 |         output(x, y) = bias(x);
49 |         output(x, y) += input(k, y) * weight(x, k);
50 |     }
51 | };
52 | 
53 | // maxpool
54 | class MaxPoolGenerator : public Generator<MaxPoolGenerator> {
55 | public:
56 |     Input<Buffer<float>> input_a{"input_a", 4};
57 |     Input<int> ksize{"ksize"};
58 |     Input<int> stride{"stride"};
59 |     Output<Buffer<float>> output{"output", 4};
60 | 
61 |     void generate() {
62 |         RDom r(0, ksize, 0, ksize);
63 |         int pad = 0;
64 |         output(x, y, c, n) = maximum(input_a(stride*x+ r.x -pad , stride*y+r.y - pad, c, n));
65 |     }
66 | };
67 | 
68 | // relu
69 | class ReluGenerator : public Generator<ReluGenerator> {
70 | public:
71 |     
72 |     Input<Buffer<float>> input_a{"input_a", 4};
73 |     Output<Buffer<float>> output{"output", 4};
74 | 
75 |     void generate() {
76 |         output(x, y, c, n) = max(0.0f,input_a(x,y,c,n));
77 |     }
78 | };
79 | HALIDE_REGISTER_GENERATOR(MatMulGenerator, halide_matmul)
80 | HALIDE_REGISTER_GENERATOR(MaxPoolGenerator, halide_maxpool)
81 | HALIDE_REGISTER_GENERATOR(ReluGenerator, halide_relu)
82 | HALIDE_REGISTER_GENERATOR(ConvGenerator, halide_conv)
83 | 


--------------------------------------------------------------------------------
/auto_deploy/op_generator.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | 
 4 | def gen_code(op_name, head=0):
 5 | 
 6 |     f = open("c_source/%s.halide_generated.cpp" % op_name, "r")
 7 |     x = f.readlines()
 8 |     if head == 1:
 9 |         with open("op/generated.h", "w") as fp:
10 |             for line in x[:2363]:
11 |                 fp.write(line)
12 | 
13 |     with open("c_source/%s.cpp" % op_name, "w") as fp:
14 |         headline = "#include \"generated.h\"\n"
15 |         fp.write(headline)
16 |         for line in x[2364:]:
17 |             fp.write(line)
18 |     f.close()
19 | 
20 | 
21 | os.system("bash op_build.sh")
22 | 
23 | gen_code("halide_relu", 1)
24 | gen_code("halide_conv")
25 | gen_code("halide_maxpool")
26 | gen_code("halide_matmul")
27 | 
28 | os.system("rm c_source/*.halide_generated.cpp")
29 | 


--------------------------------------------------------------------------------
/autokernel_plugin/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | .vscode/
3 | halide_*
4 | *_gen
5 | 
6 | 


--------------------------------------------------------------------------------
/autokernel_plugin/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | project(AutoKernel)
 2 | 
 3 | set(TENGINE_ROOT /workspace/Tengine/)
 4 | 
 5 | if(NOT DEFINED TENGINE_ROOT)
 6 |     message(FATAL_ERROR "please set TENGINE_ROOT for tengine directory")
 7 | endif()
 8 | 
 9 | set(TENGINE_DIR /workspace/Tengine/build/install)
10 | set(TENGINE_LIBRARY ${TENGINE_DIR}/lib/libtengine-lite.so)
11 | 
12 | cmake_minimum_required(VERSION 3.0)
13 | 
14 | add_definitions(-Wall)
15 | add_definitions(-fPIC)
16 | #add_definitions(-g)
17 | add_definitions(-O3)
18 | add_definitions(-funroll-loops)
19 | 
20 | include_directories(include/)
21 | 
22 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-overloaded-virtual")
23 | set(CMAKE_CXX_STANDARD 11)
24 | 
25 | # sub directories
26 | add_subdirectory(src)
27 | add_subdirectory(tests)
28 | 


--------------------------------------------------------------------------------
/autokernel_plugin/common/GenGen.cpp:
--------------------------------------------------------------------------------
1 | #include "Halide.h"
2 | 
3 | int main(int argc, char **argv) {
4 |     return Halide::Internal::generate_filter_main(argc, argv, std::cerr);
5 | }


--------------------------------------------------------------------------------
/autokernel_plugin/images/cat.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OAID/AutoKernel/6ada93a743c26e3aae0372f7518433889dd0b437/autokernel_plugin/images/cat.jpg


--------------------------------------------------------------------------------
/autokernel_plugin/models/squeezenet.tmfile:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OAID/AutoKernel/6ada93a743c26e3aae0372f7518433889dd0b437/autokernel_plugin/models/squeezenet.tmfile


--------------------------------------------------------------------------------
/autokernel_plugin/scripts/clean.sh:
--------------------------------------------------------------------------------
 1 | for dir in `ls src`
 2 | do
 3 |     if [ -d src/$dir ] 
 4 |     then
 5 | 	echo src/$dir
 6 | 	cd src/$dir
 7 | 	rm *gen
 8 | 	rm halide*
 9 |         cd ../../
10 |     fi
11 | done 
12 | 


--------------------------------------------------------------------------------
/autokernel_plugin/scripts/generate.sh:
--------------------------------------------------------------------------------
 1 | export HALIDE_DIR=/workspace/Halide/halide-build
 2 | export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${HALIDE_DIR}/lib
 3 | for dir in `ls src`
 4 | do
 5 |     if [ -d src/$dir ] 
 6 |     then
 7 | 	echo src/$dir
 8 | 	cd src/$dir
 9 | 	chmod +x build.sh
10 | 	./build.sh
11 |         cd ../../
12 |     fi
13 | done 
14 | 


--------------------------------------------------------------------------------
/autokernel_plugin/scripts/register_op.sh:
--------------------------------------------------------------------------------
 1 | #/usr/bin
 2 | #./register_op.sh op_dir_name op_func_name op_name
 3 | op_name=""
 4 | op_define_name=""
 5 | op_src_file=""
 6 | op_header_file=""
 7 | op_func_name=""
 8 | 
 9 | if [ ! -n "$1" ];then
10 |     echo "please input op_name"
11 |     read op_name
12 |     echo "op_name:"$op_name
13 |     echo "please input op_type, [eg.:OP_CONV, OP_POOL, ref @ tengine_op.h]"
14 |     read op_define_name
15 |     echo "op_type:"$op_define_name
16 |     op_dir=src/$op_name
17 |     op_src_file=$op_dir/$op_name.cpp
18 |     op_header_file=$op_dir/$op_name.h
19 |     op_define_name=${op_define_name^^}
20 |     op_func_name=halide_${op_name}
21 | else
22 |     op_name=$1
23 |     op_dir=src/$op_name
24 |     op_define_name=${2^^}
25 |     op_src_file=$op_dir/$op_name.cpp
26 |     op_header_file=$op_dir/$op_name.h
27 |     op_func_name=halide_${op_name}
28 | fi
29 | 
30 | 
31 | echo "op name is $op_name"
32 | if [ ! -d $op_dir  ];then
33 |     mkdir $op_dir
34 | else
35 |     rm -rf $op_dir
36 |     mkdir $op_dir
37 | fi
38 | 
39 | cp template/template.cpp $op_src_file
40 | cp template/template.h $op_header_file
41 | # cp generator/$op_name/$op_func_name.h $op_dir
42 | # cp generator/$op_name/$op_func_name.s $op_dir
43 | 
44 | sed -i s/'template'/$op_name/g $op_src_file
45 | 
46 | sed -i s/'AutoKernel_Func'/$op_func_name/g $op_header_file
47 | sed -i s/'AutoKernel_Func'/$op_func_name/g $op_src_file
48 | 
49 | sed -i s/'OP_CONV'/$op_define_name/g $op_src_file
50 | 
51 | sed -i s/'RegisterAutoKernelOP'/'RegisterAutoKernel'${op_name^}/g $op_header_file
52 | sed -i s/'RegisterAutoKernelOP'/'RegisterAutoKernel'${op_name^}/g $op_src_file
53 | 
54 | # plugin_init.cpp
55 | if [ `grep -c 'RegisterAutoKernel'${op_name^} src/plugin_init.cpp` -eq '0' ]; then
56 |     line=`grep -n "autokernel_plugin_init"  src/plugin_init.cpp | cut -d ":" -f 1`
57 |     sed -i '/register halide operator/a\    RegisterAutoKernel'${op_name^}'();' src/plugin_init.cpp
58 |     sed -i '1a\#include "'${op_name}'/'${op_name}'.h"' src/plugin_init.cpp 
59 | else
60 |     echo "found"	
61 | fi
62 | 
63 | 
64 | # op_name_gen.cpp
65 | op_gen_file=$op_dir/${op_name}_gen.cc
66 | cp template/generator.cc $op_gen_file
67 | 
68 | sed -i s/'Halide_Func_Name'/$op_func_name/g $op_gen_file
69 | 
70 | # build.sh
71 | cp template/build.sh $op_dir
72 | sed -i s/'OP_NAME'/$op_name/g $op_dir/build.sh
73 | chmod +x $op_dir/build.sh
74 | 


--------------------------------------------------------------------------------
/autokernel_plugin/src/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | # include directory list
 3 | include_directories(${TENGINE_ROOT}/build/source)
 4 | include_directories(${TENGINE_ROOT}/source)
 5 | 
 6 | ENABLE_LANGUAGE(ASM)
 7 | 
 8 | set(DRIVER_TARGET autokernel)
 9 | FILE(GLOB_RECURSE DRIVER_SRCS "*.cpp" "*.c" "*.s")
10 | ADD_LIBRARY(${DRIVER_TARGET} SHARED ${DRIVER_SRCS})
11 | target_link_libraries(${DRIVER_TARGET} ${TENGINE_LIBRARY})
12 | 


--------------------------------------------------------------------------------
/autokernel_plugin/src/depthwise/build.sh:
--------------------------------------------------------------------------------
1 | g++ depthwise_gen.cc ../../common/GenGen.cpp \
2 | 	-I ${HALIDE_DIR}/include \
3 | 	-L ${HALIDE_DIR}/lib \
4 |         -lHalide -std=c++11 -fno-rtti \
5 | 	-o depthwise_gen
6 | 
7 | ./depthwise_gen -g halide_depthwise -e c_header,assembly -o . target=host-no_runtime-no_asserts-no_bounds_query
8 | 


--------------------------------------------------------------------------------
/autokernel_plugin/src/depthwise/depthwise.cpp:
--------------------------------------------------------------------------------
  1 | #include "depthwise.h"
  2 | 
  3 | // add helper data struct and functions here
  4 | 
  5 | static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
  6 | {
  7 |     return 0;
  8 | }
  9 | 
 10 | static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 11 | {
 12 |     bool info_autokernel = false;
 13 |     const char* debug_env = std::getenv("DEBUG_INFO");
 14 |     if((debug_env) && (debug_env[0] == '1'))
 15 |     {
 16 |         info_autokernel = true;
 17 |     }
 18 |     struct node* ir_node = exec_node->ir_node;
 19 |     struct graph* ir_graph = ir_node->graph;
 20 |     struct tensor* input_tensor;
 21 |     struct tensor* weight_tensor;
 22 |     struct tensor* output_tensor;
 23 |     struct tensor* bias_tensor = NULL;
 24 |     int num_thread = exec_graph->num_thread;
 25 |     int cpu_affinity = exec_graph->cpu_affinity;
 26 | 
 27 |     // set the input data and shape again, in case of reshape or dynamic shape 
 28 |     input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
 29 |     weight_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]);
 30 |     output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
 31 |     if (ir_node->input_num > 2)
 32 |         bias_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[2]);
 33 | 
 34 |     struct conv_param* conv_param = ( struct conv_param* )ir_node->op.param_mem;
 35 | 
 36 |     float* input_buf = (float*)(input_tensor->data);
 37 |     float* weight_buf = (float*)(weight_tensor->data);
 38 |     float* output_buf = (float*)(output_tensor->data);
 39 |     float* bias = NULL;
 40 |     if (ir_node->input_num > 2)
 41 |         bias = (float*)(bias_tensor->data);
 42 | 
 43 |     if (exec_graph->mode == TENGINE_MODE_FP32)
 44 |     {
 45 |         int stride = conv_param->stride_h;
 46 |         int pad_width = conv_param->pad_w0;
 47 |         int pad_height = conv_param->pad_h0;
 48 | 	int act = conv_param->activation;
 49 |         int group = conv_param->group;
 50 | 
 51 |         Halide::Runtime::Buffer<float> input(input_buf, input_tensor->dims[3], input_tensor->dims[2], input_tensor->dims[1], input_tensor->dims[0]);
 52 |         Halide::Runtime::Buffer<float> filter(weight_buf, weight_tensor->dims[3], weight_tensor->dims[2], weight_tensor->dims[1], weight_tensor->dims[0]);
 53 |         Halide::Runtime::Buffer<float> output(output_buf, output_tensor->dims[3], output_tensor->dims[2], output_tensor->dims[1], output_tensor->dims[0]);
 54 |         Halide::Runtime::Buffer<float> bias1(bias, output_tensor->dims[1]);
 55 | 
 56 |         if(info_autokernel)printf("[INFO]: runing AutoKernel im2col_conv ...\n");
 57 | 
 58 |         halide_depthwise(input, filter, bias1, stride, pad_width, pad_height, act, output);
 59 |     }
 60 |     else
 61 |     {
 62 |         printf("Tengine work node not support %d\n", exec_graph->mode);
 63 | 	return -1;
 64 |     }
 65 |     return 0;
 66 | }
 67 | 
 68 | static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 69 | {
 70 |     return 0;
 71 | }
 72 | 
 73 | static int postrun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 74 | {
 75 |     /*
 76 |     release the helper memory you 
 77 |     */
 78 |     return 0;
 79 | }
 80 | 
 81 | static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 82 | {
 83 |     /* 
 84 |     init the private info data for your op:
 85 |     void ops_priv;
 86 |     int shared_mem_size;
 87 |     int shared_pack4_mem_size;
 88 |     */
 89 |     return 0;
 90 | }
 91 | 
 92 | static int release_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 93 | {
 94 |     /* 
 95 |     release the private info data for your op:
 96 |     void ops_priv;
 97 |     int shared_mem_size;
 98 |     int shared_pack4_mem_size;
 99 |     */
100 |     return 0;
101 | }
102 | 
103 | static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struct node* exec_node)
104 | {
105 | 
106 |     struct conv_param* param = ( struct conv_param* )exec_node->op.param_mem;
107 |     struct node* ir_node = exec_node;
108 |     struct graph* ir_graph = ir_node->graph;
109 | 
110 |     struct tensor* input_tensor;
111 |     struct tensor* output_tensor;
112 | 
113 |     int group = param->group;
114 |     int kernel_h = param->kernel_h;
115 |     int kernel_w = param->kernel_w;
116 |     int stride_h = param->stride_h;
117 |     int stride_w = param->stride_w;
118 |     int dilation_h = param->dilation_h;
119 |     int dilation_w = param->dilation_w;
120 |     int pad_h0 = param->pad_h0;
121 |     int pad_w0 = param->pad_h0;
122 |     int pad_h1 = param->pad_h1;
123 |     int pad_w1 = param->pad_w1;
124 | 
125 |     input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
126 |     output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
127 |   
128 |     int in_c = input_tensor->dims[1] / group;
129 |     int out_c = output_tensor->dims[1] / group;
130 | 
131 |     if (input_tensor->data_type != TENGINE_DT_FP32)
132 | 	return 0;
133 |     if (kernel_h != kernel_w || input_tensor->dims[0] > 1)
134 | 	return 0;
135 |     
136 |     if (param->group > 1 && in_c == 1 && out_c == 1 && pad_h0 == pad_h1 && pad_w0 == pad_w1 
137 |        && dilation_h == 1 && dilation_w == 1 && kernel_h == 3 && kernel_w == 3
138 |        && ((stride_h == 1 && stride_w == 1) || (stride_w == 2 && stride_h == 2)))
139 |     {    
140 |         return OPS_SCORE_STATIC;
141 |     }
142 |     else
143 |     {
144 | 	return 0;
145 |     }
146 | }
147 | 
148 | static struct node_ops autokernel_node_ops = {.prerun = prerun,
149 |                                        .run = run,
150 |                                        .reshape = reshape,
151 |                                        .postrun = postrun,
152 |                                        .init_node = init_node,
153 |                                        .release_node = release_node,
154 |                                        .score = score};
155 | 
156 | int RegisterAutoKernelDepthwise()
157 | {
158 |     return register_builtin_node_ops(OP_CONV, &autokernel_node_ops);
159 | }
160 | 
161 | //static int unreg_autokernel_ops(void* arg)
162 | //{
163 | //    unregister_builtin_node_ops(OP_DEPTHWISE, &autokernel_node_ops);
164 | //    return 0;
165 | //}
166 | 


--------------------------------------------------------------------------------
/autokernel_plugin/src/depthwise/depthwise.h:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <math.h>
 3 | 
 4 | extern "C"
 5 | {
 6 |     #include "device/cpu/cpu_define.h"
 7 |     #include "device/cpu/cpu_node.h" 
 8 |     #include "device/cpu/cpu_module.h"
 9 |     #include "device/cpu/cpu_graph.h"
10 | 
11 |     #include "api/c_api.h"
12 |     #include "device/device.h"
13 |     #include "graph/tensor.h"
14 |     #include "graph/node.h"
15 |     #include "graph/graph.h"
16 |     #include "graph/subgraph.h"
17 |     #include "executer/executer.h"
18 |     #include "optimizer/split.h"
19 |     #include "module/module.h"
20 |     #include "utility/vector.h"
21 |     #include "utility/log.h"
22 |     #include "utility/sys_port.h"
23 |     #include "defines.h"
24 | 
25 |     // include op param header file here, locate in src/op/
26 |     #include "operator/prototype/convolution_param.h"
27 | }
28 |                                             
29 | #include "HalideBuffer.h"
30 | 
31 | // include the c_header file here
32 | #include "halide_depthwise.h"
33 | 
34 | int RegisterAutoKernelDepthwise();
35 | 


--------------------------------------------------------------------------------
/autokernel_plugin/src/depthwise/depthwise_gen.cc:
--------------------------------------------------------------------------------
 1 | #include "Halide.h"
 2 | #include "HalideBuffer.h"
 3 | using namespace Halide;
 4 | using Halide::Expr;
 5 | using Halide::Func;
 6 | using Halide::Generator;
 7 | using Halide::Var;
 8 | using Halide::BoundaryConditions::constant_exterior;
 9 | 
10 | class halide_depthwise:public Halide::Generator<halide_depthwise>{
11 | public:
12 |     Input<Buffer<float>> input{"input", 4};
13 |     Input<Buffer<float>> kernel{"kernel", 4};
14 |     Input<Buffer<float>> bias{"bias", 1};
15 | 
16 |     Input<int> stride{"stride"};
17 |     Input<int> pad_width{"pad_width"};
18 |     Input<int> pad_height{"pad_height"};
19 |     Input<int> act{"act"};
20 | 
21 |     Output<Buffer<float>> output{"output", 4};
22 | 
23 |     void generate() {
24 |         // The algorithm.
25 |         Var x("x"), y("y"), depth("depth"), n("n");
26 | 
27 |         Func input_bounded =
28 |             constant_exterior(input, 0,
29 |                         {{0, input.dim(0).extent()},    //boundary-dim0 w
30 | 	     		    	{0, input.dim(1).extent()},	    //boundary-dim1 h
31 |             		    {Expr(), Expr()},		        //boundary-dim2 c
32 | 	     		    	{Expr(), Expr()}});		        //boundary-dim3 n
33 | 
34 |         Func inp_padded("inp_padded");
35 |         inp_padded(x, y, depth, n) = input_bounded(x - pad_width, y - pad_height, depth, n);
36 | 
37 |         Func conv_nchw("conv_nchw");        
38 |         RDom filter_dom(0, kernel.dim(0).extent(), 0, kernel.dim(1).extent()); 
39 | 
40 |         conv_nchw(x, y, depth, n) = bias(depth);
41 |         conv_nchw(x, y, depth, n) += kernel(filter_dom.x, filter_dom.y, 0, depth) *
42 |              inp_padded(x * stride + filter_dom.x, y * stride + filter_dom.y, depth, n);		
43 |        output(x, y, depth, n) = select(act >= 0, max(act, conv_nchw(x, y, depth, n)), conv_nchw(x, y, depth, n));
44 |     }
45 | 
46 |     void schedule()
47 |     {
48 |         if(auto_schedule)
49 |         {
50 |             input.set_estimates({{0, 512}, {0, 512}, {0, 512}, {0, 1}});
51 |             kernel.set_estimates({{0, 512}, {0, 512}, {0, 512}, {0, 1}});
52 |             bias.set_estimates({{0, 512}});
53 | 
54 |             stride.set_estimate(1);
55 |             pad_width.set_estimate(1);
56 |             pad_height.set_estimate(1);
57 |             output.set_estimates({{0, 512}, {0, 512}, {0, 512}, {0, 1}});
58 |         }
59 |     }
60 | };
61 | 
62 | HALIDE_REGISTER_GENERATOR(halide_depthwise, halide_depthwise)
63 | 


--------------------------------------------------------------------------------
/autokernel_plugin/src/direct_conv/build.sh:
--------------------------------------------------------------------------------
1 | g++ direct_conv_gen.cc ../../common/GenGen.cpp \
2 | 	-I ${HALIDE_DIR}/include \
3 | 	-L ${HALIDE_DIR}/lib \
4 | 	-lHalide -std=c++11 -fno-rtti \
5 | 	-o direct_conv_gen
6 | 
7 | ./direct_conv_gen -g halide_direct_conv -e c_header,assembly -o . target=host
8 | 


--------------------------------------------------------------------------------
/autokernel_plugin/src/direct_conv/direct_conv.cpp:
--------------------------------------------------------------------------------
  1 | #include "direct_conv.h"
  2 | 
  3 | static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
  4 | {
  5 |     return 0;
  6 | }
  7 | 
  8 | static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
  9 | {
 10 |     struct node* ir_node = exec_node->ir_node;
 11 |     struct graph* ir_graph = ir_node->graph;
 12 |     struct tensor* input_tensor;
 13 |     struct tensor* weight_tensor;
 14 |     struct tensor* output_tensor;
 15 |     struct tensor* bias_tensor = NULL;
 16 |     int num_thread = exec_graph->num_thread;
 17 |     int cpu_affinity = exec_graph->cpu_affinity;
 18 | 
 19 |     /* set the input data and shape again, in case of reshape or dynamic shape */
 20 |     input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
 21 |     weight_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]);
 22 |     output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
 23 |     if (ir_node->input_num > 2)
 24 |         bias_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[2]);
 25 | 
 26 |     struct conv_param* conv_param = ( struct conv_param* )ir_node->op.param_mem;
 27 |     
 28 |     float* input_buf = (float*)(input_tensor->data);
 29 |     float* weight_buf = (float*)(weight_tensor->data);
 30 |     float* output_buf = (float*)(output_tensor->data);
 31 |     float* bias = NULL;
 32 |     if (ir_node->input_num > 2)
 33 |         bias = (float*)(bias_tensor->data);
 34 | 
 35 |     if (exec_graph->mode == TENGINE_MODE_FP32)
 36 |     {
 37 |         int stride = conv_param->stride_h;
 38 | 	int pad_width = conv_param->pad_w0;
 39 | 	int pad_height = conv_param->pad_h0;
 40 | 	int input_c = input_tensor->dims[1];
 41 | 	int act = conv_param->activation;
 42 | 	
 43 | 	Halide::Runtime::Buffer<float> input(input_buf, input_tensor->dims[3], input_tensor->dims[2], input_tensor->dims[1], input_tensor->dims[0]);
 44 | 	Halide::Runtime::Buffer<float> filter(weight_buf, weight_tensor->dims[3], weight_tensor->dims[2], weight_tensor->dims[1], weight_tensor->dims[0]);
 45 | 	Halide::Runtime::Buffer<float> output(output_buf, output_tensor->dims[3], output_tensor->dims[2], output_tensor->dims[1], output_tensor->dims[0]);
 46 | 	Halide::Runtime::Buffer<float> bias1(bias, output_tensor->dims[1]);
 47 | 	
 48 | 	halide_direct_conv(input, filter, bias1, input_c, stride, pad_width, pad_height, act, output);
 49 |     }
 50 |     else
 51 |     {
 52 |         printf("Tengine work node not support %d\n", exec_graph->mode);
 53 | 	return -1;
 54 |     }
 55 | 
 56 |     return 0;
 57 | }
 58 | 
 59 | static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 60 | {
 61 |     return 0;
 62 | }
 63 | 
 64 | static int postrun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 65 | {
 66 |     //printf("run halide postrun\n");
 67 |     return 0;
 68 | }
 69 | 
 70 | static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 71 | {
 72 |     return 0;
 73 | }
 74 | 
 75 | static int release_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 76 | {
 77 |     return 0;
 78 | }
 79 | 
 80 | static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struct node* exec_node)
 81 | {
 82 |     
 83 |     return 5003;
 84 | }
 85 | 
 86 | static struct node_ops hcl_node_ops = {.prerun = prerun,               
 87 |                                        .run = run,                   
 88 |                                        .reshape = reshape,           
 89 |                                        .postrun = postrun,              
 90 |                                        .init_node = init_node,       
 91 |                                        .release_node = release_node, 
 92 | 	                               .score = score};              
 93 | 
 94 | int RegisterAutoKernelDirect_conv()
 95 | {
 96 |     return register_builtin_node_ops(OP_CONV, &hcl_node_ops);
 97 | }
 98 | 
 99 | // static int unreg_conv_hcl_ops(void* arg)
100 | // {
101 | //     unregister_builtin_node_ops(OP_CONV, &hcl_node_ops);
102 | //     return 0;
103 | // }
104 | 


--------------------------------------------------------------------------------
/autokernel_plugin/src/direct_conv/direct_conv.h:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | 
 3 | extern "C"
 4 | {
 5 |     #include "device/cpu/cpu_define.h"
 6 |     #include "device/cpu/cpu_node.h"
 7 |     #include "device/cpu/cpu_module.h"
 8 |     #include "device/cpu/cpu_graph.h"
 9 | 
10 |     #include "api/c_api.h"
11 |     #include "device/device.h"
12 |     #include "graph/tensor.h"
13 |     #include "graph/node.h"
14 |     #include "graph/graph.h"
15 |     #include "graph/subgraph.h"
16 |     #include "executer/executer.h"
17 |     #include "optimizer/split.h"
18 |     #include "module/module.h"
19 |     #include "utility/vector.h"
20 |     #include "utility/log.h"
21 |     #include "utility/sys_port.h"
22 |     #include "defines.h"
23 | 
24 |     #include "operator/prototype/convolution_param.h"
25 | }
26 | #include "HalideBuffer.h"
27 | #include "halide_direct_conv.h"
28 | 
29 | int RegisterAutoKernelDirect_conv(); 
30 | 


--------------------------------------------------------------------------------
/autokernel_plugin/src/direct_conv/direct_conv_gen.cc:
--------------------------------------------------------------------------------
 1 | #include "Halide.h"
 2 | #include "HalideBuffer.h"
 3 | using namespace Halide;
 4 | using Halide::Expr;
 5 | using Halide::Func;
 6 | using Halide::Generator;
 7 | using Halide::Var;
 8 | using Halide::BoundaryConditions::constant_exterior;
 9 | 
10 | class halide_direct_conv:public Halide::Generator<halide_direct_conv>{
11 |     public:
12 |     Input<Buffer<float>> input{"input", 4};
13 |     Input<Buffer<float>> kernel{"kernel", 4};
14 |     Input<Buffer<float>> bias{"bias", 1};
15 | 
16 |     Input<int> input_c{"input_depth"};
17 |     Input<int> stride{"stride"};
18 |     Input<int> pad_width{"pad_width"};
19 |     Input<int> pad_height{"pad_height"};
20 |     Input<int> act{"act"};
21 | 
22 |     Output<Buffer<float>> relu{"relu", 4};
23 | 
24 |     void generate() {
25 |         /* THE ALGORITHM */
26 | 
27 |         Var x("x"), y("y"), ci("ci"), n("n"), co("co");
28 | 
29 | 	    Func inp_bounded =constant_exterior(input,	//source
30 | 			        0,			     	            //value
31 | 				{{0, input.dim(0).extent()},	    //boundary-dim0 w
32 | 	     		    	{0, input.dim(1).extent()},	    //boundary-dim1 h
33 |             		    	{Expr(), Expr()},		    //boundary-dim2 c
34 | 	     		    	{Expr(), Expr()}});		    //boundary-dim3 n
35 |         Func inp_padded("inp_padded");
36 |         inp_padded(x, y, ci, n) = inp_bounded(x - pad_width, y - pad_height, ci, n);
37 | 
38 |         Func conv_nchw("conv_nchw");
39 |         
40 |         RDom r(0, kernel.dim(0).extent(), 0, kernel.dim(1).extent(), 0, input_c); 
41 | 
42 |         conv_nchw(x, y, co, n) = bias(co);
43 |         conv_nchw(x, y, co, n) += kernel(r[0], r[1], r[2], co) * 
44 | 				inp_padded(x * stride + r[0], y * stride + r[1],r[2],n);
45 | 
46 | 	relu(x, y, co, n) = select(act >= 0, max(act, conv_nchw(x, y, co, n)), conv_nchw(x, y, co, n));
47 | 	/*
48 | 	if(act == 0)
49 |             relu(x, y, co, n) = max(act, conv_nchw(x, y, co, n));
50 | 	else
51 | 	    relu(x, y, co, n) = conv_nchw(x, y, co, n);
52 | 	*/
53 |     }
54 | 
55 |     void schedule()
56 |     {
57 | 	if(auto_schedule)
58 | 	{
59 | 	    input.set_estimates({{0, 512}, {0, 512}, {0, 512}, {0, 1}});
60 | 	    kernel.set_estimates({{0, 512}, {0, 512}, {0, 512}, {0, 512}});
61 | 	    bias.set_estimates({{0, 512}});
62 | 	    // input_c.set_estimate(64);
63 | 	    stride.set_estimate(1);
64 | 	    pad_width.set_estimate(1);
65 | 	    pad_height.set_estimate(1);
66 | 	    relu.set_estimates({{0, 512}, {0, 512}, {0, 512}, {0, 1}});
67 | 	}
68 |     }
69 | 
70 | 
71 | };
72 | HALIDE_REGISTER_GENERATOR(halide_direct_conv, halide_direct_conv)
73 | 


--------------------------------------------------------------------------------
/autokernel_plugin/src/fc/build.sh:
--------------------------------------------------------------------------------
1 | g++ fc_gen.cc ../../common/GenGen.cpp \
2 | 	-I ${HALIDE_DIR}/include \
3 | 	-L ${HALIDE_DIR}/lib \
4 | 	-lHalide -std=c++11 -fno-rtti \
5 | 	-o fc_gen
6 | 
7 | ./fc_gen -g halide_fc -e c_header,assembly -o . target=host
8 | 


--------------------------------------------------------------------------------
/autokernel_plugin/src/fc/fc.cpp:
--------------------------------------------------------------------------------
  1 | #include "fc.h"
  2 | 
  3 | // add helper data struct and functions here
  4 | /*
  5 | struct op_priv_info
  6 | {
  7 | 
  8 | };
  9 | */
 10 | 
 11 | static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 12 | {
 13 |     /*
 14 |     allocate helper memory for your op
 15 |     */
 16 |     return 0;
 17 | }
 18 | 
 19 | static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 20 | {
 21 |     bool info_autokernel = false;
 22 |     const char* debug_env = std::getenv("DEBUG_INFO");
 23 |     if((debug_env) && (debug_env[0] == '1'))
 24 |     {
 25 |         info_autokernel = true;
 26 |     } 
 27 |     // step 1: get input and output
 28 |     struct node* ir_node = exec_node->ir_node;
 29 |     struct graph* ir_graph = ir_node->graph;
 30 |     struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
 31 |     struct tensor* weight_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]);
 32 |     struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
 33 |     struct tensor* bias_tensor = NULL;
 34 |     if (ir_node->input_num > 2)
 35 | 	bias_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[2]);
 36 |     struct fc_data* fc_param = ( struct fc_data* )ir_node->op.param_mem;
 37 | 
 38 |     float* input_buf = (float*)(input_tensor->data);
 39 |     float* weight_buf = (float*)(weight_tensor->data);
 40 |     float* output_buf = (float*)(output_tensor->data);
 41 |     float* bias = NULL;
 42 |     if(ir_node->input_num > 2)
 43 | 	bias = (float*)(bias_tensor->data);
 44 | 
 45 |     if(exec_graph->mode == TENGINE_MODE_FP32)
 46 |     {
 47 |         Halide::Runtime::Buffer<float> input(input_buf, input_tensor->dims[1], input_tensor->dims[0]);
 48 |         Halide::Runtime::Buffer<float> weight(weight_buf, weight_tensor->dims[1], weight_tensor->dims[0]);  
 49 |         Halide::Runtime::Buffer<float> output(output_buf, output_tensor->dims[1], output_tensor->dims[0]);
 50 | 	Halide::Runtime::Buffer<float> bias1(bias, output_tensor->dims[1]);
 51 | 
 52 |         printf("[INFO]:using halide fc...\n");
 53 | 
 54 |         halide_fc(input, weight, bias1, output);
 55 |     }
 56 |     else
 57 |     {
 58 |     	printf("Tengine work node not support %d\n", exec_graph->mode);
 59 | 	return -1;
 60 |     } 
 61 | 
 62 |     return 0;
 63 | }
 64 | 
 65 | static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 66 | {
 67 |     return 0;
 68 | }
 69 | 
 70 | static int postrun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 71 | {
 72 |     return 0;
 73 | }
 74 | 
 75 | static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 76 | {
 77 |     return 0;
 78 | }
 79 | 
 80 | static int release_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 81 | {
 82 |     /* 
 83 |     release the private info data for your op:
 84 |     void ops_priv;
 85 |     int shared_mem_size;
 86 |     int shared_pack4_mem_size;
 87 |     */
 88 |     return 0;
 89 | }
 90 | 
 91 | static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struct node* exec_node)
 92 | {
 93 |     /*
 94 |     OPS_SCORE_STATIC 10000
 95 |     OPS_SCORE_BEST 8000
 96 |     OPS_SCORE_PREFER 6000
 97 |     OPS_SCORE_CANDO 4000
 98 |     OPS_SCORE_NOTSUP 2000
 99 |     */
100 |     return OPS_SCORE_STATIC;
101 | }
102 | 
103 | static struct node_ops autokernel_node_ops = {.prerun = prerun,
104 |                                        .run = run,
105 |                                        .reshape = reshape,
106 |                                        .postrun = postrun,
107 |                                        .init_node = init_node,
108 |                                        .release_node = release_node,
109 |                                        .score = score};
110 | 
111 | int RegisterAutoKernelFc()
112 | {
113 |     return register_builtin_node_ops(OP_FC, &autokernel_node_ops);
114 | }
115 | 
116 | //static int unreg_autokernel_ops(void* arg)
117 | //{
118 | //    unregister_builtin_node_ops(OP_FC, &autokernel_node_ops);
119 | //    return 0;
120 | //}
121 | 


--------------------------------------------------------------------------------
/autokernel_plugin/src/fc/fc.h:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <math.h>
 3 | 
 4 | extern "C"
 5 | {
 6 |     #include "device/cpu/cpu_define.h"
 7 |     #include "device/cpu/cpu_node.h"
 8 |     #include "device/cpu/cpu_module.h"
 9 |     #include "device/cpu/cpu_graph.h"
10 | 
11 |     #include "api/c_api.h"
12 |     #include "device/device.h"
13 |     #include "graph/tensor.h"
14 |     #include "graph/node.h"
15 |     #include "graph/graph.h"
16 |     #include "graph/subgraph.h"
17 |     #include "executer/executer.h"
18 |     #include "optimizer/split.h"
19 |     #include "module/module.h"
20 |     #include "utility/vector.h"
21 |     #include "utility/log.h"
22 |     #include "utility/sys_port.h"
23 |     #include "defines.h"
24 | 
25 |     // include op param header file here, locate in src/op/
26 |     #include "operator/prototype/convolution_param.h"
27 | }
28 |                                             
29 | #include "HalideBuffer.h"
30 | 
31 | // include the c_header file here
32 | #include "halide_fc.h"
33 | 
34 | int RegisterAutoKernelFc();
35 | 


--------------------------------------------------------------------------------
/autokernel_plugin/src/fc/fc_gen.cc:
--------------------------------------------------------------------------------
 1 | #include "Halide.h"
 2 | #include "HalideBuffer.h"
 3 | using namespace Halide;
 4 | using Halide::Expr;
 5 | using Halide::Func;
 6 | using Halide::Generator;
 7 | using Halide::Var;
 8 | using Halide::BoundaryConditions::constant_exterior;
 9 | 
10 | class halide_fc:public Halide::Generator<halide_fc>{
11 | public:
12 |     // args
13 |     Input<Buffer<float>> input{"input", 2};
14 |     Input<Buffer<float>> filter{"filter", 2};
15 |     Input<Buffer<float>> bias{"bias", 1};
16 |     Output<Buffer<float>> output{"output", 2};
17 | 
18 |     void generate()
19 |     {
20 | 	/* THE ALGORITHM */
21 |         const Expr hidden = input.width();
22 | 
23 |  	Var b("b"), co("co");
24 | 	Func halide_fc("halide_fc");
25 | 	RDom hi(0, hidden);
26 | 	halide_fc(co, b) = bias(co);
27 | 	halide_fc(co, b) += input(hi, b) * filter(hi, co);
28 | 
29 | 	output(co, b) = halide_fc(co, b);
30 |     }
31 | 
32 |     void schedule()
33 |     {
34 | 	/* THE SCHEDULE */
35 |         input.set_estimates({{0, 512}, {0, 512}});
36 |         filter.set_estimates({{0, 512}, {0, 512}});
37 |         bias.set_estimates({{0, 512}});
38 |         output.set_estimates({{0, 512}, {0, 512}});
39 | 	
40 |     }
41 | };
42 | 
43 | HALIDE_REGISTER_GENERATOR(halide_fc, halide_fc)
44 | 


--------------------------------------------------------------------------------
/autokernel_plugin/src/im2col_conv/build.sh:
--------------------------------------------------------------------------------
1 | g++ im2col_conv_gen.cc ../../common/GenGen.cpp \
2 | 	-I ${HALIDE_DIR}/include \
3 | 	-L ${HALIDE_DIR}/lib \
4 | 	-lHalide -std=c++11 -fno-rtti \
5 | 	-o im2col_conv_gen
6 | 
7 | ./im2col_conv_gen -g halide_im2col_conv -e c_header,assembly -o . target=host-no_runtime-no_asserts-no_bounds_query
8 | 


--------------------------------------------------------------------------------
/autokernel_plugin/src/im2col_conv/im2col_conv.h:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <cmath>
 3 | 
 4 | extern "C"
 5 | {
 6 |     #include "device/cpu/cpu_define.h"
 7 |     #include "device/cpu/cpu_node.h"
 8 |     #include "device/cpu/cpu_module.h"
 9 |     #include "device/cpu/cpu_graph.h"
10 | 
11 |     #include "api/c_api.h"
12 |     #include "device/device.h"
13 |     #include "graph/tensor.h"
14 |     #include "graph/node.h"
15 |     #include "graph/graph.h"
16 |     #include "graph/subgraph.h"
17 |     #include "executer/executer.h"
18 |     #include "optimizer/split.h"
19 |     #include "module/module.h"
20 |     #include "utility/vector.h"
21 |     #include "utility/log.h"
22 |     #include "utility/sys_port.h"
23 |     #include "defines.h"
24 | 
25 |     // include op param header file here, locate in src/op/
26 |     #include "operator/prototype/convolution_param.h"
27 | }
28 | #include "HalideBuffer.h"
29 | #include "halide_im2col_conv.h"
30 | 
31 | int RegisterAutoKernelIm2col_conv(); 
32 | 


--------------------------------------------------------------------------------
/autokernel_plugin/src/im2col_conv/im2col_conv_gen.cc:
--------------------------------------------------------------------------------
  1 | #include "Halide.h"
  2 | #include <vector>
  3 | 
  4 | using namespace Halide;
  5 | namespace {
  6 | 
  7 | // Generator class for BLAS gemm operations.
  8 | template<class T>
  9 | class GEMMGenerator : public Generator<GEMMGenerator<T>> {
 10 | public:
 11 |     typedef Generator<GEMMGenerator<T>> Base;
 12 |     using Base::get_target;
 13 |     using Base::natural_vector_size;
 14 |     using Base::target;
 15 |     template<typename T2>
 16 |     using Input = typename Base::template Input<T2>;
 17 |     template<typename T2>
 18 |     using Output = typename Base::template Output<T2>;
 19 | 
 20 |     Input<Buffer<T>> A_ = {"A_", 2};
 21 |     Input<Buffer<T>> B_ = {"B_", 2};
 22 |     Output<Buffer<T>> result_ = {"result", 2};
 23 | 
 24 |     void generate() {
 25 | 
 26 |         const Expr num_rows = A_.height(); //M  A(K,M)
 27 |         const Expr num_cols = B_.width(); //N  B(N,k)
 28 |         const Expr sum_size = A_.width(); //K
 29 | 
 30 |         const int vec = 8;
 31 |         const int s = vec * 2;
 32 | 
 33 |         Input<Buffer<T>> *A_in = &A_;
 34 |         Input<Buffer<T>> *B_in = &B_;
 35 | 
 36 | 
 37 |         Var i, j, ii, ji, jii, iii, io, jo, t;
 38 |         Var ti[3], tj[3];
 39 | 
 40 |         Func A("A"), B("B"), Btmp("Btmp"), As("As"), Atmp("Atmp"), Bs("Bs");
 41 |         Btmp(i, j) = BoundaryConditions::constant_exterior(*B_in, cast<T>(0))(i, j);
 42 | 
 43 |         Bs(i, j, io) = Btmp(io * s + i, j);
 44 |         B(i, j) = Bs(i % s, j, i / s);
 45 | 
 46 |         Atmp(i, j) = (*A_in)(i, j);
 47 |         A(i, j) = Atmp(i, j);
 48 | 
 49 |         Var k("k");
 50 |         Func prod;
 51 |         prod(k, j, i) = A(k, i) * B(j, k);
 52 | 
 53 |         Func AB("AB");
 54 |         RDom rv(0, sum_size);
 55 |         AB(i, j) += prod(rv, i, j);
 56 | 
 57 |    
 58 |         result_(i, j) =  AB(i, j);
 59 | 
 60 |         //schedule
 61 |         result_.tile(i, j, ti[1], tj[1], i, j, 2 * s, 2 * s, TailStrategy::GuardWithIf);
 62 |         result_
 63 |             .tile(i, j, ii, ji, s, 4)
 64 |             .tile(i, j, ti[0], tj[0], i, j, 1, s / 4);
 65 | 
 66 |         result_.specialize(num_rows >= 512 && num_cols >= 512)
 67 |             .fuse(tj[1], ti[1], t)
 68 |             .parallel(t);
 69 | 
 70 |         result_.specialize(num_rows >= 128 && num_cols >= 128)
 71 |             .tile(ti[1], tj[1], ti[2], tj[2], ti[1], tj[1], 2, 2)
 72 |             .fuse(tj[2], ti[2], t)
 73 |             .parallel(t);
 74 | 
 75 |         //long N
 76 |         result_.specialize(num_rows >= 64 && num_cols >= 8000)
 77 |             .parallel(ti[1],4);
 78 |         result_.specialize(num_rows >= 64 && num_cols >= 256)
 79 |             .tile(ti[1], tj[1], ti[2], tj[2], ti[1], tj[1], 4, 2)
 80 |             .parallel(ti[2]);
 81 |         result_.specialize(num_rows >= 64 && num_cols >= 128)
 82 |             .tile(ti[1], tj[1], ti[2], tj[2], ti[1], tj[1], 2, 2)
 83 |             .fuse(tj[2], ti[2], t)
 84 |             .parallel(t);
 85 |         //long M
 86 |        result_.specialize(num_rows >= 512 && num_cols >=32)
 87 |             .tile(ti[1], tj[1], ti[2], tj[2], ti[1], tj[1], 1, 4)
 88 |             .parallel(tj[2]);
 89 |         // long N
 90 |         result_.specialize(num_rows >= 32 && num_cols >= 8000)
 91 |             .parallel(ti[1],8);
 92 |         result_.specialize(num_rows >= 16 && num_cols >= 256)
 93 |             .tile(ti[1], tj[1], ti[2], tj[2], ti[1], tj[1], 4, 1)
 94 |             .parallel(ti[2]);
 95 |         result_.specialize(num_rows >= 16 && num_cols >= 128)
 96 |             .tile(ti[1], tj[1], ti[2], tj[2], ti[1], tj[1], 2, 1)
 97 |             .fuse(tj[2], ti[2], t)
 98 |             .parallel(t);
 99 |  
100 |         //
101 |         result_.rename(tj[0], t);
102 |         result_.bound(i, 0, num_cols).bound(j, 0, num_rows);
103 | 
104 |         AB.compute_at(result_, i)
105 |             .bound_extent(j, 4)
106 |             .unroll(j)
107 |             .bound_extent(i, s)
108 |             .vectorize(i)
109 |             .update()
110 |             .reorder(i, j, rv)
111 |             .unroll(j)
112 |             .unroll(rv, 2)
113 |             .vectorize(i);
114 | 
115 |         Bs.compute_root()
116 |             .split(j, jo, ji, s)
117 |             .reorder(i, ji, io, jo)
118 |             .unroll(i)
119 |             .vectorize(ji);
120 |         Bs.specialize(B_.width() >= 256 && B_.height() >= 64)
121 |             .parallel(jo, 4);
122 | 
123 |         Btmp.compute_at(Bs, io)
124 |             .vectorize(i)
125 |             .unroll(j);
126 | 
127 |         A_.dim(0).set_min(0).dim(1).set_min(0);
128 |         B_.dim(0).set_min(0).dim(1).set_bounds(0, sum_size);
129 |         result_.dim(0).set_bounds(0, num_cols).dim(1).set_bounds(0, num_rows);
130 |     }
131 | };
132 | 
133 | }  // namespace
134 | 
135 | HALIDE_REGISTER_GENERATOR(GEMMGenerator<float>, halide_im2col_conv)
136 | 
137 | 


--------------------------------------------------------------------------------
/autokernel_plugin/src/normalize/build.sh:
--------------------------------------------------------------------------------
1 | g++ normalize_gen.cc ../../common/GenGen.cpp \
2 | 	-I ${HALIDE_DIR}/include \
3 | 	-L ${HALIDE_DIR}/lib \
4 | 	-lHalide -std=c++11 -fno-rtti \
5 | 	-o normalize_gen
6 | 
7 | ./normalize_gen -g halide_normalize -e c_header,assembly -o . target=host
8 | 


--------------------------------------------------------------------------------
/autokernel_plugin/src/normalize/normalize.cpp:
--------------------------------------------------------------------------------
  1 | #include "normalize.h"
  2 | 
  3 | // add helper data struct and functions here
  4 | /*
  5 | struct op_priv_info
  6 | {
  7 | 
  8 | };
  9 | */
 10 | 
 11 | static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 12 | {
 13 |     /*
 14 |     allocate helper memory for your op
 15 |     */
 16 |     return 0;
 17 | }
 18 | 
 19 | static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 20 | {
 21 |     bool info_autokernel = false;
 22 |     const char* debug_env = std::getenv("DEBUG_INFO");
 23 |     if((debug_env) && (debug_env[0] == '1'))
 24 |     {
 25 |         info_autokernel = true;
 26 |     } 
 27 |     // step 1: get input and output
 28 |     struct node* ir_node = exec_node->ir_node;
 29 |     struct graph* ir_graph = ir_node->graph;
 30 |     struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
 31 |     struct tensor* scale_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[1]);
 32 |     struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
 33 | 
 34 |     float* input_buf = (float*)(input_tensor->data);
 35 |     float* scale_buf = (float*)(scale_tensor->data);
 36 |     float* output_buf = (float*)(output_tensor->data);
 37 | 
 38 |     if(exec_graph->mode == TENGINE_MODE_FP32)
 39 |     {
 40 |         Halide::Runtime::Buffer<float> input(input_buf,
 41 |                                              input_tensor->dims[3], input_tensor->dims[2], input_tensor->dims[1], input_tensor->dims[0]);
 42 |         Halide::Runtime::Buffer<float> scale(scale_buf, scale_tensor->dims[0]);
 43 |         Halide::Runtime::Buffer<float> output(output_buf,
 44 |                                               output_tensor->dims[3], output_tensor->dims[2], output_tensor->dims[1], output_tensor->dims[0]);
 45 | 
 46 |         printf("[INFO]:using halide normalize...\n");
 47 | 
 48 |         halide_normalize(input, scale, output);
 49 |     }
 50 |     else
 51 |     {
 52 |     	printf("Tengine work node not support %d\n", exec_graph->mode);
 53 | 	    return -1;
 54 |     } 
 55 | 
 56 |     return 0;
 57 | }
 58 | 
 59 | static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 60 | {
 61 |     return 0;
 62 | }
 63 | 
 64 | static int postrun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 65 | {
 66 |     return 0;
 67 | }
 68 | 
 69 | static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 70 | {
 71 |     return 0;
 72 | }
 73 | 
 74 | static int release_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 75 | {
 76 |     /* 
 77 |     release the private info data for your op:
 78 |     void ops_priv;
 79 |     int shared_mem_size;
 80 |     int shared_pack4_mem_size;
 81 |     */
 82 |     return 0;
 83 | }
 84 | 
 85 | static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struct node* exec_node)
 86 | {
 87 |     /*
 88 |     OPS_SCORE_STATIC 10000
 89 |     OPS_SCORE_BEST 8000
 90 |     OPS_SCORE_PREFER 6000
 91 |     OPS_SCORE_CANDO 4000
 92 |     OPS_SCORE_NOTSUP 2000
 93 |     */
 94 |     return OPS_SCORE_STATIC;
 95 | }
 96 | 
 97 | static struct node_ops autokernel_node_ops = {.prerun = prerun,
 98 |                                        .run = run,
 99 |                                        .reshape = reshape,
100 |                                        .postrun = postrun,
101 |                                        .init_node = init_node,
102 |                                        .release_node = release_node,
103 |                                        .score = score};
104 | 
105 | int RegisterAutoKernelNormalize()
106 | {
107 |     return register_builtin_node_ops(OP_NORMALIZE, &autokernel_node_ops);
108 | }
109 | 
110 | 


--------------------------------------------------------------------------------
/autokernel_plugin/src/normalize/normalize.h:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <math.h>
 3 | 
 4 | extern "C"
 5 | {
 6 |     #include "device/cpu/cpu_define.h"
 7 |     #include "device/cpu/cpu_node.h"
 8 |     #include "device/cpu/cpu_module.h"
 9 |     #include "device/cpu/cpu_graph.h"
10 | 
11 |     #include "api/c_api.h"
12 |     #include "device/device.h"
13 |     #include "graph/tensor.h"
14 |     #include "graph/node.h"
15 |     #include "graph/graph.h"
16 |     #include "graph/subgraph.h"
17 |     #include "executer/executer.h"
18 |     #include "optimizer/split.h"
19 |     #include "module/module.h"
20 |     #include "utility/vector.h"
21 |     #include "utility/log.h"
22 |     #include "utility/sys_port.h"
23 |     #include "defines.h"
24 | 
25 |     // include op param header file here, locate in src/op/
26 |     #include "operator/prototype/normalize_param.h"
27 | }
28 |                                             
29 | #include "HalideBuffer.h"
30 | 
31 | // include the c_header file here
32 | #include "halide_normalize.h"
33 | 
34 | int RegisterAutoKernelNormalize();
35 | 


--------------------------------------------------------------------------------
/autokernel_plugin/src/normalize/normalize_gen.cc:
--------------------------------------------------------------------------------
 1 | #include "Halide.h"
 2 | #include "HalideBuffer.h"
 3 | using namespace Halide;
 4 | using Halide::Expr;
 5 | using Halide::Func;
 6 | using Halide::Generator;
 7 | using Halide::Var;
 8 | using Halide::BoundaryConditions::constant_exterior;
 9 | 
10 | class halide_normalize:public Halide::Generator<halide_normalize>{
11 | public:
12 |     // args
13 |     Input<Buffer<float>> input{"input", 4};
14 |     Input<Buffer<float>> scale{"scale", 1};
15 |     Output<Buffer<float>> output{"output", 4};
16 | 
17 |     void generate()
18 |     {
19 | 	    /* THE ALGORITHM */
20 |         const Expr channel_number = input.dim(2).extent();
21 | 
22 |         Var n("n"), c("c"), h("h"), w("w");
23 |         RDom cn(0, channel_number);
24 |         Func channel_reduce("channel_reduce");
25 |         channel_reduce(w, h, n) += input(w, h, cn, n) * input(w, h, cn, n);
26 |         channel_reduce(w, h, n) = 1.f / sqrt(channel_reduce(w, h, n));
27 | 
28 |         Func halide_normalize("halide_normalize");
29 |         halide_normalize(w, h, c, n) = channel_reduce(w, h, n) * scale(c) * input(w, h, c, n);
30 | 
31 |         output(w, h, c, n) = halide_normalize(w, h, c, n);
32 |     }
33 | 
34 |     void schedule()
35 |     {
36 | 	    /* THE SCHEDULE */
37 |         input.set_estimates({{0, 512}, {0, 512}, {0, 512}, {0, 512}});
38 |         scale.set_estimates({{0, 512}});
39 |         output.set_estimates({{0, 512}, {0, 512}, {0, 512}, {0, 512}});
40 |     }
41 | };
42 | 
43 | HALIDE_REGISTER_GENERATOR(halide_normalize, halide_normalize)
44 | 


--------------------------------------------------------------------------------
/autokernel_plugin/src/plugin_init.cpp:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include "pool/pool.h"
 3 | #include "direct_conv/direct_conv.h"
 4 | #include "im2col_conv/im2col_conv.h"
 5 | #include "fc/fc.h"
 6 | #include "depthwise/depthwise.h"
 7 | #include "softmax/softmax.h"
 8 | #include "normalize/normalize.h"
 9 | 
10 | extern "C" int autokernel_plugin_init(void)       
11 | {                                      
12 |     /* register halide operator */
13 |     RegisterAutoKernelDepthwise();
14 |     RegisterAutoKernelSoftmax();
15 |     RegisterAutoKernelFc();
16 |     RegisterAutoKernelPool();
17 |     RegisterAutoKernelDirect_conv();
18 |     RegisterAutoKernelIm2col_conv();
19 |     RegisterAutoKernelNormalize();
20 |     printf("AutoKernel plugin inited\n");  
21 |     return 0;                          
22 | }
23 | 


--------------------------------------------------------------------------------
/autokernel_plugin/src/pool/avepool_gen.cc:
--------------------------------------------------------------------------------
 1 | #include "Halide.h"
 2 | #include "HalideBuffer.h"
 3 | using namespace Halide;
 4 | using Halide::Expr;
 5 | using Halide::Func;
 6 | using Halide::Generator;
 7 | using Halide::Var;
 8 | using Halide::BoundaryConditions::constant_exterior;
 9 | 
10 | class halide_avepool:public Halide::Generator<halide_avepool>{
11 | public:
12 |     // args
13 |     Input<Buffer<float>> input{"input", 4};
14 |     Input<int> stride{"stride"};
15 |     Input<int> pad_width{"pad_width"};
16 |     Input<int> pad_height{"height"};
17 |     Input<int> kernel_w{"kernel_w"};
18 |     Input<int> kernel_h{"kernel_h"};
19 |     Output<Buffer<float>> output{"output", 4};
20 | 
21 |     void generate()
22 |     {
23 |         /* THE ALGORITHM */
24 |         Var x("x"), y("y"), c("c"), n("n");
25 | 
26 |         constexpr float kMinValue = -3.4028235e38;
27 |         Func input_bounded = constant_exterior(input, kMinValue,
28 |                                                {{0, input.dim(0).extent()},
29 |                                                 {0, input.dim(1).extent()},
30 |                                                 {Expr(), Expr()},
31 |                                                 {Expr(), Expr()},
32 |                                                 });
33 |         Func input_padded("input_padded");
34 |         input_padded(x, y, c, n) = input_bounded(x - pad_width, y - pad_height, c, n);
35 | 
36 |         Func sum("sum");
37 |         RDom filter_dom(0, kernel_w, 0, kernel_h);
38 |         sum(x, y, c, n) += select(
39 |                                 stride == 1,
40 |                                 input_padded(x + filter_dom.x, y + filter_dom.y, c, n),
41 |                                 input_padded(x * stride + filter_dom.x, y * stride + filter_dom.y, c, n) );
42 |         Expr in_x_origin = x * stride - pad_width;
43 |         Expr x_start = max(0, -in_x_origin);
44 |         Expr x_end = min(kernel_w, input.dim(0).extent() - in_x_origin);
45 | 
46 |         Expr in_y_origin = y * stride - pad_height;
47 |         Expr y_start = max(0, -in_y_origin);
48 |         Expr y_end = min(kernel_h, input.dim(1).extent() - in_y_origin);
49 | 
50 |         Expr filter_count = (x_end - x_start) * (y_end - y_start);
51 | 
52 |         output(x, y, c, n) = sum(x, y, c, n) / filter_count;
53 |     }
54 | 
55 |     void schedule()
56 |     {
57 |         if(auto_schedule)
58 |         {
59 |             input.set_estimates({{0, 512}, {0, 512}, {0, 512}, {0, 1}});
60 |             stride.set_estimate(1);
61 |             pad_width.set_estimate(1);
62 |             pad_height.set_estimate(1);
63 |             kernel_w.set_estimate(1);
64 |             kernel_h.set_estimate(1);
65 |             output.set_estimates({{0, 512}, {0, 512}, {0, 512}, {0, 1}});
66 |         }
67 |     }
68 | };
69 | 
70 | HALIDE_REGISTER_GENERATOR(halide_avepool, halide_avepool)
71 | 


--------------------------------------------------------------------------------
/autokernel_plugin/src/pool/build.sh:
--------------------------------------------------------------------------------
 1 | g++ maxpool_gen.cc ../../common/GenGen.cpp \
 2 | 	-I ${HALIDE_DIR}/include \
 3 | 	-L ${HALIDE_DIR}/lib \
 4 | 	-lHalide -std=c++11 -fno-rtti \
 5 | 	-o maxpool_gen
 6 | 
 7 | ./maxpool_gen -g halide_maxpool -e c_header,assembly -o . target=host
 8 | 
 9 | g++ avepool_gen.cc ../../common/GenGen.cpp \
10 | 	-I ${HALIDE_DIR}/include \
11 | 	-L ${HALIDE_DIR}/lib \
12 | 	-lHalide -std=c++11 -fno-rtti \
13 | 	-o avepool_gen
14 | 
15 | ./avepool_gen -g halide_avepool -e c_header,assembly -o . target=host
16 | 


--------------------------------------------------------------------------------
/autokernel_plugin/src/pool/maxpool_gen.cc:
--------------------------------------------------------------------------------
 1 | #include "Halide.h"
 2 | #include "HalideBuffer.h"
 3 | using namespace Halide;
 4 | using Halide::Expr;
 5 | using Halide::Func;
 6 | using Halide::Generator;
 7 | using Halide::Var;
 8 | using Halide::BoundaryConditions::constant_exterior;
 9 | 
10 | class halide_maxpool:public Halide::Generator<halide_maxpool>{
11 | public:
12 |     // args
13 |     Input<Buffer<float>> input{"input", 4};
14 |     Input<int> stride{"stride"};
15 |     Input<int> pad_width{"pad_width"};
16 |     Input<int> pad_height{"height"};
17 |     Input<int> kernel_w{"kernel_w"};
18 |     Input<int> kernel_h{"kernel_h"};
19 |     Output<Buffer<float>> output{"output", 4};
20 | 
21 |     void generate()
22 |     {
23 |         /* THE ALGORITHM */
24 |         Var x("x"), y("y"), c("c"), n("n");
25 | 
26 |         constexpr float kMinValue = -3.4028235e38;
27 |         Func input_bounded = constant_exterior(input, kMinValue,
28 |                                                {{0, input.dim(0).extent()},
29 |                                                 {0, input.dim(1).extent()},
30 |                                                 {Expr(), Expr()},
31 |                                                 {Expr(), Expr()},
32 |                                                 });
33 |         Func input_padded("input_padded");
34 |         input_padded(x, y, c, n) = input_bounded(x - pad_width, y - pad_height, c, n);
35 | 
36 |         Func local_max("local_max");
37 |         RDom filter_dom(0, kernel_w, 0, kernel_h);
38 |         local_max(x, y, c, n) = maximum(select(
39 |                                 stride == 1,
40 |                                 input_padded(x + filter_dom.x, y + filter_dom.y, c, n),
41 |                                 input_padded(x * stride + filter_dom.x, y * stride + filter_dom.y, c, n) ));
42 |         output(x, y, c, n) = local_max(x, y, c, n);
43 | 
44 |     }
45 | 
46 |     void schedule()
47 |     {
48 |         if(auto_schedule)
49 |         {
50 |             input.set_estimates({{0, 512}, {0, 512}, {0, 512}, {0, 1}});
51 |             stride.set_estimate(1);
52 |             pad_width.set_estimate(1);
53 |             pad_height.set_estimate(1);
54 |             kernel_w.set_estimate(1);
55 |             kernel_h.set_estimate(1);
56 |             output.set_estimates({{0, 512}, {0, 512}, {0, 512}, {0, 1}});
57 |         }
58 |     }
59 | };
60 | 
61 | HALIDE_REGISTER_GENERATOR(halide_maxpool, halide_maxpool)
62 | 


--------------------------------------------------------------------------------
/autokernel_plugin/src/pool/pool.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 | 
  3 | step 1: init_node
  4 |         init the private info data for your op, if no need, skip this
  5 | step 2: prerun
  6 |         allocate helper memory for your op, if no need, skip this
  7 | step 3: run
  8 |         complete the run function to use the function generated by autokernel
  9 | step 4: postrun
 10 |         release helper memory you allocated for your op, if no need, skip this
 11 | step 5: release_node
 12 |         release the private info data you allocated for your op, if no need, skip this
 13 | step 6: score
 14 |         adjust you score priority strategy, default score value is defined in cpu_node_ops.h
 15 | step 7: reshape
 16 |         reshape output tensor, if no need, skip this
 17 | step 8: register op
 18 |         change register func name and called in init.cpp
 19 | */
 20 | 
 21 | #include "pool.h"
 22 | 
 23 | // add helper data struct and functions here
 24 | /*
 25 | struct op_priv_info
 26 | {
 27 | 
 28 | };
 29 | */
 30 | 
 31 | static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 32 | {
 33 |     /*
 34 |     allocate helper memory for your op
 35 |     */
 36 |     return 0;
 37 | }
 38 | 
 39 | static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 40 | {
 41 |     bool info_autokernel = false;
 42 |     const char* debug_env = std::getenv("DEBUG_INFO");
 43 |     if((debug_env) && (debug_env[0] == '1'))
 44 |     {
 45 |         info_autokernel = true;
 46 |     }
 47 |     // step 1: get input and output
 48 |     struct node* ir_node = exec_node->ir_node;
 49 |     struct graph* ir_graph = ir_node->graph;
 50 |     struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
 51 |     struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
 52 | 
 53 |     // get op private data info (if needed)
 54 |     // struct op_priv_info* conv_priv_info = ( struct conv_priv_info* )exec_node->ops_priv;
 55 | 
 56 |     // step 2: get op params (if needed), the op_param struct is defined in src/op/
 57 |     // struct op_param* op_param = ( struct conv_param* )ir_node->op.param_mem;
 58 |     // DTYPE [param_list] = conv_param->param_list;
 59 |     struct pool_param* pool_param = ( struct pool_param* )ir_node->op.param_mem;
 60 |     int stride      = pool_param->stride_h;
 61 |     int pad_width   = pool_param->pad_w0;
 62 |     int pad_height  = pool_param->pad_h0;
 63 |     int kernel_w    = pool_param->kernel_w;
 64 |     int kernel_h    = pool_param->kernel_h;
 65 | 
 66 |     // step 3: call the func generated by Autokernel
 67 |     if (exec_graph->mode == TENGINE_MODE_FP32)
 68 |     {
 69 |         Halide::Runtime::Buffer<float> input((float*)input_tensor->data, input_tensor->dims[3], input_tensor->dims[2], input_tensor->dims[1], input_tensor->dims[0]);
 70 |         Halide::Runtime::Buffer<float> output((float*)output_tensor->data, output_tensor->dims[3], output_tensor->dims[2], output_tensor->dims[1], output_tensor->dims[0]);
 71 |         if (pool_param->pool_method == 0)
 72 |         {
 73 |             // maxpooling
 74 |             halide_maxpool(input, stride, pad_width, pad_height, kernel_w, kernel_h, output);
 75 |             if(info_autokernel)printf("[INFO]: runing Autokernel halide_maxpool...\n");
 76 |         }
 77 |         else if (pool_param->pool_method == 1)
 78 |         {
 79 |             // average pooling
 80 |             halide_avepool(input, stride, pad_width, pad_height, kernel_w, kernel_h, output);
 81 |             if(info_autokernel)printf("[INFO]: runing Autokernel halide_avgpool...\n");
 82 |         }
 83 |         
 84 |     }
 85 |     else
 86 |     {
 87 |         printf("Tengine work node with halide plugin not support %d\n", exec_graph->mode);
 88 |         return -1;
 89 |     }
 90 |     
 91 |     return 0;
 92 | }
 93 | 
 94 | static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 95 | {
 96 |     return 0;
 97 | }
 98 | 
 99 | static int postrun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
100 | {
101 |     //printf("run halide postrun\n");
102 |     return 0;
103 | }
104 | 
105 | static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
106 | {
107 |     /* 
108 |     init the private info data for your op:
109 |     void ops_priv;
110 |     int shared_mem_size;
111 |     int shared_pack4_mem_size;
112 |     */
113 |     return 0;
114 | }
115 | 
116 | static int release_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
117 | {
118 |     /* 
119 |     release the private info data for your op:
120 |     void ops_priv;
121 |     int shared_mem_size;
122 |     int shared_pack4_mem_size;
123 |     */
124 |     return 0;
125 | }
126 | 
127 | static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struct node* exec_node)
128 | {
129 |     /*
130 |     OPS_SCORE_STATIC 10000
131 |     OPS_SCORE_BEST 8000
132 |     OPS_SCORE_PREFER 6000
133 |     OPS_SCORE_CANDO 4000
134 |     OPS_SCORE_NOTSUP 2000
135 |     */
136 |     return OPS_SCORE_STATIC;
137 | }
138 | 
139 | static struct node_ops autokernel_node_ops = {.prerun = prerun,
140 |                                        .run = run,
141 |                                        .reshape = reshape,
142 |                                        .postrun = postrun,
143 |                                        .init_node = init_node,
144 |                                        .release_node = release_node,
145 |                                        .score = score};
146 | 
147 | int RegisterAutoKernelPool()
148 | {
149 |     return register_builtin_node_ops(OP_POOL, &autokernel_node_ops);
150 | }
151 | 
152 | //static int unreg_autokernel_ops(void* arg)
153 | //{
154 | //    unregister_builtin_node_ops(OP_POOL, &autokernel_node_ops);
155 | //    return 0;
156 | //}
157 | 


--------------------------------------------------------------------------------
/autokernel_plugin/src/pool/pool.h:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <math.h>
 3 | 
 4 | extern "C"
 5 | {
 6 |     #include "device/cpu/cpu_define.h"
 7 |     #include "device/cpu/cpu_node.h"
 8 |     #include "device/cpu/cpu_module.h"
 9 |     #include "device/cpu/cpu_graph.h"
10 | 
11 |     #include "api/c_api.h"
12 |     #include "device/device.h"
13 |     #include "graph/tensor.h"
14 |     #include "graph/node.h"
15 |     #include "graph/graph.h"
16 |     #include "graph/subgraph.h"
17 |     #include "executer/executer.h"
18 |     #include "optimizer/split.h"
19 |     #include "module/module.h"
20 |     #include "utility/vector.h"
21 |     #include "utility/log.h"
22 |     #include "utility/sys_port.h"
23 |     #include "defines.h"
24 | 
25 |     // include op param header file here, locate in src/op/
26 |     #include "operator/prototype/pooling_param.h"
27 | }
28 |                                             
29 | #include "HalideBuffer.h"
30 | 
31 | // include the c_header file here
32 | #include "halide_maxpool.h"
33 | #include "halide_avepool.h"
34 | 
35 | int RegisterAutoKernelPool();
36 | 


--------------------------------------------------------------------------------
/autokernel_plugin/src/softmax/build.sh:
--------------------------------------------------------------------------------
1 | g++ softmax_gen.cc ../../common/GenGen.cpp \
2 | 	-I ${HALIDE_DIR}/include \
3 | 	-L ${HALIDE_DIR}/lib \
4 | 	-lHalide -std=c++11 -fno-rtti \
5 | 	-o softmax_gen
6 | 
7 | ./softmax_gen -g halide_softmax -e c_header,assembly -o . target=host
8 | 


--------------------------------------------------------------------------------
/autokernel_plugin/src/softmax/softmax.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 | 
  3 | step 1: init_node
  4 |         init the private info data for your op, if no need, skip this
  5 | step 2: prerun
  6 |         allocate helper memory for your op, if no need, skip this
  7 | step 3: run
  8 |         complete the run function to use the function generated by autokernel
  9 | step 4: postrun
 10 |         release helper memory you allocated for your op, if no need, skip this
 11 | step 5: release_node
 12 |         release the private info data you allocated for your op, if no need, skip this
 13 | step 6: score
 14 |         adjust you score priority strategy, default score value is defined in cpu_node_ops.h
 15 | step 7: reshape
 16 |         reshape output tensor, if no need, skip this
 17 | step 8: register op
 18 |         change register func name and called in init.cpp
 19 | */
 20 | 
 21 | #include "softmax.h"
 22 | 
 23 | // add helper data struct and functions here
 24 | /*
 25 | struct op_priv_info
 26 | {
 27 | 
 28 | };
 29 | */
 30 | 
 31 | static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 32 | {
 33 |     /*
 34 |     allocate helper memory for your op
 35 |     */
 36 |     return 0;
 37 | }
 38 | 
 39 | static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 40 | {
 41 |     bool info_autokernel = false;
 42 |     const char* debug_env = std::getenv("DEBUG_INFO");
 43 |     if((debug_env) && (debug_env[0] == '1'))
 44 |     {
 45 |         info_autokernel = true;
 46 |     }
 47 |     // step 1: get input and output
 48 |     struct node* ir_node = exec_node->ir_node;
 49 |     struct graph* ir_graph = ir_node->graph;
 50 |     struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
 51 |     struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
 52 | 
 53 |     // get op private data info (if needed)
 54 |     // struct op_priv_info* conv_priv_info = ( struct conv_priv_info* )exec_node->ops_priv;
 55 | 
 56 | // struct softmax_param* softmax_param=(strcut softmax_param*)ir_node->op.param_mem;
 57 | 
 58 |     // step 2: get op params (if needed), the op_param struct is defined in src/op/
 59 |     // struct op_param* op_param = ( struct conv_param* )ir_node->op.param_mem;
 60 |     // DTYPE [param_list] = conv_param->param_list;
 61 | 
 62 |     // step 3: call the func generated by Autokernel
 63 |     Halide::Runtime::Buffer<float> input((float*)input_tensor->data, input_tensor->dims[1], input_tensor->dims[0]);
 64 |     Halide::Runtime::Buffer<float> output((float*)output_tensor->data,  output_tensor->dims[1], output_tensor->dims[0]);
 65 | 
 66 |     halide_softmax(input, output);
 67 | 
 68 |     if(info_autokernel)printf("[INFO]: runing Autokernel halide_softmax...\n");
 69 |     return 0;
 70 | }
 71 | 
 72 | static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 73 | {
 74 |     return 0;
 75 | }
 76 | 
 77 | static int postrun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 78 | {
 79 |     /*
 80 |     release the helper memory you 
 81 |     */
 82 |     return 0;
 83 | }
 84 | 
 85 | static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 86 | {
 87 |     /* 
 88 |     init the private info data for your op:
 89 |     void ops_priv;
 90 |     int shared_mem_size;
 91 |     int shared_pack4_mem_size;
 92 |     */
 93 |     return 0;
 94 | }
 95 | 
 96 | static int release_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 97 | {
 98 |     /* 
 99 |     release the private info data for your op:
100 |     void ops_priv;
101 |     int shared_mem_size;
102 |     int shared_pack4_mem_size;
103 |     */
104 |     return 0;
105 | }
106 | 
107 | static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struct node* exec_node)
108 | {
109 |     /*
110 |     OPS_SCORE_STATIC 10000
111 |     OPS_SCORE_BEST 8000
112 |     OPS_SCORE_PREFER 6000
113 |     OPS_SCORE_CANDO 4000
114 |     OPS_SCORE_NOTSUP 2000
115 |     */
116 |     return OPS_SCORE_STATIC;
117 | }
118 | 
119 | static struct node_ops autokernel_node_ops = {.prerun = prerun,
120 |                                        .run = run,
121 |                                        .reshape = reshape,
122 |                                        .postrun = postrun,
123 |                                        .init_node = init_node,
124 |                                        .release_node = release_node,
125 |                                        .score = score};
126 | 
127 | int RegisterAutoKernelSoftmax()
128 | {
129 |     return register_builtin_node_ops(OP_SOFTMAX, &autokernel_node_ops);
130 | }
131 | 
132 | //static int unreg_autokernel_ops(void* arg)
133 | //{
134 | //    unregister_builtin_node_ops(OP_SOFTMAX, &autokernel_node_ops);
135 | //    return 0;
136 | //}
137 | 


--------------------------------------------------------------------------------
/autokernel_plugin/src/softmax/softmax.h:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <math.h>
 3 | 
 4 | extern "C"
 5 | {
 6 |     #include "device/cpu/cpu_define.h"
 7 |     #include "device/cpu/cpu_node.h"
 8 |     #include "device/cpu/cpu_module.h"
 9 |     #include "device/cpu/cpu_graph.h"
10 | 
11 |     #include "api/c_api.h"
12 |     #include "device/device.h"
13 |     #include "graph/tensor.h"
14 |     #include "graph/node.h"
15 |     #include "graph/graph.h"
16 |     #include "graph/subgraph.h"
17 |     #include "executer/executer.h"
18 |     #include "optimizer/split.h"
19 |     #include "module/module.h"
20 |     #include "utility/vector.h"
21 |     #include "utility/log.h"
22 |     #include "utility/sys_port.h"
23 |     #include "defines.h"
24 | 
25 |     // include op param header file here, locate in src/op/
26 |     #include "operator/prototype/softmax_param.h"
27 | }
28 |                                             
29 | #include "HalideBuffer.h"
30 | 
31 | // include the c_header file here
32 | #include "halide_softmax.h"
33 | 
34 | int RegisterAutoKernelSoftmax();
35 | 


--------------------------------------------------------------------------------
/autokernel_plugin/src/softmax/softmax_gen.cc:
--------------------------------------------------------------------------------
 1 | #include "Halide.h"
 2 | #include "HalideBuffer.h"
 3 | using namespace Halide;
 4 | using Halide::Expr;
 5 | using Halide::Func;
 6 | using Halide::Generator;
 7 | using Halide::Var;
 8 | using Halide::BoundaryConditions::constant_exterior;
 9 | 
10 | class halide_softmax:public Halide::Generator<halide_softmax>{
11 | public:
12 |     // args
13 |     Input<Buffer<float>> input{"input", 2};
14 | 
15 |     Output<Buffer<float>> output{"output", 2};
16 | 
17 |     void generate()
18 |     {
19 | 	/* THE ALGORITHM */
20 |     const Expr num_classes=input.width();
21 | 	Var in("in"), n("n");
22 | 	Func expInput;
23 |     RDom r(0,num_classes);
24 | 	expInput(in, n) = exp(input(in, n));
25 |     Expr globalSum=sum(expInput(r.x,n));
26 | 
27 | 
28 | 	output(in,n)=expInput(in,n)/globalSum;
29 |     }
30 | 
31 |     void schedule()
32 |     {
33 | 	/* THE SCHEDULE */
34 |         input.set_estimates({{0, 512}, {0, 512}});
35 |         output.set_estimates({{0, 512}, {0, 512}});
36 |     }
37 | };
38 | 
39 | HALIDE_REGISTER_GENERATOR(halide_softmax, halide_softmax)


--------------------------------------------------------------------------------
/autokernel_plugin/template/build.sh:
--------------------------------------------------------------------------------
1 | g++ OP_NAME_gen.cc ../../common/GenGen.cpp \
2 | 	-I ${HALIDE_DIR}/include \
3 | 	-L ${HALIDE_DIR}/lib \
4 | 	-lHalide -std=c++11 -fno-rtti \
5 | 	-o OP_NAME_gen
6 | 
7 | ./OP_NAME_gen -g halide_OP_NAME -e c_header,assembly -o . target=host
8 | 


--------------------------------------------------------------------------------
/autokernel_plugin/template/generator.cc:
--------------------------------------------------------------------------------
 1 | #include "Halide.h"
 2 | #include "HalideBuffer.h"
 3 | using namespace Halide;
 4 | using Halide::Expr;
 5 | using Halide::Func;
 6 | using Halide::Generator;
 7 | using Halide::Var;
 8 | using Halide::BoundaryConditions::constant_exterior;
 9 | 
10 | class Halide_Func_Name:public Halide::Generator<Halide_Func_Name>{
11 | public:
12 |     // args
13 |     Input<Buffer<float>> input{"input", 4};
14 |     Input<int> param{"param"};
15 | 
16 |     Output<Buffer<float>> output{"output", 4};
17 | 
18 |     void generate()
19 |     {
20 | 	/* THE ALGORITHM */
21 | 	Var x("x"), y("y"), c("c"), n("n");
22 | 	Func Halide_Func_Name("Halide_Func_Name");
23 | 	Halide_Func_Name(c, x, y, n) = input(c, x, y, n);
24 | 
25 | 	output(c, x, y, n) = select(param >= 0, max(param, Halide_Func_Name(c, x, y, n)), Halide_Func_Name(c, x, y, n));
26 |     }
27 | 
28 |     void schedule()
29 |     {
30 | 	/* THE SCHEDULE */
31 |     }
32 | };
33 | 
34 | HALIDE_REGISTER_GENERATOR(Halide_Func_Name, Halide_Func_Name)
35 | 


--------------------------------------------------------------------------------
/autokernel_plugin/template/template.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 | 
  3 | step 1: init_node
  4 |         init the private info data for your op, if no need, skip this
  5 | step 2: prerun
  6 |         allocate helper memory for your op, if no need, skip this
  7 | step 3: run
  8 |         complete the run function to use the function generated by autokernel
  9 | step 4: postrun
 10 |         release helper memory you allocated for your op, if no need, skip this
 11 | step 5: release_node
 12 |         release the private info data you allocated for your op, if no need, skip this
 13 | step 6: score
 14 |         adjust you score priority strategy, default score value is defined in cpu_node_ops.h
 15 | step 7: reshape
 16 |         reshape output tensor, if no need, skip this
 17 | step 8: register op
 18 |         change register func name and called in init.cpp
 19 | */
 20 | 
 21 | #include "template.h"
 22 | 
 23 | // add helper data struct and functions here
 24 | /*
 25 | struct op_priv_info
 26 | {
 27 | 
 28 | };
 29 | */
 30 | 
 31 | static int prerun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 32 | {
 33 |     /*
 34 |     allocate helper memory for your op
 35 |     */
 36 |     return 0;
 37 | }
 38 | 
 39 | static int run(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 40 | {
 41 |     bool info_autokernel = false;
 42 |     const char* debug_env = std::getenv("DEBUG_INFO");
 43 |     if((debug_env) && (debug_env[0] == '1'))
 44 |     {
 45 |         info_autokernel = true;
 46 |     }
 47 |     // step 1: get input and output
 48 |     struct node* ir_node = exec_node->ir_node;
 49 |     struct graph* ir_graph = ir_node->graph;
 50 |     struct tensor* input_tensor = get_ir_graph_tensor(ir_graph, ir_node->input_tensors[0]);
 51 |     struct tensor* output_tensor = get_ir_graph_tensor(ir_graph, ir_node->output_tensors[0]);
 52 | 
 53 |     // get op private data info (if needed)
 54 |     // struct op_priv_info* conv_priv_info = ( struct conv_priv_info* )exec_node->ops_priv;
 55 | 
 56 |     // step 2: get op params (if needed), the op_param struct is defined in src/op/
 57 |     // struct op_param* op_param = ( struct conv_param* )ir_node->op.param_mem;
 58 |     // DTYPE [param_list] = conv_param->param_list;
 59 | 
 60 |     // step 3: call the func generated by Autokernel
 61 |     Halide::Runtime::Buffer<float> input((float*)input_tensor->data, input_tensor->dims[3], input_tensor->dims[2], input_tensor->dims[1], input_tensor->dims[0]);
 62 |     Halide::Runtime::Buffer<float> output((float*)output_tensor->data, output_tensor->dims[3], output_tensor->dims[2], output_tensor->dims[1], output_tensor->dims[0]);
 63 | 
 64 |     int param = 0;
 65 |     AutoKernel_Func(input, param, output);
 66 | 
 67 |     if(info_autokernel)printf("[INFO]: runing Autokernel AutoKernel_Func...\n");
 68 |     return 0;
 69 | }
 70 | 
 71 | static int reshape(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 72 | {
 73 |     return 0;
 74 | }
 75 | 
 76 | static int postrun(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 77 | {
 78 |     /*
 79 |     release the helper memory you 
 80 |     */
 81 |     return 0;
 82 | }
 83 | 
 84 | static int init_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 85 | {
 86 |     /* 
 87 |     init the private info data for your op:
 88 |     void ops_priv;
 89 |     int shared_mem_size;
 90 |     int shared_pack4_mem_size;
 91 |     */
 92 |     return 0;
 93 | }
 94 | 
 95 | static int release_node(struct node_ops* node_ops, struct exec_node* exec_node, struct exec_graph* exec_graph)
 96 | {
 97 |     /* 
 98 |     release the private info data for your op:
 99 |     void ops_priv;
100 |     int shared_mem_size;
101 |     int shared_pack4_mem_size;
102 |     */
103 |     return 0;
104 | }
105 | 
106 | static int score(struct node_ops* node_ops, struct exec_graph* exec_graph, struct node* exec_node)
107 | {
108 |     /*
109 |     OPS_SCORE_STATIC 10000
110 |     OPS_SCORE_BEST 8000
111 |     OPS_SCORE_PREFER 6000
112 |     OPS_SCORE_CANDO 4000
113 |     OPS_SCORE_NOTSUP 2000
114 |     */
115 |     return OPS_SCORE_STATIC;
116 | }
117 | 
118 | static struct node_ops autokernel_node_ops = {.prerun = prerun,
119 |                                        .run = run,
120 |                                        .reshape = reshape,
121 |                                        .postrun = postrun,
122 |                                        .init_node = init_node,
123 |                                        .release_node = release_node,
124 |                                        .score = score};
125 | 
126 | int RegisterAutoKernelOP()
127 | {
128 |     return register_builtin_node_ops(OP_CONV, &autokernel_node_ops);
129 | }
130 | 
131 | // int unreg_autokernel_ops(void* arg)
132 | // {
133 | //     unregister_builtin_node_ops(OP_CONV, &autokernel_node_ops);
134 | //     return 0;
135 | // }
136 | 


--------------------------------------------------------------------------------
/autokernel_plugin/template/template.h:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include <math.h>
 3 | 
 4 | extern "C"
 5 | {
 6 |     #include "device/cpu/cpu_define.h"
 7 |     #include "device/cpu/cpu_node.h"
 8 |     #include "device/cpu/cpu_module.h"
 9 |     #include "device/cpu/cpu_graph.h"
10 | 
11 |     #include "api/c_api.h"
12 |     #include "device/device.h"
13 |     #include "graph/tensor.h"
14 |     #include "graph/node.h"
15 |     #include "graph/graph.h"
16 |     #include "graph/subgraph.h"
17 |     #include "executer/executer.h"
18 |     #include "optimizer/split.h"
19 |     #include "module/module.h"
20 |     #include "utility/vector.h"
21 |     #include "utility/log.h"
22 |     #include "utility/sys_port.h"
23 |     #include "defines.h"
24 | 
25 |     // include op param header file here, locate in operator/prototype/
26 |     #include "operator/prototype/convolution_param.h"
27 | }
28 |                                             
29 | #include "HalideBuffer.h"
30 | 
31 | // include the c_header file here
32 | #include "AutoKernel_Func.h"
33 | 
34 | int RegisterAutoKernelOP();
35 | 


--------------------------------------------------------------------------------
/autokernel_plugin/tests/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | project(test)                                                    
 2 |                                                                         
 3 | include_directories(${TENGINE_DIR}/include)
 4 | include_directories(./common)
 5 | link_directories(${TENGINE_DIR}/lib)
 6 | 
 7 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
 8 | set(LINK_LIBS tengine-lite)
 9 | set(CMAKE_EXE_LINKER_FLAGS "-rdynamic -ldl")
10 | 
11 | add_executable(test_conv test_conv.cpp)
12 | target_link_libraries(test_conv ${LINK_LIBS})
13 | 
14 | add_executable(test_depthwise test_depthwise.cpp)
15 | target_link_libraries(test_depthwise ${LINK_LIBS})
16 | 
17 | add_executable(test_fc test_fc.cpp)
18 | target_link_libraries(test_fc ${LINK_LIBS})
19 | 
20 | add_executable(test_pool test_pool.cpp)
21 | target_link_libraries(test_pool ${LINK_LIBS})
22 | 
23 | add_executable(test_softmax test_softmax.cpp)
24 | target_link_libraries(test_softmax ${LINK_LIBS})
25 | 
26 | add_executable(test_normalize test_normalize.cpp)
27 | target_link_libraries(test_normalize ${LINK_LIBS})
28 | 
29 | add_executable(tm_classification tm_classification.cpp common/tengine_operations.cpp)
30 | target_link_libraries (tm_classification ${LINK_LIBS})
31 | 


--------------------------------------------------------------------------------
/autokernel_plugin/tests/common/utils.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef __UTILS_HPP__
 2 | #define __UTILS_HPP__
 3 | 
 4 | #include<string>
 5 | int is_file_exist(std::string file_name);
 6 | int is_file_exist(std::string file_name)
 7 | {
 8 |     FILE* fp = fopen(file_name.c_str(), "r");
 9 |     if (!fp)
10 |     {
11 |         return 0;
12 |     }
13 |     fclose(fp);
14 |     return 1;
15 | }
16 | #endif    // __UTILS_HPP__


--------------------------------------------------------------------------------
/doc/add_op.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OAID/AutoKernel/6ada93a743c26e3aae0372f7518433889dd0b437/doc/add_op.png


--------------------------------------------------------------------------------
/doc/architecture-en.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OAID/AutoKernel/6ada93a743c26e3aae0372f7518433889dd0b437/doc/architecture-en.png


--------------------------------------------------------------------------------
/doc/architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OAID/AutoKernel/6ada93a743c26e3aae0372f7518433889dd0b437/doc/architecture.png


--------------------------------------------------------------------------------
/doc/how_to_add_op.md:
--------------------------------------------------------------------------------
  1 | ## 如何快速开发一个自动优化的新算子
  2 | 
  3 | 运用AutoKernel开发一个Tengine可以用算子，具体可分为两个步骤：
  4 | 1. 生成：编写算法描述和调度策略，生成相应后端的优化算子代码
  5 |    
  6 | 2. 部署：将生成的优化算子代码通过plugin的形式集成进Tengine
  7 | 
  8 | --------------------------
  9 | 本教程将以Relu算子为例，演示如何快速开发Tengine可用的自动优化算子。
 10 | 
 11 | ![add_op.png](add_op.png)
 12 | ### 1.执行`register_op.sh`，自动生成模板文件
 13 | 我们提供了一个快速生成算子的脚本文件，根据模板生成这两个步骤需要的源文件和编译脚本。
 14 | ```
 15 | cd AutoKernel/autokernel_plugin
 16 | chmod +x -R . 
 17 | ./scripts/register_op.sh
 18 | ```
 19 | 根据提示填入：
 20 | ```
 21 | op_name: relu
 22 | op_type: OP_RELU
 23 | ```
 24 | 可得到文件目录如下：
 25 | ```
 26 | src/relu/relu.cpp
 27 | src/relu/relu.h
 28 | src/relu/relu_gen.cc
 29 | src/relu/build/sh
 30 | ```
 31 | ### 2.生成：编辑生成文件`relu_gen.cc`
 32 | 该文件用于生成算子汇编代码。使用Halide语言描述算子的计算过程和调度策略schedule。
 33 | 该示例中，schedule默认为空。
 34 | 
 35 | ```
 36 | class halide_relu:public Halide::Generator<halide_relu>{
 37 | public:
 38 |     // args
 39 |     Input<Buffer<float>> input{"input", 4};
 40 |     Input<int> param{"param"};
 41 | 
 42 |     Output<Buffer<float>> output{"output", 4};
 43 | 
 44 |     void generate()
 45 |     {
 46 |         /* THE ALGORITHM */
 47 |         Var w("w"), h("h"), c("c"), n("n");
 48 |         Func halide_relu("halide_relu");
 49 |         halide_relu(w, h, c, n) = input(w, h, c, n);
 50 | 
 51 |         output(w, h, c, n) = select(param >= 0, max(param, halide_relu(w, h, c, n)), halide_relu(w, h, c, n));
 52 |     }
 53 | 
 54 |     void schedule()
 55 |     {
 56 |         /* THE SCHEDULE */
 57 |     }
 58 | };
 59 | 
 60 | ```
 61 | ### 3.部署：编辑`auto_relu.cpp`,一键编译生成`AutoKernel.so`
 62 | 
 63 | ```
 64 | ./scripts/generate.sh	# 一键生成所有算子所需的.s .h文件
 65 | mkdir build
 66 | cd build
 67 | cmake ..
 68 | make -j4
 69 | ```
 70 | 
 71 | ### 4.测试
 72 | 
 73 | 测试用例仅供参考
 74 | 
 75 | ```
 76 | #include "HalideBuffer.h"
 77 | #include <iostream>
 78 | #include "halide_relu.h"
 79 | 
 80 | int main(int argc, char **argv)
 81 | {
 82 |     int C = 1, W = 4, H = 4, N = 1;
 83 |     Halide::Runtime::Buffer<float> input_tensor(nullptr, W, H, C, N);
 84 |     Halide::Runtime::Buffer<float> output_tensor(nullptr, W, H, C, N);
 85 |     input_tensor.allocate();
 86 |     output_tensor.allocate();
 87 |     input_tensor.for_each_value([](float &x) {
 88 |         x = 2.0 * rand() / RAND_MAX - 1.0;
 89 |     });
 90 | 
 91 |     output_tensor.for_each_value([](float &x) {
 92 |         x = 2.0 * rand() / RAND_MAX - 1.0;
 93 |     });
 94 | 
 95 |     halide_relu(input_tensor, 0, output_tensor);
 96 | 
 97 |     printf("input:\n");
 98 |     for (int c = 0; c < input_tensor.dim(3).extent(); c++) {
 99 |         for (int z = 0; z < input_tensor.channels(); z++) {
100 |             for (int y = 0; y < input_tensor.height(); y++) {
101 |                 for (int x = 0; x < input_tensor.width(); x++) {
102 |                     std::cout<<input_tensor(x,y,z,0)<<" ";
103 |                 }
104 |                 std::cout<<"\n";
105 |             }
106 |             std::cout<<"\n";
107 |         }
108 |     }
109 |     
110 |     printf("output:\n");
111 |     for (int c = 0; c < output_tensor.dim(3).extent(); c++) {
112 |         for (int z = 0; z < output_tensor.channels(); z++) {
113 |             for (int y = 0; y < output_tensor.height(); y++) {
114 |                 for (int x = 0; x < output_tensor.width(); x++) {
115 |                     std::cout<<output_tensor(x,y,z,0)<<" ";
116 |                 }
117 |                 std::cout<<"\n";
118 |             }
119 |             std::cout<<"\n";
120 |         }
121 |     }
122 | 
123 |     return 0;
124 | }
125 | ```
126 | 把该测试代码`test_relu.cpp`放在 AutoKernel/autokernel_plugin/build/目录下，然后编译测试用例:
127 | 
128 | ```
129 | g++ test_relu.cpp ../src/relu/halide_relu.s -I ../include/ -I ../src/relu/ -std=c++11 -lpthread -ldl -O3 -o relu_run
130 | ```
131 | 执行得到结果
132 | ```
133 | ./relu_run
134 | input:
135 | 0.680375 -0.211234 0.566198 0.59688 
136 | 0.823295 -0.604897 -0.329554 0.53645
137 | -0.444451 0.10794 -0.0452059 0.25774
138 | -0.270431 0.0268018 0.904459 0.83239
139 | 
140 | output:
141 | 0.680375 0 0.566198 0.59688 
142 | 0.823295 0 0 0.536459 
143 | 0 0.10794 0 0.257742 
144 | 0 0.0268018 0.904459 0.83239 
145 | ```
146 | 
147 | 


--------------------------------------------------------------------------------
/doc/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OAID/AutoKernel/6ada93a743c26e3aae0372f7518433889dd0b437/doc/logo.png


--------------------------------------------------------------------------------
/doc/readme.md:
--------------------------------------------------------------------------------
1 | 
2 |   <h3> <a href="https://autokernel-docs-en.readthedocs.io/"> Documentation </a> | <a href="https://autokernel-docs.readthedocs.io/"> 中文文档 </a>  </h3>
3 | 
4 | 


--------------------------------------------------------------------------------
/doc/tutorials/01_AutoKernel开发环境快速入门.md:
--------------------------------------------------------------------------------
  1 | # AutoKernel开发环境
  2 | 
  3 | 在这个教程中，我们将会介绍如何安装/配置AutoKernel的开发环境，并介绍本项目所依赖的两大组件Tengine, Halide。为了减少开发者配置环境的遇到的问题，目前我们提供了Docker来配置所需的基本环境。后续我们会提供更多的环境配置方式。
  4 | 
  5 | - AutoKernel开发环境
  6 |   - [AutoKernel 安装指引](#autokernel-安装指引)
  7 |   - [Halide](#halide)
  8 |   - [Tengine](#tengine)
  9 | -------------------
 10 | 
 11 | ## AutoKernel 安装指引 
 12 | AutoKernel提供了docker镜像，提供了AutoKernel的开发环境
 13 | 
 14 | - 如果你还没安装docker,请查看[docker的官方安装文档](https://docs.docker.com/engine/install/debian/)。
 15 | 
 16 | - 如果你对docker不熟悉，可以参考docker使用入门教程: [菜鸟课程docker使用入门](https://www.runoob.com/docker/docker-hello-world.html)
 17 | 
 18 | 接下来我们认为你已经安装好docker。
 19 | 
 20 | 1. 拉取镜像(可能需要一段时间，请耐心等待， 根据网速，可能需要10-20mins)
 21 |     ```
 22 |     docker pull openailab/autokernel
 23 |     ```
 24 | 2. 创建容器，进入开发环境
 25 |     ```
 26 |     docker run -ti openailab/autokernel /bin/bash 
 27 |     ```
 28 |     进入到docker容器里
 29 |     ```
 30 |     root@39bfb5ea515d:/workspace#
 31 |     ```
 32 |     * 注意，如果你已经创建了容器，那你只需要启动容器，并且进入即可。 否则，你之前的改动不会在新创建的容器中生效。
 33 | 
 34 |     查看之前创建的容器, 你可以通过命令`docker container rename `来重命名你的容器，这里，我们的容器叫做`autokernel`
 35 |     ```
 36 |     $ docker container ls -a
 37 |     CONTAINER ID        IMAGE                  COMMAND             CREATED             STATUS                       PORTS               NAMES
 38 |     ff8b59212784        openailab/autokernel   "/bin/bash"         21 hours ago        Exited (255) 2 minutes ago                       autokernel
 39 |     ```
 40 | 
 41 |     启动容器
 42 |     ```
 43 |     docker start autokernel
 44 |     ```
 45 |     进入容器
 46 |     ```
 47 |     docker exec -ti autokernel /bin/bash
 48 |     ```
 49 | 3. docker里面已经安装好Halide, Tengine
 50 |     ```
 51 |     /workspace/Halide	# Halide
 52 |     /workspace/Tengine  # Tengine
 53 |     ```
 54 | 
 55 | 4. 克隆AutoKernel项目
 56 |     ```
 57 |     git clone https://github.com/OAID/AutoKernel.git
 58 |     ```
 59 | 
 60 | 至此，我们后面所需的环境文件都已经准备完毕。
 61 | 
 62 | ## Halide
 63 | Halide 是一个DSL编程语言，他将算法和硬件后端分离了。本项目将使用Halide的DSL 以及IR。docker里面已经安装好Halide， 并且配置好了Python的API。
 64 | 
 65 | Halide相关的文件都在`/workspace/Halide/`文件夹下，Halide的安装文件都在`/workspace/Halide/halide-build` 文件夹下。
 66 | 
 67 | ```
 68 | cd /workspace/Halide/halide-build
 69 | ```
 70 | * Halide相关头文件在`/workspace/Halide/halide-build/include`
 71 |     ```
 72 |     root@bd3faab0f079:/workspace/Halide/halide-build/include# ls
 73 | 
 74 |     Halide.h                     HalideRuntimeHexagonDma.h
 75 |     HalideBuffer.h               HalideRuntimeHexagonHost.h
 76 |     HalidePyTorchCudaHelpers.h   HalideRuntimeMetal.h
 77 |     HalidePyTorchHelpers.h       HalideRuntimeOpenCL.h
 78 |     HalideRuntime.h              HalideRuntimeOpenGL.h
 79 |     HalideRuntimeCuda.h          HalideRuntimeOpenGLCompute.h
 80 |     HalideRuntimeD3D12Compute.h  HalideRuntimeQurt.h
 81 |     ```
 82 | * 编译好的Halide库在`/workspace/Halide/halide-build/src`目录下, 可以看到`libHalide.so` 
 83 |     ```
 84 |     root@bd3faab0f079:/workspace/Halide/halide-build/src# ls 
 85 |     CMakeFiles           autoschedulers       libHalide.so.10
 86 |     CTestTestfile.cmake  cmake_install.cmake  libHalide.so.10.0.0
 87 |     Makefile             libHalide.so         runtime
 88 |     ```
 89 | * 运行Halide小程序
 90 |     ```
 91 |     cd /workspace/Halide/halide-build
 92 |     ./tutorial/lesson_01_basics 
 93 |     ```
 94 |     运行结果
 95 |     ```
 96 |     Success!
 97 |     ```
 98 | * 运行Halide的Python接口    
 99 |     首先查看Python的系统路径
100 |     ```
101 |     python
102 |     >>>import sys
103 |     >>> sys.path
104 |     ['', '/root', '/workspace/Halide/halide-build/python_bindings/src', '/usr/lib/python36.zip', '/usr/lib/python3.6', '/usr/lib/python3.6/lib-dynload', '/usr/local/lib/python3.6/dist-packages', '/usr/lib/python3/dist-packages']
105 |     ```
106 |     可以看到Python的系统路径已经有Halide的编译后的python包路径`'/workspace/Halide/halide-build/python_bindings/src'`
107 |     ```
108 |     python
109 |     >>> import halide
110 |     ```
111 |     直接`import halide`成功！
112 | 
113 | 
114 | 
115 | ## Tengine
116 | Tengine是一个轻量级高性能深度神经网络推理引擎。本项目将基于Tengine进行算子开发优化的工作。
117 | 
118 | docker里面已经安装好Tengine， 相关文件都在`/workspace/Tengine/`目录下
119 | ```
120 | cd /workspace/Tengine/build
121 | ```
122 | * Tengine相关头文件在`/workspace/Tengine/build/install/include`
123 |     ```
124 |     root@bd3faab0f079:/workspace/Tengine/build/install/include# ls
125 | 
126 |     tengine_c_api.h
127 |     tengine_cpp_api.h
128 |     ```
129 | * 编译好的Tengine库在`/workspace/Tengine/build/install/lib`目录下, 可以看到`libtengine-lite.so` 
130 |     ```
131 |     root@bd3faab0f079:/workspace/Tengine/build/install/lib# ls 
132 | 
133 |     libtengine-lite.so
134 |     ```
135 | * 运行Tengine小程序
136 | 
137 |     该示例跑了Tengine在目标电脑上各个网络模型的性能benchmark
138 |     ```
139 |     cd /workspace/Tengine/benchmark
140 |     ../build/benchmark/tm_benchmark
141 |     ```
142 |     运行结果
143 |     ```
144 |     start to run register cpu allocator
145 |     loop_counts = 1
146 |     num_threads = 1
147 |     power       = 0
148 |     tengine-lite library version: 1.0-dev
149 |         squeezenet_v1.1  min =   32.74 ms   max =   32.74 ms   avg =   32.74 ms
150 |             mobilenetv1  min =   31.33 ms   max =   31.33 ms   avg =   31.33 ms
151 |             mobilenetv2  min =   35.55 ms   max =   35.55 ms   avg =   35.55 ms
152 |             mobilenetv3  min =   37.65 ms   max =   37.65 ms   avg =   37.65 ms
153 |             shufflenetv2  min =   10.93 ms   max =   10.93 ms   avg =   10.93 ms
154 |                 resnet18  min =   74.53 ms   max =   74.53 ms   avg =   74.53 ms
155 |                 resnet50  min =  175.55 ms   max =  175.55 ms   avg =  175.55 ms
156 |             googlenet  min =  133.23 ms   max =  133.23 ms   avg =  133.23 ms
157 |             inceptionv3  min =  298.22 ms   max =  298.22 ms   avg =  298.22 ms
158 |                 vgg16  min =  555.60 ms   max =  555.60 ms   avg =  555.60 ms
159 |                     mssd  min =   69.41 ms   max =   69.41 ms   avg =   69.41 ms
160 |             retinaface  min =   13.14 ms   max =   13.14 ms   avg =   13.14 ms
161 |             yolov3_tiny  min =  132.67 ms   max =  132.67 ms   avg =  132.67 ms
162 |         mobilefacenets  min =   14.95 ms   max =   14.95 ms   avg =   14.95 ms
163 |     ALL TEST DONE
164 |     ```
165 | 


--------------------------------------------------------------------------------
/doc/tutorials/02_Tengine快速入门.md:
--------------------------------------------------------------------------------
  1 | # Tengine快速入门
  2 | 
  3 | Tengine是一个轻量级深度神经网络推理引擎。本文档将在x86 Linux平台，以分类模型（Squezenet模型）为例，带你快速上手Tengine。
  4 | 
  5 | ## 深度学习神经网络计算流程
  6 | 
  7 | **概念理解**
  8 | 
  9 | - `神经网络`: 神经网络可以理解为计算图(graph)，一个计算图由多个算子(operator)节点组成，这些节点可以是卷积算子(Convolution), 池化算子（Pooling), 全连接算子(Fc)等。
 10 | 
 11 | - `神经网络模型`: 神经网络模型是由深度学习训练框架（Tensorflow, Caffe, Pytorch, Mxnet等）训练得到的，模型包含了两个信息：
 12 |   - 神经网络的计算图结构
 13 |   - 算子的权重数据
 14 |   
 15 | **计算流程**
 16 | 
 17 | ![inference](data/inference.png)
 18 | 
 19 | 1. 加载模型：得到神经网络结构和权重数据
 20 | 
 21 | 2. 准备输入数据，喂入输入数据
 22 | 
 23 | 3. 进行模型推理计算
 24 | 
 25 | 4. 获取输出数据
 26 | 
 27 | ## Tengine Squeezenet示例
 28 | 本示例将按照神经网络推理计算流程，演示如何在Tengine中进行Squeezenet分类网络的推理计算
 29 | 
 30 | 1. 加载模型
 31 |     ```cpp
 32 |     /* load model */
 33 |     graph_t graph = create_graph(NULL, "tengine", model_file);
 34 | 
 35 |     ```
 36 |     `model_file`是tengine格式的模型文件："squeezenet.tmfile"
 37 | 
 38 | 2. 准备输入数据, 喂入输入数据
 39 |     ```cpp
 40 |     /* prepare input data */
 41 |     tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0);
 42 |     set_tensor_shape(input_tensor, dims, 4);
 43 |     set_tensor_buffer(input_tensor, input_data, img_size * sizeof(float));
 44 |     ```
 45 | 
 46 | 3. 进行模型推理计算
 47 |     ```cpp
 48 |     /* forward */
 49 |     run_graph(graph, 1);
 50 |     ```
 51 | 4. 获取输出数据
 52 |     ```cpp
 53 |     /* get result */
 54 |     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
 55 |     float* output_data = ( float* )get_tensor_buffer(output_tensor);
 56 |     ```
 57 | * 代码：
 58 |     - 完整的代码源文件在: [data/02_tengine_tutorial.cpp](data/02_tengine_tutorial.cpp),代码非常清晰简洁~
 59 |     - 代码使用了一些工具函数，在文件[tengine_operations.h](https://github.com/OAID/Tengine/blob/tengine-lite/examples/common/tengine_operations.h)中
 60 | 
 61 | * 编译
 62 |     ```
 63 |     cd tutorials/data
 64 |     cp /workspace/Tengine/examples/common -r .
 65 |     mkdir build
 66 |     cd build
 67 |     cmake ..
 68 |     make
 69 |     ```
 70 | * 执行
 71 |     ```
 72 |     cd tutorials/data/build
 73 | 
 74 |     #下载模型和图片
 75 |     wget https://github.com/OAID/TengineModels/raw/main/images/cat.jpg .
 76 |     wget https://github.com/OAID/TengineModels/raw/main/tmfiles/squeezenet.tmfile .
 77 |     ./02_tengine_tutorial
 78 |     ```
 79 |     得到结果
 80 |     ```
 81 |     0.273198, 281
 82 |     0.267550, 282
 83 |     0.181006, 278
 84 |     0.081798, 285
 85 |     0.072406, 151
 86 |     --------------------------------------
 87 |     ALL TEST DONE
 88 |     ```
 89 |     这是一个分类网络，1000类，index从0到999，每个类别有一个概率分数，运行结果打印出了排名前5的概率分数score和index.
 90 | 
 91 | 
 92 | ## 更多Tengine示例
 93 | 更多Tengine的应用示例在[Tengine/examples](https://github.com/OAID/Tengine/tree/tengine-lite/examples)：
 94 | - 分类任务
 95 | - 人脸关键点检测任务
 96 | - ssd 目标检测任务
 97 | - retinaface 人脸检测任务
 98 | - yolact 实例分割任务
 99 | - yolov3 目标检测任务
100 | - yolov4-tiny目标检测任务
101 | - openpose人体姿态识别任务
102 | - crnn汉字识别任务
103 | 


--------------------------------------------------------------------------------
/doc/tutorials/03_Halide初体验.md:
--------------------------------------------------------------------------------
  1 | # Halide初体验
  2 | 
  3 | 在深入了解Halide之前，我们先来体验一下Halide的黑魔法。
  4 | 
  5 | 进入AutoKernel的docker, docker里已有Halide的python环境，直接运行
  6 | ```
  7 | python data/03_halide_magic.py
  8 | ```
  9 | 可以得到输出
 10 | ```
 11 | func_origin__ cost 0.510215 second
 12 | func_parallel cost 0.122265 second
 13 | ```
 14 | 以上这个脚本执行了一个简单的函数计算：`func[x,y] = x + 10*y`
 15 | 对比了两个函数的运行时间：
 16 | - func_origin: 默认函数
 17 | - func_parallel: 添加了Halide的一个调度策略：`func.parallel(y,4)`, 对y维度进行并行化，并行度为4
 18 | 
 19 | 结果可以看到，第二个函数的耗时是第一个函数的四分之一。
 20 | 
 21 | <center>这，就是Halide的魔法！</center>
 22 | 
 23 | 无需底层优化汇编知识，只需添加一行代码，就能得到比较好的优化效果
 24 | 
 25 | 
 26 | ## Halide语言基础
 27 | 要想调用Halide的调度策略，首先要掌握基本的Halide语言，用Halide语言来描述算子的计算。下面以简单的函数来演示Halide语言的基本数据结构。
 28 | 
 29 | - `变量 Var`：可以理解为函数的自变量，比如要描述一个图像的像素，需要两个变量x和y来描述 w维度和h维度的坐标。
 30 | - `函数 Func`：和数学上的函数类似，定义了一个计算过程。复杂的计算过程可以拆成多个小函数来实现。
 31 | 
 32 | ### 示例一
 33 | 本例子的函数计算公式为：`func(x,y)= 10*y + x`
 34 | 用Halide语言来描述这个函数：
 35 | * Python:
 36 |     ```python
 37 |     import halide as hl
 38 | 
 39 |     x, y = hl.Var("x"), hl.Var("y")
 40 |     func = hl.Func("func")
 41 |     func[x,y] = x + 10*y
 42 |     ```
 43 | * C++
 44 |     ```c++
 45 |     #include "Halide.h"
 46 |     using namespace Halide;
 47 | 
 48 |     Var x("x"), y("y");
 49 |     Func func("func");
 50 | 
 51 |     func(x, y) = x + 10 * y;
 52 |     ```
 53 | Func的realize会计算函数在定义域的值并返回数值结果。调用了realize，函数才被即时编译(jit-compile),在这之前只是定义了函数的计算过程。
 54 | 
 55 | 查看计算结果
 56 | 
 57 | * Python:
 58 |     ```python
 59 |     out = func.realize(3, 4)  # width, height = 3,4
 60 | 
 61 |     for j in range(out.height()):
 62 |         for i in range(out.width()):
 63 |             print("out[x=%i,y=%i]=%i"%(i,j,out[i,j]))
 64 |     ```
 65 | * C++
 66 |     ```c++
 67 |     Buffer<int32_t> out = func.realize(3, 4);
 68 |  
 69 |     for (int j = 0; j < out.height(); j++) {
 70 |         for (int i = 0; i < out.width(); i++) {
 71 |             printf("out[x=%d,y=%d]=%d",i,j,out(i,j));
 72 |             }
 73 |         }
 74 |     ```
 75 | 这个函数的计算是：
 76 | ```
 77 |                     wide = 3
 78 |                   x=0 x=1 x=2
 79 |                 ------------
 80 |             y=0 |  0   1   2
 81 | hight = 4   y=1 | 10  11  12
 82 |             y=2 | 20  21  22
 83 |             y=3 | 30  31  32
 84 | ```
 85 | 
 86 | 完整的代码在[data/03_halide_basic.py](data/03_halide_basic.py)
 87 | 可以直接运行：
 88 | ```
 89 | python data/03_halide_basic.py
 90 | ```
 91 | 另外可以调用`func.trace_stores()`来跟踪函数的值
 92 | 
 93 | ### 示例二
 94 | 本示例演示如何喂入输入数据，取出输出数据
 95 | 完整的代码在[data/03_halide_feed_data.py](data/03_halide_feed_data.py)
 96 | 
 97 | 本示例的函数：
 98 | ```
 99 | B(x,y)=A(x,y)+1
100 | ```
101 | A是输入数据，可以定义Halide.Buffer,然后把numpy的array数据喂入buffer
102 | ```python
103 |     # feed input
104 |     input_data = np.ones((4,4),dtype=np.uint8)
105 |     A = hl.Buffer(input_data)
106 | ```
107 | 定义函数B
108 | ```python
109 |     i,j = hl.Var("i"), hl.Var("j")
110 |     B = hl.Func("B")
111 |     B[i,j] = A[i,j] + 1
112 | ```
113 | 获取输出数据, 有以下几种方式
114 | ```python
115 |     # 1
116 |         output = B.realize(4,4)
117 |         print("out: \n",np.asanyarray(output))
118 |     # 2
119 |         output = hl.Buffer(hl.UInt(8),[4,4])
120 |         B.realize(output)
121 |         print("out: \n",np.asanyarray(output))
122 |     # 3
123 |         output_data = np.empty(input_data.shape, dtype=input_data.dtype,order="F")
124 |         output = hl.Buffer(output_data)
125 |         B.realize(output)
126 |         print("out: \n",output_data)
127 | 
128 | ```
129 | 可以直接运行完整代码：
130 | ```
131 | python data/03_halide_feed_data.py
132 | ```


--------------------------------------------------------------------------------
/doc/tutorials/data/02_tengine_tutorial.cpp:
--------------------------------------------------------------------------------
 1 | #include <unistd.h>
 2 | #include <iostream>
 3 | #include <string>
 4 | #include <algorithm>
 5 | 
 6 | #include "tengine_c_api.h"
 7 | #include "tengine_operations.h"
 8 | 
 9 | const char* model_file = "squeezenet.tmfile";
10 | const char* image_file = "cat.jpg";
11 | 
12 | using namespace std;
13 | 
14 | int main()
15 | {
16 |     // check files
17 |     if(!check_file_exist(model_file) || !check_file_exist(image_file))
18 |     {
19 |         return -1;
20 |     }
21 | 
22 |     int img_h = 227;
23 |     int img_w = 227;
24 |     float mean[3] = {104.007, 116.669, 122.679};
25 |     float scale[3] = {1.f, 1.f, 1.f};
26 | 
27 |     /* set runtime options of Net */
28 |     struct options opt;
29 |     opt.num_thread = 1;
30 |     opt.precision = TENGINE_MODE_FP32;
31 |     opt.cluster = TENGINE_CLUSTER_ALL;
32 | 
33 |     /* load model */
34 |     init_tengine();
35 |     graph_t graph = create_graph(NULL, "tengine", model_file);
36 | 
37 |     /* prepare input data */
38 |     int img_size = img_h * img_w * 3;
39 |     int dims[] = {1, 3, img_h, img_w}; 
40 |     float* input_data = ( float* )malloc(img_size * sizeof(float));
41 |     get_input_data(image_file, input_data, img_h, img_w, mean, scale);
42 |     tensor_t input_tensor = get_graph_input_tensor(graph, 0, 0);
43 |     set_tensor_shape(input_tensor, dims, 4);
44 |     set_tensor_buffer(input_tensor, input_data, img_size * sizeof(float));
45 | 
46 |     /* forward */
47 |     prerun_graph_multithread(graph, opt);
48 |     run_graph(graph, 1);
49 | 
50 |     /* get result */
51 |     tensor_t output_tensor = get_graph_output_tensor(graph, 0, 0);
52 |     float* output_data = ( float* )get_tensor_buffer(output_tensor);
53 |     int output_size = get_tensor_buffer_size(output_tensor) / sizeof(float);
54 | 
55 |     /* after process */
56 |     print_topk(output_data, output_size, 5);
57 |     std::cout << "--------------------------------------\n";
58 |     std::cout << "ALL TEST DONE\n";
59 | 
60 | 
61 |     free(input_data);
62 |     postrun_graph(graph);
63 |     destroy_graph(graph);
64 |     release_tengine();
65 |     return 0;
66 | }
67 | 


--------------------------------------------------------------------------------
/doc/tutorials/data/03_halide_basic.py:
--------------------------------------------------------------------------------
 1 | 
 2 | #!/usr/bin/python3
 3 | 
 4 | import halide as hl
 5 | 
 6 | x, y = hl.Var("x"), hl.Var("y")
 7 | func = hl.Func("func")
 8 | 
 9 | func[x,y] = x + 10*y
10 | #func.trace_stores()
11 | 
12 | out = func.realize(3, 4)  # width, height = 3,4
13 | 
14 | print("=============================")
15 | for j in range(out.height()):
16 |     for i in range(out.width()):
17 |         print("out[x=%i,y=%i]=%i"%(i,j,out[i,j]))
18 | 
19 | print("Success!")
20 | 
21 | 
22 | 


--------------------------------------------------------------------------------
/doc/tutorials/data/03_halide_feed_data.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | 
 3 | import halide as hl
 4 | import numpy as np
 5 | 
 6 | w,h = 8,8
 7 | 
 8 | def addone():
 9 |     # feed input
10 |     input_data = np.ones((4,4),dtype=np.uint8)
11 |     A = hl.Buffer(input_data)
12 | 
13 |     i,j = hl.Var("i"), hl.Var("j")
14 |     B = hl.Func("B")
15 |     B[i,j] = A[i,j] + 1
16 | 
17 |     # output
18 |     if 0:
19 |         output = B.realize(4,4)
20 |         print("out: \n",np.asanyarray(output))
21 |     if 0:
22 |         output = hl.Buffer(hl.UInt(8),[4,4])
23 |         B.realize(output)
24 |         print("out: \n",np.asanyarray(output))
25 |     if 1:
26 |         output_data = np.empty(input_data.shape, dtype=input_data.dtype,order="F")
27 |         output = hl.Buffer(output_data)
28 |         B.realize(output)
29 |         print("out: \n",output_data)
30 | 
31 | addone()
32 | 
33 | 
34 | 


--------------------------------------------------------------------------------
/doc/tutorials/data/03_halide_magic.py:
--------------------------------------------------------------------------------
 1 | 
 2 | #!/usr/bin/python3
 3 | 
 4 | import halide as hl
 5 | import time
 6 | 
 7 | def time_me(fn):
 8 |     def _wrapper(*args, **kwargs):
 9 |         start = time.clock()
10 |         fn(*args, **kwargs)
11 |         print("%s cost %.6f second"%(fn.__name__, float(time.clock() - start)))
12 |     return _wrapper
13 | 
14 | 
15 | x, y = hl.Var("x"), hl.Var("y")
16 | 
17 | @time_me
18 | def func_origin__(w,h):
19 |     func = hl.Func("func")
20 |     func[x,y] = x + 10*y
21 |     out = func.realize(w, h)
22 | 
23 | @time_me
24 | def func_parallel(w,h):
25 |     func = hl.Func("func")
26 |     func[x,y] = x + 10*y
27 |     func.parallel(y,4)
28 |     func.realize(w,h)
29 | 
30 | 
31 | func_origin__(400,400)
32 | func_parallel(400,400)
33 | 
34 | '''
35 | 运行结果：
36 | func_origin__ cost 0.510215 second
37 | func_parallel cost 0.122265 second
38 | '''
39 | 
40 | print("Success!")
41 | 
42 | 


--------------------------------------------------------------------------------
/doc/tutorials/data/04_test_relu.cpp:
--------------------------------------------------------------------------------
 1 | #include "HalideBuffer.h"
 2 | #include <iostream>
 3 | #include "halide_relu.h"
 4 | 
 5 | int main(int argc, char **argv)
 6 | {
 7 |     int C = 1, W = 4, H = 4, N = 1;
 8 |     Halide::Runtime::Buffer<float> input_tensor(nullptr, W, H, C, N);
 9 |     Halide::Runtime::Buffer<float> output_tensor(nullptr, W, H, C, N);
10 |     input_tensor.allocate();
11 |     output_tensor.allocate();
12 |     input_tensor.for_each_value([](float &x) {
13 |         x = 2.0 * rand() / RAND_MAX - 1.0;
14 |     });
15 | 
16 |     output_tensor.for_each_value([](float &x) {
17 |         x = 2.0 * rand() / RAND_MAX - 1.0;
18 |     });
19 | 
20 |     halide_relu(input_tensor, 0, output_tensor);
21 | 
22 |     printf("input:\n");
23 |     for (int c = 0; c < input_tensor.dim(3).extent(); c++) {
24 |         for (int z = 0; z < input_tensor.channels(); z++) {
25 |             for (int y = 0; y < input_tensor.height(); y++) {
26 |                 for (int x = 0; x < input_tensor.width(); x++) {
27 |                     std::cout<<input_tensor(x,y,z,0)<<" ";
28 |                 }
29 |                 std::cout<<"\n";
30 |             }
31 |             std::cout<<"\n";
32 |         }
33 |     }
34 |     
35 |     printf("output:\n");
36 |     for (int c = 0; c < output_tensor.dim(3).extent(); c++) {
37 |         for (int z = 0; z < output_tensor.channels(); z++) {
38 |             for (int y = 0; y < output_tensor.height(); y++) {
39 |                 for (int x = 0; x < output_tensor.width(); x++) {
40 |                     std::cout<<output_tensor(x,y,z,0)<<" ";
41 |                 }
42 |                 std::cout<<"\n";
43 |             }
44 |             std::cout<<"\n";
45 |         }
46 |     }
47 | 
48 |     return 0;
49 | }


--------------------------------------------------------------------------------
/doc/tutorials/data/05_loop_schedule.py:
--------------------------------------------------------------------------------
  1 | 
  2 | #!/usr/bin/python3
  3 | 
  4 | import halide as hl
  5 | 
  6 | x, y = hl.Var("x"), hl.Var("y")
  7 | w,h = 8,8
  8 | 
  9 | def origin():
 10 |     func = hl.Func("func_origin")
 11 |     func[x,y] = x + 10*y
 12 |     out = func.realize(w, h)
 13 |     func.print_loop_nest()
 14 | 
 15 |     '''
 16 |     运行结果：
 17 |     produce func_origin:
 18 |     for y:
 19 |       for x:
 20 |         func_origin(...) = ...
 21 |     '''
 22 |     print("---------------------")
 23 | 
 24 | def reorder():
 25 |     func = hl.Func("func_reorder")
 26 |     func[x,y] = x + 10*y
 27 |     func.reorder(y,x)
 28 |     out = func.realize(w, h)
 29 |     func.print_loop_nest()
 30 | 
 31 |     '''
 32 |     运行结果：
 33 |     produce func_reorder:
 34 |     for x:
 35 |       for y:
 36 |         func_reorder(...) = ...
 37 |     '''
 38 |     print("---------------------")
 39 | 
 40 | def split():
 41 |     func = hl.Func("func_split")
 42 |     func[x,y] = x + 10*y
 43 | 
 44 |     xo, xi = hl.Var("x_outer"), hl.Var("x_inner")
 45 |     func.split(x, xo, xi,4)
 46 |     out = func.realize(w, h)
 47 |     func.print_loop_nest()
 48 |     '''
 49 |     运行结果：
 50 |     produce func_split:
 51 |     for y:
 52 |       for x.x_outter:
 53 |         for x.x_inner in [0, 3]:
 54 |           func_split(...) = ...
 55 |     '''
 56 |     print("---------------------")
 57 | 
 58 | def fuse():
 59 |     func = hl.Func("func_fuse")
 60 |     func[x,y] = x + 10*y
 61 | 
 62 |     xy_fuse = hl.Var("xy_fuse")
 63 |     func.fuse(x,y,xy_fuse)
 64 |     out = func.realize(w, h)
 65 |     func.print_loop_nest()
 66 |     '''
 67 |     运行结果：
 68 |     produce func_fuse:
 69 |       for x.xy_fuse:
 70 |         func_fuse(...) = ...
 71 |     '''
 72 |     print("---------------------")
 73 | 
 74 | def tile():
 75 |     func = hl.Func("func_tile")
 76 |     func[x,y] = x + 10*y
 77 | 
 78 |     xo, xi, yo, yi = hl.Var("xo"), hl.Var("xi"),hl.Var("yo"), hl.Var("yi")
 79 |     xfactor, yfactor = 4, 8
 80 |     func.tile(x,y,xo,yo,xi,yi,xfactor,yfactor)
 81 |     out = func.realize(w, h)
 82 |     func.print_loop_nest()
 83 |     '''
 84 |     运行结果：
 85 |     produce func_tile:
 86 |     for y.yo:
 87 |       for x.xo:
 88 |         for y.yi in [0, 7]:
 89 |           for x.xi in [0, 3]:
 90 |             func_tile(...) = ...
 91 |     '''
 92 |     print("---------------------")
 93 | 
 94 | def vectorize():
 95 |     func = hl.Func("func_vectorize")
 96 |     func[x,y] = x + 10*y
 97 | 
 98 |     factor = 4
 99 |     func.vectorize(x,factor)
100 |     out = func.realize(w, h)
101 |     func.print_loop_nest()
102 |     '''
103 |     运行结果：
104 |     produce func_vectorize:
105 |     for y:
106 |       for x.x:
107 |         vectorized x.v0 in [0, 3]:
108 |           func_vectorize(...) = ...`
109 |     '''
110 |     print("---------------------")
111 | 
112 | def unroll():
113 |     func = hl.Func("func_unroll")
114 |     func[x,y] = x + 10*y
115 | 
116 |     factor = 2
117 |     func.unroll(x,factor)
118 |     out = func.realize(w, h)
119 |     func.print_loop_nest()
120 |     '''
121 |     运行结果：
122 |     produce func_unroll:
123 |     for y:
124 |       for x.x:
125 |         unrolled x.v1 in [0, 1]:
126 |           func_unroll(...) = ...
127 |     '''
128 |     print("---------------------")
129 | 
130 | def parallel():
131 |     func = hl.Func("func_parallel")
132 |     func[x,y] = x + 10*y
133 | 
134 |     factor = 4
135 |     func.parallel(x,factor)
136 |     out = func.realize(w, h)
137 |     func.print_loop_nest()
138 |     '''
139 |     运行结果：
140 |     produce func_parallel:
141 |     for y:
142 |       parallel x.x:
143 |         for x.v2 in [0, 3]:
144 |           func_parallel(...) = ...
145 |     '''
146 |     print("---------------------")
147 | 
148 | 
149 | 
150 | def default_inline():
151 |     print("=" * 50)
152 |     x, y = hl.Var("x"), hl.Var("y")
153 |     A, B = hl.Func("A_default"), hl.Func("B_default")
154 |     A[x, y] = x + 10*y
155 |     B[x, y] = A[x, y] + 1
156 | 
157 |     print("pipeline with default schedule: inline")
158 |     print('-'*50)
159 |     B.realize(w, h)
160 |     B.print_loop_nest()
161 | 
162 | def compute_at():
163 |     print("=" * 50)
164 |     A, B = hl.Func("A_y"), hl.Func("B_y")
165 |     A[x, y] = x + 10*y
166 |     B[x, y] = A[x, y] + 1
167 |     
168 |     print("pipeline with schedule: A.compute_at(B,y)")
169 |     print('-'*50)
170 |     A.compute_at(B, y)
171 |     B.realize(w, h)
172 |     B.print_loop_nest()
173 | 
174 | def compute_root():
175 |     print("=" * 50)
176 |     A, B = hl.Func("A_root"), hl.Func("B_root")
177 |     A[x, y] = x + 10*y
178 |     B[x, y] = A[x, y] + 1
179 |     
180 |     print("pipeline with schedule: A.compute_root()")
181 |     print('-'*50)
182 |     A.compute_root()
183 |     B.realize(w, h)
184 |     B.print_loop_nest()
185 | '''
186 | ==================================================
187 | pipeline with default schedule: inline
188 | --------------------------------------------------
189 | produce B_default:
190 |   for y:
191 |     for x:
192 |       B_default(...) = ...
193 | ==================================================
194 | pipeline with schedule: A.compute_at(B,y)
195 | --------------------------------------------------
196 | produce B_y:
197 |   for y:
198 |     produce A_y:
199 |       for x:
200 |         A_y(...) = ...
201 |     consume A_y:
202 |       for x:
203 |         B_y(...) = ...
204 | ==================================================
205 | pipeline with schedule: A.compute_root()
206 | --------------------------------------------------
207 | produce A_root:
208 |   for y:
209 |     for x:
210 |       A_root(...) = ...
211 | consume A_root:
212 |   produce B_root:
213 |     for y:
214 |       for x:
215 |         B_root(...) = ...
216 | '''
217 | 
218 | origin()
219 | reorder()
220 | split()
221 | fuse()
222 | tile()
223 | vectorize()
224 | unroll()
225 | parallel()
226 | 
227 | default_inline()
228 | compute_at()
229 | compute_root()
230 | 
231 | print("Success!")
232 | 
233 | 


--------------------------------------------------------------------------------
/doc/tutorials/data/06_build.sh:
--------------------------------------------------------------------------------
 1 | HALIDE_BUILD_DIR=/workspace/Halide/halide-build
 2 | 
 3 | EXE_FILE=06_gemm_optimization
 4 | 
 5 | g++ ${EXE_FILE}.cpp \
 6 |  -I ${HALIDE_BUILD_DIR}/include/ \
 7 |  -L ${HALIDE_BUILD_DIR}/src/ -lHalide\
 8 |   -lpthread -ldl -std=c++11 -lopenblas\
 9 |   -o ${EXE_FILE} 
10 | 
11 | export LD_LIBRARY_PATH=${HALIDE_BUILD_DIR}/src
12 | 
13 | export OMP_NUM_THREADS=4
14 | export HL_NUM_THREADS=4
15 | 
16 | if [ ! -n "$1" ]; then
17 |     echo "Usage:./build.sh <step>(step=1,2,..,7)"
18 |     echo "e.g. execute step3:./build.sh 3"
19 |     exit
20 | fi
21 | STEP=$1
22 | echo "step = " ${STEP}
23 | ./${EXE_FILE} ${STEP}
24 | 


--------------------------------------------------------------------------------
/doc/tutorials/data/06_gemm_optimization.cpp:
--------------------------------------------------------------------------------
  1 | #include "Halide.h"
  2 | #include <iostream>
  3 | #include<cblas.h>
  4 | #include <cmath>
  5 | #include <sys/time.h>
  6 | using namespace Halide;
  7 | unsigned long get_cur_time(void)
  8 | {
  9 |     struct timeval tv;
 10 | 
 11 |     gettimeofday(&tv, NULL);
 12 | 
 13 |     return (tv.tv_sec * 1000000 + tv.tv_usec);
 14 | }
 15 | #ifndef MAX
 16 | #define MAX(a, b) (((a) > (b)) ? (a) : (b))
 17 | #endif
 18 | static inline float *init(int size, int mode)
 19 | {
 20 |     srand(0); //set rand_seed
 21 |     int i;
 22 |     float *m = (float *)malloc(size * sizeof(float));
 23 |     for (i = 0; i < size; ++i) {
 24 |         if (mode == 0)
 25 |             m[i] = 0;
 26 |         else if (mode == 1)
 27 |             m[i] = 1;
 28 |         else if (mode == 2)
 29 |             m[i] = i % 8;
 30 |         else if (mode == 3)
 31 |             m[i] = (float)(rand()%4); 
 32 |         else
 33 |             m[i] = (float)rand() / RAND_MAX;
 34 |     }
 35 |     return m;
 36 | }
 37 | void maxerr(float* pred, float* gt, int h,int w)
 38 | {
 39 |     float maxError = 0.f;
 40 | 
 41 |     for(int i=0; i< (h*w); i++){
 42 |             maxError = MAX(( float )fabs(gt[i] - pred[i]), maxError);
 43 |     }
 44 |     // printf("====================================\n");
 45 |     printf("err %.2f\t", maxError);
 46 |     // printf("====================================\n");
 47 | }
 48 | 
 49 | int main(int argc, char **argv) {
 50 |     if(argc<2)
 51 |     {
 52 |         printf("[usage] exe [step] <rep=30> <debug=0>\n");
 53 |         return 1;
 54 |     } 
 55 |     int M= 640;
 56 |     int N= 640;
 57 |     int K= 640;
 58 |     printf("M N K = %3d %3d %3d\t",M,N,K);
 59 |     int debug=0;
 60 |     int repeat_count=50;
 61 | 
 62 |     int step = atoi(argv[1]);
 63 | 
 64 |     float* a = init(M*K,4);
 65 |     float* b = init(N*K,4);
 66 |     float* c = init(M*N,1);
 67 |     float* ct = init(M*N,2);
 68 | 
 69 |     Buffer<float> A(a,K,M);
 70 |     Buffer<float> B(b,N,K);
 71 |     Buffer<float> C(c,N,M);
 72 |     
 73 |     Var x,y,xy;
 74 |     Var xi,yi,xo,yo,yii;
 75 |     RDom k(0, K);
 76 |     Func gemm("gemm");
 77 | 
 78 |     //1: default
 79 |     if (step==1)
 80 |     {
 81 |         gemm(x, y) += A(k, y) * B(x, k);
 82 | 
 83 |     }
 84 |     //2: tile
 85 |     if(step==2)
 86 |     {
 87 |         gemm(x, y) += A(k, y) * B(x, k);
 88 |         gemm.update()
 89 |             .tile(x, y, xo, yo, xi, yi, 16, 8)
 90 |             .reorder(xi, yi, k, xo, yo);
 91 |     }
 92 |     //3 tile + vectorize
 93 |     if(step==3)
 94 |     {
 95 |         gemm(x, y) += A(k, y) * B(x, k);
 96 |         gemm.update()
 97 |             .tile(x, y, xo, yo, xi, yi, 16, 8)
 98 |             .reorder(xi, yi, k, xo, yo)
 99 |             .vectorize(xi, 8);
100 |     }
101 |     //4 tile + vectorize + parallel
102 |     if(step==4)
103 |     {
104 |         gemm(x, y) += A(k, y) * B(x, k);
105 |         gemm.update()
106 |             .tile(x, y, xo, yo, xi, yi, 16, 8)
107 |             .reorder(xi, yi, k, xo, yo)
108 |             .vectorize(xi, 8)
109 |             .parallel(yo);
110 |     }
111 |     //5 tile + vectorize + parallel + unroll
112 |     if(step==5)
113 |     {
114 |         gemm(x, y) += A(k, y) * B(x, k);
115 |         gemm.update()
116 |             .tile(x, y, xo, yo, xi, yi, 16, 8)
117 |             .reorder(xi, yi, k, xo, yo)
118 |             .vectorize(xi, 8)
119 |             .parallel(yo)
120 |             .unroll(xi)
121 |             .unroll(yi,2);
122 |     } 
123 |     //6 micro_kernel 4x16
124 |     if(step==6)
125 |     {
126 |         Func prod;
127 |         prod(x, y) += A(k, y) * B(x, k);
128 |         gemm(x, y) = prod(x, y);
129 | 
130 |         gemm.tile(x, y, xi, yi, 16, 32)
131 |             .fuse(x, y, xy).parallel(xy)
132 |             .split(yi, yi, yii, 4)
133 |             .vectorize(xi, 8)
134 |             .unroll(xi)
135 |             .unroll(yii);
136 | 
137 |         prod.compute_at(gemm, yi)
138 |             .vectorize(x, 8).unroll(y);
139 | 
140 |         prod.update()
141 |             .reorder(x, y, k)
142 |             .vectorize(x, 8)
143 |             .unroll(x)
144 |             .unroll(y)
145 |             .unroll(k, 2);
146 |     }
147 |     // 7.interleave B
148 |     if(step==7)
149 |     {
150 |         Func B_interleave("B"), Bs("Bs");
151 |         Bs(x, y, xo) = B(xo * 16 + x, y);
152 |         B_interleave(x, y) = Bs(x % 16, y, x / 16);
153 | 
154 |         Func prod;
155 |         prod(x, y) += A(k, y) * B_interleave(x, k);
156 |         gemm(x, y) = prod(x, y);
157 | 
158 |         gemm.tile(x, y, xi, yi, 16, 32)
159 |             .fuse(x, y, xy).parallel(xy)
160 |             .split(yi, yi, yii, 4)
161 |             .vectorize(xi, 8)
162 |             .unroll(xi)
163 |             .unroll(yii);
164 | 
165 |         prod.compute_at(gemm, yi)
166 |             .vectorize(x, 8).unroll(y);
167 | 
168 |         prod.update()
169 |             .reorder(x, y, k)
170 |             .vectorize(x, 8)
171 |             .unroll(x)
172 |             .unroll(y)
173 |             .unroll(k, 2);
174 |         Bs.compute_root()
175 |             .split(y, yo, yi, 16)
176 |             .reorder(x, yi, xo, yo)
177 |             .unroll(x)
178 |             .vectorize(yi).parallel(yo, 4);
179 |     }
180 |     gemm.output_buffer().dim(0).set_bounds(0, N).dim(1).set_bounds(0, M);
181 |     gemm.realize(C);
182 | 
183 |     cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, K, 1, a, K, b, N, 0, ct, N);
184 |     maxerr(c, ct, M,N);
185 |     if(debug)
186 |     {
187 |         for (int j = 0; j < C.height(); j++) {
188 |             for (int i = 0; i < C.width(); i++) {
189 |                 printf("%.1f  ",C(i,j));
190 |             }
191 |             printf("\n");
192 |         }
193 |     }
194 |     
195 | 
196 |     unsigned long t0, t1;
197 |     float totalTime = 0;
198 |     for (int i = 0; i < repeat_count; i++)
199 |     {
200 |         t0 = get_cur_time();
201 |         gemm.realize(C);
202 |         t1 = get_cur_time();
203 |         totalTime += ((float)(t1 - t0) / 1000.);
204 |     }
205 |     printf("[rep %d] autokernel | blas \t%.4f ms \t",repeat_count, totalTime / repeat_count);
206 | 
207 | 
208 |     totalTime = 0;
209 |     for (int i = 0; i < repeat_count; i++)
210 |     {
211 |         t0 = get_cur_time();
212 |         cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, K, 1, a, K, b, N, 0, ct, N),
213 |         t1 = get_cur_time();
214 |         totalTime += ((float)(t1 - t0) / 1000.);
215 |     }
216 |     printf("%.4f ms\n",totalTime / repeat_count);
217 |     return 0;
218 | }
219 | 


--------------------------------------------------------------------------------
/doc/tutorials/data/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | project(tengine_tutorial)        
 2 | cmake_minimum_required(VERSION 3.10)
 3 | 
 4 | # Tengine path
 5 | set(TENGINE_ROOT /workspace/Tengine)   
 6 | 
 7 | include_directories(./common)
 8 | include_directories(${TENGINE_ROOT}/include)
 9 | link_directories(${TENGINE_ROOT}/build/install/lib)
10 | 
11 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
12 | set(LINK_LIBS tengine-lite)
13 | set(CMAKE_EXE_LINKER_FLAGS "-rdynamic -ldl")
14 | 
15 | add_executable(02_tengine_tutorial 02_tengine_tutorial.cpp common/tengine_operations.c)
16 | target_link_libraries (02_tengine_tutorial ${LINK_LIBS})
17 | 


--------------------------------------------------------------------------------
/doc/tutorials/data/gemm.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OAID/AutoKernel/6ada93a743c26e3aae0372f7518433889dd0b437/doc/tutorials/data/gemm.jpg


--------------------------------------------------------------------------------
/doc/tutorials/data/inference.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OAID/AutoKernel/6ada93a743c26e3aae0372f7518433889dd0b437/doc/tutorials/data/inference.png


--------------------------------------------------------------------------------
/doc/tutorials/data/interleave.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OAID/AutoKernel/6ada93a743c26e3aae0372f7518433889dd0b437/doc/tutorials/data/interleave.png


--------------------------------------------------------------------------------
/doc/tutorials/data/memory.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OAID/AutoKernel/6ada93a743c26e3aae0372f7518433889dd0b437/doc/tutorials/data/memory.png


--------------------------------------------------------------------------------
/doc/tutorials/data/plugin.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OAID/AutoKernel/6ada93a743c26e3aae0372f7518433889dd0b437/doc/tutorials/data/plugin.png


--------------------------------------------------------------------------------
/doc/tutorials/data/step6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OAID/AutoKernel/6ada93a743c26e3aae0372f7518433889dd0b437/doc/tutorials/data/step6.png


--------------------------------------------------------------------------------
/doc/tutorials/readme.md:
--------------------------------------------------------------------------------
 1 | # AutoKernel教程
 2 | 
 3 | 随着人工智能技术的发展，任何耗费大量人力的工作都将被自动化取代，码农的开发工作也不例外。**自动化**是未来发展的大方向，大趋势。那么，掌握自动化工具就显得格外重要。那么，赶紧跟着小编的AutoKernel教程，一起进军`自动化优化`领域吧~
 4 | 
 5 | AutoKernel教程分为初级课程和高级课程，目前开放的是初级课程
 6 | 
 7 | ## 初级课程
 8 | 通过本教程，你将获得的能力：
 9 | - 熟悉AutoKernel开发环境
10 | - 掌握基本的Halide语言，具备描述算子能力
11 | - 掌握常见的优化调度策略Schedule
12 | - 具备快速将自动优化算子集成进Tengine框架的能力
13 | 
14 | 
15 | 课程大纲：
16 | - 01: [AutoKernel开发环境快速入门](01_AutoKernel开发环境快速入门.md)
17 | - 02: [Tengine快速入门](02_Tengine快速入门.md)
18 | - 03: [Halide初体验](03_Halide初体验.md)
19 | - 04: [AutoKernel Plugin快速入门](04_AutoKernel插件指南.md)
20 | - 05: [Halide调度策略Schedule](05_Halide调度策略Schedule.md)
21 | - 06: [GEMM调度策略优化指南](06_GEMM调度策略优化指南.md)
22 | 
23 |   
24 | 课程配套代码:
25 | - [02_tengine_tutorial.cpp](data/02_tengine_tutorial.cpp)
26 | - [03_halide_basic.py](data/03_halide_basic.py)
27 | - [03_halide_feed_data.py](data/03_halide_feed_data.py)
28 | - [03_halide_magic.py](data/03_halide_magic.py)
29 | - [04_test_relu.cpp](data/04_test_relu.cpp)
30 | - [05_loop_schedule.py](data/05_loop_schedule.py)
31 | - [06_gemm_optimization.cpp](data/06_gemm_optimization.cpp)
32 | 
33 | ## 高级课程
34 |   (待补充...)
35 | 


--------------------------------------------------------------------------------