├── README.md ├── src ├── dajnorm.h ├── dajdef.h ├── dajfunc.h ├── dajmodel.h ├── dajnorm.cpp ├── dajdense.h ├── dajgemm.h ├── dajutil.h ├── dajconv.h ├── dajmodel.cpp ├── dajnn.cpp ├── dajnn.h ├── dajdense.cpp ├── dajtensor.h ├── dajutil.cpp ├── dajfunc.cpp ├── dajconv.cpp ├── dajgemm.cpp └── dajtensor.cpp ├── vsproj ├── main.cpp ├── dajnn.vcxproj.user ├── dajnn.sln ├── dajnn.vcxproj.filters └── dajnn.vcxproj └── paddle ├── paddle_lite_factory_helper.h ├── paddle_use_ops.h ├── paddle_use_passes.h ├── paddle_use_kernels.h ├── paddle_image_preprocess.h ├── paddle_place.h ├── paddle_api_2.h └── paddle_api.h /README.md: -------------------------------------------------------------------------------- 1 | 2 | # dajnn 3 | Customized C++ Deep Learning Framework (Multiplatform, Inference Only) 4 | 5 | Fast, optimized, portable, easy C++ inference framework for Deep Learning. 6 | -------------------------------------------------------------------------------- /src/dajnorm.h: -------------------------------------------------------------------------------- 1 | 2 | #pragma once 3 | 4 | #include "dajnn.h" 5 | 6 | namespace dajnn { 7 | namespace norm { 8 | 9 | void batch_norm_with_precomputed(FTensor* tensor, FTensor* pc_gamma, FTensor* pc_beta); 10 | 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /vsproj/main.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include "dajmodel.h" 3 | 4 | using namespace dajnn; 5 | 6 | int main(int argc, const char** argv) { 7 | Model* model = new Model("../../pymaster/duel/export/koni_p2_d5.daj"); 8 | delete model; 9 | return 0; 10 | } 11 | -------------------------------------------------------------------------------- /src/dajdef.h: -------------------------------------------------------------------------------- 1 | 2 | #pragma once 3 | 4 | #ifdef _WIN32 5 | 6 | #ifdef _DEBUG 7 | #define TRACE_MEMORY_LEAK 8 | #endif 9 | 10 | #else // _WIN32 11 | 12 | #define LITE_WITH_ARM 13 | #define PADDLE 14 | #define PADDLE_THREADS 2 15 | #define PADDLE_CLS 1 16 | 17 | #endif 18 | -------------------------------------------------------------------------------- /src/dajfunc.h: -------------------------------------------------------------------------------- 1 | 2 | #pragma once 3 | 4 | #include "dajnn.h" 5 | 6 | namespace dajnn { 7 | namespace func { 8 | 9 | void relu(FTensor* tensor); 10 | void tanh(FTensor* tensor); 11 | 12 | void scale(FTensor* tensor, FTensor* weight, FTensor* bias, bool is_first_batch_dim); 13 | void add(FTensor* dst, FTensor* oprd); 14 | 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /src/dajmodel.h: -------------------------------------------------------------------------------- 1 | 2 | #pragma once 3 | 4 | #include "dajtensor.h" 5 | 6 | namespace dajnn { 7 | 8 | class Model { 9 | public: 10 | Model(); 11 | Model(ByteStream* stream); 12 | virtual ~Model(); 13 | 14 | public: 15 | uint length(); 16 | 17 | FTensor* get_f(uint idx); 18 | ITensor* get_i(uint idx); 19 | 20 | protected: 21 | Tensor** weights; 22 | uint weights_len; 23 | }; 24 | 25 | } 26 | -------------------------------------------------------------------------------- /src/dajnorm.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include "dajnorm.h" 3 | #include "dajtensor.h" 4 | #include "dajfunc.h" 5 | 6 | #ifdef PADDLE 7 | #include "paddle_api_2.h" 8 | using namespace paddle::lite_api; 9 | #endif 10 | 11 | namespace dajnn { 12 | namespace norm { 13 | 14 | void batch_norm_with_precomputed(FTensor* tensor, FTensor* pc_gamma, FTensor* pc_beta) { 15 | func::scale(tensor, pc_gamma, pc_beta, true); 16 | } 17 | 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /src/dajdense.h: -------------------------------------------------------------------------------- 1 | 2 | #pragma once 3 | 4 | #include "dajnn.h" 5 | 6 | namespace dajnn { 7 | namespace dense { 8 | 9 | /* 10 | full-connected layer 11 | @param input: 2-d tensor with shape (n, m) 12 | @param kernel: 2-d tensor with shape (m, p) 13 | @param bias: null or 1-d tensor with shape (p) 14 | @return: 2-d tensor with shape (n, p) 15 | */ 16 | FTensor* dense(FTensor* input, FTensor* kernel, FTensor* bias); 17 | 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /src/dajgemm.h: -------------------------------------------------------------------------------- 1 | 2 | #pragma once 3 | 4 | namespace dajnn { 5 | 6 | void gemm_bin(int M, int N, int K, float ALPHA, char* A, int lda, float* B, int ldb, float* C, int ldc); 7 | void gemm(int TA, int TB, int M, int N, int K, float ALPHA, float* A, int lda, float* B, int ldb, float BETA, float* C, int ldc); 8 | void gemm_cpu(int TA, int TB, int M, int N, int K, float ALPHA, float* A, int lda, float* B, int ldb, float BETA, float* C, int ldc); 9 | 10 | } 11 | -------------------------------------------------------------------------------- /vsproj/dajnn.vcxproj.user: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | $(OutDir) 5 | WindowsLocalDebugger 6 | 7 | 8 | $(OutDir) 9 | WindowsLocalDebugger 10 | 11 | -------------------------------------------------------------------------------- /src/dajutil.h: -------------------------------------------------------------------------------- 1 | 2 | #pragma once 3 | 4 | #include "dajtensor.h" 5 | 6 | namespace dajnn { 7 | 8 | void log_i(const char* format, ...); 9 | void log_w(const char* format, ...); 10 | void log_d(const char* format, ...); 11 | void log_e(const char* format, ...); 12 | void log_x(const char* type_str, const char* format, va_list ap); 13 | 14 | void exit_if(bool condition, const char* format = nullptr, ...); 15 | 16 | float get_max(float* arr, uint len); 17 | float get_min(float* arr, uint len); 18 | 19 | int get_max(int* arr, uint len); 20 | int get_min(int* arr, uint len); 21 | 22 | uint get_span(vector* shape); 23 | string get_shape_str(vector* shape); 24 | string format_str(const char* format, ...); 25 | 26 | } 27 | -------------------------------------------------------------------------------- /src/dajconv.h: -------------------------------------------------------------------------------- 1 | 2 | #pragma once 3 | 4 | #include "dajnn.h" 5 | 6 | namespace dajnn { 7 | namespace conv { 8 | 9 | /* 10 | 2-d convolutional layer 11 | @param input: 4-d tensor with shape (n, c, h, w) 12 | @param kernel: 4-d tensor with shape (f, c, k_h, k_w) 13 | @param bias: null or 1-d tensor with shape (f) 14 | @param padding_x: padding sizes (-1 for auto, 0 for no padding) 15 | @param stride_x: strides 16 | @param dilation_x: dilations 17 | @return: 4-d tensor with shape (n, f, _h_, _w_) 18 | 19 | CAUTION: 20 | f (# of filters) must be >1 for mobile forward (paddle-lite's bug) 21 | dilation_x must be =1 for win32 forward (darknet's limitance) 22 | */ 23 | FTensor* conv2d(FTensor* input, FTensor* kernel, FTensor* bias = nullptr, 24 | int padding_h = -1, int padding_w = -1, int stride_h = 1, int stride_w = 1, 25 | int dilation_h = 1, int dilation_w = 1); 26 | 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /vsproj/dajnn.sln: -------------------------------------------------------------------------------- 1 | 2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio 2013 4 | VisualStudioVersion = 12.0.21005.1 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "dajnn", "dajnn.vcxproj", "{A4E8B25A-10B8-4252-8C8A-EE5FDBF7891F}" 7 | EndProject 8 | Global 9 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 10 | Debug|Win32 = Debug|Win32 11 | Release|Win32 = Release|Win32 12 | EndGlobalSection 13 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 14 | {A4E8B25A-10B8-4252-8C8A-EE5FDBF7891F}.Debug|Win32.ActiveCfg = Debug|Win32 15 | {A4E8B25A-10B8-4252-8C8A-EE5FDBF7891F}.Debug|Win32.Build.0 = Debug|Win32 16 | {A4E8B25A-10B8-4252-8C8A-EE5FDBF7891F}.Release|Win32.ActiveCfg = Release|Win32 17 | {A4E8B25A-10B8-4252-8C8A-EE5FDBF7891F}.Release|Win32.Build.0 = Release|Win32 18 | EndGlobalSection 19 | GlobalSection(SolutionProperties) = preSolution 20 | HideSolutionNode = FALSE 21 | EndGlobalSection 22 | EndGlobal 23 | -------------------------------------------------------------------------------- /src/dajmodel.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include "dajmodel.h" 3 | #include "dajutil.h" 4 | 5 | namespace dajnn { 6 | 7 | Model::Model() { 8 | weights = nullptr; 9 | weights_len = 0; 10 | } 11 | 12 | Model::~Model() { 13 | if (weights) { 14 | for (uint i = 0; i < weights_len; ++i) { 15 | delete weights[i]; 16 | } 17 | delete[] weights; 18 | } 19 | } 20 | 21 | Model::Model(ByteStream* stream) : Model() { 22 | string header = stream->read_str(); 23 | 24 | if (header.compare(MODEL_HEADER)) { 25 | log_w("invalid model header : %s", header); 26 | return; 27 | } 28 | vector tensors; 29 | 30 | while (true) { 31 | string mode = stream->read_str(); 32 | 33 | if (!mode.compare("f")) { 34 | tensors.push_back(new FTensor(stream)); 35 | } else if (!mode.compare("i")) { 36 | tensors.push_back(new ITensor(stream)); 37 | } else if (!mode.compare(MODEL_FOOTER)) { 38 | break; 39 | } else { 40 | log_w("invalid tensor mode (%s) from model", mode.c_str()); 41 | } 42 | } 43 | weights_len = tensors.size(); 44 | weights = new Tensor*[weights_len]; 45 | 46 | for (uint i = 0; i < weights_len; ++i) { 47 | weights[i] = tensors[i]; 48 | } 49 | } 50 | 51 | uint Model::length() { 52 | return weights_len; 53 | } 54 | 55 | FTensor* Model::get_f(uint idx) { 56 | return (FTensor*) weights[idx]; 57 | } 58 | 59 | ITensor* Model::get_i(uint idx) { 60 | return (ITensor*) weights[idx]; 61 | } 62 | 63 | } 64 | -------------------------------------------------------------------------------- /src/dajnn.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include "dajnn.h" 3 | #include "dajutil.h" 4 | 5 | #ifdef PADDLE 6 | #include "paddle_api_2.h" 7 | using namespace paddle::lite_api; 8 | #endif 9 | 10 | namespace dajnn { 11 | 12 | #ifdef TRACE_MEMORY_LEAK 13 | vector _tensor_trace_pool_; 14 | vector _tensor_unique_indice_; 15 | uint _tensor_trace_len_ = 0; 16 | 17 | void push_tensor_trace(Tensor* tensor) { 18 | if (_tensor_trace_pool_.empty()) _tensor_trace_len_ = 0; 19 | 20 | _tensor_trace_pool_.push_back(tensor); 21 | _tensor_unique_indice_.push_back(_tensor_trace_len_++); 22 | } 23 | 24 | uint pop_tensor_trace(Tensor* tensor) { 25 | uint i = 0; 26 | 27 | for (vector::iterator ti = _tensor_trace_pool_.begin(); 28 | ti != _tensor_trace_pool_.end(); ++ti, ++i) { 29 | if (tensor == *ti) break; 30 | } 31 | exit_if(i == _tensor_trace_pool_.size(), "cannot find tensor to pop from trace"); 32 | 33 | uint idx = _tensor_unique_indice_[i]; 34 | _tensor_unique_indice_.erase(_tensor_unique_indice_.begin() + i); 35 | _tensor_trace_pool_.erase(_tensor_trace_pool_.begin() + i); 36 | return idx; 37 | } 38 | 39 | vector get_leaked_tensor_indice() { 40 | return _tensor_unique_indice_; 41 | } 42 | #endif 43 | 44 | void init_dajnn() { 45 | #ifdef PADDLE 46 | paddle_DeviceInit(); 47 | #endif 48 | } 49 | 50 | void finish_dajnn() { 51 | #ifdef TRACE_MEMORY_LEAK 52 | if (!_tensor_unique_indice_.empty()) { 53 | string msg = "leaked tensors : "; 54 | 55 | for (vector::iterator idx = _tensor_unique_indice_.begin(); idx != _tensor_unique_indice_.end(); ++idx) { 56 | msg += format_str("%d ", *idx); 57 | } 58 | log_e("%s", msg); 59 | } 60 | #endif 61 | } 62 | 63 | } 64 | -------------------------------------------------------------------------------- /src/dajnn.h: -------------------------------------------------------------------------------- 1 | 2 | #pragma once 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include "dajdef.h" 16 | 17 | #ifdef PADDLE 18 | #include 19 | #include 20 | #endif 21 | 22 | using namespace std; 23 | 24 | namespace dajnn { 25 | 26 | #ifndef uchar 27 | #define uchar unsigned char 28 | #endif 29 | 30 | #ifndef ushort 31 | #define ushort unsigned short 32 | #endif 33 | 34 | #ifndef uint 35 | #define uint unsigned int 36 | #endif 37 | 38 | #if defined(_MSC_VER) && _MSC_VER < 1900 39 | #define inline __inline 40 | #endif 41 | 42 | #ifndef INT_MIN 43 | #define INT_MIN -2147483648 44 | #define INT_MAX 2147483647 45 | #endif 46 | 47 | #ifndef FLOAT_MIN 48 | #define FLOAT_MIN -1e10f 49 | #define FLOAT_MAX 1e10f 50 | #endif 51 | 52 | #ifndef SHRT_MIN 53 | #define SHRT_MIN -32768 54 | #define SHRT_MAX 32767 55 | #endif 56 | 57 | #ifndef MIN 58 | #define MIN(x, y) (((x) < (y)) ? (x) : (y)) 59 | #endif 60 | 61 | #ifndef MAX 62 | #define MAX(x, y) (((x) > (y)) ? (x) : (y)) 63 | #endif 64 | 65 | #define END_DIM 0xFFFFFFFF 66 | #define MAX_TENSOR_DIM 16 67 | #define MAX_MODEL_STR 256 68 | 69 | #define MODEL_HEADER "MRB_NN_DAJ_MODEL_V1_BEGIN" 70 | #define MODEL_FOOTER "MRB_NN_DAJ_MODEL_END" 71 | 72 | class Tensor; 73 | class ITensor; 74 | class FTensor; 75 | 76 | void init_dajnn(); 77 | void finish_dajnn(); 78 | 79 | #ifdef TRACE_MEMORY_LEAK 80 | void push_tensor_trace(Tensor* tensor); 81 | uint pop_tensor_trace(Tensor* tensor); 82 | 83 | vector get_leaked_tensor_indice(); 84 | #endif 85 | 86 | } 87 | -------------------------------------------------------------------------------- /paddle/paddle_lite_factory_helper.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | /* 16 | * This file defines some MACROS that explicitly determine the op, kernel, mir 17 | * passes used in the inference lib. 18 | */ 19 | #pragma once 20 | 21 | // some platform-independent defintion 22 | 23 | #if defined(_WIN32) 24 | #define UNUSED 25 | #define __builtin_expect(EXP, C) (EXP) 26 | #else 27 | #define UNUSED __attribute__((unused)) 28 | #endif 29 | 30 | #define USE_LITE_OP(op_type__) \ 31 | extern int touch_op_##op_type__(); \ 32 | int LITE_OP_REGISTER_FAKE(op_type__) UNUSED = touch_op_##op_type__(); 33 | 34 | #define USE_LITE_KERNEL(op_type__, target__, precision__, layout__, alias__) \ 35 | extern int touch_##op_type__##target__##precision__##layout__##alias__(); \ 36 | int op_type__##target__##precision__##layout__##alias__##__use_lite_kernel \ 37 | UNUSED = touch_##op_type__##target__##precision__##layout__##alias__(); 38 | 39 | #define USE_MIR_PASS(name__) \ 40 | extern bool mir_pass_registry##name__##_fake(); \ 41 | static bool mir_pass_usage##name__ UNUSED = \ 42 | mir_pass_registry##name__##_fake(); 43 | 44 | #define LITE_OP_REGISTER_FAKE(op_type__) op_type__##__registry__ 45 | -------------------------------------------------------------------------------- /src/dajdense.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include "dajdense.h" 3 | #include "dajtensor.h" 4 | #include "dajutil.h" 5 | #include "dajgemm.h" 6 | 7 | #ifdef PADDLE 8 | #include "paddle_api_2.h" 9 | using namespace paddle::lite_api; 10 | #endif 11 | 12 | namespace dajnn { 13 | namespace dense { 14 | 15 | FTensor* dense(FTensor* input, FTensor* kernel, FTensor* bias) { 16 | exit_if(input->shape.size() != 2, "input dim of dense expects to be 2, but got %d", input->shape.size()); 17 | exit_if(kernel->shape.size() != 2, "kernel dim of dense expects to be 2, but got %d", kernel->shape.size()); 18 | exit_if(bias && (bias->shape.size() != 1), "bias dim of dense expects to be null or 1, but got %d", bias->shape.size()); 19 | 20 | uint n = input->shape[0]; 21 | uint m = input->shape[1]; 22 | #ifdef PADDLE 23 | uint p = kernel->shape[1]; 24 | bool shape_ok = kernel->shape[0] == m; 25 | #else 26 | uint p = kernel->shape[0]; 27 | bool shape_ok = kernel->shape[1] == m; 28 | #endif 29 | 30 | exit_if(!shape_ok, "dense input and kernel shapes mismatch : %s and %s", 31 | get_shape_str(&input->shape).c_str(), 32 | get_shape_str(&kernel->shape).c_str()); 33 | exit_if(bias && (bias->span != p), "dense kernel and bias shapes mismatch : %s and %s", 34 | get_shape_str(&kernel->shape).c_str(), 35 | get_shape_str(&bias->shape).c_str()); 36 | 37 | FTensor* output = new FTensor(n, p, END_DIM); 38 | #ifdef PADDLE 39 | paddle_matmul(n, p, m, input->val, kernel->val, output->val); 40 | 41 | if (bias) { 42 | float* op = output->val; 43 | 44 | for (uint i = 0; i < n; ++i, op += p) { 45 | paddle_elementwise_add(op, bias->val, op, p); 46 | } 47 | } 48 | #else 49 | memset(output->val, 0, 4 * output->span); 50 | gemm(0, 1, n, p, m, 1, input->val, m, kernel->val, m, 1, output->val, p); 51 | 52 | if (bias) { 53 | float* op = output->val; 54 | 55 | for (uint i = 0; i < n; ++i) { 56 | for (float* bp = bias->val; bp < bias->val + p; ++bp, ++op) { 57 | *op += *bp; 58 | } 59 | } 60 | } 61 | #endif 62 | return output; 63 | } 64 | 65 | } 66 | } -------------------------------------------------------------------------------- /src/dajtensor.h: -------------------------------------------------------------------------------- 1 | 2 | #pragma once 3 | 4 | #include "dajnn.h" 5 | 6 | namespace dajnn { 7 | 8 | class ByteStream { 9 | public: 10 | ByteStream(); 11 | ByteStream(const void* buff); 12 | ByteStream(FILE* fp); 13 | 14 | string read_str(); 15 | uint read(void* dst, int ele_size, int ele_count); 16 | void write(void* src, int ele_size, int ele_count); 17 | int seek(); 18 | 19 | private: 20 | const char* buff; 21 | FILE* fp; 22 | int pointer; 23 | }; 24 | 25 | class Tensor { 26 | public: 27 | Tensor(); 28 | virtual ~Tensor(); 29 | 30 | void reshape(vector* shape); 31 | void reshape(uint dim1, ...); 32 | bool is_shape(vector* shape); 33 | bool is_shape(uint dim1, ...); 34 | void set_releasable(bool releasable); 35 | 36 | public: 37 | vector shape; 38 | uint span; 39 | bool releasable; 40 | 41 | protected: 42 | void* _init_(Tensor* tensor, bool copy_val = true); 43 | void* _init_(vector* shape, void* val = nullptr, bool copy_val = true); 44 | void* _init_(void* val, bool copy_val, uint dim1, va_list ap); 45 | void* _init_(ByteStream* stream); 46 | 47 | void _read_meta_(ByteStream* stream); 48 | void _write_meta_(ByteStream* stream); 49 | void _write_val_(ByteStream* stream); 50 | void _save_(ByteStream* stream); 51 | 52 | protected: 53 | void* _val_; 54 | }; 55 | 56 | class ITensor : public Tensor { 57 | public: 58 | ITensor(); 59 | ITensor(ITensor* tensor, bool copy_val = true); 60 | ITensor(vector* shape, int* val = nullptr, bool copy_val = true); 61 | ITensor(int* val, bool copy_val, uint dim1, ...); 62 | ITensor(uint dim1, ...); 63 | ITensor(ByteStream* stream); 64 | 65 | void save(ByteStream* stream, bool compressed = false); 66 | int compare(ITensor* tensor); 67 | int compare(int* val, uint len); 68 | 69 | int get_max(); 70 | int get_min(); 71 | 72 | public: 73 | int* val; 74 | }; 75 | 76 | class FTensor : public Tensor { 77 | public: 78 | FTensor(); 79 | FTensor(FTensor* tensor, bool copy_val = true); 80 | FTensor(ITensor* tensor); 81 | FTensor(vector* shape, float* val = nullptr, bool copy_val = true); 82 | FTensor(float* val, bool copy_val, uint dim1, ...); 83 | FTensor(uint dim1, ...); 84 | FTensor(ByteStream* stream); 85 | 86 | void save(ByteStream* stream, bool compressed = false); 87 | void print(uint start = END_DIM, uint end = END_DIM); 88 | 89 | float compare(FTensor* tensor); 90 | float compare(float* val, uint len); 91 | 92 | float get_max(); 93 | float get_min(); 94 | 95 | public: 96 | float* val; 97 | }; 98 | 99 | } 100 | -------------------------------------------------------------------------------- /paddle/paddle_use_ops.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "paddle_lite_factory_helper.h" 3 | 4 | USE_LITE_OP(conv2d); 5 | USE_LITE_OP(depthwise_conv2d); 6 | USE_LITE_OP(unsqueeze); 7 | USE_LITE_OP(unsqueeze2); 8 | USE_LITE_OP(pool2d); 9 | USE_LITE_OP(fc); 10 | USE_LITE_OP(nearest_interp); 11 | USE_LITE_OP(bilinear_interp); 12 | USE_LITE_OP(batch_norm); 13 | USE_LITE_OP(sync_batch_norm); 14 | USE_LITE_OP(reduce_mean); 15 | USE_LITE_OP(layout); 16 | USE_LITE_OP(assign_value); 17 | USE_LITE_OP(scale); 18 | USE_LITE_OP(fusion_elementwise_sub_activation); 19 | USE_LITE_OP(fusion_elementwise_add_activation); 20 | USE_LITE_OP(fusion_elementwise_mul_activation); 21 | USE_LITE_OP(fusion_elementwise_max_activation); 22 | USE_LITE_OP(fusion_elementwise_min_activation); 23 | USE_LITE_OP(fusion_elementwise_div_activation); 24 | USE_LITE_OP(io_copy_once); 25 | USE_LITE_OP(concat); 26 | USE_LITE_OP(layout_once); 27 | USE_LITE_OP(multiclass_nms); 28 | USE_LITE_OP(multiclass_nms2); 29 | USE_LITE_OP(multiclass_nms3); 30 | USE_LITE_OP(density_prior_box); 31 | USE_LITE_OP(io_copy); 32 | USE_LITE_OP(shuffle_channel); 33 | USE_LITE_OP(elementwise_sub); 34 | USE_LITE_OP(elementwise_add); 35 | USE_LITE_OP(elementwise_mul); 36 | USE_LITE_OP(elementwise_max); 37 | USE_LITE_OP(elementwise_min); 38 | USE_LITE_OP(elementwise_div); 39 | USE_LITE_OP(elementwise_mod); 40 | USE_LITE_OP(elementwise_pow); 41 | USE_LITE_OP(grid_sampler); 42 | USE_LITE_OP(expand_as); 43 | USE_LITE_OP(instance_norm); 44 | USE_LITE_OP(pad2d); 45 | USE_LITE_OP(box_coder); 46 | USE_LITE_OP(sigmoid); 47 | USE_LITE_OP(tanh); 48 | USE_LITE_OP(relu); 49 | USE_LITE_OP(leaky_relu); 50 | USE_LITE_OP(relu6); 51 | USE_LITE_OP(prelu); 52 | USE_LITE_OP(thresholded_relu); 53 | USE_LITE_OP(elu); 54 | USE_LITE_OP(bilinear_interp_v2); 55 | USE_LITE_OP(nearest_interp_v2); 56 | USE_LITE_OP(fill_constant); 57 | USE_LITE_OP(softmax); 58 | USE_LITE_OP(split); 59 | USE_LITE_OP(subgraph); 60 | USE_LITE_OP(slice); 61 | USE_LITE_OP(cast); 62 | USE_LITE_OP(search_fc); 63 | USE_LITE_OP(prior_box); 64 | USE_LITE_OP(conv2d_transpose); 65 | USE_LITE_OP(depthwise_conv2d_transpose); 66 | USE_LITE_OP(squeeze); 67 | USE_LITE_OP(squeeze2); 68 | USE_LITE_OP(arg_max); 69 | USE_LITE_OP(affine_channel); 70 | USE_LITE_OP(fill_constant_batch_size_like); 71 | USE_LITE_OP(affine_grid); 72 | USE_LITE_OP(expand); 73 | USE_LITE_OP(feed); 74 | USE_LITE_OP(yolo_box); 75 | USE_LITE_OP(sequence_topk_avg_pooling); 76 | USE_LITE_OP(mul); 77 | USE_LITE_OP(reshape); 78 | USE_LITE_OP(reshape2); 79 | USE_LITE_OP(fetch); 80 | USE_LITE_OP(matmul); 81 | USE_LITE_OP(calib); 82 | USE_LITE_OP(transpose); 83 | USE_LITE_OP(transpose2); 84 | USE_LITE_OP(range); 85 | USE_LITE_OP(dropout); 86 | USE_LITE_OP(flatten); 87 | USE_LITE_OP(flatten2); 88 | USE_LITE_OP(flatten_contiguous_range); 89 | USE_LITE_OP(stack); 90 | USE_LITE_OP(lod_array_length); -------------------------------------------------------------------------------- /src/dajutil.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include "dajutil.h" 3 | 4 | namespace dajnn { 5 | 6 | void log_i(const char* format, ...) { 7 | va_list ap; 8 | va_start(ap, format); 9 | log_x("[info]", format, ap); 10 | va_end(ap); 11 | } 12 | 13 | void log_w(const char* format, ...) { 14 | va_list ap; 15 | va_start(ap, format); 16 | log_x("[warn]", format, ap); 17 | va_end(ap); 18 | } 19 | 20 | void log_d(const char* format, ...) { 21 | va_list ap; 22 | va_start(ap, format); 23 | log_x("[debug]", format, ap); 24 | va_end(ap); 25 | } 26 | 27 | void log_e(const char* format, ...) { 28 | va_list ap; 29 | va_start(ap, format); 30 | log_x("[error]", format, ap); 31 | va_end(ap); 32 | } 33 | 34 | void log_x(const char* type_str, const char* format, va_list ap) { 35 | char msg[1024]; 36 | #ifdef _WIN32 37 | sprintf_s(msg, 1024, format, ap); 38 | printf_s("%s %s\n", type_str, msg); 39 | #else 40 | vsprintf(msg, format, ap); 41 | __android_log_print(ANDROID_LOG_ERROR, type_str, msg) 42 | #endif 43 | } 44 | 45 | void exit_if(bool condition, const char* format, ...) { 46 | if (condition) { 47 | if (format) { 48 | va_list ap; 49 | va_start(ap, format); 50 | log_e(format, ap); 51 | va_end(ap); 52 | } 53 | exit(-1); 54 | } 55 | } 56 | 57 | float get_max(float* arr, uint len) { 58 | float m = FLOAT_MIN; 59 | 60 | for (float* ap = arr; ap < arr + len; ++ap) { 61 | if (m < *ap) m = *ap; 62 | } 63 | return m; 64 | } 65 | 66 | float get_min(float* arr, uint len) { 67 | float m = FLOAT_MAX; 68 | 69 | for (float* ap = arr; ap < arr + len; ++ap) { 70 | if (m > *ap) m = *ap; 71 | } 72 | return m; 73 | } 74 | 75 | int get_max(int* arr, uint len) { 76 | int m = INT_MIN; 77 | 78 | for (int* ap = arr; ap < arr + len; ++ap) { 79 | if (m < *ap) m = *ap; 80 | } 81 | return m; 82 | } 83 | 84 | int get_min(int* arr, uint len) { 85 | int m = INT_MAX; 86 | 87 | for (int* ap = arr; ap < arr + len; ++ap) { 88 | if (m > *ap) m = *ap; 89 | } 90 | return m; 91 | } 92 | 93 | uint get_span(vector* shape) { 94 | uint span = 1; 95 | 96 | for (vector::iterator dim = shape->begin(); dim != shape->end(); ++dim) { 97 | span *= *dim; 98 | } 99 | return span; 100 | } 101 | 102 | string get_shape_str(vector* shape) { 103 | string str = "("; 104 | 105 | for (uint i = 0; i < shape->size(); ++i) { 106 | if (i > 0) str += ","; 107 | str += format_str("%d", shape->at(i)); 108 | } 109 | str += ")"; 110 | return str; 111 | } 112 | 113 | string format_str(const char* format, ...) { 114 | char str[1024]; 115 | va_list ap; 116 | va_start(ap, format); 117 | #ifdef _WIN32 118 | sprintf_s(str, 1024, format, ap); 119 | #else 120 | vsprintf(str, format, ap); 121 | #endif 122 | va_end(ap); 123 | return string(str); 124 | } 125 | 126 | } 127 | -------------------------------------------------------------------------------- /src/dajfunc.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include "dajfunc.h" 3 | #include "dajtensor.h" 4 | #include "dajutil.h" 5 | 6 | #ifdef PADDLE 7 | #include "paddle_api_2.h" 8 | using namespace paddle::lite_api; 9 | #endif 10 | 11 | namespace dajnn { 12 | namespace func { 13 | 14 | void relu(FTensor* tensor) { 15 | #ifdef PADDLE 16 | paddle_act_relu(tensor->val, tensor->val, tensor->span, PADDLE_THREADS); 17 | #else 18 | for (float* v = tensor->val; v < tensor->val + tensor->span; ++v) { 19 | if (*v < 0) *v = 0; 20 | } 21 | #endif 22 | } 23 | 24 | void tanh(FTensor* tensor) { 25 | #ifdef PADDLE 26 | act_tanh(tensor->val, tensor->val, tensor->span, PADDLE_THREADS); 27 | #else 28 | for (float* v = tensor->val; v < tensor->val + tensor->span; ++v) { 29 | *v = 2.f / (1.f + expf(-2.f * *v)) - 1; 30 | } 31 | #endif 32 | } 33 | 34 | void scale(FTensor* tensor, FTensor* weight, FTensor* bias, bool is_first_batch_dim) { 35 | uint span = tensor->span; 36 | int num_batches = is_first_batch_dim ? tensor->shape[0] : 1; 37 | int num_channels = tensor->shape[is_first_batch_dim ? 1 : 0]; 38 | int num_features = span / num_batches / num_channels; 39 | 40 | exit_if((weight->shape.size() != 1) || (weight->span != num_channels), 41 | "invalid scale weight shape with tensor shape : %s and %s", 42 | get_shape_str(&weight->shape).c_str(), 43 | get_shape_str(&tensor->shape).c_str()); 44 | exit_if(bias && ((bias->shape.size() != 1) || (bias->span != num_channels)), 45 | "invalid scale bias shape with weight shape : %s and %s", 46 | get_shape_str(&bias->shape).c_str(), 47 | get_shape_str(&weight->shape).c_str()); 48 | 49 | #ifdef PADDLE 50 | paddle::lite_api::scale(tensor->val, tensor->val, num_batches, num_channels, num_features, weight->val, bias ? bias->val : nullptr); 51 | #else 52 | float* v = tensor->val; 53 | 54 | if (bias) { 55 | for (int i = 0; i < num_batches; ++i) { 56 | float* w = weight->val; 57 | float* b = bias->val; 58 | 59 | for (int j = 0; j < num_channels; ++j, ++w, ++b) { 60 | for (int k = 0; k < num_features; ++k, ++v) { 61 | *v = *v * *w + *b; 62 | } 63 | } 64 | } 65 | } else { 66 | for (int i = 0; i < num_batches; ++i) { 67 | float* w = weight->val; 68 | 69 | for (int j = 0; j < num_channels; ++w) { 70 | for (int k = 0; k < num_features; ++k, ++v) { 71 | *v = *v * *w; 72 | } 73 | } 74 | } 75 | } 76 | #endif 77 | } 78 | 79 | void add(FTensor* dst, FTensor* oprd) { 80 | int span = dst->span; 81 | 82 | exit_if(!dst->is_shape(&oprd->shape), "shapes mismatch for add operation : %s and %s", 83 | get_shape_str(&dst->shape).c_str(), 84 | get_shape_str(&oprd->shape).c_str()); 85 | 86 | #ifdef PADDLE 87 | paddle_elementwise_add(dst->val, oprd->val, dst->val, span); 88 | #else 89 | float* v = dst->val; 90 | float* o = oprd->val; 91 | 92 | for (int i = 0; i < span; ++i, ++v, ++o) { 93 | *v += *o; 94 | } 95 | #endif 96 | } 97 | 98 | } 99 | } 100 | -------------------------------------------------------------------------------- /vsproj/dajnn.vcxproj.filters: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | {fe6241e9-2e92-445d-abaa-fd89a812ded6} 6 | 7 | 8 | {625277b8-cb1e-43da-874e-7c76e1f7fef9} 9 | 10 | 11 | 12 | 13 | paddle 14 | 15 | 16 | paddle 17 | 18 | 19 | paddle 20 | 21 | 22 | paddle 23 | 24 | 25 | paddle 26 | 27 | 28 | paddle 29 | 30 | 31 | paddle 32 | 33 | 34 | paddle 35 | 36 | 37 | dajnn 38 | 39 | 40 | dajnn 41 | 42 | 43 | dajnn 44 | 45 | 46 | dajnn 47 | 48 | 49 | dajnn 50 | 51 | 52 | dajnn 53 | 54 | 55 | dajnn 56 | 57 | 58 | dajnn 59 | 60 | 61 | dajnn 62 | 63 | 64 | dajnn 65 | 66 | 67 | 68 | 69 | 70 | dajnn 71 | 72 | 73 | dajnn 74 | 75 | 76 | dajnn 77 | 78 | 79 | dajnn 80 | 81 | 82 | dajnn 83 | 84 | 85 | dajnn 86 | 87 | 88 | dajnn 89 | 90 | 91 | dajnn 92 | 93 | 94 | dajnn 95 | 96 | 97 | -------------------------------------------------------------------------------- /paddle/paddle_use_passes.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #pragma once 16 | #include "paddle_lite_factory_helper.h" // NOLINT 17 | 18 | USE_MIR_PASS(demo); 19 | USE_MIR_PASS(static_kernel_pick_pass); 20 | USE_MIR_PASS(variable_place_inference_pass); 21 | USE_MIR_PASS(type_target_cast_pass); 22 | USE_MIR_PASS(generate_program_pass); 23 | 24 | USE_MIR_PASS(io_copy_kernel_pick_pass); 25 | USE_MIR_PASS(argument_type_display_pass); 26 | USE_MIR_PASS(runtime_context_assign_pass); 27 | USE_MIR_PASS(graph_visualize_pass); 28 | 29 | USE_MIR_PASS(adaptive_1x1_pool2d_convert_global_pass); 30 | USE_MIR_PASS(remove_tf_redundant_ops_pass); 31 | USE_MIR_PASS(lite_conv_bn_fuse_pass); 32 | USE_MIR_PASS(lite_conv_conv_fuse_pass); 33 | USE_MIR_PASS(lite_squeeze2_matmul_fuse_pass); 34 | USE_MIR_PASS(lite_reshape2_matmul_fuse_pass); 35 | USE_MIR_PASS(lite_matmul_fuse_pass); 36 | USE_MIR_PASS(lite_fc_fuse_pass); 37 | USE_MIR_PASS(lite_shuffle_channel_fuse_pass); 38 | USE_MIR_PASS(lite_transpose_softmax_transpose_fuse_pass); 39 | USE_MIR_PASS(lite_interpolate_fuse_pass); 40 | USE_MIR_PASS(lite_sequence_pool_concat_fuse_pass); 41 | USE_MIR_PASS(identity_scale_eliminate_pass); 42 | USE_MIR_PASS(identity_dropout_eliminate_pass); 43 | USE_MIR_PASS(lite_conv_elementwise_fuse_pass); 44 | USE_MIR_PASS(lite_conv_activation_fuse_pass); 45 | USE_MIR_PASS(lite_var_conv_2d_activation_fuse_pass); 46 | USE_MIR_PASS(lite_match_matrix_activation_fuse_pass); 47 | USE_MIR_PASS(lite_scales_fuse_pass); 48 | USE_MIR_PASS(lite_sequence_reverse_embedding_fuse_pass); 49 | USE_MIR_PASS(lite_elementwise_activation_fuse_pass); 50 | USE_MIR_PASS(lite_quant_dequant_fuse_pass); 51 | USE_MIR_PASS(type_precision_cast_pass); 52 | USE_MIR_PASS(type_layout_cast_pass); 53 | USE_MIR_PASS(type_layout_cast_preprocess_pass); 54 | USE_MIR_PASS(memory_optimize_pass); 55 | USE_MIR_PASS(lite_reshape_fuse_pass); 56 | USE_MIR_PASS(multi_stream_analysis_pass); 57 | USE_MIR_PASS(elementwise_mul_constant_eliminate_pass) 58 | USE_MIR_PASS(npu_subgraph_pass); 59 | USE_MIR_PASS(huawei_ascend_npu_subgraph_pass); 60 | USE_MIR_PASS(imagination_nna_subgraph_pass); 61 | USE_MIR_PASS(xpu_subgraph_pass); 62 | USE_MIR_PASS(mlu_subgraph_pass); 63 | USE_MIR_PASS(mlu_postprocess_pass); 64 | USE_MIR_PASS(weight_quantization_preprocess_pass); 65 | USE_MIR_PASS(post_quant_dynamic_pass); 66 | USE_MIR_PASS(apu_subgraph_pass); 67 | USE_MIR_PASS(quantized_op_attributes_inference_pass); 68 | USE_MIR_PASS(restrict_quantized_op_with_same_input_output_scale_pass); 69 | USE_MIR_PASS(control_flow_op_unused_inputs_and_outputs_eliminate_pass) 70 | USE_MIR_PASS(lite_scale_activation_fuse_pass); 71 | USE_MIR_PASS(lite_instance_norm_activation_fuse_pass); 72 | USE_MIR_PASS(__xpu__resnet_fuse_pass); 73 | USE_MIR_PASS(__xpu__resnet_d_fuse_pass); 74 | USE_MIR_PASS(__xpu__resnet_cbam_fuse_pass); 75 | USE_MIR_PASS(__xpu__multi_encoder_fuse_pass); 76 | USE_MIR_PASS(__xpu__embedding_with_eltwise_add_fuse_pass); 77 | USE_MIR_PASS(__xpu__fc_fuse_pass); 78 | USE_MIR_PASS(__xpu__mmdnn_fuse_pass); 79 | USE_MIR_PASS(__xpu__conv2d_fuse_pass); 80 | USE_MIR_PASS(__xpu__resblock_reduction_fuse_pass); 81 | USE_MIR_PASS(__xpu__resblock_normal_fuse_pass); 82 | USE_MIR_PASS(__xpu__conv2d_link_previous_out_max_pass); 83 | USE_MIR_PASS(__xpu__sfa_head_meanstd_fuse_pass); 84 | USE_MIR_PASS(__xpu__sfa_head_moment_fuse_pass); 85 | USE_MIR_PASS(__xpu__softmax_topk_fuse_pass); 86 | USE_MIR_PASS(__xpu__multi_encoder_slice_link_fuse_pass); 87 | -------------------------------------------------------------------------------- /src/dajconv.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include "dajdense.h" 3 | #include "dajtensor.h" 4 | #include "dajutil.h" 5 | #include "dajgemm.h" 6 | 7 | #ifdef PADDLE 8 | #include "paddle_api_2.h" 9 | using namespace paddle::lite_api; 10 | #endif 11 | 12 | namespace dajnn { 13 | namespace conv { 14 | 15 | #ifndef PADDLE 16 | float im2col_get_pixel(float* im, int height, int width, int channels, 17 | int row, int col, int channel, int pad_h, int pad_w) { 18 | 19 | row -= pad_h; 20 | col -= pad_w; 21 | 22 | if (row < 0 || col < 0 || row >= height || col >= width) return 0; 23 | return im[col + width * (row + height * channel)]; 24 | } 25 | 26 | void im2col_cpu(float* data_im, int channels, int height, int width, 27 | int kernel_h, int kernel_w, int stride_h, int stride_w, 28 | int pad_h, int pad_w, float* data_col) { 29 | 30 | int c, h, w; 31 | int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1; 32 | int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1; 33 | 34 | int ksize = kernel_h * kernel_w; 35 | int channels_col = channels * ksize; 36 | 37 | for (c = 0; c < channels_col; ++c) { 38 | int w_offset = c % kernel_w; 39 | int h_offset = (c / kernel_w) % kernel_h; 40 | int c_im = c / ksize; 41 | 42 | for (h = 0; h < height_col; ++h) { 43 | for (w = 0; w < width_col; ++w) { 44 | int im_row = h_offset + h * stride_h; 45 | int im_col = w_offset + w * stride_w; 46 | int col_index = (c * height_col + h) * width_col + w; 47 | 48 | data_col[col_index] = im2col_get_pixel( 49 | data_im, height, width, channels, 50 | im_row, im_col, c_im, pad_h, pad_w); 51 | } 52 | } 53 | } 54 | } 55 | #endif 56 | 57 | FTensor* conv2d(FTensor* input, FTensor* kernel, FTensor* bias, 58 | int padding_h, int padding_w, int stride_h, int stride_w, 59 | int dilation_h, int dilation_w) { 60 | 61 | exit_if(input->shape.size() != 4, "input dim of conv2d expects to be 4, but got %d", input->shape.size()); 62 | exit_if(kernel->shape.size() != 4, "kernel dim of conv2d expects to be 4, but got %d", kernel->shape.size()); 63 | exit_if(bias && (bias->shape.size() != 1), "bias dim of conv2d expects to be null or 1, but got %d", bias->shape.size()); 64 | 65 | int num_batches = input->shape[0]; 66 | int num_channels = input->shape[1]; 67 | int h = input->shape[2]; 68 | int w = input->shape[3]; 69 | int num_filters = kernel->shape[0]; 70 | int kernel_h = kernel->shape[2]; 71 | int kernel_w = kernel->shape[3]; 72 | 73 | exit_if(kernel->shape[1] != num_channels, "second dim of conv2d kernel (# of channels) expects to be %d, but got %d", num_channels, kernel->shape[1]); 74 | exit_if(bias->span != num_filters, "span of conv2d bias (# of filters) expects to be %d, but got %d", num_filters, bias->span); 75 | 76 | if (padding_h < 0) padding_h = (kernel_h - 1) * dilation_h / 2; 77 | if (padding_w < 0) padding_w = (kernel_w - 1) * dilation_w / 2; 78 | 79 | int _h_ = (h + 2 * padding_h - dilation_h * (kernel_h - 1) - 1) / stride_h + 1; 80 | int _w_ = (w + 2 * padding_w - dilation_w * (kernel_w - 1) - 1) / stride_w + 1; 81 | 82 | FTensor* output = new FTensor(num_batches, num_filters, _h_, _w_, END_DIM); 83 | 84 | int chw = input->span / num_batches; 85 | int _chw_ = output->span / num_batches; 86 | 87 | float* ip = input->val; 88 | float* op = output->val; 89 | 90 | #ifdef PADDLE 91 | exit_if(num_filters == 1, "single conv2d filter is not available for mobile forward"); 92 | 93 | paddle_conv2d(num_batches, h, w, num_channels, ip, num_filters, kernel_h, kernel_w, kernel->val, 94 | op, bias ? bias->val : nullptr, padding_h, padding_w, dilation_h, dilation_w, 95 | stride_h, stride_w, 0, 0, PADDLE_CLS, PADDLE_THREADS); 96 | #else 97 | exit_if((dilation_h != 1) || (dilation_w != 1), "only single dilation is available for win32 forward"); 98 | memset(output->val, 0, 4 * output->span); 99 | 100 | int _hw_ = _h_ * _w_; 101 | int _kkc_ = kernel_h * kernel_w * num_channels; 102 | float* workspace = (float*) malloc(4 * _kkc_ * _hw_); 103 | 104 | for (int i = 0; i < num_batches; ++i, ip += chw, op += _chw_) { 105 | im2col_cpu(ip, num_channels, h, w, kernel_h, kernel_w, stride_h, stride_w, 106 | padding_h, padding_w, workspace); 107 | gemm(0, 0, num_filters, _hw_, _kkc_, 1, kernel->val, _kkc_, workspace, _hw_, 1, op, _hw_); 108 | } 109 | if (bias) { 110 | op = output->val; 111 | 112 | for (int i = 0; i < num_batches; ++i) { 113 | for (int j = 0; j < num_filters; ++j) { 114 | float b = bias->val[j]; 115 | 116 | for (int k = 0; k < _hw_; ++k) { 117 | op[i * _chw_ + j * _hw_ + k] += b; 118 | } 119 | } 120 | } 121 | } 122 | free(workspace); 123 | #endif 124 | return output; 125 | } 126 | 127 | } 128 | } 129 | -------------------------------------------------------------------------------- /vsproj/dajnn.vcxproj: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Debug 6 | Win32 7 | 8 | 9 | Release 10 | Win32 11 | 12 | 13 | 14 | {A4E8B25A-10B8-4252-8C8A-EE5FDBF7891F} 15 | Win32Proj 16 | dajnn 17 | 18 | 19 | 20 | Application 21 | true 22 | v120_xp 23 | Unicode 24 | 25 | 26 | Application 27 | false 28 | v120_xp 29 | true 30 | Unicode 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | true 44 | $(SolutionDir)_bin\ 45 | $(SolutionDir)_obj\ 46 | 47 | 48 | false 49 | $(SolutionDir)_bin\ 50 | $(SolutionDir)_obj\ 51 | 52 | 53 | 54 | 55 | 56 | Level3 57 | Disabled 58 | WIN32;_DEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions) 59 | $(SolutionDir)..\src\dajnn;$(SolutionDir)paddle; 60 | /wd4996 %(AdditionalOptions) 61 | 62 | 63 | Console 64 | true 65 | 66 | 67 | 68 | 69 | Level3 70 | 71 | 72 | MaxSpeed 73 | true 74 | true 75 | WIN32;NDEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions) 76 | $(SolutionDir)..\src\dajnn;$(SolutionDir)paddle; 77 | /wd4996 %(AdditionalOptions) 78 | 79 | 80 | Console 81 | true 82 | true 83 | true 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | -------------------------------------------------------------------------------- /src/dajgemm.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include "dajgemm.h" 3 | #include "dajutil.h" 4 | 5 | namespace dajnn { 6 | 7 | void gemm_bin(int M, int N, int K, float ALPHA, char* A, int lda, float* B, int ldb, float* C, int ldc) { 8 | int i, j, k; 9 | 10 | for (i = 0; i < M; ++i) { 11 | for (k = 0; k < K; ++k) { 12 | char A_PART = A[i * lda + k]; 13 | 14 | if (A_PART) { 15 | for (j = 0; j < N; ++j) { 16 | C[i * ldc + j] += B[k * ldb + j]; 17 | } 18 | } else { 19 | for (j = 0; j < N; ++j) { 20 | C[i * ldc + j] -= B[k * ldb + j]; 21 | } 22 | } 23 | } 24 | } 25 | } 26 | 27 | void gemm(int TA, int TB, int M, int N, int K, float ALPHA, float* A, int lda, float* B, int ldb, float BETA, float* C, int ldc) { 28 | gemm_cpu(TA, TB, M, N, K, ALPHA, A, lda, B, ldb, BETA, C, ldc); 29 | } 30 | 31 | //#if (defined(__AVX__) && defined(__x86_64__)) || defined(_WIN64) 32 | #if defined(__AVX__) || defined(_WIN64) 33 | #define OSXSAVEFlag (1UL << 27) 34 | #define AVXFlag ((1UL << 28) | OSXSAVEFlag) 35 | #define FMAFlag ((1UL << 12) | AVXFlag | OSXSAVEFlag) 36 | #define CLMULFlag ((1UL << 1) | AVXFlag | OSXSAVEFlag) 37 | #define VAESFlag ((1UL << 25) | AVXFlag | OSXSAVEFlag) 38 | 39 | #include 40 | 41 | //#ifdef _WIN64 42 | #ifdef _WIN32 43 | #include 44 | #include 45 | #include 46 | #include 47 | #else // Linux GCC/Clang 48 | #include 49 | #include 50 | #include 51 | #include 52 | #include 53 | 54 | void asm_cpuid(uint32_t* abcd, uint32_t eax) { 55 | uint32_t ebx = 0, edx = 0, ecx = 0; 56 | 57 | // EBX is saved to EDI and later restored 58 | __asm__("movl %%ebx, %%edi;" 59 | "cpuid;" 60 | "xchgl %%ebx, %%edi;" 61 | : "=D"(ebx), 62 | "+a"(eax), "+c"(ecx), "=d"(edx)); 63 | 64 | abcd[0] = eax; 65 | abcd[1] = ebx; 66 | abcd[2] = ecx; 67 | abcd[3] = edx; 68 | } 69 | #endif 70 | 71 | int simd_detect_x86(unsigned int idFeature) { 72 | uint32_t regs[4]; // EAX, EBX, ECX, EDX; 73 | #ifdef _WIN32 74 | __cpuid((int*) regs, 0); 75 | if (regs[0] > 1U) __cpuid((int*) regs, 1); 76 | #else 77 | __get_cpuid(0, ®s[0], ®s[1], ®s[2], ®s[3]); 78 | if (regs[0] > 1U) __get_cpuid(1, ®s[0], ®s[1], ®s[2], ®s[3]); 79 | #endif 80 | if ((regs[2] & idFeature) != idFeature) return 0; 81 | return 1; 82 | } 83 | 84 | int is_fma_avx() { 85 | static int result = -1; 86 | 87 | if (result == -1) { 88 | result = simd_detect_x86(AVXFlag); 89 | 90 | if (result == 1) { 91 | log_i(" used AVX"); 92 | } else { 93 | log_i(" not used AVX"); 94 | } 95 | } 96 | return result; 97 | } 98 | 99 | void gemm_nn(int M, int N, int K, float ALPHA, float* A, int lda, float* B, int ldb, float* C, int ldc) { 100 | int i, j, k; 101 | 102 | if (is_fma_avx() == 1) { // AVX 103 | for (i = 0; i < M; ++i) { 104 | for (k = 0; k < K; ++k) { 105 | float A_PART = ALPHA * A[i * lda + k]; 106 | __m256 a256, b256, c256, result256; // AVX 107 | a256 = _mm256_set1_ps(A_PART); 108 | 109 | for (j = 0; j < N - 8; j += 8) { 110 | b256 = _mm256_loadu_ps(&B[k * ldb + j]); 111 | c256 = _mm256_loadu_ps(&C[i * ldc + j]); 112 | 113 | // FMA - Intel Haswell (2013), AMD Piledriver (2012) 114 | result256 = _mm256_fmadd_ps(a256, b256, c256); 115 | //result256 = _mm256_mul_ps(a256, b256); 116 | //result256 = _mm256_add_ps(result256, c256); 117 | 118 | _mm256_storeu_ps(&C[i * ldc + j], result256); 119 | } 120 | int prev_end = (N % 8 == 0) ? (N - 8) : (N / 8) * 8; 121 | 122 | for (j = prev_end; j < N; ++j) 123 | C[i * ldc + j] += A_PART * B[k * ldb + j]; 124 | } 125 | } 126 | } else { 127 | for (i = 0; i < M; ++i) { 128 | for (k = 0; k < K; ++k) { 129 | register float A_PART = ALPHA * A[i * lda + k]; 130 | 131 | for (j = 0; j < N; ++j) { 132 | C[i * ldc + j] += A_PART * B[k * ldb + j]; 133 | } 134 | /* // SSE 135 | __m128 a128, b128, c128, result128; // SSE 136 | a128 = _mm_set1_ps(A_PART); 137 | for (j = 0; j < N - 4; j += 4) { 138 | b128 = _mm_loadu_ps(&B[k*ldb + j]); 139 | c128 = _mm_loadu_ps(&C[i*ldc + j]); 140 | //result128 = _mm_fmadd_ps(a128, b128, c128); 141 | result128 = _mm_mul_ps(a128, b128); 142 | result128 = _mm_add_ps(result128, c128); 143 | _mm_storeu_ps(&C[i*ldc + j], result128); 144 | } 145 | 146 | int prev_end = (N % 4 == 0) ? (N - 4) : (N / 4) * 4; 147 | for (j = prev_end; j < N; ++j){ 148 | C[i*ldc + j] += A_PART*B[k*ldb + j]; 149 | } 150 | */ 151 | } 152 | } 153 | } 154 | } 155 | #else 156 | 157 | void gemm_nn(int M, int N, int K, float ALPHA, float* A, int lda, float* B, int ldb, float* C, int ldc) { 158 | int i, j, k; 159 | 160 | for (i = 0; i < M; ++i) { 161 | for (k = 0; k < K; ++k) { 162 | register float A_PART = ALPHA * A[i * lda + k]; 163 | 164 | for (j = 0; j < N; ++j) { 165 | C[i * ldc + j] += A_PART * B[k * ldb + j]; 166 | } 167 | } 168 | } 169 | } 170 | 171 | #endif // __x86_64 172 | 173 | void gemm_nt(int M, int N, int K, float ALPHA, float* A, int lda, float* B, int ldb, float* C, int ldc) { 174 | int i, j, k; 175 | 176 | for (i = 0; i < M; ++i) { 177 | for (j = 0; j < N; ++j) { 178 | register float sum = 0; 179 | 180 | for (k = 0; k < K; ++k) { 181 | sum += ALPHA * A[i * lda + k] * B[j * ldb + k]; 182 | } 183 | C[i * ldc + j] += sum; 184 | } 185 | } 186 | } 187 | 188 | void gemm_tn(int M, int N, int K, float ALPHA, float* A, int lda, float* B, int ldb, float* C, int ldc) { 189 | int i, j, k; 190 | 191 | for (i = 0; i < M; ++i) { 192 | for (k = 0; k < K; ++k) { 193 | register float A_PART = ALPHA * A[k * lda + i]; 194 | 195 | for (j = 0; j < N; ++j) { 196 | C[i * ldc + j] += A_PART * B[k * ldb + j]; 197 | } 198 | } 199 | } 200 | } 201 | 202 | void gemm_tt(int M, int N, int K, float ALPHA, float* A, int lda, float* B, int ldb, float* C, int ldc) { 203 | int i, j, k; 204 | 205 | for (i = 0; i < M; ++i) { 206 | for (j = 0; j < N; ++j) { 207 | register float sum = 0; 208 | 209 | for (k = 0; k < K; ++k) { 210 | sum += ALPHA * A[i + k * lda] * B[k + j * ldb]; 211 | } 212 | C[i * ldc + j] += sum; 213 | } 214 | } 215 | } 216 | 217 | void gemm_cpu(int TA, int TB, int M, int N, int K, float ALPHA, float* A, int lda, float* B, int ldb, float BETA, float* C, int ldc) { 218 | int i, j; 219 | 220 | /*for (i = 0; i < M; ++i) { 221 | for (j = 0; j < N; ++j) { 222 | C[i * ldc + j] *= BETA; 223 | } 224 | }*/ 225 | int t; 226 | 227 | #pragma omp parallel for 228 | for (t = 0; t < M; ++t) { 229 | if (!TA && !TB) { 230 | gemm_nn(1, N, K, ALPHA, A + t * lda, lda, B, ldb, C + t * ldc, ldc); 231 | } else if (TA && !TB) { 232 | gemm_tn(1, N, K, ALPHA, A + t, lda, B, ldb, C + t * ldc, ldc); 233 | } else if (!TA && TB) { 234 | gemm_nt(1, N, K, ALPHA, A + t * lda, lda, B, ldb, C + t * ldc, ldc); 235 | } else { 236 | gemm_tt(1, N, K, ALPHA, A + t, lda, B, ldb, C + t * ldc, ldc); 237 | } 238 | } 239 | } 240 | 241 | } 242 | -------------------------------------------------------------------------------- /paddle/paddle_use_kernels.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include "paddle_lite_factory_helper.h" 3 | 4 | USE_LITE_KERNEL(reshape, kHost, kAny, kAny, def); 5 | USE_LITE_KERNEL(reshape2, kHost, kAny, kAny, def); 6 | USE_LITE_KERNEL(flatten, kHost, kAny, kAny, def); 7 | USE_LITE_KERNEL(flatten2, kHost, kAny, kAny, def); 8 | USE_LITE_KERNEL(unsqueeze, kHost, kAny, kAny, def); 9 | USE_LITE_KERNEL(unsqueeze2, kHost, kAny, kAny, def); 10 | USE_LITE_KERNEL(slice, kARM, kFloat, kNCHW, def); 11 | USE_LITE_KERNEL(slice, kARM, kFloat, kNCHW, bool_slice); 12 | USE_LITE_KERNEL(slice, kARM, kFloat, kNCHW, int32_slice); 13 | USE_LITE_KERNEL(slice, kARM, kFloat, kNCHW, def_int64); 14 | USE_LITE_KERNEL(affine_channel, kARM, kFloat, kNCHW, def); 15 | USE_LITE_KERNEL(pool2d, kARM, kFloat, kNCHW, def); 16 | USE_LITE_KERNEL(fill_constant, kHost, kAny, kNCHW, def); 17 | USE_LITE_KERNEL(mul, kARM, kFloat, kNCHW, def); 18 | USE_LITE_KERNEL(pad2d, kARM, kFloat, kNCHW, def); 19 | USE_LITE_KERNEL(relu, kARM, kFloat, kNCHW, def); 20 | USE_LITE_KERNEL(leaky_relu, kARM, kFloat, kNCHW, def); 21 | USE_LITE_KERNEL(prelu, kARM, kFloat, kNCHW, def); 22 | USE_LITE_KERNEL(sigmoid, kARM, kFloat, kNCHW, def); 23 | USE_LITE_KERNEL(tanh, kARM, kFloat, kNCHW, def); 24 | USE_LITE_KERNEL(relu6, kARM, kFloat, kNCHW, def); 25 | USE_LITE_KERNEL(thresholded_relu, kARM, kFloat, kNCHW, def); 26 | USE_LITE_KERNEL(elu, kARM, kFloat, kNCHW, def); 27 | USE_LITE_KERNEL(layout, kARM, kFloat, kNCHW, nchw2nhwc); 28 | USE_LITE_KERNEL(layout, kARM, kFloat, kNCHW, nhwc2nchw); 29 | USE_LITE_KERNEL(layout, kARM, kInt8, kNCHW, int8_nchw2nhwc); 30 | USE_LITE_KERNEL(layout, kARM, kInt8, kNCHW, int8_nhwc2nchw); 31 | USE_LITE_KERNEL(layout_once, kARM, kFloat, kNCHW, nchw2nhwc); 32 | USE_LITE_KERNEL(layout_once, kARM, kFloat, kNCHW, nhwc2nchw); 33 | USE_LITE_KERNEL(layout_once, kARM, kInt8, kNCHW, int8_nchw2nhwc); 34 | USE_LITE_KERNEL(layout_once, kARM, kInt8, kNCHW, int8_nhwc2nchw); 35 | USE_LITE_KERNEL(split, kARM, kFloat, kNCHW, def); 36 | USE_LITE_KERNEL(split, kARM, kInt64, kNCHW, def); 37 | USE_LITE_KERNEL(concat, kARM, kAny, kNCHW, def); 38 | USE_LITE_KERNEL(expand, kHost, kFloat, kAny, def); 39 | USE_LITE_KERNEL(expand, kHost, kInt32, kAny, def); 40 | USE_LITE_KERNEL(yolo_box, kARM, kFloat, kNCHW, def); 41 | USE_LITE_KERNEL(gather, kHost, kFloat, kNCHW, int32int32); 42 | USE_LITE_KERNEL(gather, kHost, kFloat, kNCHW, int64int64); 43 | USE_LITE_KERNEL(gather, kHost, kFloat, kNCHW, int64int32); 44 | USE_LITE_KERNEL(gather, kHost, kFloat, kNCHW, int32int64); 45 | USE_LITE_KERNEL(calib, kARM, kInt8, kNCHW, fp32_to_int8); 46 | USE_LITE_KERNEL(calib, kARM, kInt32, kNCHW, int32_to_fp32); 47 | USE_LITE_KERNEL(calib, kARM, kInt32, kNCHW, int32_to_int64); 48 | USE_LITE_KERNEL(calib, kARM, kInt32, kNCHW, fp32_to_int32); 49 | USE_LITE_KERNEL(calib, kARM, kInt64, kNCHW, int64_to_fp32); 50 | USE_LITE_KERNEL(calib, kARM, kInt8, kNCHW, int8_to_fp32); 51 | USE_LITE_KERNEL(calib, kARM, kInt8, kNHWC, fp32_to_int8); 52 | USE_LITE_KERNEL(calib, kARM, kInt8, kNHWC, int8_to_fp32); 53 | USE_LITE_KERNEL(calib, kARM, kInt64, kNCHW, int64_to_int32); 54 | USE_LITE_KERNEL(calib_once, kARM, kInt8, kNCHW, fp32_to_int8); 55 | USE_LITE_KERNEL(calib_once, kARM, kInt8, kNCHW, int8_to_fp32); 56 | USE_LITE_KERNEL(calib_once, kARM, kInt8, kNHWC, fp32_to_int8); 57 | USE_LITE_KERNEL(calib_once, kARM, kInt8, kNHWC, int8_to_fp32); 58 | USE_LITE_KERNEL(calib_once, kARM, kInt64, kNCHW, int64_to_int32); 59 | USE_LITE_KERNEL(arg_max, kHost, kAny, kNCHW, fp32); 60 | USE_LITE_KERNEL(feed, kHost, kAny, kAny, def); 61 | USE_LITE_KERNEL(multiclass_nms, kHost, kFloat, kNCHW, def); 62 | USE_LITE_KERNEL(multiclass_nms2, kHost, kFloat, kNCHW, def); 63 | USE_LITE_KERNEL(multiclass_nms3, kHost, kFloat, kNCHW, def); 64 | USE_LITE_KERNEL(conv2d, kARM, kFloat, kNCHW, def); 65 | USE_LITE_KERNEL(depthwise_conv2d, kARM, kFloat, kNCHW, def); 66 | USE_LITE_KERNEL(conv2d, kARM, kInt8, kNCHW, int8_out); 67 | USE_LITE_KERNEL(conv2d, kARM, kInt8, kNCHW, fp32_out); 68 | USE_LITE_KERNEL(depthwise_conv2d, kARM, kInt8, kNCHW, int8_out); 69 | USE_LITE_KERNEL(depthwise_conv2d, kARM, kInt8, kNCHW, fp32_out); 70 | USE_LITE_KERNEL(box_coder, kHost, kFloat, kNCHW, def); 71 | USE_LITE_KERNEL(assign_value, kARM, kAny, kNCHW, def); 72 | USE_LITE_KERNEL(squeeze, kHost, kAny, kAny, def); 73 | USE_LITE_KERNEL(squeeze2, kHost, kAny, kAny, def); 74 | USE_LITE_KERNEL(relu_clipped, kARM, kFloat, kNCHW, def); 75 | USE_LITE_KERNEL(swish, kARM, kFloat, kNCHW, def); 76 | USE_LITE_KERNEL(log, kARM, kFloat, kNCHW, def); 77 | USE_LITE_KERNEL(exp, kARM, kFloat, kNCHW, def); 78 | USE_LITE_KERNEL(floor, kARM, kFloat, kNCHW, def); 79 | USE_LITE_KERNEL(hard_sigmoid, kARM, kFloat, kNCHW, def); 80 | USE_LITE_KERNEL(sqrt, kARM, kFloat, kNCHW, def); 81 | USE_LITE_KERNEL(rsqrt, kARM, kFloat, kNCHW, def); 82 | USE_LITE_KERNEL(square, kARM, kFloat, kNCHW, def); 83 | USE_LITE_KERNEL(hard_swish, kARM, kFloat, kNCHW, def); 84 | USE_LITE_KERNEL(reciprocal, kARM, kFloat, kNCHW, def); 85 | USE_LITE_KERNEL(abs, kARM, kFloat, kNCHW, def); 86 | USE_LITE_KERNEL(range, kARM, kFloat, kNCHW, def); 87 | USE_LITE_KERNEL(range, kARM, kInt32, kNCHW, def); 88 | USE_LITE_KERNEL(fc, kARM, kFloat, kNCHW, def); 89 | USE_LITE_KERNEL(fc, kARM, kInt8, kNCHW, int8out); 90 | USE_LITE_KERNEL(fc, kARM, kInt8, kNCHW, fp32out); 91 | USE_LITE_KERNEL(grid_sampler, kARM, kFloat, kNCHW, def); 92 | USE_LITE_KERNEL(instance_norm, kARM, kFloat, kNCHW, def); 93 | USE_LITE_KERNEL(stack, kHost, kFloat, kAny, def); 94 | USE_LITE_KERNEL(stack, kHost, kInt32, kAny, def); 95 | USE_LITE_KERNEL(lod_array_length, kHost, kAny, kAny, def); 96 | USE_LITE_KERNEL(transpose, kARM, kFloat, kNCHW, def); 97 | USE_LITE_KERNEL(transpose2, kARM, kFloat, kNCHW, def); 98 | USE_LITE_KERNEL(prior_box, kARM, kFloat, kNCHW, def); 99 | USE_LITE_KERNEL(scale, kARM, kFloat, kNCHW, def); 100 | USE_LITE_KERNEL(scale, kARM, kInt32, kNCHW, def); 101 | USE_LITE_KERNEL(scale, kARM, kInt64, kNCHW, def); 102 | USE_LITE_KERNEL(arg_max, kARM, kAny, kNCHW, fp32); 103 | USE_LITE_KERNEL(shuffle_channel, kARM, kFloat, kNCHW, def); 104 | USE_LITE_KERNEL(range, kHost, kFloat, kAny, def); 105 | USE_LITE_KERNEL(range, kHost, kInt32, kAny, def); 106 | USE_LITE_KERNEL(softmax, kARM, kFloat, kNCHW, def); 107 | USE_LITE_KERNEL(deformable_conv, kHost, kFloat, kNCHW, def); 108 | USE_LITE_KERNEL(affine_grid, kARM, kFloat, kNCHW, def); 109 | USE_LITE_KERNEL(conv2d_transpose, kARM, kFloat, kNCHW, def); 110 | USE_LITE_KERNEL(elementwise_add, kARM, kFloat, kNCHW, def); 111 | USE_LITE_KERNEL(elementwise_add, kARM, kInt32, kNCHW, def); 112 | USE_LITE_KERNEL(elementwise_add, kARM, kInt64, kNCHW, def); 113 | USE_LITE_KERNEL(fusion_elementwise_add_activation, kARM, kFloat, kNCHW, def); 114 | USE_LITE_KERNEL(elementwise_sub, kARM, kFloat, kNCHW, def); 115 | USE_LITE_KERNEL(elementwise_sub, kARM, kInt32, kNCHW, def); 116 | USE_LITE_KERNEL(fusion_elementwise_sub_activation, kARM, kFloat, kNCHW, def); 117 | USE_LITE_KERNEL(elementwise_mul, kARM, kInt64, kNCHW, def); 118 | USE_LITE_KERNEL(elementwise_mul, kARM, kFloat, kNCHW, def); 119 | USE_LITE_KERNEL(elementwise_mul, kARM, kInt32, kNCHW, def); 120 | USE_LITE_KERNEL(fusion_elementwise_mul_activation, kARM, kFloat, kNCHW, def); 121 | USE_LITE_KERNEL(fusion_elementwise_mul_activation, kARM, kInt64, kNCHW, def); 122 | USE_LITE_KERNEL(elementwise_max, kARM, kFloat, kNCHW, def); 123 | USE_LITE_KERNEL(fusion_elementwise_max_activation, kARM, kFloat, kNCHW, def); 124 | USE_LITE_KERNEL(elementwise_min, kARM, kFloat, kNCHW, def); 125 | USE_LITE_KERNEL(fusion_elementwise_min_activation, kARM, kFloat, kNCHW, def); 126 | USE_LITE_KERNEL(elementwise_div, kARM, kFloat, kNCHW, def); 127 | USE_LITE_KERNEL(elementwise_div, kARM, kInt32, kNCHW, def); 128 | USE_LITE_KERNEL(elementwise_div, kARM, kInt64, kNCHW, def); 129 | USE_LITE_KERNEL(fusion_elementwise_div_activation, kARM, kFloat, kNCHW, def); 130 | USE_LITE_KERNEL(elementwise_mod, kARM, kInt64, kNCHW, def); 131 | USE_LITE_KERNEL(elementwise_pow, kARM, kFloat, kNCHW, def); 132 | USE_LITE_KERNEL(elementwise_pow, kARM, kInt32, kNCHW, def); 133 | USE_LITE_KERNEL(bilinear_interp, kARM, kFloat, kNCHW, def); 134 | USE_LITE_KERNEL(nearest_interp, kARM, kFloat, kNCHW, def); 135 | USE_LITE_KERNEL(bilinear_interp_v2, kARM, kFloat, kNCHW, def); 136 | USE_LITE_KERNEL(nearest_interp_v2, kARM, kFloat, kNCHW, def); 137 | USE_LITE_KERNEL(fetch, kHost, kAny, kAny, def); 138 | USE_LITE_KERNEL(box_coder, kARM, kFloat, kNCHW, def); 139 | USE_LITE_KERNEL(expand_as, kHost, kFloat, kAny, def); 140 | USE_LITE_KERNEL(matmul, kARM, kFloat, kNCHW, def); 141 | USE_LITE_KERNEL(reduce_mean, kARM, kFloat, kNCHW, def); 142 | USE_LITE_KERNEL(batch_norm, kARM, kFloat, kNCHW, def); 143 | USE_LITE_KERNEL(sync_batch_norm, kARM, kFloat, kNCHW, def); 144 | USE_LITE_KERNEL(dropout, kARM, kFloat, kNCHW, def); 145 | USE_LITE_KERNEL(density_prior_box, kARM, kFloat, kNCHW, def); 146 | USE_LITE_KERNEL(cast, kARM, kAny, kNCHW, def); 147 | USE_LITE_KERNEL(fill_constant_batch_size_like, kHost, kAny, kNCHW, def); 148 | USE_LITE_KERNEL(stack, kARM, kFloat, kNCHW, def); 149 | USE_LITE_KERNEL(stack, kARM, kInt32, kNCHW, def); -------------------------------------------------------------------------------- /paddle/paddle_image_preprocess.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #pragma once 16 | 17 | #include 18 | #include 19 | #include 20 | #include "lite/api/paddle_api.h" 21 | #include "lite/api/paddle_place.h" 22 | 23 | namespace paddle { 24 | namespace lite { 25 | namespace utils { 26 | namespace cv { 27 | typedef paddle::lite_api::Tensor Tensor; 28 | typedef paddle::lite_api::DataLayoutType LayoutType; 29 | // color enum 30 | enum ImageFormat { 31 | RGBA = 0, 32 | BGRA, 33 | RGB, 34 | BGR, 35 | GRAY, 36 | NV21 = 11, 37 | NV12, 38 | }; 39 | // flip enum 40 | enum FlipParam { 41 | XY = -1, // flip along the XY axis 42 | X = 0, // flip along the X axis 43 | Y // flip along the Y axis 44 | }; 45 | // transform param 46 | typedef struct { 47 | int ih; // input height 48 | int iw; // input width 49 | int oh; // outpu theight 50 | int ow; // output width 51 | FlipParam flip_param; // flip, support x, y, xy 52 | float rotate_param; // rotate, support 90, 180, 270 53 | } TransParam; 54 | 55 | class ImagePreprocess { 56 | public: 57 | /* 58 | * init 59 | * param srcFormat: input image color 60 | * param dstFormat: output image color 61 | * param param: input image parameter, egs: input size 62 | */ 63 | ImagePreprocess(ImageFormat srcFormat, 64 | ImageFormat dstFormat, 65 | TransParam param); 66 | 67 | /* 68 | * image color convert 69 | * support NV12/NV21_to_BGR(RGB), NV12/NV21_to_BGRA(RGBA), 70 | * BGR(RGB)and BGRA(RGBA) transform, 71 | * BGR(RGB)and RGB(BGR) transform, 72 | * BGR(RGB)and RGBA(BGRA) transform, 73 | * BGR(RGB) and GRAY transform, 74 | * BGRA(RGBA) and GRAY transform, 75 | * param src: input image data 76 | * param dst: output image data 77 | */ 78 | void image_convert(const uint8_t* src, uint8_t* dst); 79 | 80 | /* 81 | * image color convert 82 | * support NV12/NV21_to_BGR(RGB), NV12/NV21_to_BGRA(RGBA), 83 | * BGR(RGB)and BGRA(RGBA) transform, 84 | * BGR(RGB)and RGB(BGR) transform, 85 | * BGR(RGB)and RGBA(BGRA) transform, 86 | * BGR(RGB)and GRAY transform, 87 | * BGRA(RGBA) and GRAY transform, 88 | * param src: input image data 89 | * param dst: output image data 90 | * param srcFormat: input image image format support: GRAY, NV12(NV21), 91 | * BGR(RGB) and BGRA(RGBA) 92 | * param dstFormat: output image image format, support GRAY, BGR(RGB) and 93 | * BGRA(RGBA) 94 | */ 95 | void image_convert(const uint8_t* src, 96 | uint8_t* dst, 97 | ImageFormat srcFormat, 98 | ImageFormat dstFormat); 99 | 100 | /* 101 | * image color convert 102 | * support NV12/NV21_to_BGR(RGB), NV12/NV21_to_BGRA(RGBA), 103 | * BGR(RGB)and BGRA(RGBA) transform, 104 | * BGR(RGB)and RGB(BGR) transform, 105 | * BGR(RGB)and RGBA(BGRA) transform, 106 | * BGR(RGB)and GRAY transform, 107 | * BGRA(RGBA) and GRAY transform, 108 | * param src: input image data 109 | * param dst: output image data 110 | * param srcFormat: input image image format support: GRAY, NV12(NV21), 111 | * BGR(RGB) and BGRA(RGBA) 112 | * param dstFormat: output image image format, support GRAY, BGR(RGB) and 113 | * BGRA(RGBA) 114 | * param srcw: input image width 115 | * param srch: input image height 116 | */ 117 | void image_convert(const uint8_t* src, 118 | uint8_t* dst, 119 | ImageFormat srcFormat, 120 | ImageFormat dstFormat, 121 | int srcw, 122 | int srch); 123 | 124 | /* 125 | * image resize, use bilinear method 126 | * support image format: 1-channel image (egs: GRAY, 2-channel image (egs: 127 | * NV12, NV21), 3-channel(egs: BGR), 4-channel(egs: BGRA) 128 | * param src: input image data 129 | * param dst: output image data 130 | */ 131 | void image_resize(const uint8_t* src, uint8_t* dst); 132 | 133 | /* 134 | image resize, use bilinear method 135 | * support image format: 1-channel image (egs: GRAY, 2-channel image (egs: 136 | NV12, NV21), 3-channel image(egs: BGR), 4-channel image(egs: BGRA) 137 | * param src: input image data 138 | * param dst: output image data 139 | * param srcw: input image width 140 | * param srch: input image height 141 | * param dstw: output image width 142 | * param dsth: output image height 143 | */ 144 | void image_resize(const uint8_t* src, 145 | uint8_t* dst, 146 | ImageFormat srcFormat, 147 | int srcw, 148 | int srch, 149 | int dstw, 150 | int dsth); 151 | 152 | /* 153 | * image Rotate 154 | * support 90, 180 and 270 Rotate process 155 | * color format support 1-channel image, 3-channel image and 4-channel image 156 | * param src: input image data 157 | * param dst: output image data 158 | */ 159 | void image_rotate(const uint8_t* src, uint8_t* dst); 160 | 161 | /* 162 | * image Rotate 163 | * support 90, 180 and 270 Rotate process 164 | * color format support 1-channel image, 3-channel image and 4-channel image 165 | * param src: input image data 166 | * param dst: output image data 167 | * param srcFormat: input image format, support GRAY, BGR(RGB) and BGRA(RGBA) 168 | * param srcw: input image width 169 | * param srch: input image height 170 | * param degree: Rotate degree, support 90, 180 and 270 171 | */ 172 | void image_rotate(const uint8_t* src, 173 | uint8_t* dst, 174 | ImageFormat srcFormat, 175 | int srcw, 176 | int srch, 177 | float degree); 178 | 179 | /* 180 | * image Flip 181 | * support X, Y and XY flip process 182 | * color format support 1-channel image, 3-channel image and 4-channel image 183 | * param src: input image data 184 | * param dst: output image data 185 | */ 186 | void image_flip(const uint8_t* src, uint8_t* dst); 187 | 188 | /* 189 | * image Flip 190 | * support X, Y and XY flip process 191 | * color format support 1-channel image, 3-channel image and 4-channel image 192 | * param src: input image data 193 | * param dst: output image data 194 | * param srcFormat: input image format, support GRAY, BGR(RGB) and BGRA(RGBA) 195 | * param srcw: input image width 196 | * param srch: input image height 197 | * param flip_param: flip parameter, support X, Y and XY 198 | */ 199 | void image_flip(const uint8_t* src, 200 | uint8_t* dst, 201 | ImageFormat srcFormat, 202 | int srcw, 203 | int srch, 204 | FlipParam flip_param); 205 | 206 | /* 207 | * change image data to tensor data 208 | * support image format is GRAY, BGR(RGB) and BGRA(RGBA), Data layout is NHWC 209 | * and 210 | * NCHW 211 | * param src: input image data 212 | * param dstTensor: output tensor data 213 | * param layout: output tensor layout,support NHWC and NCHW 214 | * param means: means of image 215 | * param scales: scales of image 216 | */ 217 | void image_to_tensor(const uint8_t* src, 218 | Tensor* dstTensor, 219 | LayoutType layout, 220 | float* means, 221 | float* scales); 222 | 223 | /* 224 | * change image data to tensor data 225 | * support image format is GRAY, BGR(RGB) and BGRA(RGBA), Data layout is NHWC 226 | * and 227 | * NCHW 228 | * param src: input image data 229 | * param dstTensor: output tensor data 230 | * param srcFormat: input image format, support BGR(RGB) and BGRA(RGBA) 231 | * param srcw: input image width 232 | * param srch: input image height 233 | * param layout: output tensor layout,support NHWC and NCHW 234 | * param means: means of image 235 | * param scales: scales of image 236 | */ 237 | void image_to_tensor(const uint8_t* src, 238 | Tensor* dstTensor, 239 | ImageFormat srcFormat, 240 | int srcw, 241 | int srch, 242 | LayoutType layout, 243 | float* means, 244 | float* scales); 245 | 246 | /* 247 | * image crop process 248 | * color format support 1-channel image, 3-channel image and 4-channel image 249 | * param src: input image data 250 | * param dst: output image data 251 | */ 252 | void image_crop(const uint8_t* src, 253 | uint8_t* dst, 254 | ImageFormat srcFormat, 255 | int srcw, 256 | int srch, 257 | int left_x, 258 | int left_y, 259 | int dstw, 260 | int dsth); 261 | 262 | private: 263 | ImageFormat srcFormat_; 264 | ImageFormat dstFormat_; 265 | TransParam transParam_; 266 | }; 267 | } // namespace cv 268 | } // namespace utils 269 | } // namespace lite 270 | } // namespace paddle 271 | -------------------------------------------------------------------------------- /paddle/paddle_place.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #pragma once 16 | #include 17 | #include 18 | 19 | // Generic helper definitions for shared library support 20 | #if defined _WIN32 || defined __CYGWIN__ 21 | #define PADDLE_LITE_HELPER_DLL_IMPORT __declspec(dllimport) 22 | #define PADDLE_LITE_HELPER_DLL_EXPORT __declspec(dllexport) 23 | #define PADDLE_LITE_HELPER_DLL_LOCAL 24 | #else 25 | #if __GNUC__ >= 4 26 | #define PADDLE_LITE_HELPER_DLL_IMPORT __attribute__((visibility("default"))) 27 | #define PADDLE_LITE_HELPER_DLL_EXPORT __attribute__((visibility("default"))) 28 | #else 29 | #define PADDLE_LITE_HELPER_DLL_IMPORT 30 | #define PADDLE_LITE_HELPER_DLL_EXPORT 31 | #endif 32 | #endif 33 | 34 | #ifdef LITE_ON_TINY_PUBLISH 35 | #define LITE_API PADDLE_LITE_HELPER_DLL_EXPORT 36 | #define LITE_API_IMPORT PADDLE_LITE_HELPER_DLL_IMPORT 37 | #else 38 | #define LITE_API 39 | #define LITE_API_IMPORT 40 | #endif 41 | 42 | namespace paddle { 43 | namespace lite_api { 44 | 45 | enum class TargetType : int { 46 | kUnk = 0, 47 | kHost = 1, 48 | kX86 = 2, 49 | kCUDA = 3, 50 | kARM = 4, 51 | kOpenCL = 5, 52 | kAny = 6, // any target 53 | kFPGA = 7, 54 | kNPU = 8, 55 | kXPU = 9, 56 | kBM = 10, 57 | kMLU = 11, 58 | kRKNPU = 12, 59 | kAPU = 13, 60 | kHuaweiAscendNPU = 14, 61 | kImaginationNNA = 15, 62 | NUM = 16, // number of fields. 63 | }; 64 | enum class PrecisionType : int { 65 | kUnk = 0, 66 | kFloat = 1, 67 | kInt8 = 2, 68 | kInt32 = 3, 69 | kAny = 4, // any precision 70 | kFP16 = 5, 71 | kBool = 6, 72 | kInt64 = 7, 73 | kInt16 = 8, 74 | kUInt8 = 9, 75 | kFP64 = 10, 76 | NUM = 11, // number of fields. 77 | }; 78 | enum class DataLayoutType : int { 79 | kUnk = 0, 80 | kNCHW = 1, 81 | kNHWC = 3, 82 | kImageDefault = 4, // for opencl image2d 83 | kImageFolder = 5, // for opencl image2d 84 | kImageNW = 6, // for opencl image2d 85 | kAny = 2, // any data layout 86 | NUM = 7, // number of fields. 87 | }; 88 | 89 | typedef enum { 90 | LITE_POWER_HIGH = 0, 91 | LITE_POWER_LOW = 1, 92 | LITE_POWER_FULL = 2, 93 | LITE_POWER_NO_BIND = 3, 94 | LITE_POWER_RAND_HIGH = 4, 95 | LITE_POWER_RAND_LOW = 5 96 | } PowerMode; 97 | 98 | typedef enum { 99 | CL_TUNE_NONE = 0, 100 | CL_TUNE_RAPID = 1, 101 | CL_TUNE_NORMAL = 2, 102 | CL_TUNE_EXHAUSTIVE = 3 103 | } CLTuneMode; 104 | 105 | typedef enum { 106 | CL_PRECISION_AUTO = 0, 107 | CL_PRECISION_FP32 = 1, 108 | CL_PRECISION_FP16 = 2 109 | } CLPrecisionType; 110 | 111 | typedef enum { MLU_220 = 0, MLU_270 = 1 } MLUCoreVersion; 112 | 113 | enum class ActivationType : int { 114 | kIndentity = 0, 115 | kRelu = 1, 116 | kRelu6 = 2, 117 | kPRelu = 3, 118 | kLeakyRelu = 4, 119 | kSigmoid = 5, 120 | kTanh = 6, 121 | kSwish = 7, 122 | kExp = 8, 123 | kAbs = 9, 124 | kHardSwish = 10, 125 | kReciprocal = 11, 126 | kThresholdedRelu = 12, 127 | kElu = 13, 128 | kHardSigmoid = 14, 129 | kLog = 15, 130 | kSigmoid_v2 = 16, 131 | kTanh_v2 = 17, 132 | NUM = 18, 133 | }; 134 | 135 | static size_t PrecisionTypeLength(PrecisionType type) { 136 | switch (type) { 137 | case PrecisionType::kFloat: 138 | return 4; 139 | case PrecisionType::kFP64: 140 | return 8; 141 | case PrecisionType::kUInt8: 142 | return 1; 143 | case PrecisionType::kInt8: 144 | return 1; 145 | case PrecisionType::kInt32: 146 | return 4; 147 | case PrecisionType::kInt64: 148 | return 8; 149 | case PrecisionType::kFP16: 150 | return 2; 151 | case PrecisionType::kInt16: 152 | return 2; 153 | default: 154 | return 0; 155 | } 156 | } 157 | 158 | enum class QuantType : int { 159 | QUANT_INT8, 160 | QUANT_INT16, 161 | }; 162 | 163 | template 164 | struct PrecisionTypeTrait { 165 | constexpr static PrecisionType Type() { return PrecisionType::kUnk; } 166 | }; 167 | 168 | #define _ForEachPrecisionTypeHelper(callback, cpp_type, precision_type) \ 169 | callback(cpp_type, ::paddle::lite_api::PrecisionType::precision_type); 170 | 171 | #define _ForEachPrecisionType(callback) \ 172 | _ForEachPrecisionTypeHelper(callback, bool, kBool); \ 173 | _ForEachPrecisionTypeHelper(callback, float, kFloat); \ 174 | _ForEachPrecisionTypeHelper(callback, double, kFP64); \ 175 | _ForEachPrecisionTypeHelper(callback, uint8_t, kUInt8); \ 176 | _ForEachPrecisionTypeHelper(callback, int8_t, kInt8); \ 177 | _ForEachPrecisionTypeHelper(callback, int16_t, kInt16); \ 178 | _ForEachPrecisionTypeHelper(callback, int, kInt32); \ 179 | _ForEachPrecisionTypeHelper(callback, int64_t, kInt64); 180 | 181 | #define DefinePrecisionTypeTrait(cpp_type, precision_type) \ 182 | template <> \ 183 | struct PrecisionTypeTrait { \ 184 | constexpr static PrecisionType Type() { return precision_type; } \ 185 | } 186 | 187 | _ForEachPrecisionType(DefinePrecisionTypeTrait); 188 | 189 | #undef _ForEachPrecisionTypeHelper 190 | #undef _ForEachPrecisionType 191 | #undef DefinePrecisionTypeTrait 192 | 193 | #define TARGET(item__) paddle::lite_api::TargetType::item__ 194 | #define PRECISION(item__) paddle::lite_api::PrecisionType::item__ 195 | #define DATALAYOUT(item__) paddle::lite_api::DataLayoutType::item__ 196 | 197 | const std::string& ActivationTypeToStr(ActivationType act); 198 | 199 | const std::string& TargetToStr(TargetType target); 200 | 201 | const std::string& PrecisionToStr(PrecisionType precision); 202 | 203 | const std::string& DataLayoutToStr(DataLayoutType layout); 204 | 205 | const std::string& TargetRepr(TargetType target); 206 | 207 | const std::string& PrecisionRepr(PrecisionType precision); 208 | 209 | const std::string& DataLayoutRepr(DataLayoutType layout); 210 | 211 | // Get a set of all the elements represented by the target. 212 | std::set ExpandValidTargets(TargetType target = TARGET(kAny)); 213 | 214 | // Get a set of all the elements represented by the precision. 215 | std::set ExpandValidPrecisions( 216 | PrecisionType precision = PRECISION(kAny)); 217 | 218 | // Get a set of all the elements represented by the layout. 219 | std::set ExpandValidLayouts( 220 | DataLayoutType layout = DATALAYOUT(kAny)); 221 | 222 | /* 223 | * Place specifies the execution context of a Kernel or input/output for a 224 | * kernel. It is used to make the analysis of the MIR more clear and accurate. 225 | */ 226 | struct LITE_API Place { 227 | TargetType target{TARGET(kUnk)}; 228 | PrecisionType precision{PRECISION(kUnk)}; 229 | DataLayoutType layout{DATALAYOUT(kUnk)}; 230 | int16_t device{0}; // device ID 231 | 232 | Place() = default; 233 | Place(TargetType target, 234 | PrecisionType precision = PRECISION(kFloat), 235 | DataLayoutType layout = DATALAYOUT(kNCHW), 236 | int16_t device = 0) 237 | : target(target), precision(precision), layout(layout), device(device) {} 238 | 239 | bool is_valid() const { 240 | return target != TARGET(kUnk) && precision != PRECISION(kUnk) && 241 | layout != DATALAYOUT(kUnk); 242 | } 243 | 244 | size_t hash() const; 245 | 246 | bool operator==(const Place& other) const { 247 | return target == other.target && precision == other.precision && 248 | layout == other.layout && device == other.device; 249 | } 250 | 251 | bool operator!=(const Place& other) const { return !(*this == other); } 252 | 253 | friend bool operator<(const Place& a, const Place& b); 254 | 255 | std::string DebugString() const; 256 | }; 257 | 258 | } // namespace lite_api 259 | } // namespace paddle 260 | -------------------------------------------------------------------------------- /src/dajtensor.cpp: -------------------------------------------------------------------------------- 1 | 2 | #include "dajtensor.h" 3 | #include "dajutil.h" 4 | 5 | namespace dajnn { 6 | 7 | Tensor::Tensor() { 8 | _val_ = nullptr; 9 | span = 0; 10 | releasable = true; 11 | 12 | #ifdef TRACE_MEMORY_LEAK 13 | push_tensor_trace(this); 14 | #endif 15 | } 16 | 17 | Tensor::~Tensor() { 18 | if (releasable && _val_) free(_val_); 19 | 20 | #ifdef TRACE_MEMORY_LEAK 21 | pop_tensor_trace(this); 22 | #endif 23 | } 24 | 25 | void Tensor::reshape(vector* shape) { 26 | exit_if(span != get_span(shape), "unable to reshape from %s to %s", 27 | get_shape_str(&this->shape).c_str(), get_shape_str(shape).c_str()); 28 | this->shape = *shape; 29 | } 30 | 31 | void Tensor::reshape(uint dim1, ...) { 32 | if (dim1 == END_DIM) return; 33 | 34 | vector running_shape; 35 | running_shape.push_back(dim1); 36 | 37 | va_list ap; 38 | va_start(ap, dim1); 39 | uint adim = va_arg(ap, uint); 40 | 41 | while (adim != END_DIM) { 42 | running_shape.push_back(adim); 43 | adim = va_arg(ap, uint); 44 | } 45 | va_end(ap); 46 | reshape(&running_shape); 47 | } 48 | 49 | bool Tensor::is_shape(vector* shape) { 50 | if (this->shape.size() != shape->size()) return false; 51 | 52 | for (uint i = 0; i < shape->size(); ++i) { 53 | if (this->shape[i] != shape->at(i)) return false; 54 | } 55 | return true; 56 | } 57 | 58 | bool Tensor::is_shape(uint dim1, ...) { 59 | if (dim1 != END_DIM) { 60 | if (shape.empty()) return false; 61 | if (shape[0] != dim1) return false; 62 | } else if (!shape.empty()) return false; 63 | 64 | va_list ap; 65 | va_start(ap, dim1); 66 | uint adim = va_arg(ap, uint); 67 | 68 | for (uint i = 1; i < shape.size(); ++i) { 69 | if (shape[i] != adim) return false; 70 | adim = va_arg(ap, uint); 71 | } 72 | if (adim != END_DIM) return false; 73 | va_end(ap); 74 | return true; 75 | } 76 | 77 | void Tensor::set_releasable(bool releasable) { 78 | this->releasable = releasable; 79 | } 80 | 81 | void* Tensor::_init_(Tensor* tensor, bool copy_val) { 82 | exit_if(!tensor, "cannot clone tensor from empty tensor"); 83 | shape = tensor->shape; 84 | span = get_span(&shape); 85 | 86 | if (copy_val) { 87 | _val_ = malloc(4 * span); 88 | memcpy(_val_, tensor->_val_, 4 * span); 89 | } else { 90 | _val_ = tensor->_val_; 91 | } 92 | return _val_; 93 | } 94 | 95 | void* Tensor::_init_(vector* shape, void* val, bool copy_val) { 96 | this->shape = *shape; 97 | span = get_span(shape); 98 | 99 | if (val && copy_val) { 100 | _val_ = malloc(4 * span); 101 | memcpy(_val_, val, 4 * span); 102 | } else if (!val) { 103 | _val_ = malloc(4 * span); 104 | } else { 105 | _val_ = val; 106 | } 107 | return _val_; 108 | } 109 | 110 | void* Tensor::_init_(void* val, bool copy_val, uint dim1, va_list ap) { 111 | if (dim1 == END_DIM) return nullptr; 112 | vector running_shape; 113 | 114 | running_shape.push_back(dim1); 115 | uint adim = va_arg(ap, uint); 116 | 117 | while (adim != END_DIM) { 118 | running_shape.push_back(adim); 119 | exit_if(running_shape.size() == MAX_TENSOR_DIM, 120 | "tensor shape with too many dimensions (%s) : did you forget to end with END_DIM?", 121 | get_shape_str(&running_shape).c_str()); 122 | adim = va_arg(ap, uint); 123 | } 124 | return _init_(&running_shape, val, copy_val); 125 | } 126 | 127 | void* Tensor::_init_(ByteStream* stream) { 128 | _read_meta_(stream); 129 | _val_ = malloc(4 * span); 130 | stream->read(_val_, 4, span); 131 | return _val_; 132 | } 133 | 134 | void Tensor::_read_meta_(ByteStream* stream) { 135 | unsigned char len = 0; 136 | stream->read(&len, 1, 1); 137 | 138 | for (unsigned char i = 0; i < len; ++i) { 139 | unsigned int dim = 0; 140 | stream->read(&dim, 4, 1); 141 | shape.push_back(dim); 142 | } 143 | span = get_span(&shape); 144 | } 145 | 146 | void Tensor::_write_meta_(ByteStream* stream) { 147 | unsigned char len = (unsigned char) shape.size(); 148 | stream->write(&len, 1, 1); 149 | 150 | for (unsigned char i = 0; i < len; ++i) { 151 | stream->write(&shape[i], 4, 1); 152 | } 153 | } 154 | 155 | void Tensor::_write_val_(ByteStream* stream) { 156 | stream->write(_val_, 4, span); 157 | } 158 | 159 | void Tensor::_save_(ByteStream* stream) { 160 | _write_meta_(stream); 161 | _write_val_(stream); 162 | } 163 | 164 | ITensor::ITensor() : Tensor() { 165 | val = nullptr; 166 | } 167 | 168 | ITensor::ITensor(ITensor* tensor, bool copy_val) : ITensor() { 169 | this->val = (int*) _init_(tensor, copy_val); 170 | } 171 | 172 | ITensor::ITensor(vector* shape, int* val, bool copy_val) : ITensor() { 173 | this->val = (int*) _init_(shape, val, copy_val); 174 | } 175 | 176 | ITensor::ITensor(int* val, bool copy_val, uint dim1, ...) : ITensor() { 177 | va_list ap; 178 | va_start(ap, dim1); 179 | this->val = (int*) _init_(val, copy_val, dim1, ap); 180 | va_end(ap); 181 | } 182 | 183 | ITensor::ITensor(uint dim1, ...) : ITensor() { 184 | va_list ap; 185 | va_start(ap, dim1); 186 | this->val = (int*) _init_(nullptr, false, dim1, ap); 187 | va_end(ap); 188 | }; 189 | 190 | ITensor::ITensor(ByteStream* stream) : ITensor() { 191 | char compressed = 0; 192 | stream->read(&compressed, 1, 1); 193 | 194 | if (compressed) { 195 | _read_meta_(stream); 196 | short sh = 0; 197 | _val_ = val = (int*) malloc(span * 4); 198 | 199 | for (int* vp = val; vp < val + span; ++vp) { 200 | stream->read(&sh, 2, 1); 201 | *vp = sh; 202 | } 203 | } else { 204 | val = (int*) _init_(stream); 205 | } 206 | } 207 | 208 | void ITensor::save(ByteStream* stream, bool compressed) { 209 | char flag = compressed ? 1 : 0; 210 | stream->write(&flag, 1, 1); 211 | 212 | if (compressed) { 213 | _write_meta_(stream); 214 | short sh = 0; 215 | 216 | for (int* vp = val; vp < val + span; ++vp) { 217 | sh = (short) *vp; 218 | stream->write(&sh, 2, 1); 219 | } 220 | } else { 221 | _save_(stream); 222 | } 223 | } 224 | 225 | int ITensor::compare(ITensor* tensor) { 226 | return compare(tensor->val, tensor->span); 227 | } 228 | 229 | int ITensor::compare(int* val, uint len) { 230 | uint comp_len = MIN(span, len); 231 | int max_abs = 0; 232 | int* vp1 = this->val; 233 | int* vp2 = val; 234 | 235 | for (uint i = 0; i < comp_len; ++i, ++vp1, ++vp2) { 236 | int d = abs(*vp1 - *vp2); 237 | if (d > max_abs) max_abs = d; 238 | } 239 | return max_abs; 240 | } 241 | 242 | int ITensor::get_max() { 243 | return dajnn::get_max(val, span); 244 | } 245 | 246 | int ITensor::get_min() { 247 | return dajnn::get_min(val, span); 248 | } 249 | 250 | FTensor::FTensor() : Tensor() { 251 | val = nullptr; 252 | } 253 | 254 | FTensor::FTensor(ITensor* tensor) { 255 | shape = tensor->shape; 256 | span = get_span(&shape); 257 | 258 | _val_ = val = (float*) malloc(span * 4); 259 | int* tp = tensor->val; 260 | 261 | for (float* vp = val; vp < val + span; ++vp, ++tp) { 262 | *vp = (float) *tp; 263 | } 264 | } 265 | 266 | FTensor::FTensor(FTensor* tensor, bool copy_val) : FTensor() { 267 | this->val = (float*) _init_(tensor, copy_val); 268 | } 269 | 270 | FTensor::FTensor(vector* shape, float* val, bool copy_val) : FTensor() { 271 | this->val = (float*) _init_(shape, val, copy_val); 272 | } 273 | 274 | FTensor::FTensor(float* val, bool copy_val, uint dim1, ...) : FTensor() { 275 | va_list ap; 276 | va_start(ap, dim1); 277 | this->val = (float*) _init_(val, copy_val, dim1, ap); 278 | va_end(ap); 279 | } 280 | 281 | FTensor::FTensor(uint dim1, ...) : FTensor() { 282 | va_list ap; 283 | va_start(ap, dim1); 284 | this->val = (float*) _init_(nullptr, false, dim1, ap); 285 | va_end(ap); 286 | } 287 | 288 | FTensor::FTensor(ByteStream* stream) : FTensor() { 289 | char compressed = 0; 290 | stream->read(&compressed, 1, 1); 291 | 292 | if (compressed) { 293 | _read_meta_(stream); 294 | 295 | float min_v = 0, max_v = 0; 296 | short sh = 0; 297 | 298 | stream->read(&min_v, 4, 1); 299 | stream->read(&max_v, 4, 1); 300 | _val_ = val = (float*) malloc(span * 4); 301 | 302 | for (float* vp = val; vp < val + span; ++vp) { 303 | stream->read(&sh, 2, 1); 304 | *vp = min_v + (max_v - min_v) * (1 + (float) sh / SHRT_MAX) / 2; 305 | } 306 | } else { 307 | val = (float*) _init_(stream); 308 | } 309 | } 310 | 311 | void FTensor::save(ByteStream* stream, bool compressed) { 312 | char flag = compressed ? 1 : 0; 313 | stream->write(&flag, 1, 1); 314 | 315 | if (compressed) { 316 | _write_meta_(stream); 317 | 318 | float min_v = get_min(); 319 | float max_v = get_max(); 320 | short sh = 0; 321 | 322 | stream->write(&min_v, 4, 1); 323 | stream->write(&max_v, 4, 1); 324 | 325 | for (float* vp = val; vp < val + span; ++vp) { 326 | sh = (short) ((2 * (*vp - min_v) / (max_v - min_v) - 1) * SHRT_MAX); 327 | stream->write(&sh, 2, 1); 328 | } 329 | } else { 330 | _save_(stream); 331 | } 332 | } 333 | 334 | void FTensor::print(uint start, uint end) { 335 | if (start == END_DIM) start = 0; 336 | if (end == END_DIM) end = span; 337 | 338 | for (uint i = start; i < end; ++i) { 339 | printf("%.8f,", val[i]); 340 | } 341 | } 342 | 343 | float FTensor::compare(FTensor* tensor) { 344 | return compare(tensor->val, tensor->span); 345 | } 346 | 347 | float FTensor::compare(float* val, uint len) { 348 | uint comp_len = MIN(span, len); 349 | float max_abs = 0; 350 | float* vp1 = this->val; 351 | float* vp2 = val; 352 | 353 | for (uint i = 0; i < comp_len; ++i, ++vp1, ++vp2) { 354 | float d = fabsf(*vp1 - *vp2); 355 | if (d > max_abs) max_abs = d; 356 | } 357 | return max_abs; 358 | } 359 | 360 | float FTensor::get_max() { 361 | return dajnn::get_max(val, span); 362 | } 363 | 364 | float FTensor::get_min() { 365 | return dajnn::get_min(val, span); 366 | } 367 | 368 | ByteStream::ByteStream() { 369 | buff = nullptr; 370 | fp = nullptr; 371 | pointer = 0; 372 | } 373 | 374 | ByteStream::ByteStream(const void* buff) : ByteStream() { 375 | this->buff = (const char*) buff; 376 | } 377 | 378 | ByteStream::ByteStream(FILE* fp) : ByteStream() { 379 | this->fp = fp; 380 | } 381 | 382 | string ByteStream::read_str() { 383 | string str; 384 | char t = 0; 385 | 386 | for (uint i = 0; i < MAX_MODEL_STR; ++i) { 387 | if (!read(&t, 1, 1)) break; 388 | if (!t) break; 389 | str += t; 390 | } 391 | return str; 392 | } 393 | 394 | uint ByteStream::read(void* dst, int ele_size, int ele_count) { 395 | if (buff) { 396 | int len = ele_size * ele_count; 397 | memcpy(dst, &buff[pointer], len); 398 | pointer += len; 399 | return ele_count; 400 | } else if (fp) { 401 | return (uint) fread(dst, ele_size, ele_count, fp); 402 | } else { 403 | return 0; 404 | } 405 | } 406 | 407 | void ByteStream::write(void* src, int ele_size, int ele_count) { 408 | if (buff) { 409 | int len = ele_size * ele_count; 410 | memcpy((char*) &buff[pointer], src, len); 411 | pointer += len; 412 | } else if (fp) { 413 | fwrite(src, ele_size, ele_count, fp); 414 | } 415 | } 416 | 417 | int ByteStream::seek() { 418 | return pointer; 419 | } 420 | 421 | } 422 | -------------------------------------------------------------------------------- /paddle/paddle_api_2.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | /* 16 | * This file defines PaddlePredictor, the api for lite. It supports multiple 17 | * hardware including ARM, X86, OpenCL, CUDA and so on. 18 | */ 19 | 20 | #ifndef PADDLE_LITE_API_2_H_ // NOLINT 21 | #define PADDLE_LITE_API_2_H_ 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include "paddle_place.h" // NOLINT 28 | #include "paddle_api.h" 29 | 30 | 31 | namespace paddle { 32 | namespace lite_api { 33 | 34 | #ifdef LITE_WITH_ARM 35 | 36 | LITE_API void paddle_DeviceInit(); 37 | LITE_API void paddle_clip_kernel_fp32(const float* input, int64_t num, float min, float max, float* output); 38 | LITE_API void paddle_elementwise_mul(const float* dinx, const float* diny, float* dout, int num); 39 | LITE_API void paddle_elementwise_div(const float* dinx, const float* diny, float* dout, int num); 40 | LITE_API void paddle_elementwise_add(const float* dinx, const float* diny, float* dout, int num); 41 | LITE_API void paddle_elementwise_sub(const float* dinx, const float* diny, float* dout, int num); 42 | LITE_API void paddle_elementwise_pow(const float* dinx, const float* diny, float* dout, int num); 43 | LITE_API void paddle_elementwise_max(const float* dinx, const float* diny, float* dout,int num); 44 | LITE_API void paddle_act_relu(const float* din, float* dout, int size, int threads); 45 | LITE_API void paddle_act_sigmoid(const float* din, float* dout, int size, int threads); 46 | LITE_API void act_tanh(const float* din, float* dout, int size, int threads); 47 | LITE_API void act_log(const float* din, float* dout, int size, int threads); 48 | LITE_API void act_exp(const float* din, float* dout, int size, int threads); 49 | 50 | // New added activate 51 | LITE_API void act_leakyrelu(const float* din, float* dout, int size, float alpha, int threads); 52 | LITE_API void act_sqrt(const float* din, float* dout, int size, int threads); 53 | LITE_API void act_softmax(const float* din, float* dout, int dims, int axis_num); 54 | 55 | 56 | LITE_API void scale(const float* din, float* dout, int num, float scale, float bias); 57 | LITE_API void scale(const int* din, int* dout, int num, int scale, int bias); 58 | LITE_API void scale(const float* din, 59 | float* dout, 60 | int outer_dim, 61 | int scale_dim, 62 | int inner_dim, 63 | const float* scale_data, 64 | const float* bias_data); 65 | LITE_API void scale(const float* din, 66 | float* dout, 67 | int outer_dim, 68 | int scale_dim, 69 | const float* scale_data, 70 | const float* bias_data); 71 | 72 | LITE_API void paddle_matmul(const int M, const int N, const int K, const float* X, const float* W, float* Y, int cls=0, int ths=1); 73 | LITE_API void paddle_fccompute(const int M, const int N, const int K, 74 | const float* X, const float* W, float* Y, 75 | const float* bias = nullptr, ActivationType activationtype = ActivationType::kIndentity, 76 | int cls=0, int ths=1); 77 | 78 | LITE_API void paddle_matmul_quantize(const int M, const int N, const int K, 79 | const int8_t* X, const float xscale, 80 | const int8_t* W, const float wscale, float* Y); 81 | 82 | LITE_API void paddle_conv1d( 83 | int batches, 84 | int channels, int xlen, float* indata, 85 | int filters, int kernelsize, float *kerneldata, 86 | float*outdata, float* bias=NULL, 87 | int padding = 0, int dilation = 1, int stride = 1, 88 | int flag_act = 0, float leaky_relu_scale = 0.1, 89 | int cls = 1, int ths = 2); 90 | 91 | LITE_API void paddle_conv2d( 92 | int batches, 93 | int x_h, int x_w, int channels, float* indata, 94 | int filters, int kernel_h, int kernel_w, float* kerneldata, 95 | float* outdata, float* bias=NULL, 96 | int padding_h = 0, int padding_w = 0, 97 | int dilation_h = 1, int dilation_w = 1, 98 | int stride_h = 1, int stride_w = 1, 99 | int flag_act = 0, float leaky_relu_scale = 0.1, 100 | int cls = 1, int ths = 2); 101 | 102 | LITE_API void paddle_conv( 103 | std::vectorindatashape, float* indata, 104 | std::vectorkernelshape, float *kerneldata, 105 | std::vectoroutdatashape, float*outdata, 106 | bool flag_bias, float*biasdata, 107 | std::vector pad, 108 | std::vector dilation, 109 | std::vector stride, 110 | int flag_act, float leaky_relu_scale, int cls, int ths); 111 | LITE_API void paddle_matrix_norm_row(const float* x_data, 112 | const float* scale_data, 113 | const float* bias_data, 114 | float* out_data, 115 | float* mean_out, 116 | float* var_out, 117 | float epsilon, 118 | int batch_size, 119 | int feature_size); 120 | LITE_API void paddle_mean_var(const float* x_data, 121 | float* mean_out, 122 | float* var_out, 123 | float epsilon, 124 | int batch_size, 125 | int feature_size); 126 | // LITE_API void paddle_conv1d_int( 127 | // int channels, int xlen, int8_t* indata, 128 | // int filters, int kernelsize, int8_t *kerneldata, 129 | // float *outdata, float* bias, float input_scale, float weight_scale, 130 | // int padding=0, int dilation=1, int stride=1, 131 | // int flag_act=0, float leaky_relu_scale=0.1, 132 | // int cls=0, int ths=1); 133 | 134 | // LITE_API void paddle_conv_int( 135 | // std::vectorindatashape, int8_t* indata, 136 | // std::vectorkernelshape, int8_t *kerneldata, 137 | // std::vectoroutdatashape, float*outdata, 138 | // bool flag_bias, float*biasdata,float input_scale, float weight_scale, 139 | // std::vector pad, 140 | // std::vector dilation, 141 | // std::vector stride, int flag_act, float leaky_relu_scale, int cls=0, int ths=1); 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | LITE_API void paddle_conv_transpose1d( 154 | int channels, int xlen, float* indata, 155 | int filters, int kernelsize, float* kerneldata, 156 | float* outdata, 157 | int padding = 0, int dilation = 1, int stride = 1, 158 | int flag_act = 0, float leaky_relu_scale = 0.1, 159 | int cls = 1, int ths = 2 160 | ); 161 | 162 | 163 | LITE_API void paddle_conv_transpose2d( 164 | int channels, int x_h, int x_w, float* indata, 165 | int filters, int kernel_h, int kernel_w, float* kerneldata, 166 | float* outdata, 167 | int padding_h=0, int padding_w=0, 168 | int dilation_h=1, int dilation_w=1, 169 | int stride_h=1, int stride_w=1, 170 | int flag_act=0, float leaky_relu_scale=0.1, 171 | int cls=1, int ths=2); 172 | LITE_API void paddle_conv_transpose( 173 | std::vectorindatashape, float* indata, 174 | std::vectorkernelshape, float* kerneldata, 175 | std::vectoroutdatashape, float* outdata, 176 | std::vector pad, 177 | std::vector dilation, 178 | std::vector stride, int flag_act, float leaky_relu_scale, int cls, int ths); 179 | 180 | LITE_API void paddle_layernorm1d(float* x, float* weight, float* bias, float* outdata, 181 | float* meandata, float* vardata, int batch_size, int features); 182 | 183 | LITE_API void paddle_batchnorm1d(float* x, float* outdata, 184 | float* scale, float* bias, float* mean_data, float* var_data, 185 | int channels, int xlen, int cls = 1, int ths = 2); 186 | 187 | LITE_API void paddle_batchnorm(std::vectorindatashape, float* indata, float* outdata, 188 | float* scaledata, float*biasdata, float* mean_data, float* var_data, 189 | int cls = 1, int ths = 2); 190 | 191 | LITE_API void paddle_fill_bias(float* x, float* bias, int channels, int xlen, bool flag_relu=false); 192 | 193 | LITE_API void paddle_transpose2d(float* x, float* out, int size1, int size2, int cls = 0, int ths = 1); 194 | 195 | LITE_API void paddle_transpose3d(float* x, float* out, int size1, int size2, int size3, int axis1, int axis2, int cls = 0, int ths = 1); 196 | 197 | LITE_API void paddle_transpose(float* x, float* out, std::vector axis_size, int axis1 = 1, int axis2 = 0, int cls = 0, int ths=1); 198 | 199 | LITE_API void paddle_transpose(std::vector input_shape, float* indata, 200 | std::vector output_shape, float* outdata, 201 | std::vector axis, int cls, int ths); 202 | 203 | LITE_API void paddle_reflect1d(float* din, float* dout, int channels, int x_len, int dilation); 204 | LITE_API void paddle_reflect2d(float* din, float* dout, int channels, int x_h, int x_w, int dilation_h, int dilation_w); 205 | 206 | LITE_API void paddle_matmul_int16_32(int m, int n, int k, int16_t* A, int16_t* B, int32_t* C, bool rettrans, int cls=1, int ths=2); 207 | 208 | // New added functions 209 | LITE_API void paddle_affine(const float* din, const float* weight, const float* bias, const int dim1, const int dim2, float* dout ); 210 | 211 | LITE_API float paddle_FindAbsMax(float* din, int size); 212 | 213 | LITE_API float paddle_GetScale(float threshold, int bit_length); 214 | 215 | // LITE_API float paddle_fp32_to_int8_1d(const float* din, int8_t* dout, int size); 216 | // LITE_API void paddle_int8_to_fp32_1d(const int8_t* din, float* out, const float scale, int size); 217 | // LITE_API void paddle_int32_to_fp32_1d(const int* din, float* dout, const float scale, int size); 218 | // LITE_API float paddle_int32_to_int8_1d(const int* din, int8_t* dout, const float scale, int size); 219 | // LITE_API float paddle_fp32_to_int16_1d(const float* din, int16_t* dout, int size); 220 | // LITE_API void paddle_int16_to_fp32_1d(const int16_t* din, float* dout, const float scale, int size); 221 | 222 | 223 | 224 | 225 | 226 | #endif//LITE_WITH_ARM 227 | 228 | 229 | 230 | } // namespace lite_api 231 | } // namespace paddle 232 | 233 | #endif // NOLINT 234 | -------------------------------------------------------------------------------- /paddle/paddle_api.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | /* 16 | * This file defines PaddlePredictor, the api for lite. It supports multiple 17 | * hardware including ARM, X86, OpenCL, CUDA and so on. 18 | */ 19 | 20 | #ifndef PADDLE_LITE_API_H_ // NOLINT 21 | #define PADDLE_LITE_API_H_ 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include "paddle_place.h" // NOLINT 28 | 29 | namespace paddle { 30 | namespace lite_api { 31 | 32 | using shape_t = std::vector; 33 | using lod_t = std::vector>; 34 | 35 | enum class LiteModelType { kProtobuf = 0, kNaiveBuffer, UNK }; 36 | // Methods for allocating L3Cache on Arm platform 37 | enum class L3CacheSetMethod { 38 | kDeviceL3Cache = 0, // Use the system L3 Cache size, best performance. 39 | kDeviceL2Cache = 1, // Use the system L2 Cache size, trade off performance 40 | // with less memory consumption. 41 | kAbsolute = 2, // Use the external setting. 42 | // kAutoGrow = 3, // Not supported yet, least memory consumption. 43 | }; 44 | 45 | // return true if current device supports OpenCL model 46 | LITE_API bool IsOpenCLBackendValid(bool check_fp16_valid = false); 47 | 48 | struct LITE_API Tensor { 49 | explicit Tensor(void* raw); 50 | explicit Tensor(const void* raw); 51 | 52 | void Resize(const shape_t& shape); 53 | 54 | /// Readonly data. 55 | template 56 | const T* data() const; 57 | 58 | template 59 | T* mutable_data(TargetType type = TargetType::kHost) const; 60 | 61 | // Share external memory. Note: ensure that the data pointer is in a valid 62 | // state 63 | // during the prediction process. 64 | void ShareExternalMemory(void* data, size_t memory_size, TargetType target); 65 | 66 | template 67 | void CopyFromCpu(const T* data); 68 | 69 | template 70 | void CopyToCpu(T* data) const; 71 | /// Shape of the tensor. 72 | shape_t shape() const; 73 | TargetType target() const; 74 | PrecisionType precision() const; 75 | void SetPrecision(PrecisionType precision); 76 | 77 | // LoD of the tensor 78 | lod_t lod() const; 79 | 80 | // Set LoD of the tensor 81 | void SetLoD(const lod_t& lod); 82 | bool IsInitialized() const; 83 | 84 | private: 85 | void* raw_tensor_; 86 | }; 87 | 88 | /// The PaddlePredictor defines the basic interfaces for different kinds of 89 | /// predictors. 90 | class LITE_API PaddlePredictor { 91 | public: 92 | PaddlePredictor() = default; 93 | 94 | /// Get i-th input. 95 | virtual std::unique_ptr GetInput(int i) = 0; 96 | 97 | /// Get i-th output. 98 | virtual std::unique_ptr GetOutput(int i) const = 0; 99 | 100 | virtual void Run() = 0; 101 | virtual std::shared_ptr Clone() = 0; 102 | virtual std::shared_ptr Clone( 103 | const std::vector& var_names) = 0; 104 | 105 | virtual std::string GetVersion() const = 0; 106 | 107 | // Get input names 108 | virtual std::vector GetInputNames() = 0; 109 | // Get output names 110 | virtual std::vector GetOutputNames() = 0; 111 | // Get output names 112 | virtual std::vector GetParamNames(); 113 | 114 | // Get Input by name 115 | virtual std::unique_ptr GetInputByName(const std::string& name) = 0; 116 | 117 | /// Get a readonly tensor, return null if no one called `name` exists. 118 | virtual std::unique_ptr GetTensor( 119 | const std::string& name) const = 0; 120 | /// Get a mutable tensor, return null if on one called `name` exists 121 | /// internal infereces API, not recommanded. 122 | virtual std::unique_ptr GetMutableTensor(const std::string& name); 123 | 124 | /// Persist the optimized model to disk. This API is only supported by 125 | /// CxxConfig, and the persisted model can be reused for MobileConfig. 126 | virtual void SaveOptimizedModel( 127 | const std::string& model_dir, 128 | LiteModelType model_type = LiteModelType::kProtobuf, 129 | bool record_info = false); 130 | 131 | virtual ~PaddlePredictor() = default; 132 | 133 | protected: 134 | int threads_{1}; 135 | lite_api::PowerMode mode_{lite_api::LITE_POWER_NO_BIND}; 136 | }; 137 | 138 | /// Base class for all the configs. 139 | class LITE_API ConfigBase { 140 | std::string model_dir_; 141 | int threads_{1}; 142 | PowerMode mode_{LITE_POWER_NO_BIND}; 143 | // gpu opencl 144 | CLTuneMode opencl_tune_mode_{CL_TUNE_NONE}; 145 | CLPrecisionType opencl_precision_{CL_PRECISION_AUTO}; 146 | // Where to cache the npu/xpu/rknpu/apu offline model to the binary files 147 | std::string subgraph_model_cache_dir_{""}; 148 | // Set the cached npu/xpu/rknpu/apu offline model from the buffers 149 | std::map, std::vector>> 150 | subgraph_model_cache_buffers_{}; 151 | int device_id_{0}; 152 | int x86_math_num_threads_ = 1; 153 | 154 | public: 155 | explicit ConfigBase(PowerMode mode = LITE_POWER_NO_BIND, int threads = 1); 156 | // set Model_dir 157 | void set_model_dir(const std::string& x) { model_dir_ = x; } 158 | const std::string& model_dir() const { return model_dir_; } 159 | // set Thread 160 | void set_threads(int threads); 161 | int threads() const { return threads_; } 162 | // set Power_mode 163 | void set_power_mode(PowerMode mode); 164 | PowerMode power_mode() const { return mode_; } 165 | // set GPU opencl tune 166 | void set_opencl_tune(CLTuneMode tune_mode = CL_TUNE_NONE, 167 | size_t lws_repeats = 4); 168 | // set GPU opencl precision 169 | void set_opencl_precision(CLPrecisionType p = CL_PRECISION_AUTO); 170 | // set subgraph_model_dir 171 | void set_subgraph_model_cache_dir(std::string subgraph_model_cache_dir) { 172 | subgraph_model_cache_dir_ = subgraph_model_cache_dir; 173 | } 174 | const std::string& subgraph_model_cache_dir() const { 175 | return subgraph_model_cache_dir_; 176 | } 177 | void set_subgraph_model_cache_buffers(const std::string& key, 178 | const std::vector& cfg, 179 | const std::vector& bin); 180 | const std::map, std::vector>>& 181 | subgraph_model_cache_buffers() const { 182 | return subgraph_model_cache_buffers_; 183 | } 184 | // set Device ID 185 | void set_device_id(int device_id) { device_id_ = device_id; } 186 | int get_device_id() const { return device_id_; } 187 | // set x86_math_num_threads 188 | void set_x86_math_num_threads(int threads); 189 | int x86_math_num_threads() const; 190 | }; 191 | 192 | class LITE_API CxxModelBuffer { 193 | public: 194 | CxxModelBuffer(const char* program_buffer, 195 | size_t program_buffer_size, 196 | const char* params_buffer, 197 | size_t params_buffer_size); 198 | CxxModelBuffer(std::string&& program_buffer, std::string&& params_buffer); 199 | const std::string& get_program() const; 200 | const std::string& get_params() const; 201 | bool is_empty() const; 202 | 203 | CxxModelBuffer() = default; 204 | CxxModelBuffer(const CxxModelBuffer&) = delete; 205 | 206 | private: 207 | std::string program_; 208 | std::string params_; 209 | }; 210 | 211 | /// CxxConfig is the config for the Full feature predictor. 212 | class LITE_API CxxConfig : public ConfigBase { 213 | std::vector valid_places_; 214 | std::string model_file_; 215 | std::string param_file_; 216 | std::shared_ptr model_buffer_{nullptr}; 217 | std::vector passes_internal_{}; 218 | bool quant_model_{false}; // Enable post_quant_dynamic in opt 219 | QuantType quant_type_{QuantType::QUANT_INT16}; 220 | std::map>> 221 | preferred_inputs_for_warmup_; 222 | #ifdef LITE_WITH_CUDA 223 | bool multi_stream_{false}; 224 | #endif 225 | #ifdef LITE_WITH_MLU 226 | lite_api::MLUCoreVersion mlu_core_version_{lite_api::MLUCoreVersion::MLU_270}; 227 | int mlu_core_number_{1}; 228 | DataLayoutType mlu_input_layout_{DATALAYOUT(kNCHW)}; 229 | std::vector mlu_first_conv_mean_{}; 230 | std::vector mlu_first_conv_std_{}; 231 | #endif 232 | 233 | public: 234 | void set_valid_places(const std::vector& x) { valid_places_ = x; } 235 | void set_model_file(const std::string& path) { model_file_ = path; } 236 | void set_param_file(const std::string& path) { param_file_ = path; } 237 | void set_model_buffer(const char* model_buffer, 238 | size_t model_buffer_size, 239 | const char* param_buffer, 240 | size_t param_buffer_size) { 241 | model_buffer_.reset(new CxxModelBuffer( 242 | model_buffer, model_buffer_size, param_buffer, param_buffer_size)); 243 | } 244 | void set_model_buffer(std::shared_ptr model_buffer) { 245 | model_buffer_ = model_buffer; 246 | } 247 | const CxxModelBuffer& get_model_buffer() const; 248 | // internal inference to choose passes for model optimizing, 249 | // it's designed for internal developer and not recommanded 250 | // for comman users. 251 | void set_passes_internal( 252 | const std::vector& passes_internal = {}) { 253 | passes_internal_ = passes_internal; 254 | } 255 | const std::vector& get_passes_internal() const { 256 | return passes_internal_; 257 | } 258 | const std::vector& valid_places() const { return valid_places_; } 259 | std::string model_file() const { return model_file_; } 260 | std::string param_file() const { return param_file_; } 261 | bool is_model_from_memory() const { return static_cast(model_buffer_); } 262 | // note: `model_from_memory` has the same effect as `is_model_from_memory`, 263 | // but is_model_from_memory is recommended and `model_from_memory` will be 264 | // abandoned in v3.0. 265 | bool model_from_memory() const { return static_cast(model_buffer_); } 266 | 267 | #ifdef LITE_WITH_CUDA 268 | void set_multi_stream(bool multi_stream) { multi_stream_ = multi_stream; } 269 | bool multi_stream() const { return multi_stream_; } 270 | #endif 271 | 272 | #ifdef LITE_WITH_MLU 273 | // set MLU core version, which is used when compiling MLU kernels 274 | void set_mlu_core_version(lite_api::MLUCoreVersion core_version); 275 | // set MLU core number, which is used when compiling MLU kernels 276 | void set_mlu_core_number(int core_number); 277 | // whether use MLU's first conv kernel. First conv is a special kernel 278 | // provided by MLU, its input is uint8, and also needs two 3-dimentional 279 | // vectors which save all inputs' mean and std values 280 | // set the 3-dimentional mean vector and 3-dimentional std vector used by 281 | // MLU's first conv 282 | void set_mlu_firstconv_param(const std::vector& mean, 283 | const std::vector& std); 284 | // set MLU input layout. User can specify layout of input data to be NHWC, 285 | // default is NCHW 286 | void set_mlu_input_layout(DataLayoutType layout); 287 | 288 | lite_api::MLUCoreVersion mlu_core_version() const; 289 | int mlu_core_number() const; 290 | DataLayoutType mlu_input_layout() const; 291 | // std::pair 292 | std::pair, std::vector> mlu_firstconv_param() const; 293 | #endif 294 | 295 | // XPU only, set the size of the workspace memory from L3 cache for the 296 | // current thread. 297 | void set_xpu_workspace_l3_size_per_thread(int l3_size = 0xfffc00); 298 | // XPU only, specify the target device ID for the current thread. 299 | // **DEPRECATED**, use xpu_set_device() at the very beginning of each worker 300 | // thread 301 | void set_xpu_dev_per_thread(int dev_no = 0); 302 | void set_xpu_multi_encoder_precision(const std::string& precision = "int16"); 303 | 304 | // set input tensor for warmup. 305 | // It is optional. If you set prefered_inputs, model wil run immediately when 306 | // predictor is created 307 | template 308 | void set_preferred_inputs_for_warmup(const int group_idx, 309 | const int tensor_idx, 310 | const shape_t& shape, 311 | const lod_t& lod = {}, 312 | const T fill_value = 0, 313 | const void* data = nullptr); 314 | const std::map>>& 315 | preferred_inputs_for_warmup() const { 316 | return preferred_inputs_for_warmup_; 317 | } 318 | 319 | void set_quant_model(bool quant_model) { quant_model_ = quant_model; } 320 | bool quant_model() const { return quant_model_; } 321 | void set_quant_type(QuantType quant_type) { quant_type_ = quant_type; } 322 | QuantType quant_type() const { return quant_type_; } 323 | }; 324 | 325 | /// MobileConfig is the config for the light weight predictor, it will skip 326 | /// IR optimization or other unnecessary stages. 327 | class LITE_API MobileConfig : public ConfigBase { 328 | // whether to load data from memory. Model data will be loaded from memory 329 | // buffer if model_from_memory_ is true. 330 | bool model_from_memory_{false}; 331 | 332 | // model data readed from file or memory buffer in combined format. 333 | std::string lite_model_file_; 334 | 335 | // NOTE: This is a deprecated variable and will be removed in latter release. 336 | std::string model_buffer_; 337 | std::string param_buffer_; 338 | 339 | public: 340 | // set model data in combined format, `set_model_from_file` refers to loading 341 | // model from file, set_model_from_buffer refers to loading model from memory 342 | // buffer 343 | void set_model_from_file(const std::string& x); 344 | void set_model_from_buffer(const std::string& x); 345 | // return model data in lite_model_file_, which is in combined format. 346 | const std::string& lite_model_file() const { return lite_model_file_; } 347 | 348 | // return model_from_memory_, which indicates whether to load model from 349 | // memory buffer. 350 | bool is_model_from_memory() const { return model_from_memory_; } 351 | // note: `model_from_memory` has the same effect as `is_model_from_memory`, 352 | // but is_model_from_memory is recommended and `model_from_memory` will be 353 | // abandoned in v3.0. 354 | bool model_from_memory() const { return model_from_memory_; } 355 | 356 | // NOTE: This is a deprecated API and will be removed in latter release. 357 | void set_model_buffer(const char* model_buffer, 358 | size_t model_buffer_size, 359 | const char* param_buffer, 360 | size_t param_buffer_size); 361 | 362 | // NOTE: This is a deprecated API and will be removed in latter release. 363 | const std::string& model_buffer() const { return model_buffer_; } 364 | 365 | // NOTE: This is a deprecated API and will be removed in latter release. 366 | const std::string& param_buffer() const { return param_buffer_; } 367 | 368 | // This is the method for allocating workspace_size according to L3Cache size 369 | void SetArmL3CacheSize( 370 | L3CacheSetMethod method = L3CacheSetMethod::kDeviceL3Cache, 371 | int absolute_val = -1); 372 | }; 373 | 374 | template 375 | LITE_API std::shared_ptr CreatePaddlePredictor(const ConfigT&); 376 | 377 | } // namespace lite_api 378 | } // namespace paddle 379 | 380 | #endif // NOLINT 381 | --------------------------------------------------------------------------------