├── README.md
├── src
├── dajnorm.h
├── dajdef.h
├── dajfunc.h
├── dajmodel.h
├── dajnorm.cpp
├── dajdense.h
├── dajgemm.h
├── dajutil.h
├── dajconv.h
├── dajmodel.cpp
├── dajnn.cpp
├── dajnn.h
├── dajdense.cpp
├── dajtensor.h
├── dajutil.cpp
├── dajfunc.cpp
├── dajconv.cpp
├── dajgemm.cpp
└── dajtensor.cpp
├── vsproj
├── main.cpp
├── dajnn.vcxproj.user
├── dajnn.sln
├── dajnn.vcxproj.filters
└── dajnn.vcxproj
└── paddle
├── paddle_lite_factory_helper.h
├── paddle_use_ops.h
├── paddle_use_passes.h
├── paddle_use_kernels.h
├── paddle_image_preprocess.h
├── paddle_place.h
├── paddle_api_2.h
└── paddle_api.h
/README.md:
--------------------------------------------------------------------------------
1 |
2 | # dajnn
3 | Customized C++ Deep Learning Framework (Multiplatform, Inference Only)
4 |
5 | Fast, optimized, portable, easy C++ inference framework for Deep Learning.
6 |
--------------------------------------------------------------------------------
/src/dajnorm.h:
--------------------------------------------------------------------------------
1 |
2 | #pragma once
3 |
4 | #include "dajnn.h"
5 |
6 | namespace dajnn {
7 | namespace norm {
8 |
9 | void batch_norm_with_precomputed(FTensor* tensor, FTensor* pc_gamma, FTensor* pc_beta);
10 |
11 | }
12 | }
13 |
--------------------------------------------------------------------------------
/vsproj/main.cpp:
--------------------------------------------------------------------------------
1 |
2 | #include "dajmodel.h"
3 |
4 | using namespace dajnn;
5 |
6 | int main(int argc, const char** argv) {
7 | Model* model = new Model("../../pymaster/duel/export/koni_p2_d5.daj");
8 | delete model;
9 | return 0;
10 | }
11 |
--------------------------------------------------------------------------------
/src/dajdef.h:
--------------------------------------------------------------------------------
1 |
2 | #pragma once
3 |
4 | #ifdef _WIN32
5 |
6 | #ifdef _DEBUG
7 | #define TRACE_MEMORY_LEAK
8 | #endif
9 |
10 | #else // _WIN32
11 |
12 | #define LITE_WITH_ARM
13 | #define PADDLE
14 | #define PADDLE_THREADS 2
15 | #define PADDLE_CLS 1
16 |
17 | #endif
18 |
--------------------------------------------------------------------------------
/src/dajfunc.h:
--------------------------------------------------------------------------------
1 |
2 | #pragma once
3 |
4 | #include "dajnn.h"
5 |
6 | namespace dajnn {
7 | namespace func {
8 |
9 | void relu(FTensor* tensor);
10 | void tanh(FTensor* tensor);
11 |
12 | void scale(FTensor* tensor, FTensor* weight, FTensor* bias, bool is_first_batch_dim);
13 | void add(FTensor* dst, FTensor* oprd);
14 |
15 | }
16 | }
17 |
--------------------------------------------------------------------------------
/src/dajmodel.h:
--------------------------------------------------------------------------------
1 |
2 | #pragma once
3 |
4 | #include "dajtensor.h"
5 |
6 | namespace dajnn {
7 |
8 | class Model {
9 | public:
10 | Model();
11 | Model(ByteStream* stream);
12 | virtual ~Model();
13 |
14 | public:
15 | uint length();
16 |
17 | FTensor* get_f(uint idx);
18 | ITensor* get_i(uint idx);
19 |
20 | protected:
21 | Tensor** weights;
22 | uint weights_len;
23 | };
24 |
25 | }
26 |
--------------------------------------------------------------------------------
/src/dajnorm.cpp:
--------------------------------------------------------------------------------
1 |
2 | #include "dajnorm.h"
3 | #include "dajtensor.h"
4 | #include "dajfunc.h"
5 |
6 | #ifdef PADDLE
7 | #include "paddle_api_2.h"
8 | using namespace paddle::lite_api;
9 | #endif
10 |
11 | namespace dajnn {
12 | namespace norm {
13 |
14 | void batch_norm_with_precomputed(FTensor* tensor, FTensor* pc_gamma, FTensor* pc_beta) {
15 | func::scale(tensor, pc_gamma, pc_beta, true);
16 | }
17 |
18 | }
19 | }
20 |
--------------------------------------------------------------------------------
/src/dajdense.h:
--------------------------------------------------------------------------------
1 |
2 | #pragma once
3 |
4 | #include "dajnn.h"
5 |
6 | namespace dajnn {
7 | namespace dense {
8 |
9 | /*
10 | full-connected layer
11 | @param input: 2-d tensor with shape (n, m)
12 | @param kernel: 2-d tensor with shape (m, p)
13 | @param bias: null or 1-d tensor with shape (p)
14 | @return: 2-d tensor with shape (n, p)
15 | */
16 | FTensor* dense(FTensor* input, FTensor* kernel, FTensor* bias);
17 |
18 | }
19 | }
20 |
--------------------------------------------------------------------------------
/src/dajgemm.h:
--------------------------------------------------------------------------------
1 |
2 | #pragma once
3 |
4 | namespace dajnn {
5 |
6 | void gemm_bin(int M, int N, int K, float ALPHA, char* A, int lda, float* B, int ldb, float* C, int ldc);
7 | void gemm(int TA, int TB, int M, int N, int K, float ALPHA, float* A, int lda, float* B, int ldb, float BETA, float* C, int ldc);
8 | void gemm_cpu(int TA, int TB, int M, int N, int K, float ALPHA, float* A, int lda, float* B, int ldb, float BETA, float* C, int ldc);
9 |
10 | }
11 |
--------------------------------------------------------------------------------
/vsproj/dajnn.vcxproj.user:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | $(OutDir)
5 | WindowsLocalDebugger
6 |
7 |
8 | $(OutDir)
9 | WindowsLocalDebugger
10 |
11 |
--------------------------------------------------------------------------------
/src/dajutil.h:
--------------------------------------------------------------------------------
1 |
2 | #pragma once
3 |
4 | #include "dajtensor.h"
5 |
6 | namespace dajnn {
7 |
8 | void log_i(const char* format, ...);
9 | void log_w(const char* format, ...);
10 | void log_d(const char* format, ...);
11 | void log_e(const char* format, ...);
12 | void log_x(const char* type_str, const char* format, va_list ap);
13 |
14 | void exit_if(bool condition, const char* format = nullptr, ...);
15 |
16 | float get_max(float* arr, uint len);
17 | float get_min(float* arr, uint len);
18 |
19 | int get_max(int* arr, uint len);
20 | int get_min(int* arr, uint len);
21 |
22 | uint get_span(vector* shape);
23 | string get_shape_str(vector* shape);
24 | string format_str(const char* format, ...);
25 |
26 | }
27 |
--------------------------------------------------------------------------------
/src/dajconv.h:
--------------------------------------------------------------------------------
1 |
2 | #pragma once
3 |
4 | #include "dajnn.h"
5 |
6 | namespace dajnn {
7 | namespace conv {
8 |
9 | /*
10 | 2-d convolutional layer
11 | @param input: 4-d tensor with shape (n, c, h, w)
12 | @param kernel: 4-d tensor with shape (f, c, k_h, k_w)
13 | @param bias: null or 1-d tensor with shape (f)
14 | @param padding_x: padding sizes (-1 for auto, 0 for no padding)
15 | @param stride_x: strides
16 | @param dilation_x: dilations
17 | @return: 4-d tensor with shape (n, f, _h_, _w_)
18 |
19 | CAUTION:
20 | f (# of filters) must be >1 for mobile forward (paddle-lite's bug)
21 | dilation_x must be =1 for win32 forward (darknet's limitance)
22 | */
23 | FTensor* conv2d(FTensor* input, FTensor* kernel, FTensor* bias = nullptr,
24 | int padding_h = -1, int padding_w = -1, int stride_h = 1, int stride_w = 1,
25 | int dilation_h = 1, int dilation_w = 1);
26 |
27 | }
28 | }
29 |
--------------------------------------------------------------------------------
/vsproj/dajnn.sln:
--------------------------------------------------------------------------------
1 |
2 | Microsoft Visual Studio Solution File, Format Version 12.00
3 | # Visual Studio 2013
4 | VisualStudioVersion = 12.0.21005.1
5 | MinimumVisualStudioVersion = 10.0.40219.1
6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "dajnn", "dajnn.vcxproj", "{A4E8B25A-10B8-4252-8C8A-EE5FDBF7891F}"
7 | EndProject
8 | Global
9 | GlobalSection(SolutionConfigurationPlatforms) = preSolution
10 | Debug|Win32 = Debug|Win32
11 | Release|Win32 = Release|Win32
12 | EndGlobalSection
13 | GlobalSection(ProjectConfigurationPlatforms) = postSolution
14 | {A4E8B25A-10B8-4252-8C8A-EE5FDBF7891F}.Debug|Win32.ActiveCfg = Debug|Win32
15 | {A4E8B25A-10B8-4252-8C8A-EE5FDBF7891F}.Debug|Win32.Build.0 = Debug|Win32
16 | {A4E8B25A-10B8-4252-8C8A-EE5FDBF7891F}.Release|Win32.ActiveCfg = Release|Win32
17 | {A4E8B25A-10B8-4252-8C8A-EE5FDBF7891F}.Release|Win32.Build.0 = Release|Win32
18 | EndGlobalSection
19 | GlobalSection(SolutionProperties) = preSolution
20 | HideSolutionNode = FALSE
21 | EndGlobalSection
22 | EndGlobal
23 |
--------------------------------------------------------------------------------
/src/dajmodel.cpp:
--------------------------------------------------------------------------------
1 |
2 | #include "dajmodel.h"
3 | #include "dajutil.h"
4 |
5 | namespace dajnn {
6 |
7 | Model::Model() {
8 | weights = nullptr;
9 | weights_len = 0;
10 | }
11 |
12 | Model::~Model() {
13 | if (weights) {
14 | for (uint i = 0; i < weights_len; ++i) {
15 | delete weights[i];
16 | }
17 | delete[] weights;
18 | }
19 | }
20 |
21 | Model::Model(ByteStream* stream) : Model() {
22 | string header = stream->read_str();
23 |
24 | if (header.compare(MODEL_HEADER)) {
25 | log_w("invalid model header : %s", header);
26 | return;
27 | }
28 | vector tensors;
29 |
30 | while (true) {
31 | string mode = stream->read_str();
32 |
33 | if (!mode.compare("f")) {
34 | tensors.push_back(new FTensor(stream));
35 | } else if (!mode.compare("i")) {
36 | tensors.push_back(new ITensor(stream));
37 | } else if (!mode.compare(MODEL_FOOTER)) {
38 | break;
39 | } else {
40 | log_w("invalid tensor mode (%s) from model", mode.c_str());
41 | }
42 | }
43 | weights_len = tensors.size();
44 | weights = new Tensor*[weights_len];
45 |
46 | for (uint i = 0; i < weights_len; ++i) {
47 | weights[i] = tensors[i];
48 | }
49 | }
50 |
51 | uint Model::length() {
52 | return weights_len;
53 | }
54 |
55 | FTensor* Model::get_f(uint idx) {
56 | return (FTensor*) weights[idx];
57 | }
58 |
59 | ITensor* Model::get_i(uint idx) {
60 | return (ITensor*) weights[idx];
61 | }
62 |
63 | }
64 |
--------------------------------------------------------------------------------
/src/dajnn.cpp:
--------------------------------------------------------------------------------
1 |
2 | #include "dajnn.h"
3 | #include "dajutil.h"
4 |
5 | #ifdef PADDLE
6 | #include "paddle_api_2.h"
7 | using namespace paddle::lite_api;
8 | #endif
9 |
10 | namespace dajnn {
11 |
12 | #ifdef TRACE_MEMORY_LEAK
13 | vector _tensor_trace_pool_;
14 | vector _tensor_unique_indice_;
15 | uint _tensor_trace_len_ = 0;
16 |
17 | void push_tensor_trace(Tensor* tensor) {
18 | if (_tensor_trace_pool_.empty()) _tensor_trace_len_ = 0;
19 |
20 | _tensor_trace_pool_.push_back(tensor);
21 | _tensor_unique_indice_.push_back(_tensor_trace_len_++);
22 | }
23 |
24 | uint pop_tensor_trace(Tensor* tensor) {
25 | uint i = 0;
26 |
27 | for (vector::iterator ti = _tensor_trace_pool_.begin();
28 | ti != _tensor_trace_pool_.end(); ++ti, ++i) {
29 | if (tensor == *ti) break;
30 | }
31 | exit_if(i == _tensor_trace_pool_.size(), "cannot find tensor to pop from trace");
32 |
33 | uint idx = _tensor_unique_indice_[i];
34 | _tensor_unique_indice_.erase(_tensor_unique_indice_.begin() + i);
35 | _tensor_trace_pool_.erase(_tensor_trace_pool_.begin() + i);
36 | return idx;
37 | }
38 |
39 | vector get_leaked_tensor_indice() {
40 | return _tensor_unique_indice_;
41 | }
42 | #endif
43 |
44 | void init_dajnn() {
45 | #ifdef PADDLE
46 | paddle_DeviceInit();
47 | #endif
48 | }
49 |
50 | void finish_dajnn() {
51 | #ifdef TRACE_MEMORY_LEAK
52 | if (!_tensor_unique_indice_.empty()) {
53 | string msg = "leaked tensors : ";
54 |
55 | for (vector::iterator idx = _tensor_unique_indice_.begin(); idx != _tensor_unique_indice_.end(); ++idx) {
56 | msg += format_str("%d ", *idx);
57 | }
58 | log_e("%s", msg);
59 | }
60 | #endif
61 | }
62 |
63 | }
64 |
--------------------------------------------------------------------------------
/src/dajnn.h:
--------------------------------------------------------------------------------
1 |
2 | #pragma once
3 |
4 | #include
5 | #include
6 | #include
7 | #include
8 | #include
9 | #include
10 | #include
11 | #include
12 | #include
13 | #include
14 | #include
15 | #include "dajdef.h"
16 |
17 | #ifdef PADDLE
18 | #include
19 | #include
20 | #endif
21 |
22 | using namespace std;
23 |
24 | namespace dajnn {
25 |
26 | #ifndef uchar
27 | #define uchar unsigned char
28 | #endif
29 |
30 | #ifndef ushort
31 | #define ushort unsigned short
32 | #endif
33 |
34 | #ifndef uint
35 | #define uint unsigned int
36 | #endif
37 |
38 | #if defined(_MSC_VER) && _MSC_VER < 1900
39 | #define inline __inline
40 | #endif
41 |
42 | #ifndef INT_MIN
43 | #define INT_MIN -2147483648
44 | #define INT_MAX 2147483647
45 | #endif
46 |
47 | #ifndef FLOAT_MIN
48 | #define FLOAT_MIN -1e10f
49 | #define FLOAT_MAX 1e10f
50 | #endif
51 |
52 | #ifndef SHRT_MIN
53 | #define SHRT_MIN -32768
54 | #define SHRT_MAX 32767
55 | #endif
56 |
57 | #ifndef MIN
58 | #define MIN(x, y) (((x) < (y)) ? (x) : (y))
59 | #endif
60 |
61 | #ifndef MAX
62 | #define MAX(x, y) (((x) > (y)) ? (x) : (y))
63 | #endif
64 |
65 | #define END_DIM 0xFFFFFFFF
66 | #define MAX_TENSOR_DIM 16
67 | #define MAX_MODEL_STR 256
68 |
69 | #define MODEL_HEADER "MRB_NN_DAJ_MODEL_V1_BEGIN"
70 | #define MODEL_FOOTER "MRB_NN_DAJ_MODEL_END"
71 |
72 | class Tensor;
73 | class ITensor;
74 | class FTensor;
75 |
76 | void init_dajnn();
77 | void finish_dajnn();
78 |
79 | #ifdef TRACE_MEMORY_LEAK
80 | void push_tensor_trace(Tensor* tensor);
81 | uint pop_tensor_trace(Tensor* tensor);
82 |
83 | vector get_leaked_tensor_indice();
84 | #endif
85 |
86 | }
87 |
--------------------------------------------------------------------------------
/paddle/paddle_lite_factory_helper.h:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | /*
16 | * This file defines some MACROS that explicitly determine the op, kernel, mir
17 | * passes used in the inference lib.
18 | */
19 | #pragma once
20 |
21 | // some platform-independent defintion
22 |
23 | #if defined(_WIN32)
24 | #define UNUSED
25 | #define __builtin_expect(EXP, C) (EXP)
26 | #else
27 | #define UNUSED __attribute__((unused))
28 | #endif
29 |
30 | #define USE_LITE_OP(op_type__) \
31 | extern int touch_op_##op_type__(); \
32 | int LITE_OP_REGISTER_FAKE(op_type__) UNUSED = touch_op_##op_type__();
33 |
34 | #define USE_LITE_KERNEL(op_type__, target__, precision__, layout__, alias__) \
35 | extern int touch_##op_type__##target__##precision__##layout__##alias__(); \
36 | int op_type__##target__##precision__##layout__##alias__##__use_lite_kernel \
37 | UNUSED = touch_##op_type__##target__##precision__##layout__##alias__();
38 |
39 | #define USE_MIR_PASS(name__) \
40 | extern bool mir_pass_registry##name__##_fake(); \
41 | static bool mir_pass_usage##name__ UNUSED = \
42 | mir_pass_registry##name__##_fake();
43 |
44 | #define LITE_OP_REGISTER_FAKE(op_type__) op_type__##__registry__
45 |
--------------------------------------------------------------------------------
/src/dajdense.cpp:
--------------------------------------------------------------------------------
1 |
2 | #include "dajdense.h"
3 | #include "dajtensor.h"
4 | #include "dajutil.h"
5 | #include "dajgemm.h"
6 |
7 | #ifdef PADDLE
8 | #include "paddle_api_2.h"
9 | using namespace paddle::lite_api;
10 | #endif
11 |
12 | namespace dajnn {
13 | namespace dense {
14 |
15 | FTensor* dense(FTensor* input, FTensor* kernel, FTensor* bias) {
16 | exit_if(input->shape.size() != 2, "input dim of dense expects to be 2, but got %d", input->shape.size());
17 | exit_if(kernel->shape.size() != 2, "kernel dim of dense expects to be 2, but got %d", kernel->shape.size());
18 | exit_if(bias && (bias->shape.size() != 1), "bias dim of dense expects to be null or 1, but got %d", bias->shape.size());
19 |
20 | uint n = input->shape[0];
21 | uint m = input->shape[1];
22 | #ifdef PADDLE
23 | uint p = kernel->shape[1];
24 | bool shape_ok = kernel->shape[0] == m;
25 | #else
26 | uint p = kernel->shape[0];
27 | bool shape_ok = kernel->shape[1] == m;
28 | #endif
29 |
30 | exit_if(!shape_ok, "dense input and kernel shapes mismatch : %s and %s",
31 | get_shape_str(&input->shape).c_str(),
32 | get_shape_str(&kernel->shape).c_str());
33 | exit_if(bias && (bias->span != p), "dense kernel and bias shapes mismatch : %s and %s",
34 | get_shape_str(&kernel->shape).c_str(),
35 | get_shape_str(&bias->shape).c_str());
36 |
37 | FTensor* output = new FTensor(n, p, END_DIM);
38 | #ifdef PADDLE
39 | paddle_matmul(n, p, m, input->val, kernel->val, output->val);
40 |
41 | if (bias) {
42 | float* op = output->val;
43 |
44 | for (uint i = 0; i < n; ++i, op += p) {
45 | paddle_elementwise_add(op, bias->val, op, p);
46 | }
47 | }
48 | #else
49 | memset(output->val, 0, 4 * output->span);
50 | gemm(0, 1, n, p, m, 1, input->val, m, kernel->val, m, 1, output->val, p);
51 |
52 | if (bias) {
53 | float* op = output->val;
54 |
55 | for (uint i = 0; i < n; ++i) {
56 | for (float* bp = bias->val; bp < bias->val + p; ++bp, ++op) {
57 | *op += *bp;
58 | }
59 | }
60 | }
61 | #endif
62 | return output;
63 | }
64 |
65 | }
66 | }
--------------------------------------------------------------------------------
/src/dajtensor.h:
--------------------------------------------------------------------------------
1 |
2 | #pragma once
3 |
4 | #include "dajnn.h"
5 |
6 | namespace dajnn {
7 |
8 | class ByteStream {
9 | public:
10 | ByteStream();
11 | ByteStream(const void* buff);
12 | ByteStream(FILE* fp);
13 |
14 | string read_str();
15 | uint read(void* dst, int ele_size, int ele_count);
16 | void write(void* src, int ele_size, int ele_count);
17 | int seek();
18 |
19 | private:
20 | const char* buff;
21 | FILE* fp;
22 | int pointer;
23 | };
24 |
25 | class Tensor {
26 | public:
27 | Tensor();
28 | virtual ~Tensor();
29 |
30 | void reshape(vector* shape);
31 | void reshape(uint dim1, ...);
32 | bool is_shape(vector* shape);
33 | bool is_shape(uint dim1, ...);
34 | void set_releasable(bool releasable);
35 |
36 | public:
37 | vector shape;
38 | uint span;
39 | bool releasable;
40 |
41 | protected:
42 | void* _init_(Tensor* tensor, bool copy_val = true);
43 | void* _init_(vector* shape, void* val = nullptr, bool copy_val = true);
44 | void* _init_(void* val, bool copy_val, uint dim1, va_list ap);
45 | void* _init_(ByteStream* stream);
46 |
47 | void _read_meta_(ByteStream* stream);
48 | void _write_meta_(ByteStream* stream);
49 | void _write_val_(ByteStream* stream);
50 | void _save_(ByteStream* stream);
51 |
52 | protected:
53 | void* _val_;
54 | };
55 |
56 | class ITensor : public Tensor {
57 | public:
58 | ITensor();
59 | ITensor(ITensor* tensor, bool copy_val = true);
60 | ITensor(vector* shape, int* val = nullptr, bool copy_val = true);
61 | ITensor(int* val, bool copy_val, uint dim1, ...);
62 | ITensor(uint dim1, ...);
63 | ITensor(ByteStream* stream);
64 |
65 | void save(ByteStream* stream, bool compressed = false);
66 | int compare(ITensor* tensor);
67 | int compare(int* val, uint len);
68 |
69 | int get_max();
70 | int get_min();
71 |
72 | public:
73 | int* val;
74 | };
75 |
76 | class FTensor : public Tensor {
77 | public:
78 | FTensor();
79 | FTensor(FTensor* tensor, bool copy_val = true);
80 | FTensor(ITensor* tensor);
81 | FTensor(vector* shape, float* val = nullptr, bool copy_val = true);
82 | FTensor(float* val, bool copy_val, uint dim1, ...);
83 | FTensor(uint dim1, ...);
84 | FTensor(ByteStream* stream);
85 |
86 | void save(ByteStream* stream, bool compressed = false);
87 | void print(uint start = END_DIM, uint end = END_DIM);
88 |
89 | float compare(FTensor* tensor);
90 | float compare(float* val, uint len);
91 |
92 | float get_max();
93 | float get_min();
94 |
95 | public:
96 | float* val;
97 | };
98 |
99 | }
100 |
--------------------------------------------------------------------------------
/paddle/paddle_use_ops.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include "paddle_lite_factory_helper.h"
3 |
4 | USE_LITE_OP(conv2d);
5 | USE_LITE_OP(depthwise_conv2d);
6 | USE_LITE_OP(unsqueeze);
7 | USE_LITE_OP(unsqueeze2);
8 | USE_LITE_OP(pool2d);
9 | USE_LITE_OP(fc);
10 | USE_LITE_OP(nearest_interp);
11 | USE_LITE_OP(bilinear_interp);
12 | USE_LITE_OP(batch_norm);
13 | USE_LITE_OP(sync_batch_norm);
14 | USE_LITE_OP(reduce_mean);
15 | USE_LITE_OP(layout);
16 | USE_LITE_OP(assign_value);
17 | USE_LITE_OP(scale);
18 | USE_LITE_OP(fusion_elementwise_sub_activation);
19 | USE_LITE_OP(fusion_elementwise_add_activation);
20 | USE_LITE_OP(fusion_elementwise_mul_activation);
21 | USE_LITE_OP(fusion_elementwise_max_activation);
22 | USE_LITE_OP(fusion_elementwise_min_activation);
23 | USE_LITE_OP(fusion_elementwise_div_activation);
24 | USE_LITE_OP(io_copy_once);
25 | USE_LITE_OP(concat);
26 | USE_LITE_OP(layout_once);
27 | USE_LITE_OP(multiclass_nms);
28 | USE_LITE_OP(multiclass_nms2);
29 | USE_LITE_OP(multiclass_nms3);
30 | USE_LITE_OP(density_prior_box);
31 | USE_LITE_OP(io_copy);
32 | USE_LITE_OP(shuffle_channel);
33 | USE_LITE_OP(elementwise_sub);
34 | USE_LITE_OP(elementwise_add);
35 | USE_LITE_OP(elementwise_mul);
36 | USE_LITE_OP(elementwise_max);
37 | USE_LITE_OP(elementwise_min);
38 | USE_LITE_OP(elementwise_div);
39 | USE_LITE_OP(elementwise_mod);
40 | USE_LITE_OP(elementwise_pow);
41 | USE_LITE_OP(grid_sampler);
42 | USE_LITE_OP(expand_as);
43 | USE_LITE_OP(instance_norm);
44 | USE_LITE_OP(pad2d);
45 | USE_LITE_OP(box_coder);
46 | USE_LITE_OP(sigmoid);
47 | USE_LITE_OP(tanh);
48 | USE_LITE_OP(relu);
49 | USE_LITE_OP(leaky_relu);
50 | USE_LITE_OP(relu6);
51 | USE_LITE_OP(prelu);
52 | USE_LITE_OP(thresholded_relu);
53 | USE_LITE_OP(elu);
54 | USE_LITE_OP(bilinear_interp_v2);
55 | USE_LITE_OP(nearest_interp_v2);
56 | USE_LITE_OP(fill_constant);
57 | USE_LITE_OP(softmax);
58 | USE_LITE_OP(split);
59 | USE_LITE_OP(subgraph);
60 | USE_LITE_OP(slice);
61 | USE_LITE_OP(cast);
62 | USE_LITE_OP(search_fc);
63 | USE_LITE_OP(prior_box);
64 | USE_LITE_OP(conv2d_transpose);
65 | USE_LITE_OP(depthwise_conv2d_transpose);
66 | USE_LITE_OP(squeeze);
67 | USE_LITE_OP(squeeze2);
68 | USE_LITE_OP(arg_max);
69 | USE_LITE_OP(affine_channel);
70 | USE_LITE_OP(fill_constant_batch_size_like);
71 | USE_LITE_OP(affine_grid);
72 | USE_LITE_OP(expand);
73 | USE_LITE_OP(feed);
74 | USE_LITE_OP(yolo_box);
75 | USE_LITE_OP(sequence_topk_avg_pooling);
76 | USE_LITE_OP(mul);
77 | USE_LITE_OP(reshape);
78 | USE_LITE_OP(reshape2);
79 | USE_LITE_OP(fetch);
80 | USE_LITE_OP(matmul);
81 | USE_LITE_OP(calib);
82 | USE_LITE_OP(transpose);
83 | USE_LITE_OP(transpose2);
84 | USE_LITE_OP(range);
85 | USE_LITE_OP(dropout);
86 | USE_LITE_OP(flatten);
87 | USE_LITE_OP(flatten2);
88 | USE_LITE_OP(flatten_contiguous_range);
89 | USE_LITE_OP(stack);
90 | USE_LITE_OP(lod_array_length);
--------------------------------------------------------------------------------
/src/dajutil.cpp:
--------------------------------------------------------------------------------
1 |
2 | #include "dajutil.h"
3 |
4 | namespace dajnn {
5 |
6 | void log_i(const char* format, ...) {
7 | va_list ap;
8 | va_start(ap, format);
9 | log_x("[info]", format, ap);
10 | va_end(ap);
11 | }
12 |
13 | void log_w(const char* format, ...) {
14 | va_list ap;
15 | va_start(ap, format);
16 | log_x("[warn]", format, ap);
17 | va_end(ap);
18 | }
19 |
20 | void log_d(const char* format, ...) {
21 | va_list ap;
22 | va_start(ap, format);
23 | log_x("[debug]", format, ap);
24 | va_end(ap);
25 | }
26 |
27 | void log_e(const char* format, ...) {
28 | va_list ap;
29 | va_start(ap, format);
30 | log_x("[error]", format, ap);
31 | va_end(ap);
32 | }
33 |
34 | void log_x(const char* type_str, const char* format, va_list ap) {
35 | char msg[1024];
36 | #ifdef _WIN32
37 | sprintf_s(msg, 1024, format, ap);
38 | printf_s("%s %s\n", type_str, msg);
39 | #else
40 | vsprintf(msg, format, ap);
41 | __android_log_print(ANDROID_LOG_ERROR, type_str, msg)
42 | #endif
43 | }
44 |
45 | void exit_if(bool condition, const char* format, ...) {
46 | if (condition) {
47 | if (format) {
48 | va_list ap;
49 | va_start(ap, format);
50 | log_e(format, ap);
51 | va_end(ap);
52 | }
53 | exit(-1);
54 | }
55 | }
56 |
57 | float get_max(float* arr, uint len) {
58 | float m = FLOAT_MIN;
59 |
60 | for (float* ap = arr; ap < arr + len; ++ap) {
61 | if (m < *ap) m = *ap;
62 | }
63 | return m;
64 | }
65 |
66 | float get_min(float* arr, uint len) {
67 | float m = FLOAT_MAX;
68 |
69 | for (float* ap = arr; ap < arr + len; ++ap) {
70 | if (m > *ap) m = *ap;
71 | }
72 | return m;
73 | }
74 |
75 | int get_max(int* arr, uint len) {
76 | int m = INT_MIN;
77 |
78 | for (int* ap = arr; ap < arr + len; ++ap) {
79 | if (m < *ap) m = *ap;
80 | }
81 | return m;
82 | }
83 |
84 | int get_min(int* arr, uint len) {
85 | int m = INT_MAX;
86 |
87 | for (int* ap = arr; ap < arr + len; ++ap) {
88 | if (m > *ap) m = *ap;
89 | }
90 | return m;
91 | }
92 |
93 | uint get_span(vector* shape) {
94 | uint span = 1;
95 |
96 | for (vector::iterator dim = shape->begin(); dim != shape->end(); ++dim) {
97 | span *= *dim;
98 | }
99 | return span;
100 | }
101 |
102 | string get_shape_str(vector* shape) {
103 | string str = "(";
104 |
105 | for (uint i = 0; i < shape->size(); ++i) {
106 | if (i > 0) str += ",";
107 | str += format_str("%d", shape->at(i));
108 | }
109 | str += ")";
110 | return str;
111 | }
112 |
113 | string format_str(const char* format, ...) {
114 | char str[1024];
115 | va_list ap;
116 | va_start(ap, format);
117 | #ifdef _WIN32
118 | sprintf_s(str, 1024, format, ap);
119 | #else
120 | vsprintf(str, format, ap);
121 | #endif
122 | va_end(ap);
123 | return string(str);
124 | }
125 |
126 | }
127 |
--------------------------------------------------------------------------------
/src/dajfunc.cpp:
--------------------------------------------------------------------------------
1 |
2 | #include "dajfunc.h"
3 | #include "dajtensor.h"
4 | #include "dajutil.h"
5 |
6 | #ifdef PADDLE
7 | #include "paddle_api_2.h"
8 | using namespace paddle::lite_api;
9 | #endif
10 |
11 | namespace dajnn {
12 | namespace func {
13 |
14 | void relu(FTensor* tensor) {
15 | #ifdef PADDLE
16 | paddle_act_relu(tensor->val, tensor->val, tensor->span, PADDLE_THREADS);
17 | #else
18 | for (float* v = tensor->val; v < tensor->val + tensor->span; ++v) {
19 | if (*v < 0) *v = 0;
20 | }
21 | #endif
22 | }
23 |
24 | void tanh(FTensor* tensor) {
25 | #ifdef PADDLE
26 | act_tanh(tensor->val, tensor->val, tensor->span, PADDLE_THREADS);
27 | #else
28 | for (float* v = tensor->val; v < tensor->val + tensor->span; ++v) {
29 | *v = 2.f / (1.f + expf(-2.f * *v)) - 1;
30 | }
31 | #endif
32 | }
33 |
34 | void scale(FTensor* tensor, FTensor* weight, FTensor* bias, bool is_first_batch_dim) {
35 | uint span = tensor->span;
36 | int num_batches = is_first_batch_dim ? tensor->shape[0] : 1;
37 | int num_channels = tensor->shape[is_first_batch_dim ? 1 : 0];
38 | int num_features = span / num_batches / num_channels;
39 |
40 | exit_if((weight->shape.size() != 1) || (weight->span != num_channels),
41 | "invalid scale weight shape with tensor shape : %s and %s",
42 | get_shape_str(&weight->shape).c_str(),
43 | get_shape_str(&tensor->shape).c_str());
44 | exit_if(bias && ((bias->shape.size() != 1) || (bias->span != num_channels)),
45 | "invalid scale bias shape with weight shape : %s and %s",
46 | get_shape_str(&bias->shape).c_str(),
47 | get_shape_str(&weight->shape).c_str());
48 |
49 | #ifdef PADDLE
50 | paddle::lite_api::scale(tensor->val, tensor->val, num_batches, num_channels, num_features, weight->val, bias ? bias->val : nullptr);
51 | #else
52 | float* v = tensor->val;
53 |
54 | if (bias) {
55 | for (int i = 0; i < num_batches; ++i) {
56 | float* w = weight->val;
57 | float* b = bias->val;
58 |
59 | for (int j = 0; j < num_channels; ++j, ++w, ++b) {
60 | for (int k = 0; k < num_features; ++k, ++v) {
61 | *v = *v * *w + *b;
62 | }
63 | }
64 | }
65 | } else {
66 | for (int i = 0; i < num_batches; ++i) {
67 | float* w = weight->val;
68 |
69 | for (int j = 0; j < num_channels; ++w) {
70 | for (int k = 0; k < num_features; ++k, ++v) {
71 | *v = *v * *w;
72 | }
73 | }
74 | }
75 | }
76 | #endif
77 | }
78 |
79 | void add(FTensor* dst, FTensor* oprd) {
80 | int span = dst->span;
81 |
82 | exit_if(!dst->is_shape(&oprd->shape), "shapes mismatch for add operation : %s and %s",
83 | get_shape_str(&dst->shape).c_str(),
84 | get_shape_str(&oprd->shape).c_str());
85 |
86 | #ifdef PADDLE
87 | paddle_elementwise_add(dst->val, oprd->val, dst->val, span);
88 | #else
89 | float* v = dst->val;
90 | float* o = oprd->val;
91 |
92 | for (int i = 0; i < span; ++i, ++v, ++o) {
93 | *v += *o;
94 | }
95 | #endif
96 | }
97 |
98 | }
99 | }
100 |
--------------------------------------------------------------------------------
/vsproj/dajnn.vcxproj.filters:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | {fe6241e9-2e92-445d-abaa-fd89a812ded6}
6 |
7 |
8 | {625277b8-cb1e-43da-874e-7c76e1f7fef9}
9 |
10 |
11 |
12 |
13 | paddle
14 |
15 |
16 | paddle
17 |
18 |
19 | paddle
20 |
21 |
22 | paddle
23 |
24 |
25 | paddle
26 |
27 |
28 | paddle
29 |
30 |
31 | paddle
32 |
33 |
34 | paddle
35 |
36 |
37 | dajnn
38 |
39 |
40 | dajnn
41 |
42 |
43 | dajnn
44 |
45 |
46 | dajnn
47 |
48 |
49 | dajnn
50 |
51 |
52 | dajnn
53 |
54 |
55 | dajnn
56 |
57 |
58 | dajnn
59 |
60 |
61 | dajnn
62 |
63 |
64 | dajnn
65 |
66 |
67 |
68 |
69 |
70 | dajnn
71 |
72 |
73 | dajnn
74 |
75 |
76 | dajnn
77 |
78 |
79 | dajnn
80 |
81 |
82 | dajnn
83 |
84 |
85 | dajnn
86 |
87 |
88 | dajnn
89 |
90 |
91 | dajnn
92 |
93 |
94 | dajnn
95 |
96 |
97 |
--------------------------------------------------------------------------------
/paddle/paddle_use_passes.h:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | #pragma once
16 | #include "paddle_lite_factory_helper.h" // NOLINT
17 |
18 | USE_MIR_PASS(demo);
19 | USE_MIR_PASS(static_kernel_pick_pass);
20 | USE_MIR_PASS(variable_place_inference_pass);
21 | USE_MIR_PASS(type_target_cast_pass);
22 | USE_MIR_PASS(generate_program_pass);
23 |
24 | USE_MIR_PASS(io_copy_kernel_pick_pass);
25 | USE_MIR_PASS(argument_type_display_pass);
26 | USE_MIR_PASS(runtime_context_assign_pass);
27 | USE_MIR_PASS(graph_visualize_pass);
28 |
29 | USE_MIR_PASS(adaptive_1x1_pool2d_convert_global_pass);
30 | USE_MIR_PASS(remove_tf_redundant_ops_pass);
31 | USE_MIR_PASS(lite_conv_bn_fuse_pass);
32 | USE_MIR_PASS(lite_conv_conv_fuse_pass);
33 | USE_MIR_PASS(lite_squeeze2_matmul_fuse_pass);
34 | USE_MIR_PASS(lite_reshape2_matmul_fuse_pass);
35 | USE_MIR_PASS(lite_matmul_fuse_pass);
36 | USE_MIR_PASS(lite_fc_fuse_pass);
37 | USE_MIR_PASS(lite_shuffle_channel_fuse_pass);
38 | USE_MIR_PASS(lite_transpose_softmax_transpose_fuse_pass);
39 | USE_MIR_PASS(lite_interpolate_fuse_pass);
40 | USE_MIR_PASS(lite_sequence_pool_concat_fuse_pass);
41 | USE_MIR_PASS(identity_scale_eliminate_pass);
42 | USE_MIR_PASS(identity_dropout_eliminate_pass);
43 | USE_MIR_PASS(lite_conv_elementwise_fuse_pass);
44 | USE_MIR_PASS(lite_conv_activation_fuse_pass);
45 | USE_MIR_PASS(lite_var_conv_2d_activation_fuse_pass);
46 | USE_MIR_PASS(lite_match_matrix_activation_fuse_pass);
47 | USE_MIR_PASS(lite_scales_fuse_pass);
48 | USE_MIR_PASS(lite_sequence_reverse_embedding_fuse_pass);
49 | USE_MIR_PASS(lite_elementwise_activation_fuse_pass);
50 | USE_MIR_PASS(lite_quant_dequant_fuse_pass);
51 | USE_MIR_PASS(type_precision_cast_pass);
52 | USE_MIR_PASS(type_layout_cast_pass);
53 | USE_MIR_PASS(type_layout_cast_preprocess_pass);
54 | USE_MIR_PASS(memory_optimize_pass);
55 | USE_MIR_PASS(lite_reshape_fuse_pass);
56 | USE_MIR_PASS(multi_stream_analysis_pass);
57 | USE_MIR_PASS(elementwise_mul_constant_eliminate_pass)
58 | USE_MIR_PASS(npu_subgraph_pass);
59 | USE_MIR_PASS(huawei_ascend_npu_subgraph_pass);
60 | USE_MIR_PASS(imagination_nna_subgraph_pass);
61 | USE_MIR_PASS(xpu_subgraph_pass);
62 | USE_MIR_PASS(mlu_subgraph_pass);
63 | USE_MIR_PASS(mlu_postprocess_pass);
64 | USE_MIR_PASS(weight_quantization_preprocess_pass);
65 | USE_MIR_PASS(post_quant_dynamic_pass);
66 | USE_MIR_PASS(apu_subgraph_pass);
67 | USE_MIR_PASS(quantized_op_attributes_inference_pass);
68 | USE_MIR_PASS(restrict_quantized_op_with_same_input_output_scale_pass);
69 | USE_MIR_PASS(control_flow_op_unused_inputs_and_outputs_eliminate_pass)
70 | USE_MIR_PASS(lite_scale_activation_fuse_pass);
71 | USE_MIR_PASS(lite_instance_norm_activation_fuse_pass);
72 | USE_MIR_PASS(__xpu__resnet_fuse_pass);
73 | USE_MIR_PASS(__xpu__resnet_d_fuse_pass);
74 | USE_MIR_PASS(__xpu__resnet_cbam_fuse_pass);
75 | USE_MIR_PASS(__xpu__multi_encoder_fuse_pass);
76 | USE_MIR_PASS(__xpu__embedding_with_eltwise_add_fuse_pass);
77 | USE_MIR_PASS(__xpu__fc_fuse_pass);
78 | USE_MIR_PASS(__xpu__mmdnn_fuse_pass);
79 | USE_MIR_PASS(__xpu__conv2d_fuse_pass);
80 | USE_MIR_PASS(__xpu__resblock_reduction_fuse_pass);
81 | USE_MIR_PASS(__xpu__resblock_normal_fuse_pass);
82 | USE_MIR_PASS(__xpu__conv2d_link_previous_out_max_pass);
83 | USE_MIR_PASS(__xpu__sfa_head_meanstd_fuse_pass);
84 | USE_MIR_PASS(__xpu__sfa_head_moment_fuse_pass);
85 | USE_MIR_PASS(__xpu__softmax_topk_fuse_pass);
86 | USE_MIR_PASS(__xpu__multi_encoder_slice_link_fuse_pass);
87 |
--------------------------------------------------------------------------------
/src/dajconv.cpp:
--------------------------------------------------------------------------------
1 |
2 | #include "dajdense.h"
3 | #include "dajtensor.h"
4 | #include "dajutil.h"
5 | #include "dajgemm.h"
6 |
7 | #ifdef PADDLE
8 | #include "paddle_api_2.h"
9 | using namespace paddle::lite_api;
10 | #endif
11 |
12 | namespace dajnn {
13 | namespace conv {
14 |
15 | #ifndef PADDLE
16 | float im2col_get_pixel(float* im, int height, int width, int channels,
17 | int row, int col, int channel, int pad_h, int pad_w) {
18 |
19 | row -= pad_h;
20 | col -= pad_w;
21 |
22 | if (row < 0 || col < 0 || row >= height || col >= width) return 0;
23 | return im[col + width * (row + height * channel)];
24 | }
25 |
26 | void im2col_cpu(float* data_im, int channels, int height, int width,
27 | int kernel_h, int kernel_w, int stride_h, int stride_w,
28 | int pad_h, int pad_w, float* data_col) {
29 |
30 | int c, h, w;
31 | int height_col = (height + 2 * pad_h - kernel_h) / stride_h + 1;
32 | int width_col = (width + 2 * pad_w - kernel_w) / stride_w + 1;
33 |
34 | int ksize = kernel_h * kernel_w;
35 | int channels_col = channels * ksize;
36 |
37 | for (c = 0; c < channels_col; ++c) {
38 | int w_offset = c % kernel_w;
39 | int h_offset = (c / kernel_w) % kernel_h;
40 | int c_im = c / ksize;
41 |
42 | for (h = 0; h < height_col; ++h) {
43 | for (w = 0; w < width_col; ++w) {
44 | int im_row = h_offset + h * stride_h;
45 | int im_col = w_offset + w * stride_w;
46 | int col_index = (c * height_col + h) * width_col + w;
47 |
48 | data_col[col_index] = im2col_get_pixel(
49 | data_im, height, width, channels,
50 | im_row, im_col, c_im, pad_h, pad_w);
51 | }
52 | }
53 | }
54 | }
55 | #endif
56 |
57 | FTensor* conv2d(FTensor* input, FTensor* kernel, FTensor* bias,
58 | int padding_h, int padding_w, int stride_h, int stride_w,
59 | int dilation_h, int dilation_w) {
60 |
61 | exit_if(input->shape.size() != 4, "input dim of conv2d expects to be 4, but got %d", input->shape.size());
62 | exit_if(kernel->shape.size() != 4, "kernel dim of conv2d expects to be 4, but got %d", kernel->shape.size());
63 | exit_if(bias && (bias->shape.size() != 1), "bias dim of conv2d expects to be null or 1, but got %d", bias->shape.size());
64 |
65 | int num_batches = input->shape[0];
66 | int num_channels = input->shape[1];
67 | int h = input->shape[2];
68 | int w = input->shape[3];
69 | int num_filters = kernel->shape[0];
70 | int kernel_h = kernel->shape[2];
71 | int kernel_w = kernel->shape[3];
72 |
73 | exit_if(kernel->shape[1] != num_channels, "second dim of conv2d kernel (# of channels) expects to be %d, but got %d", num_channels, kernel->shape[1]);
74 | exit_if(bias->span != num_filters, "span of conv2d bias (# of filters) expects to be %d, but got %d", num_filters, bias->span);
75 |
76 | if (padding_h < 0) padding_h = (kernel_h - 1) * dilation_h / 2;
77 | if (padding_w < 0) padding_w = (kernel_w - 1) * dilation_w / 2;
78 |
79 | int _h_ = (h + 2 * padding_h - dilation_h * (kernel_h - 1) - 1) / stride_h + 1;
80 | int _w_ = (w + 2 * padding_w - dilation_w * (kernel_w - 1) - 1) / stride_w + 1;
81 |
82 | FTensor* output = new FTensor(num_batches, num_filters, _h_, _w_, END_DIM);
83 |
84 | int chw = input->span / num_batches;
85 | int _chw_ = output->span / num_batches;
86 |
87 | float* ip = input->val;
88 | float* op = output->val;
89 |
90 | #ifdef PADDLE
91 | exit_if(num_filters == 1, "single conv2d filter is not available for mobile forward");
92 |
93 | paddle_conv2d(num_batches, h, w, num_channels, ip, num_filters, kernel_h, kernel_w, kernel->val,
94 | op, bias ? bias->val : nullptr, padding_h, padding_w, dilation_h, dilation_w,
95 | stride_h, stride_w, 0, 0, PADDLE_CLS, PADDLE_THREADS);
96 | #else
97 | exit_if((dilation_h != 1) || (dilation_w != 1), "only single dilation is available for win32 forward");
98 | memset(output->val, 0, 4 * output->span);
99 |
100 | int _hw_ = _h_ * _w_;
101 | int _kkc_ = kernel_h * kernel_w * num_channels;
102 | float* workspace = (float*) malloc(4 * _kkc_ * _hw_);
103 |
104 | for (int i = 0; i < num_batches; ++i, ip += chw, op += _chw_) {
105 | im2col_cpu(ip, num_channels, h, w, kernel_h, kernel_w, stride_h, stride_w,
106 | padding_h, padding_w, workspace);
107 | gemm(0, 0, num_filters, _hw_, _kkc_, 1, kernel->val, _kkc_, workspace, _hw_, 1, op, _hw_);
108 | }
109 | if (bias) {
110 | op = output->val;
111 |
112 | for (int i = 0; i < num_batches; ++i) {
113 | for (int j = 0; j < num_filters; ++j) {
114 | float b = bias->val[j];
115 |
116 | for (int k = 0; k < _hw_; ++k) {
117 | op[i * _chw_ + j * _hw_ + k] += b;
118 | }
119 | }
120 | }
121 | }
122 | free(workspace);
123 | #endif
124 | return output;
125 | }
126 |
127 | }
128 | }
129 |
--------------------------------------------------------------------------------
/vsproj/dajnn.vcxproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | Debug
6 | Win32
7 |
8 |
9 | Release
10 | Win32
11 |
12 |
13 |
14 | {A4E8B25A-10B8-4252-8C8A-EE5FDBF7891F}
15 | Win32Proj
16 | dajnn
17 |
18 |
19 |
20 | Application
21 | true
22 | v120_xp
23 | Unicode
24 |
25 |
26 | Application
27 | false
28 | v120_xp
29 | true
30 | Unicode
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 | true
44 | $(SolutionDir)_bin\
45 | $(SolutionDir)_obj\
46 |
47 |
48 | false
49 | $(SolutionDir)_bin\
50 | $(SolutionDir)_obj\
51 |
52 |
53 |
54 |
55 |
56 | Level3
57 | Disabled
58 | WIN32;_DEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions)
59 | $(SolutionDir)..\src\dajnn;$(SolutionDir)paddle;
60 | /wd4996 %(AdditionalOptions)
61 |
62 |
63 | Console
64 | true
65 |
66 |
67 |
68 |
69 | Level3
70 |
71 |
72 | MaxSpeed
73 | true
74 | true
75 | WIN32;NDEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions)
76 | $(SolutionDir)..\src\dajnn;$(SolutionDir)paddle;
77 | /wd4996 %(AdditionalOptions)
78 |
79 |
80 | Console
81 | true
82 | true
83 | true
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
--------------------------------------------------------------------------------
/src/dajgemm.cpp:
--------------------------------------------------------------------------------
1 |
2 | #include "dajgemm.h"
3 | #include "dajutil.h"
4 |
5 | namespace dajnn {
6 |
7 | void gemm_bin(int M, int N, int K, float ALPHA, char* A, int lda, float* B, int ldb, float* C, int ldc) {
8 | int i, j, k;
9 |
10 | for (i = 0; i < M; ++i) {
11 | for (k = 0; k < K; ++k) {
12 | char A_PART = A[i * lda + k];
13 |
14 | if (A_PART) {
15 | for (j = 0; j < N; ++j) {
16 | C[i * ldc + j] += B[k * ldb + j];
17 | }
18 | } else {
19 | for (j = 0; j < N; ++j) {
20 | C[i * ldc + j] -= B[k * ldb + j];
21 | }
22 | }
23 | }
24 | }
25 | }
26 |
27 | void gemm(int TA, int TB, int M, int N, int K, float ALPHA, float* A, int lda, float* B, int ldb, float BETA, float* C, int ldc) {
28 | gemm_cpu(TA, TB, M, N, K, ALPHA, A, lda, B, ldb, BETA, C, ldc);
29 | }
30 |
31 | //#if (defined(__AVX__) && defined(__x86_64__)) || defined(_WIN64)
32 | #if defined(__AVX__) || defined(_WIN64)
33 | #define OSXSAVEFlag (1UL << 27)
34 | #define AVXFlag ((1UL << 28) | OSXSAVEFlag)
35 | #define FMAFlag ((1UL << 12) | AVXFlag | OSXSAVEFlag)
36 | #define CLMULFlag ((1UL << 1) | AVXFlag | OSXSAVEFlag)
37 | #define VAESFlag ((1UL << 25) | AVXFlag | OSXSAVEFlag)
38 |
39 | #include
40 |
41 | //#ifdef _WIN64
42 | #ifdef _WIN32
43 | #include
44 | #include
45 | #include
46 | #include
47 | #else // Linux GCC/Clang
48 | #include
49 | #include
50 | #include
51 | #include
52 | #include
53 |
54 | void asm_cpuid(uint32_t* abcd, uint32_t eax) {
55 | uint32_t ebx = 0, edx = 0, ecx = 0;
56 |
57 | // EBX is saved to EDI and later restored
58 | __asm__("movl %%ebx, %%edi;"
59 | "cpuid;"
60 | "xchgl %%ebx, %%edi;"
61 | : "=D"(ebx),
62 | "+a"(eax), "+c"(ecx), "=d"(edx));
63 |
64 | abcd[0] = eax;
65 | abcd[1] = ebx;
66 | abcd[2] = ecx;
67 | abcd[3] = edx;
68 | }
69 | #endif
70 |
71 | int simd_detect_x86(unsigned int idFeature) {
72 | uint32_t regs[4]; // EAX, EBX, ECX, EDX;
73 | #ifdef _WIN32
74 | __cpuid((int*) regs, 0);
75 | if (regs[0] > 1U) __cpuid((int*) regs, 1);
76 | #else
77 | __get_cpuid(0, ®s[0], ®s[1], ®s[2], ®s[3]);
78 | if (regs[0] > 1U) __get_cpuid(1, ®s[0], ®s[1], ®s[2], ®s[3]);
79 | #endif
80 | if ((regs[2] & idFeature) != idFeature) return 0;
81 | return 1;
82 | }
83 |
84 | int is_fma_avx() {
85 | static int result = -1;
86 |
87 | if (result == -1) {
88 | result = simd_detect_x86(AVXFlag);
89 |
90 | if (result == 1) {
91 | log_i(" used AVX");
92 | } else {
93 | log_i(" not used AVX");
94 | }
95 | }
96 | return result;
97 | }
98 |
99 | void gemm_nn(int M, int N, int K, float ALPHA, float* A, int lda, float* B, int ldb, float* C, int ldc) {
100 | int i, j, k;
101 |
102 | if (is_fma_avx() == 1) { // AVX
103 | for (i = 0; i < M; ++i) {
104 | for (k = 0; k < K; ++k) {
105 | float A_PART = ALPHA * A[i * lda + k];
106 | __m256 a256, b256, c256, result256; // AVX
107 | a256 = _mm256_set1_ps(A_PART);
108 |
109 | for (j = 0; j < N - 8; j += 8) {
110 | b256 = _mm256_loadu_ps(&B[k * ldb + j]);
111 | c256 = _mm256_loadu_ps(&C[i * ldc + j]);
112 |
113 | // FMA - Intel Haswell (2013), AMD Piledriver (2012)
114 | result256 = _mm256_fmadd_ps(a256, b256, c256);
115 | //result256 = _mm256_mul_ps(a256, b256);
116 | //result256 = _mm256_add_ps(result256, c256);
117 |
118 | _mm256_storeu_ps(&C[i * ldc + j], result256);
119 | }
120 | int prev_end = (N % 8 == 0) ? (N - 8) : (N / 8) * 8;
121 |
122 | for (j = prev_end; j < N; ++j)
123 | C[i * ldc + j] += A_PART * B[k * ldb + j];
124 | }
125 | }
126 | } else {
127 | for (i = 0; i < M; ++i) {
128 | for (k = 0; k < K; ++k) {
129 | register float A_PART = ALPHA * A[i * lda + k];
130 |
131 | for (j = 0; j < N; ++j) {
132 | C[i * ldc + j] += A_PART * B[k * ldb + j];
133 | }
134 | /* // SSE
135 | __m128 a128, b128, c128, result128; // SSE
136 | a128 = _mm_set1_ps(A_PART);
137 | for (j = 0; j < N - 4; j += 4) {
138 | b128 = _mm_loadu_ps(&B[k*ldb + j]);
139 | c128 = _mm_loadu_ps(&C[i*ldc + j]);
140 | //result128 = _mm_fmadd_ps(a128, b128, c128);
141 | result128 = _mm_mul_ps(a128, b128);
142 | result128 = _mm_add_ps(result128, c128);
143 | _mm_storeu_ps(&C[i*ldc + j], result128);
144 | }
145 |
146 | int prev_end = (N % 4 == 0) ? (N - 4) : (N / 4) * 4;
147 | for (j = prev_end; j < N; ++j){
148 | C[i*ldc + j] += A_PART*B[k*ldb + j];
149 | }
150 | */
151 | }
152 | }
153 | }
154 | }
155 | #else
156 |
157 | void gemm_nn(int M, int N, int K, float ALPHA, float* A, int lda, float* B, int ldb, float* C, int ldc) {
158 | int i, j, k;
159 |
160 | for (i = 0; i < M; ++i) {
161 | for (k = 0; k < K; ++k) {
162 | register float A_PART = ALPHA * A[i * lda + k];
163 |
164 | for (j = 0; j < N; ++j) {
165 | C[i * ldc + j] += A_PART * B[k * ldb + j];
166 | }
167 | }
168 | }
169 | }
170 |
171 | #endif // __x86_64
172 |
173 | void gemm_nt(int M, int N, int K, float ALPHA, float* A, int lda, float* B, int ldb, float* C, int ldc) {
174 | int i, j, k;
175 |
176 | for (i = 0; i < M; ++i) {
177 | for (j = 0; j < N; ++j) {
178 | register float sum = 0;
179 |
180 | for (k = 0; k < K; ++k) {
181 | sum += ALPHA * A[i * lda + k] * B[j * ldb + k];
182 | }
183 | C[i * ldc + j] += sum;
184 | }
185 | }
186 | }
187 |
188 | void gemm_tn(int M, int N, int K, float ALPHA, float* A, int lda, float* B, int ldb, float* C, int ldc) {
189 | int i, j, k;
190 |
191 | for (i = 0; i < M; ++i) {
192 | for (k = 0; k < K; ++k) {
193 | register float A_PART = ALPHA * A[k * lda + i];
194 |
195 | for (j = 0; j < N; ++j) {
196 | C[i * ldc + j] += A_PART * B[k * ldb + j];
197 | }
198 | }
199 | }
200 | }
201 |
202 | void gemm_tt(int M, int N, int K, float ALPHA, float* A, int lda, float* B, int ldb, float* C, int ldc) {
203 | int i, j, k;
204 |
205 | for (i = 0; i < M; ++i) {
206 | for (j = 0; j < N; ++j) {
207 | register float sum = 0;
208 |
209 | for (k = 0; k < K; ++k) {
210 | sum += ALPHA * A[i + k * lda] * B[k + j * ldb];
211 | }
212 | C[i * ldc + j] += sum;
213 | }
214 | }
215 | }
216 |
217 | void gemm_cpu(int TA, int TB, int M, int N, int K, float ALPHA, float* A, int lda, float* B, int ldb, float BETA, float* C, int ldc) {
218 | int i, j;
219 |
220 | /*for (i = 0; i < M; ++i) {
221 | for (j = 0; j < N; ++j) {
222 | C[i * ldc + j] *= BETA;
223 | }
224 | }*/
225 | int t;
226 |
227 | #pragma omp parallel for
228 | for (t = 0; t < M; ++t) {
229 | if (!TA && !TB) {
230 | gemm_nn(1, N, K, ALPHA, A + t * lda, lda, B, ldb, C + t * ldc, ldc);
231 | } else if (TA && !TB) {
232 | gemm_tn(1, N, K, ALPHA, A + t, lda, B, ldb, C + t * ldc, ldc);
233 | } else if (!TA && TB) {
234 | gemm_nt(1, N, K, ALPHA, A + t * lda, lda, B, ldb, C + t * ldc, ldc);
235 | } else {
236 | gemm_tt(1, N, K, ALPHA, A + t, lda, B, ldb, C + t * ldc, ldc);
237 | }
238 | }
239 | }
240 |
241 | }
242 |
--------------------------------------------------------------------------------
/paddle/paddle_use_kernels.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | #include "paddle_lite_factory_helper.h"
3 |
4 | USE_LITE_KERNEL(reshape, kHost, kAny, kAny, def);
5 | USE_LITE_KERNEL(reshape2, kHost, kAny, kAny, def);
6 | USE_LITE_KERNEL(flatten, kHost, kAny, kAny, def);
7 | USE_LITE_KERNEL(flatten2, kHost, kAny, kAny, def);
8 | USE_LITE_KERNEL(unsqueeze, kHost, kAny, kAny, def);
9 | USE_LITE_KERNEL(unsqueeze2, kHost, kAny, kAny, def);
10 | USE_LITE_KERNEL(slice, kARM, kFloat, kNCHW, def);
11 | USE_LITE_KERNEL(slice, kARM, kFloat, kNCHW, bool_slice);
12 | USE_LITE_KERNEL(slice, kARM, kFloat, kNCHW, int32_slice);
13 | USE_LITE_KERNEL(slice, kARM, kFloat, kNCHW, def_int64);
14 | USE_LITE_KERNEL(affine_channel, kARM, kFloat, kNCHW, def);
15 | USE_LITE_KERNEL(pool2d, kARM, kFloat, kNCHW, def);
16 | USE_LITE_KERNEL(fill_constant, kHost, kAny, kNCHW, def);
17 | USE_LITE_KERNEL(mul, kARM, kFloat, kNCHW, def);
18 | USE_LITE_KERNEL(pad2d, kARM, kFloat, kNCHW, def);
19 | USE_LITE_KERNEL(relu, kARM, kFloat, kNCHW, def);
20 | USE_LITE_KERNEL(leaky_relu, kARM, kFloat, kNCHW, def);
21 | USE_LITE_KERNEL(prelu, kARM, kFloat, kNCHW, def);
22 | USE_LITE_KERNEL(sigmoid, kARM, kFloat, kNCHW, def);
23 | USE_LITE_KERNEL(tanh, kARM, kFloat, kNCHW, def);
24 | USE_LITE_KERNEL(relu6, kARM, kFloat, kNCHW, def);
25 | USE_LITE_KERNEL(thresholded_relu, kARM, kFloat, kNCHW, def);
26 | USE_LITE_KERNEL(elu, kARM, kFloat, kNCHW, def);
27 | USE_LITE_KERNEL(layout, kARM, kFloat, kNCHW, nchw2nhwc);
28 | USE_LITE_KERNEL(layout, kARM, kFloat, kNCHW, nhwc2nchw);
29 | USE_LITE_KERNEL(layout, kARM, kInt8, kNCHW, int8_nchw2nhwc);
30 | USE_LITE_KERNEL(layout, kARM, kInt8, kNCHW, int8_nhwc2nchw);
31 | USE_LITE_KERNEL(layout_once, kARM, kFloat, kNCHW, nchw2nhwc);
32 | USE_LITE_KERNEL(layout_once, kARM, kFloat, kNCHW, nhwc2nchw);
33 | USE_LITE_KERNEL(layout_once, kARM, kInt8, kNCHW, int8_nchw2nhwc);
34 | USE_LITE_KERNEL(layout_once, kARM, kInt8, kNCHW, int8_nhwc2nchw);
35 | USE_LITE_KERNEL(split, kARM, kFloat, kNCHW, def);
36 | USE_LITE_KERNEL(split, kARM, kInt64, kNCHW, def);
37 | USE_LITE_KERNEL(concat, kARM, kAny, kNCHW, def);
38 | USE_LITE_KERNEL(expand, kHost, kFloat, kAny, def);
39 | USE_LITE_KERNEL(expand, kHost, kInt32, kAny, def);
40 | USE_LITE_KERNEL(yolo_box, kARM, kFloat, kNCHW, def);
41 | USE_LITE_KERNEL(gather, kHost, kFloat, kNCHW, int32int32);
42 | USE_LITE_KERNEL(gather, kHost, kFloat, kNCHW, int64int64);
43 | USE_LITE_KERNEL(gather, kHost, kFloat, kNCHW, int64int32);
44 | USE_LITE_KERNEL(gather, kHost, kFloat, kNCHW, int32int64);
45 | USE_LITE_KERNEL(calib, kARM, kInt8, kNCHW, fp32_to_int8);
46 | USE_LITE_KERNEL(calib, kARM, kInt32, kNCHW, int32_to_fp32);
47 | USE_LITE_KERNEL(calib, kARM, kInt32, kNCHW, int32_to_int64);
48 | USE_LITE_KERNEL(calib, kARM, kInt32, kNCHW, fp32_to_int32);
49 | USE_LITE_KERNEL(calib, kARM, kInt64, kNCHW, int64_to_fp32);
50 | USE_LITE_KERNEL(calib, kARM, kInt8, kNCHW, int8_to_fp32);
51 | USE_LITE_KERNEL(calib, kARM, kInt8, kNHWC, fp32_to_int8);
52 | USE_LITE_KERNEL(calib, kARM, kInt8, kNHWC, int8_to_fp32);
53 | USE_LITE_KERNEL(calib, kARM, kInt64, kNCHW, int64_to_int32);
54 | USE_LITE_KERNEL(calib_once, kARM, kInt8, kNCHW, fp32_to_int8);
55 | USE_LITE_KERNEL(calib_once, kARM, kInt8, kNCHW, int8_to_fp32);
56 | USE_LITE_KERNEL(calib_once, kARM, kInt8, kNHWC, fp32_to_int8);
57 | USE_LITE_KERNEL(calib_once, kARM, kInt8, kNHWC, int8_to_fp32);
58 | USE_LITE_KERNEL(calib_once, kARM, kInt64, kNCHW, int64_to_int32);
59 | USE_LITE_KERNEL(arg_max, kHost, kAny, kNCHW, fp32);
60 | USE_LITE_KERNEL(feed, kHost, kAny, kAny, def);
61 | USE_LITE_KERNEL(multiclass_nms, kHost, kFloat, kNCHW, def);
62 | USE_LITE_KERNEL(multiclass_nms2, kHost, kFloat, kNCHW, def);
63 | USE_LITE_KERNEL(multiclass_nms3, kHost, kFloat, kNCHW, def);
64 | USE_LITE_KERNEL(conv2d, kARM, kFloat, kNCHW, def);
65 | USE_LITE_KERNEL(depthwise_conv2d, kARM, kFloat, kNCHW, def);
66 | USE_LITE_KERNEL(conv2d, kARM, kInt8, kNCHW, int8_out);
67 | USE_LITE_KERNEL(conv2d, kARM, kInt8, kNCHW, fp32_out);
68 | USE_LITE_KERNEL(depthwise_conv2d, kARM, kInt8, kNCHW, int8_out);
69 | USE_LITE_KERNEL(depthwise_conv2d, kARM, kInt8, kNCHW, fp32_out);
70 | USE_LITE_KERNEL(box_coder, kHost, kFloat, kNCHW, def);
71 | USE_LITE_KERNEL(assign_value, kARM, kAny, kNCHW, def);
72 | USE_LITE_KERNEL(squeeze, kHost, kAny, kAny, def);
73 | USE_LITE_KERNEL(squeeze2, kHost, kAny, kAny, def);
74 | USE_LITE_KERNEL(relu_clipped, kARM, kFloat, kNCHW, def);
75 | USE_LITE_KERNEL(swish, kARM, kFloat, kNCHW, def);
76 | USE_LITE_KERNEL(log, kARM, kFloat, kNCHW, def);
77 | USE_LITE_KERNEL(exp, kARM, kFloat, kNCHW, def);
78 | USE_LITE_KERNEL(floor, kARM, kFloat, kNCHW, def);
79 | USE_LITE_KERNEL(hard_sigmoid, kARM, kFloat, kNCHW, def);
80 | USE_LITE_KERNEL(sqrt, kARM, kFloat, kNCHW, def);
81 | USE_LITE_KERNEL(rsqrt, kARM, kFloat, kNCHW, def);
82 | USE_LITE_KERNEL(square, kARM, kFloat, kNCHW, def);
83 | USE_LITE_KERNEL(hard_swish, kARM, kFloat, kNCHW, def);
84 | USE_LITE_KERNEL(reciprocal, kARM, kFloat, kNCHW, def);
85 | USE_LITE_KERNEL(abs, kARM, kFloat, kNCHW, def);
86 | USE_LITE_KERNEL(range, kARM, kFloat, kNCHW, def);
87 | USE_LITE_KERNEL(range, kARM, kInt32, kNCHW, def);
88 | USE_LITE_KERNEL(fc, kARM, kFloat, kNCHW, def);
89 | USE_LITE_KERNEL(fc, kARM, kInt8, kNCHW, int8out);
90 | USE_LITE_KERNEL(fc, kARM, kInt8, kNCHW, fp32out);
91 | USE_LITE_KERNEL(grid_sampler, kARM, kFloat, kNCHW, def);
92 | USE_LITE_KERNEL(instance_norm, kARM, kFloat, kNCHW, def);
93 | USE_LITE_KERNEL(stack, kHost, kFloat, kAny, def);
94 | USE_LITE_KERNEL(stack, kHost, kInt32, kAny, def);
95 | USE_LITE_KERNEL(lod_array_length, kHost, kAny, kAny, def);
96 | USE_LITE_KERNEL(transpose, kARM, kFloat, kNCHW, def);
97 | USE_LITE_KERNEL(transpose2, kARM, kFloat, kNCHW, def);
98 | USE_LITE_KERNEL(prior_box, kARM, kFloat, kNCHW, def);
99 | USE_LITE_KERNEL(scale, kARM, kFloat, kNCHW, def);
100 | USE_LITE_KERNEL(scale, kARM, kInt32, kNCHW, def);
101 | USE_LITE_KERNEL(scale, kARM, kInt64, kNCHW, def);
102 | USE_LITE_KERNEL(arg_max, kARM, kAny, kNCHW, fp32);
103 | USE_LITE_KERNEL(shuffle_channel, kARM, kFloat, kNCHW, def);
104 | USE_LITE_KERNEL(range, kHost, kFloat, kAny, def);
105 | USE_LITE_KERNEL(range, kHost, kInt32, kAny, def);
106 | USE_LITE_KERNEL(softmax, kARM, kFloat, kNCHW, def);
107 | USE_LITE_KERNEL(deformable_conv, kHost, kFloat, kNCHW, def);
108 | USE_LITE_KERNEL(affine_grid, kARM, kFloat, kNCHW, def);
109 | USE_LITE_KERNEL(conv2d_transpose, kARM, kFloat, kNCHW, def);
110 | USE_LITE_KERNEL(elementwise_add, kARM, kFloat, kNCHW, def);
111 | USE_LITE_KERNEL(elementwise_add, kARM, kInt32, kNCHW, def);
112 | USE_LITE_KERNEL(elementwise_add, kARM, kInt64, kNCHW, def);
113 | USE_LITE_KERNEL(fusion_elementwise_add_activation, kARM, kFloat, kNCHW, def);
114 | USE_LITE_KERNEL(elementwise_sub, kARM, kFloat, kNCHW, def);
115 | USE_LITE_KERNEL(elementwise_sub, kARM, kInt32, kNCHW, def);
116 | USE_LITE_KERNEL(fusion_elementwise_sub_activation, kARM, kFloat, kNCHW, def);
117 | USE_LITE_KERNEL(elementwise_mul, kARM, kInt64, kNCHW, def);
118 | USE_LITE_KERNEL(elementwise_mul, kARM, kFloat, kNCHW, def);
119 | USE_LITE_KERNEL(elementwise_mul, kARM, kInt32, kNCHW, def);
120 | USE_LITE_KERNEL(fusion_elementwise_mul_activation, kARM, kFloat, kNCHW, def);
121 | USE_LITE_KERNEL(fusion_elementwise_mul_activation, kARM, kInt64, kNCHW, def);
122 | USE_LITE_KERNEL(elementwise_max, kARM, kFloat, kNCHW, def);
123 | USE_LITE_KERNEL(fusion_elementwise_max_activation, kARM, kFloat, kNCHW, def);
124 | USE_LITE_KERNEL(elementwise_min, kARM, kFloat, kNCHW, def);
125 | USE_LITE_KERNEL(fusion_elementwise_min_activation, kARM, kFloat, kNCHW, def);
126 | USE_LITE_KERNEL(elementwise_div, kARM, kFloat, kNCHW, def);
127 | USE_LITE_KERNEL(elementwise_div, kARM, kInt32, kNCHW, def);
128 | USE_LITE_KERNEL(elementwise_div, kARM, kInt64, kNCHW, def);
129 | USE_LITE_KERNEL(fusion_elementwise_div_activation, kARM, kFloat, kNCHW, def);
130 | USE_LITE_KERNEL(elementwise_mod, kARM, kInt64, kNCHW, def);
131 | USE_LITE_KERNEL(elementwise_pow, kARM, kFloat, kNCHW, def);
132 | USE_LITE_KERNEL(elementwise_pow, kARM, kInt32, kNCHW, def);
133 | USE_LITE_KERNEL(bilinear_interp, kARM, kFloat, kNCHW, def);
134 | USE_LITE_KERNEL(nearest_interp, kARM, kFloat, kNCHW, def);
135 | USE_LITE_KERNEL(bilinear_interp_v2, kARM, kFloat, kNCHW, def);
136 | USE_LITE_KERNEL(nearest_interp_v2, kARM, kFloat, kNCHW, def);
137 | USE_LITE_KERNEL(fetch, kHost, kAny, kAny, def);
138 | USE_LITE_KERNEL(box_coder, kARM, kFloat, kNCHW, def);
139 | USE_LITE_KERNEL(expand_as, kHost, kFloat, kAny, def);
140 | USE_LITE_KERNEL(matmul, kARM, kFloat, kNCHW, def);
141 | USE_LITE_KERNEL(reduce_mean, kARM, kFloat, kNCHW, def);
142 | USE_LITE_KERNEL(batch_norm, kARM, kFloat, kNCHW, def);
143 | USE_LITE_KERNEL(sync_batch_norm, kARM, kFloat, kNCHW, def);
144 | USE_LITE_KERNEL(dropout, kARM, kFloat, kNCHW, def);
145 | USE_LITE_KERNEL(density_prior_box, kARM, kFloat, kNCHW, def);
146 | USE_LITE_KERNEL(cast, kARM, kAny, kNCHW, def);
147 | USE_LITE_KERNEL(fill_constant_batch_size_like, kHost, kAny, kNCHW, def);
148 | USE_LITE_KERNEL(stack, kARM, kFloat, kNCHW, def);
149 | USE_LITE_KERNEL(stack, kARM, kInt32, kNCHW, def);
--------------------------------------------------------------------------------
/paddle/paddle_image_preprocess.h:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | #pragma once
16 |
17 | #include
18 | #include
19 | #include
20 | #include "lite/api/paddle_api.h"
21 | #include "lite/api/paddle_place.h"
22 |
23 | namespace paddle {
24 | namespace lite {
25 | namespace utils {
26 | namespace cv {
27 | typedef paddle::lite_api::Tensor Tensor;
28 | typedef paddle::lite_api::DataLayoutType LayoutType;
29 | // color enum
30 | enum ImageFormat {
31 | RGBA = 0,
32 | BGRA,
33 | RGB,
34 | BGR,
35 | GRAY,
36 | NV21 = 11,
37 | NV12,
38 | };
39 | // flip enum
40 | enum FlipParam {
41 | XY = -1, // flip along the XY axis
42 | X = 0, // flip along the X axis
43 | Y // flip along the Y axis
44 | };
45 | // transform param
46 | typedef struct {
47 | int ih; // input height
48 | int iw; // input width
49 | int oh; // outpu theight
50 | int ow; // output width
51 | FlipParam flip_param; // flip, support x, y, xy
52 | float rotate_param; // rotate, support 90, 180, 270
53 | } TransParam;
54 |
55 | class ImagePreprocess {
56 | public:
57 | /*
58 | * init
59 | * param srcFormat: input image color
60 | * param dstFormat: output image color
61 | * param param: input image parameter, egs: input size
62 | */
63 | ImagePreprocess(ImageFormat srcFormat,
64 | ImageFormat dstFormat,
65 | TransParam param);
66 |
67 | /*
68 | * image color convert
69 | * support NV12/NV21_to_BGR(RGB), NV12/NV21_to_BGRA(RGBA),
70 | * BGR(RGB)and BGRA(RGBA) transform,
71 | * BGR(RGB)and RGB(BGR) transform,
72 | * BGR(RGB)and RGBA(BGRA) transform,
73 | * BGR(RGB) and GRAY transform,
74 | * BGRA(RGBA) and GRAY transform,
75 | * param src: input image data
76 | * param dst: output image data
77 | */
78 | void image_convert(const uint8_t* src, uint8_t* dst);
79 |
80 | /*
81 | * image color convert
82 | * support NV12/NV21_to_BGR(RGB), NV12/NV21_to_BGRA(RGBA),
83 | * BGR(RGB)and BGRA(RGBA) transform,
84 | * BGR(RGB)and RGB(BGR) transform,
85 | * BGR(RGB)and RGBA(BGRA) transform,
86 | * BGR(RGB)and GRAY transform,
87 | * BGRA(RGBA) and GRAY transform,
88 | * param src: input image data
89 | * param dst: output image data
90 | * param srcFormat: input image image format support: GRAY, NV12(NV21),
91 | * BGR(RGB) and BGRA(RGBA)
92 | * param dstFormat: output image image format, support GRAY, BGR(RGB) and
93 | * BGRA(RGBA)
94 | */
95 | void image_convert(const uint8_t* src,
96 | uint8_t* dst,
97 | ImageFormat srcFormat,
98 | ImageFormat dstFormat);
99 |
100 | /*
101 | * image color convert
102 | * support NV12/NV21_to_BGR(RGB), NV12/NV21_to_BGRA(RGBA),
103 | * BGR(RGB)and BGRA(RGBA) transform,
104 | * BGR(RGB)and RGB(BGR) transform,
105 | * BGR(RGB)and RGBA(BGRA) transform,
106 | * BGR(RGB)and GRAY transform,
107 | * BGRA(RGBA) and GRAY transform,
108 | * param src: input image data
109 | * param dst: output image data
110 | * param srcFormat: input image image format support: GRAY, NV12(NV21),
111 | * BGR(RGB) and BGRA(RGBA)
112 | * param dstFormat: output image image format, support GRAY, BGR(RGB) and
113 | * BGRA(RGBA)
114 | * param srcw: input image width
115 | * param srch: input image height
116 | */
117 | void image_convert(const uint8_t* src,
118 | uint8_t* dst,
119 | ImageFormat srcFormat,
120 | ImageFormat dstFormat,
121 | int srcw,
122 | int srch);
123 |
124 | /*
125 | * image resize, use bilinear method
126 | * support image format: 1-channel image (egs: GRAY, 2-channel image (egs:
127 | * NV12, NV21), 3-channel(egs: BGR), 4-channel(egs: BGRA)
128 | * param src: input image data
129 | * param dst: output image data
130 | */
131 | void image_resize(const uint8_t* src, uint8_t* dst);
132 |
133 | /*
134 | image resize, use bilinear method
135 | * support image format: 1-channel image (egs: GRAY, 2-channel image (egs:
136 | NV12, NV21), 3-channel image(egs: BGR), 4-channel image(egs: BGRA)
137 | * param src: input image data
138 | * param dst: output image data
139 | * param srcw: input image width
140 | * param srch: input image height
141 | * param dstw: output image width
142 | * param dsth: output image height
143 | */
144 | void image_resize(const uint8_t* src,
145 | uint8_t* dst,
146 | ImageFormat srcFormat,
147 | int srcw,
148 | int srch,
149 | int dstw,
150 | int dsth);
151 |
152 | /*
153 | * image Rotate
154 | * support 90, 180 and 270 Rotate process
155 | * color format support 1-channel image, 3-channel image and 4-channel image
156 | * param src: input image data
157 | * param dst: output image data
158 | */
159 | void image_rotate(const uint8_t* src, uint8_t* dst);
160 |
161 | /*
162 | * image Rotate
163 | * support 90, 180 and 270 Rotate process
164 | * color format support 1-channel image, 3-channel image and 4-channel image
165 | * param src: input image data
166 | * param dst: output image data
167 | * param srcFormat: input image format, support GRAY, BGR(RGB) and BGRA(RGBA)
168 | * param srcw: input image width
169 | * param srch: input image height
170 | * param degree: Rotate degree, support 90, 180 and 270
171 | */
172 | void image_rotate(const uint8_t* src,
173 | uint8_t* dst,
174 | ImageFormat srcFormat,
175 | int srcw,
176 | int srch,
177 | float degree);
178 |
179 | /*
180 | * image Flip
181 | * support X, Y and XY flip process
182 | * color format support 1-channel image, 3-channel image and 4-channel image
183 | * param src: input image data
184 | * param dst: output image data
185 | */
186 | void image_flip(const uint8_t* src, uint8_t* dst);
187 |
188 | /*
189 | * image Flip
190 | * support X, Y and XY flip process
191 | * color format support 1-channel image, 3-channel image and 4-channel image
192 | * param src: input image data
193 | * param dst: output image data
194 | * param srcFormat: input image format, support GRAY, BGR(RGB) and BGRA(RGBA)
195 | * param srcw: input image width
196 | * param srch: input image height
197 | * param flip_param: flip parameter, support X, Y and XY
198 | */
199 | void image_flip(const uint8_t* src,
200 | uint8_t* dst,
201 | ImageFormat srcFormat,
202 | int srcw,
203 | int srch,
204 | FlipParam flip_param);
205 |
206 | /*
207 | * change image data to tensor data
208 | * support image format is GRAY, BGR(RGB) and BGRA(RGBA), Data layout is NHWC
209 | * and
210 | * NCHW
211 | * param src: input image data
212 | * param dstTensor: output tensor data
213 | * param layout: output tensor layout,support NHWC and NCHW
214 | * param means: means of image
215 | * param scales: scales of image
216 | */
217 | void image_to_tensor(const uint8_t* src,
218 | Tensor* dstTensor,
219 | LayoutType layout,
220 | float* means,
221 | float* scales);
222 |
223 | /*
224 | * change image data to tensor data
225 | * support image format is GRAY, BGR(RGB) and BGRA(RGBA), Data layout is NHWC
226 | * and
227 | * NCHW
228 | * param src: input image data
229 | * param dstTensor: output tensor data
230 | * param srcFormat: input image format, support BGR(RGB) and BGRA(RGBA)
231 | * param srcw: input image width
232 | * param srch: input image height
233 | * param layout: output tensor layout,support NHWC and NCHW
234 | * param means: means of image
235 | * param scales: scales of image
236 | */
237 | void image_to_tensor(const uint8_t* src,
238 | Tensor* dstTensor,
239 | ImageFormat srcFormat,
240 | int srcw,
241 | int srch,
242 | LayoutType layout,
243 | float* means,
244 | float* scales);
245 |
246 | /*
247 | * image crop process
248 | * color format support 1-channel image, 3-channel image and 4-channel image
249 | * param src: input image data
250 | * param dst: output image data
251 | */
252 | void image_crop(const uint8_t* src,
253 | uint8_t* dst,
254 | ImageFormat srcFormat,
255 | int srcw,
256 | int srch,
257 | int left_x,
258 | int left_y,
259 | int dstw,
260 | int dsth);
261 |
262 | private:
263 | ImageFormat srcFormat_;
264 | ImageFormat dstFormat_;
265 | TransParam transParam_;
266 | };
267 | } // namespace cv
268 | } // namespace utils
269 | } // namespace lite
270 | } // namespace paddle
271 |
--------------------------------------------------------------------------------
/paddle/paddle_place.h:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | #pragma once
16 | #include
17 | #include
18 |
19 | // Generic helper definitions for shared library support
20 | #if defined _WIN32 || defined __CYGWIN__
21 | #define PADDLE_LITE_HELPER_DLL_IMPORT __declspec(dllimport)
22 | #define PADDLE_LITE_HELPER_DLL_EXPORT __declspec(dllexport)
23 | #define PADDLE_LITE_HELPER_DLL_LOCAL
24 | #else
25 | #if __GNUC__ >= 4
26 | #define PADDLE_LITE_HELPER_DLL_IMPORT __attribute__((visibility("default")))
27 | #define PADDLE_LITE_HELPER_DLL_EXPORT __attribute__((visibility("default")))
28 | #else
29 | #define PADDLE_LITE_HELPER_DLL_IMPORT
30 | #define PADDLE_LITE_HELPER_DLL_EXPORT
31 | #endif
32 | #endif
33 |
34 | #ifdef LITE_ON_TINY_PUBLISH
35 | #define LITE_API PADDLE_LITE_HELPER_DLL_EXPORT
36 | #define LITE_API_IMPORT PADDLE_LITE_HELPER_DLL_IMPORT
37 | #else
38 | #define LITE_API
39 | #define LITE_API_IMPORT
40 | #endif
41 |
42 | namespace paddle {
43 | namespace lite_api {
44 |
45 | enum class TargetType : int {
46 | kUnk = 0,
47 | kHost = 1,
48 | kX86 = 2,
49 | kCUDA = 3,
50 | kARM = 4,
51 | kOpenCL = 5,
52 | kAny = 6, // any target
53 | kFPGA = 7,
54 | kNPU = 8,
55 | kXPU = 9,
56 | kBM = 10,
57 | kMLU = 11,
58 | kRKNPU = 12,
59 | kAPU = 13,
60 | kHuaweiAscendNPU = 14,
61 | kImaginationNNA = 15,
62 | NUM = 16, // number of fields.
63 | };
64 | enum class PrecisionType : int {
65 | kUnk = 0,
66 | kFloat = 1,
67 | kInt8 = 2,
68 | kInt32 = 3,
69 | kAny = 4, // any precision
70 | kFP16 = 5,
71 | kBool = 6,
72 | kInt64 = 7,
73 | kInt16 = 8,
74 | kUInt8 = 9,
75 | kFP64 = 10,
76 | NUM = 11, // number of fields.
77 | };
78 | enum class DataLayoutType : int {
79 | kUnk = 0,
80 | kNCHW = 1,
81 | kNHWC = 3,
82 | kImageDefault = 4, // for opencl image2d
83 | kImageFolder = 5, // for opencl image2d
84 | kImageNW = 6, // for opencl image2d
85 | kAny = 2, // any data layout
86 | NUM = 7, // number of fields.
87 | };
88 |
89 | typedef enum {
90 | LITE_POWER_HIGH = 0,
91 | LITE_POWER_LOW = 1,
92 | LITE_POWER_FULL = 2,
93 | LITE_POWER_NO_BIND = 3,
94 | LITE_POWER_RAND_HIGH = 4,
95 | LITE_POWER_RAND_LOW = 5
96 | } PowerMode;
97 |
98 | typedef enum {
99 | CL_TUNE_NONE = 0,
100 | CL_TUNE_RAPID = 1,
101 | CL_TUNE_NORMAL = 2,
102 | CL_TUNE_EXHAUSTIVE = 3
103 | } CLTuneMode;
104 |
105 | typedef enum {
106 | CL_PRECISION_AUTO = 0,
107 | CL_PRECISION_FP32 = 1,
108 | CL_PRECISION_FP16 = 2
109 | } CLPrecisionType;
110 |
111 | typedef enum { MLU_220 = 0, MLU_270 = 1 } MLUCoreVersion;
112 |
113 | enum class ActivationType : int {
114 | kIndentity = 0,
115 | kRelu = 1,
116 | kRelu6 = 2,
117 | kPRelu = 3,
118 | kLeakyRelu = 4,
119 | kSigmoid = 5,
120 | kTanh = 6,
121 | kSwish = 7,
122 | kExp = 8,
123 | kAbs = 9,
124 | kHardSwish = 10,
125 | kReciprocal = 11,
126 | kThresholdedRelu = 12,
127 | kElu = 13,
128 | kHardSigmoid = 14,
129 | kLog = 15,
130 | kSigmoid_v2 = 16,
131 | kTanh_v2 = 17,
132 | NUM = 18,
133 | };
134 |
135 | static size_t PrecisionTypeLength(PrecisionType type) {
136 | switch (type) {
137 | case PrecisionType::kFloat:
138 | return 4;
139 | case PrecisionType::kFP64:
140 | return 8;
141 | case PrecisionType::kUInt8:
142 | return 1;
143 | case PrecisionType::kInt8:
144 | return 1;
145 | case PrecisionType::kInt32:
146 | return 4;
147 | case PrecisionType::kInt64:
148 | return 8;
149 | case PrecisionType::kFP16:
150 | return 2;
151 | case PrecisionType::kInt16:
152 | return 2;
153 | default:
154 | return 0;
155 | }
156 | }
157 |
158 | enum class QuantType : int {
159 | QUANT_INT8,
160 | QUANT_INT16,
161 | };
162 |
163 | template
164 | struct PrecisionTypeTrait {
165 | constexpr static PrecisionType Type() { return PrecisionType::kUnk; }
166 | };
167 |
168 | #define _ForEachPrecisionTypeHelper(callback, cpp_type, precision_type) \
169 | callback(cpp_type, ::paddle::lite_api::PrecisionType::precision_type);
170 |
171 | #define _ForEachPrecisionType(callback) \
172 | _ForEachPrecisionTypeHelper(callback, bool, kBool); \
173 | _ForEachPrecisionTypeHelper(callback, float, kFloat); \
174 | _ForEachPrecisionTypeHelper(callback, double, kFP64); \
175 | _ForEachPrecisionTypeHelper(callback, uint8_t, kUInt8); \
176 | _ForEachPrecisionTypeHelper(callback, int8_t, kInt8); \
177 | _ForEachPrecisionTypeHelper(callback, int16_t, kInt16); \
178 | _ForEachPrecisionTypeHelper(callback, int, kInt32); \
179 | _ForEachPrecisionTypeHelper(callback, int64_t, kInt64);
180 |
181 | #define DefinePrecisionTypeTrait(cpp_type, precision_type) \
182 | template <> \
183 | struct PrecisionTypeTrait { \
184 | constexpr static PrecisionType Type() { return precision_type; } \
185 | }
186 |
187 | _ForEachPrecisionType(DefinePrecisionTypeTrait);
188 |
189 | #undef _ForEachPrecisionTypeHelper
190 | #undef _ForEachPrecisionType
191 | #undef DefinePrecisionTypeTrait
192 |
193 | #define TARGET(item__) paddle::lite_api::TargetType::item__
194 | #define PRECISION(item__) paddle::lite_api::PrecisionType::item__
195 | #define DATALAYOUT(item__) paddle::lite_api::DataLayoutType::item__
196 |
197 | const std::string& ActivationTypeToStr(ActivationType act);
198 |
199 | const std::string& TargetToStr(TargetType target);
200 |
201 | const std::string& PrecisionToStr(PrecisionType precision);
202 |
203 | const std::string& DataLayoutToStr(DataLayoutType layout);
204 |
205 | const std::string& TargetRepr(TargetType target);
206 |
207 | const std::string& PrecisionRepr(PrecisionType precision);
208 |
209 | const std::string& DataLayoutRepr(DataLayoutType layout);
210 |
211 | // Get a set of all the elements represented by the target.
212 | std::set ExpandValidTargets(TargetType target = TARGET(kAny));
213 |
214 | // Get a set of all the elements represented by the precision.
215 | std::set ExpandValidPrecisions(
216 | PrecisionType precision = PRECISION(kAny));
217 |
218 | // Get a set of all the elements represented by the layout.
219 | std::set ExpandValidLayouts(
220 | DataLayoutType layout = DATALAYOUT(kAny));
221 |
222 | /*
223 | * Place specifies the execution context of a Kernel or input/output for a
224 | * kernel. It is used to make the analysis of the MIR more clear and accurate.
225 | */
226 | struct LITE_API Place {
227 | TargetType target{TARGET(kUnk)};
228 | PrecisionType precision{PRECISION(kUnk)};
229 | DataLayoutType layout{DATALAYOUT(kUnk)};
230 | int16_t device{0}; // device ID
231 |
232 | Place() = default;
233 | Place(TargetType target,
234 | PrecisionType precision = PRECISION(kFloat),
235 | DataLayoutType layout = DATALAYOUT(kNCHW),
236 | int16_t device = 0)
237 | : target(target), precision(precision), layout(layout), device(device) {}
238 |
239 | bool is_valid() const {
240 | return target != TARGET(kUnk) && precision != PRECISION(kUnk) &&
241 | layout != DATALAYOUT(kUnk);
242 | }
243 |
244 | size_t hash() const;
245 |
246 | bool operator==(const Place& other) const {
247 | return target == other.target && precision == other.precision &&
248 | layout == other.layout && device == other.device;
249 | }
250 |
251 | bool operator!=(const Place& other) const { return !(*this == other); }
252 |
253 | friend bool operator<(const Place& a, const Place& b);
254 |
255 | std::string DebugString() const;
256 | };
257 |
258 | } // namespace lite_api
259 | } // namespace paddle
260 |
--------------------------------------------------------------------------------
/src/dajtensor.cpp:
--------------------------------------------------------------------------------
1 |
2 | #include "dajtensor.h"
3 | #include "dajutil.h"
4 |
5 | namespace dajnn {
6 |
7 | Tensor::Tensor() {
8 | _val_ = nullptr;
9 | span = 0;
10 | releasable = true;
11 |
12 | #ifdef TRACE_MEMORY_LEAK
13 | push_tensor_trace(this);
14 | #endif
15 | }
16 |
17 | Tensor::~Tensor() {
18 | if (releasable && _val_) free(_val_);
19 |
20 | #ifdef TRACE_MEMORY_LEAK
21 | pop_tensor_trace(this);
22 | #endif
23 | }
24 |
25 | void Tensor::reshape(vector* shape) {
26 | exit_if(span != get_span(shape), "unable to reshape from %s to %s",
27 | get_shape_str(&this->shape).c_str(), get_shape_str(shape).c_str());
28 | this->shape = *shape;
29 | }
30 |
31 | void Tensor::reshape(uint dim1, ...) {
32 | if (dim1 == END_DIM) return;
33 |
34 | vector running_shape;
35 | running_shape.push_back(dim1);
36 |
37 | va_list ap;
38 | va_start(ap, dim1);
39 | uint adim = va_arg(ap, uint);
40 |
41 | while (adim != END_DIM) {
42 | running_shape.push_back(adim);
43 | adim = va_arg(ap, uint);
44 | }
45 | va_end(ap);
46 | reshape(&running_shape);
47 | }
48 |
49 | bool Tensor::is_shape(vector* shape) {
50 | if (this->shape.size() != shape->size()) return false;
51 |
52 | for (uint i = 0; i < shape->size(); ++i) {
53 | if (this->shape[i] != shape->at(i)) return false;
54 | }
55 | return true;
56 | }
57 |
58 | bool Tensor::is_shape(uint dim1, ...) {
59 | if (dim1 != END_DIM) {
60 | if (shape.empty()) return false;
61 | if (shape[0] != dim1) return false;
62 | } else if (!shape.empty()) return false;
63 |
64 | va_list ap;
65 | va_start(ap, dim1);
66 | uint adim = va_arg(ap, uint);
67 |
68 | for (uint i = 1; i < shape.size(); ++i) {
69 | if (shape[i] != adim) return false;
70 | adim = va_arg(ap, uint);
71 | }
72 | if (adim != END_DIM) return false;
73 | va_end(ap);
74 | return true;
75 | }
76 |
77 | void Tensor::set_releasable(bool releasable) {
78 | this->releasable = releasable;
79 | }
80 |
81 | void* Tensor::_init_(Tensor* tensor, bool copy_val) {
82 | exit_if(!tensor, "cannot clone tensor from empty tensor");
83 | shape = tensor->shape;
84 | span = get_span(&shape);
85 |
86 | if (copy_val) {
87 | _val_ = malloc(4 * span);
88 | memcpy(_val_, tensor->_val_, 4 * span);
89 | } else {
90 | _val_ = tensor->_val_;
91 | }
92 | return _val_;
93 | }
94 |
95 | void* Tensor::_init_(vector* shape, void* val, bool copy_val) {
96 | this->shape = *shape;
97 | span = get_span(shape);
98 |
99 | if (val && copy_val) {
100 | _val_ = malloc(4 * span);
101 | memcpy(_val_, val, 4 * span);
102 | } else if (!val) {
103 | _val_ = malloc(4 * span);
104 | } else {
105 | _val_ = val;
106 | }
107 | return _val_;
108 | }
109 |
110 | void* Tensor::_init_(void* val, bool copy_val, uint dim1, va_list ap) {
111 | if (dim1 == END_DIM) return nullptr;
112 | vector running_shape;
113 |
114 | running_shape.push_back(dim1);
115 | uint adim = va_arg(ap, uint);
116 |
117 | while (adim != END_DIM) {
118 | running_shape.push_back(adim);
119 | exit_if(running_shape.size() == MAX_TENSOR_DIM,
120 | "tensor shape with too many dimensions (%s) : did you forget to end with END_DIM?",
121 | get_shape_str(&running_shape).c_str());
122 | adim = va_arg(ap, uint);
123 | }
124 | return _init_(&running_shape, val, copy_val);
125 | }
126 |
127 | void* Tensor::_init_(ByteStream* stream) {
128 | _read_meta_(stream);
129 | _val_ = malloc(4 * span);
130 | stream->read(_val_, 4, span);
131 | return _val_;
132 | }
133 |
134 | void Tensor::_read_meta_(ByteStream* stream) {
135 | unsigned char len = 0;
136 | stream->read(&len, 1, 1);
137 |
138 | for (unsigned char i = 0; i < len; ++i) {
139 | unsigned int dim = 0;
140 | stream->read(&dim, 4, 1);
141 | shape.push_back(dim);
142 | }
143 | span = get_span(&shape);
144 | }
145 |
146 | void Tensor::_write_meta_(ByteStream* stream) {
147 | unsigned char len = (unsigned char) shape.size();
148 | stream->write(&len, 1, 1);
149 |
150 | for (unsigned char i = 0; i < len; ++i) {
151 | stream->write(&shape[i], 4, 1);
152 | }
153 | }
154 |
155 | void Tensor::_write_val_(ByteStream* stream) {
156 | stream->write(_val_, 4, span);
157 | }
158 |
159 | void Tensor::_save_(ByteStream* stream) {
160 | _write_meta_(stream);
161 | _write_val_(stream);
162 | }
163 |
164 | ITensor::ITensor() : Tensor() {
165 | val = nullptr;
166 | }
167 |
168 | ITensor::ITensor(ITensor* tensor, bool copy_val) : ITensor() {
169 | this->val = (int*) _init_(tensor, copy_val);
170 | }
171 |
172 | ITensor::ITensor(vector* shape, int* val, bool copy_val) : ITensor() {
173 | this->val = (int*) _init_(shape, val, copy_val);
174 | }
175 |
176 | ITensor::ITensor(int* val, bool copy_val, uint dim1, ...) : ITensor() {
177 | va_list ap;
178 | va_start(ap, dim1);
179 | this->val = (int*) _init_(val, copy_val, dim1, ap);
180 | va_end(ap);
181 | }
182 |
183 | ITensor::ITensor(uint dim1, ...) : ITensor() {
184 | va_list ap;
185 | va_start(ap, dim1);
186 | this->val = (int*) _init_(nullptr, false, dim1, ap);
187 | va_end(ap);
188 | };
189 |
190 | ITensor::ITensor(ByteStream* stream) : ITensor() {
191 | char compressed = 0;
192 | stream->read(&compressed, 1, 1);
193 |
194 | if (compressed) {
195 | _read_meta_(stream);
196 | short sh = 0;
197 | _val_ = val = (int*) malloc(span * 4);
198 |
199 | for (int* vp = val; vp < val + span; ++vp) {
200 | stream->read(&sh, 2, 1);
201 | *vp = sh;
202 | }
203 | } else {
204 | val = (int*) _init_(stream);
205 | }
206 | }
207 |
208 | void ITensor::save(ByteStream* stream, bool compressed) {
209 | char flag = compressed ? 1 : 0;
210 | stream->write(&flag, 1, 1);
211 |
212 | if (compressed) {
213 | _write_meta_(stream);
214 | short sh = 0;
215 |
216 | for (int* vp = val; vp < val + span; ++vp) {
217 | sh = (short) *vp;
218 | stream->write(&sh, 2, 1);
219 | }
220 | } else {
221 | _save_(stream);
222 | }
223 | }
224 |
225 | int ITensor::compare(ITensor* tensor) {
226 | return compare(tensor->val, tensor->span);
227 | }
228 |
229 | int ITensor::compare(int* val, uint len) {
230 | uint comp_len = MIN(span, len);
231 | int max_abs = 0;
232 | int* vp1 = this->val;
233 | int* vp2 = val;
234 |
235 | for (uint i = 0; i < comp_len; ++i, ++vp1, ++vp2) {
236 | int d = abs(*vp1 - *vp2);
237 | if (d > max_abs) max_abs = d;
238 | }
239 | return max_abs;
240 | }
241 |
242 | int ITensor::get_max() {
243 | return dajnn::get_max(val, span);
244 | }
245 |
246 | int ITensor::get_min() {
247 | return dajnn::get_min(val, span);
248 | }
249 |
250 | FTensor::FTensor() : Tensor() {
251 | val = nullptr;
252 | }
253 |
254 | FTensor::FTensor(ITensor* tensor) {
255 | shape = tensor->shape;
256 | span = get_span(&shape);
257 |
258 | _val_ = val = (float*) malloc(span * 4);
259 | int* tp = tensor->val;
260 |
261 | for (float* vp = val; vp < val + span; ++vp, ++tp) {
262 | *vp = (float) *tp;
263 | }
264 | }
265 |
266 | FTensor::FTensor(FTensor* tensor, bool copy_val) : FTensor() {
267 | this->val = (float*) _init_(tensor, copy_val);
268 | }
269 |
270 | FTensor::FTensor(vector* shape, float* val, bool copy_val) : FTensor() {
271 | this->val = (float*) _init_(shape, val, copy_val);
272 | }
273 |
274 | FTensor::FTensor(float* val, bool copy_val, uint dim1, ...) : FTensor() {
275 | va_list ap;
276 | va_start(ap, dim1);
277 | this->val = (float*) _init_(val, copy_val, dim1, ap);
278 | va_end(ap);
279 | }
280 |
281 | FTensor::FTensor(uint dim1, ...) : FTensor() {
282 | va_list ap;
283 | va_start(ap, dim1);
284 | this->val = (float*) _init_(nullptr, false, dim1, ap);
285 | va_end(ap);
286 | }
287 |
288 | FTensor::FTensor(ByteStream* stream) : FTensor() {
289 | char compressed = 0;
290 | stream->read(&compressed, 1, 1);
291 |
292 | if (compressed) {
293 | _read_meta_(stream);
294 |
295 | float min_v = 0, max_v = 0;
296 | short sh = 0;
297 |
298 | stream->read(&min_v, 4, 1);
299 | stream->read(&max_v, 4, 1);
300 | _val_ = val = (float*) malloc(span * 4);
301 |
302 | for (float* vp = val; vp < val + span; ++vp) {
303 | stream->read(&sh, 2, 1);
304 | *vp = min_v + (max_v - min_v) * (1 + (float) sh / SHRT_MAX) / 2;
305 | }
306 | } else {
307 | val = (float*) _init_(stream);
308 | }
309 | }
310 |
311 | void FTensor::save(ByteStream* stream, bool compressed) {
312 | char flag = compressed ? 1 : 0;
313 | stream->write(&flag, 1, 1);
314 |
315 | if (compressed) {
316 | _write_meta_(stream);
317 |
318 | float min_v = get_min();
319 | float max_v = get_max();
320 | short sh = 0;
321 |
322 | stream->write(&min_v, 4, 1);
323 | stream->write(&max_v, 4, 1);
324 |
325 | for (float* vp = val; vp < val + span; ++vp) {
326 | sh = (short) ((2 * (*vp - min_v) / (max_v - min_v) - 1) * SHRT_MAX);
327 | stream->write(&sh, 2, 1);
328 | }
329 | } else {
330 | _save_(stream);
331 | }
332 | }
333 |
334 | void FTensor::print(uint start, uint end) {
335 | if (start == END_DIM) start = 0;
336 | if (end == END_DIM) end = span;
337 |
338 | for (uint i = start; i < end; ++i) {
339 | printf("%.8f,", val[i]);
340 | }
341 | }
342 |
343 | float FTensor::compare(FTensor* tensor) {
344 | return compare(tensor->val, tensor->span);
345 | }
346 |
347 | float FTensor::compare(float* val, uint len) {
348 | uint comp_len = MIN(span, len);
349 | float max_abs = 0;
350 | float* vp1 = this->val;
351 | float* vp2 = val;
352 |
353 | for (uint i = 0; i < comp_len; ++i, ++vp1, ++vp2) {
354 | float d = fabsf(*vp1 - *vp2);
355 | if (d > max_abs) max_abs = d;
356 | }
357 | return max_abs;
358 | }
359 |
360 | float FTensor::get_max() {
361 | return dajnn::get_max(val, span);
362 | }
363 |
364 | float FTensor::get_min() {
365 | return dajnn::get_min(val, span);
366 | }
367 |
368 | ByteStream::ByteStream() {
369 | buff = nullptr;
370 | fp = nullptr;
371 | pointer = 0;
372 | }
373 |
374 | ByteStream::ByteStream(const void* buff) : ByteStream() {
375 | this->buff = (const char*) buff;
376 | }
377 |
378 | ByteStream::ByteStream(FILE* fp) : ByteStream() {
379 | this->fp = fp;
380 | }
381 |
382 | string ByteStream::read_str() {
383 | string str;
384 | char t = 0;
385 |
386 | for (uint i = 0; i < MAX_MODEL_STR; ++i) {
387 | if (!read(&t, 1, 1)) break;
388 | if (!t) break;
389 | str += t;
390 | }
391 | return str;
392 | }
393 |
394 | uint ByteStream::read(void* dst, int ele_size, int ele_count) {
395 | if (buff) {
396 | int len = ele_size * ele_count;
397 | memcpy(dst, &buff[pointer], len);
398 | pointer += len;
399 | return ele_count;
400 | } else if (fp) {
401 | return (uint) fread(dst, ele_size, ele_count, fp);
402 | } else {
403 | return 0;
404 | }
405 | }
406 |
407 | void ByteStream::write(void* src, int ele_size, int ele_count) {
408 | if (buff) {
409 | int len = ele_size * ele_count;
410 | memcpy((char*) &buff[pointer], src, len);
411 | pointer += len;
412 | } else if (fp) {
413 | fwrite(src, ele_size, ele_count, fp);
414 | }
415 | }
416 |
417 | int ByteStream::seek() {
418 | return pointer;
419 | }
420 |
421 | }
422 |
--------------------------------------------------------------------------------
/paddle/paddle_api_2.h:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | /*
16 | * This file defines PaddlePredictor, the api for lite. It supports multiple
17 | * hardware including ARM, X86, OpenCL, CUDA and so on.
18 | */
19 |
20 | #ifndef PADDLE_LITE_API_2_H_ // NOLINT
21 | #define PADDLE_LITE_API_2_H_
22 | #include